update roc

5c70ef66 · dugupeiwen · 1fb0017a · 5c70ef66 · 5c70ef66 · 5c70ef66
Commit 5c70ef66 authored Mar 23, 2024 by dugupeiwen
20 changed files
--- a/numba/roc/hsadrv/devicearray.py
+++ b/numba/roc/hsadrv/devicearray.py
+"""
+A HSA dGPU backed ND Array is recognized by checking the __hsa_memory__
+attribute on the object.  If it exists and evaluate to True, it must define
+shape, strides, dtype and size attributes similar to a NumPy ndarray.
+"""
+import warnings
+import math
+import copy
+import weakref
+from ctypes import c_void_p
+import numpy as np
+from numba.roc.hsadrv import driver as _driver
+from numba.roc.hsadrv import devices
+from numba.core import types
+from .error import HsaContextMismatchError
+from numba.misc import dummyarray
+from numba.np import numpy_support
+
+
+def is_hsa_ndarray(obj):
+    "Check if an object is a HSA ndarray"
+    return getattr(obj, '__hsa_ndarray__', False)
+
+
+def verify_hsa_ndarray_interface(obj):
+    "Verify the HSA ndarray interface for an obj"
+    require_hsa_ndarray(obj)
+
+    def requires_attr(attr, typ):
+        if not hasattr(obj, attr):
+            raise AttributeError(attr)
+        if not isinstance(getattr(obj, attr), typ):
+            raise AttributeError('%s must be of type %s' % (attr, typ))
+
+    requires_attr('shape', tuple)
+    requires_attr('strides', tuple)
+    requires_attr('dtype', np.dtype)
+    requires_attr('size', int)
+
+
+def require_hsa_ndarray(obj):
+    "Raises ValueError if is_hsa_ndarray(obj) evaluates False"
+    if not is_hsa_ndarray(obj):
+        raise ValueError('require an hsa ndarray object')
+
+
+class DeviceNDArrayBase(object):
+    """Base class for an on dGPU NDArray representation cf. numpy.ndarray
+    """
+    __hsa_memory__ = True
+    __hsa_ndarray__ = True     # There must be dgpu_data attribute as a result
+
+    def __init__(self, shape, strides, dtype, dgpu_data=None):
+        """
+        Args
+        ----
+
+        shape
+            array shape.
+        strides
+            array strides.
+        dtype
+            data type as numpy.dtype.
+        dgpu_data
+            user provided device memory for the ndarray data buffer
+        """
+        if isinstance(shape, int):
+            shape = (shape,)
+        if isinstance(strides, int):
+            strides = (strides,)
+        self.ndim = len(shape)
+        if len(strides) != self.ndim:
+            raise ValueError('strides not match ndim')
+        self._dummy = dummyarray.Array.from_desc(0, shape, strides,
+                                                 dtype.itemsize)
+        self.shape = tuple(shape)
+        self.strides = tuple(strides)
+        self.dtype = np.dtype(dtype)
+        self.size = int(np.prod(self.shape))
+        # prepare dgpu memory
+        if self.size > 0:
+            if dgpu_data is None:
+                from numba.roc.api import _memory_size_from_info
+                self.alloc_size = _memory_size_from_info(self.shape,
+                                          self.strides, self.dtype.itemsize)
+                # find a coarse region on the dGPU
+                dgpu_data = devices.get_context().mempoolalloc(self.alloc_size)
+            else:  # we have some preallocated dgpu_memory
+                sz = getattr(dgpu_data, '_hsa_memsize_', None)
+                if sz is None:
+                    raise ValueError('dgpu_data as no _hsa_memsize_ attribute')
+                assert sz >= 0
+                self.alloc_size = sz
+        else:
+            dgpu_data = None
+            self.alloc_size = 0
+
+        self.dgpu_data = dgpu_data
+
+    @property
+    def _context(self):
+        return self.dgpu_data.context
+
+    @property
+    def _numba_type_(self):
+        """
+        Magic attribute expected by Numba to get the numba type that
+        represents this object.
+        """
+        dtype = numpy_support.from_dtype(self.dtype)
+        return types.Array(dtype, self.ndim, 'A')
+
+    @property
+    def device_ctypes_pointer(self):
+        """Returns the ctypes pointer to the GPU data buffer
+        """
+        if self.dgpu_data is None:
+            return c_void_p(0)
+        else:
+            return self.dgpu_data.device_ctypes_pointer
+
+    def copy_to_device(self, ary, stream=None, context=None):
+        """Copy `ary` to `self`.
+
+        If `ary` is a HSA memory, perform a device-to-device transfer.
+        Otherwise, perform a a host-to-device transfer.
+
+        If `stream` is a stream object, an async copy to used.
+        """
+        if ary.size == 0:
+            # Nothing to do
+            return
+
+        if context is not None:
+            if self.dgpu_data is not None:
+                expect, got = self._context, context
+                if expect.unproxy != got.unproxy:
+                    raise HsaContextMismatchError(expect=expect, got=got)
+        else:
+            context = self._context
+
+        # TODO: Worry about multiple dGPUs
+        #if _driver.is_device_memory(ary):
+        #    sz = min(self.alloc_size, ary.alloc_size)
+        #    _driver.device_to_device(self, ary, sz)
+        #else:
+        #    sz = min(_driver.host_memory_size(ary), self.alloc_size)
+
+        sz = self.alloc_size
+
+        # host_to_dGPU(context, dst, src, size):
+        if stream is None:
+            _driver.hsa.implicit_sync()
+
+            if isinstance(ary, DeviceNDArray):
+                _driver.dGPU_to_dGPU(self._context, self, ary, sz)
+            else:
+                _driver.host_to_dGPU(self._context, self, ary, sz)
+        else:
+            if isinstance(ary, DeviceNDArray):
+                _driver.async_dGPU_to_dGPU(dst_ctx=self._context,
+                                           src_ctx=ary._context,
+                                           dst=self, src=ary, size=sz,
+                                           stream=stream)
+            else:
+                _driver.async_host_to_dGPU(dst_ctx=self._context,
+                                        src_ctx=devices.get_cpu_context(),
+                                        dst=self, src=ary, size=sz,
+                                        stream=stream)
+
+    def copy_to_host(self, ary=None, stream=None):
+        """Copy ``self`` to ``ary`` or create a new Numpy ndarray
+        if ``ary`` is ``None``.
+
+        The transfer is synchronous: the function returns after the copy
+        is finished.
+
+        Always returns the host array.
+
+        Example::
+
+            import numpy as np
+            from numba import hsa
+
+            arr = np.arange(1000)
+            d_arr = hsa.to_device(arr)
+
+            my_kernel[100, 100](d_arr)
+
+            result_array = d_arr.copy_to_host()
+        """
+        if ary is None:  # destination does not exist
+            hostary = np.empty(shape=self.alloc_size, dtype=np.byte)
+        else: # destination does exist, it's `ary`, check it
+            if ary.dtype != self.dtype:
+                raise TypeError('incompatible dtype')
+
+            if ary.shape != self.shape:
+                scalshapes = (), (1,)
+                if not (ary.shape in scalshapes and self.shape in scalshapes):
+                    raise TypeError('incompatible shape; device %s; host %s' %
+                                    (self.shape, ary.shape))
+            if ary.strides != self.strides:
+                scalstrides = (), (self.dtype.itemsize,)
+                if not (ary.strides in scalstrides and
+                                self.strides in scalstrides):
+                    raise TypeError('incompatible strides; device %s; host %s' %
+                                    (self.strides, ary.strides))
+            hostary = ary  # this is supposed to be a ptr for writing
+
+        # a location for the data exists as `hostary`
+        assert self.alloc_size >= 0, "Negative memory size"
+
+        context = self._context
+
+        # copy the data from the device to the hostary
+        if self.alloc_size != 0:
+            sz = self.alloc_size
+            if stream is None:
+                _driver.hsa.implicit_sync()
+                _driver.dGPU_to_host(context, hostary, self, sz)
+            else:
+                _driver.async_dGPU_to_host(dst_ctx=devices.get_cpu_context(),
+                                           src_ctx=self._context,
+                                           dst=hostary, src=self,
+                                           size=sz, stream=stream)
+
+        # if the location for the data was originally None
+        # then create a new ndarray and plumb in the new memory
+        if ary is None:
+            if self.size == 0:
+                hostary = np.ndarray(shape=self.shape, dtype=self.dtype,
+                                     buffer=hostary)
+            else:
+                hostary = np.ndarray(shape=self.shape, dtype=self.dtype,
+                                     strides=self.strides, buffer=hostary)
+        else: # else hostary points to ary and how has the right memory
+            hostary = ary
+
+        return hostary
+
+    def as_hsa_arg(self):
+        """Returns a device memory object that is used as the argument.
+        """
+        return self.dgpu_data
+
+
+class DeviceNDArray(DeviceNDArrayBase):
+    '''
+    An on-dGPU array type
+    '''
+    def is_f_contiguous(self):
+        '''
+        Return true if the array is Fortran-contiguous.
+        '''
+        return self._dummy.is_f_contig
+
+    def is_c_contiguous(self):
+        '''
+        Return true if the array is C-contiguous.
+        '''
+        return self._dummy.is_c_contig
+
+    def reshape(self, *newshape, **kws):
+        """
+        Reshape the array without changing its contents, similarly to
+        :meth:`numpy.ndarray.reshape`. Example::
+
+            d_arr = d_arr.reshape(20, 50, order='F')
+        """
+        if len(newshape) == 1 and isinstance(newshape[0], (tuple, list)):
+            newshape = newshape[0]
+
+        cls = type(self)
+        if newshape == self.shape:
+            # nothing to do
+            return cls(shape=self.shape, strides=self.strides,
+                       dtype=self.dtype, dgpu_data=self.dgpu_data)
+
+        newarr, extents = self._dummy.reshape(*newshape, **kws)
+
+        if extents == [self._dummy.extent]:
+            return cls(shape=newarr.shape, strides=newarr.strides,
+                       dtype=self.dtype, dgpu_data=self.dgpu_data)
+        else:
+            raise NotImplementedError("operation requires copying")
+
+    def ravel(self, order='C'):
+        '''
+        Flatten the array without changing its contents, similar to
+        :meth:`numpy.ndarray.ravel`.
+        '''
+        cls = type(self)
+        newarr, extents = self._dummy.ravel(order=order)
+
+        if extents == [self._dummy.extent]:
+            return cls(shape=newarr.shape, strides=newarr.strides,
+                       dtype=self.dtype, dgpu_data=self.dgpu_data)
+
+        else:
+            raise NotImplementedError("operation requires copying")
+
+
+class HostArray(np.ndarray):
+    __hsa_memory__ = True
+
+    @property
+    def device_ctypes_pointer(self):
+        return self.ctypes.data_as(c_void_p)
+
+
+def from_array_like(ary, dgpu_data=None):
+    "Create a DeviceNDArray object that is like ary."
+    if ary.ndim == 0:
+        ary = ary.reshape(1)
+    return DeviceNDArray(ary.shape, ary.strides, ary.dtype,
+                         dgpu_data=dgpu_data)
+
+
+
+errmsg_contiguous_buffer = ("Array contains non-contiguous buffer and cannot "
+                            "be transferred as a single memory region. Please "
+                            "ensure contiguous buffer with numpy "
+                            ".ascontiguousarray()")
+
+
+def _single_buffer(ary):
+    i = np.argmax(ary.strides)
+    size = ary.strides[i] * ary.shape[i]
+    return size == ary.nbytes
+
+
+def sentry_contiguous(ary):
+    if not ary.flags['C_CONTIGUOUS'] and not ary.flags['F_CONTIGUOUS']:
+        if ary.strides[0] == 0:
+            # Broadcasted, ensure inner contiguous
+            return sentry_contiguous(ary[0])
+
+        elif _single_buffer(ary):
+            return True
+
+        else:
+            raise ValueError(errmsg_contiguous_buffer)
+
+
+def auto_device(obj, context, stream=None, copy=True):
+    """
+    Create a DeviceArray like obj and optionally copy data from
+    host to device. If obj already represents device memory, it is returned and
+    no copy is made.
+    """
+    if _driver.is_device_memory(obj): # it's already on the dGPU
+        return obj, False
+    else: # needs to be copied to the dGPU
+        sentry_contiguous(obj)
+        devobj = from_array_like(obj)
+        if copy:
+            devobj.copy_to_device(obj, stream=stream, context=context)
+        return devobj, True
+
+
--- a/numba/roc/hsadrv/devices.py
+++ b/numba/roc/hsadrv/devices.py
+"""
+Expose each GPU device directly
+"""
+import functools
+from .driver import hsa as driver, Context as _Context
+from numba.roc import servicelib
+
+
+class _culist(object):
+    """A thread local list of GPU instances
+    """
+
+    def __init__(self):
+        self._lst = None
+
+    @property
+    def _gpus(self):
+        if not self._lst:
+            self._lst = self._init_gpus()
+        return self._lst
+
+    def _init_gpus(self):
+        gpus = []
+        for com in driver.components:
+            gpus.append(CU(com))
+        return gpus
+
+    def __getitem__(self, item):
+        return self._gpus[item]
+
+    def append(self, item):
+        return self._gpus.append(item)
+
+    def __len__(self):
+        return len(self._gpus)
+
+    def __nonzero__(self):
+        return bool(self._gpus)
+
+    def __iter__(self):
+        return iter(self._gpus)
+
+    __bool__ = __nonzero__
+
+    def reset(self):
+        for gpu in self:
+            gpu.reset()
+
+    @property
+    def current(self):
+        """Get the current GPU object associated with the thread
+        """
+        return _custack.top
+
+
+cus = _culist()
+del _culist
+
+
+class CU(object):
+    def __init__(self, cu):
+        self._cu = cu
+        self._context = None
+
+    def __getattr__(self, key):
+        """Redirect to self._gpu
+        """
+        if key.startswith('_'):
+            raise AttributeError(key)
+        return getattr(self._cu, key)
+
+    def __repr__(self):
+        return repr(self._cu)
+
+    def associate_context(self):
+        """Associate the context of this GPU to the running thread
+        """
+        # No context was created for this GPU
+        if self._context is None:
+            self._context = self._cu.create_context()
+
+        return self._context
+
+    def __enter__(self):
+        self.associate_context()
+        _custack.push(self)
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        assert _get_device() is self
+        self._context.pop()
+        _custack.pop()
+
+    def reset(self):
+        if self._context:
+            self._context.reset()
+            self._context = None
+
+
+_cpu_context = None
+
+
+def get_cpu_context():
+    global _cpu_context
+    if _cpu_context is None:
+        cpu_agent = [a for a in driver.agents if not a.is_component][0]
+        _cpu_context = _Context(cpu_agent)
+    return _cpu_context
+
+
+def get_gpu(i):
+    return cus[i]
+
+def get_num_gpus():
+    return len(cus)
+
+
+_custack = servicelib.TLStack()
+
+
+def _get_device(devnum=0):
+    """Get the current device or use a device by device number.
+    """
+    if not _custack:
+        _custack.push(get_gpu(devnum))
+    return _custack.top
+
+
+def get_context(devnum=0):
+    """Get the current device or use a device by device number, and
+    return the HSA context.
+    """
+    return _get_device(devnum=devnum).associate_context()
+
+
+def get_all_contexts():
+    return [get_context(i) for i in range(get_num_gpus())]
+
+
+def require_context(fn):
+    """
+    A decorator to ensure a context for the HSA subsystem
+    """
+
+    @functools.wraps(fn)
+    def _require_cu_context(*args, **kws):
+        get_context()
+        return fn(*args, **kws)
+
+    return _require_cu_context
+
+
+def reset():
+    cus.reset()
+    _custack.clear()
+
+
--- a/numba/roc/hsadrv/driver.py
+++ b/numba/roc/hsadrv/driver.py
+"""
+HSA driver bridge implementation
+"""
+
+from collections.abc import Sequence
+
+import sys
+import atexit
+import os
+import ctypes
+import struct
+import traceback
+import weakref
+import logging
+from contextlib import contextmanager
+
+from collections import defaultdict, deque
+from functools import total_ordering
+from numba import mviewbuf
+from numba.core import utils, config
+from .error import HsaSupportError, HsaDriverError, HsaApiError
+from numba.roc.hsadrv import enums, enums_ext, drvapi
+import numpy as np
+
+
+_logger = logging.getLogger(__name__)
+
+
+class HsaKernelTimedOut(HsaDriverError):
+    pass
+
+
+def _device_type_to_string(device):
+    try:
+        return ['CPU', 'GPU', 'DSP'][device]
+    except IndexError:
+        return 'Unknown'
+
+
+DEFAULT_HSA_DRIVER = '/opt/rocm/lib/libhsa-runtime64.so'
+
+
+def _find_driver():
+    envpath = os.environ.get('NUMBA_HSA_DRIVER', DEFAULT_HSA_DRIVER)
+    if envpath == '0':
+        # Force fail
+        _raise_driver_not_found()
+
+    # Determine DLL type
+    if (struct.calcsize('P') != 8
+        or sys.platform == 'win32'
+        or sys.platform == 'darwin'):
+        _raise_platform_not_supported()
+    else:
+        # Assume to be *nix like and 64 bit
+        dlloader = ctypes.CDLL
+        dldir = ['/usr/lib', '/usr/lib64']
+        dlname = 'libhsa-runtime64.so'
+
+    if envpath is not None:
+        try:
+            envpath = os.path.abspath(envpath)
+        except ValueError:
+            raise HsaSupportError("NUMBA_HSA_DRIVER %s is not a valid path" %
+                             envpath)
+        if not os.path.isfile(envpath):
+            raise HsaSupportError("NUMBA_HSA_DRIVER %s is not a valid file "
+                             "path.  Note it must be a filepath of the .so/"
+                             ".dll/.dylib or the driver" % envpath)
+        candidates = [envpath]
+    else:
+        # First search for the name in the default library path.
+        # If that is not found, try the specific path.
+        candidates = [dlname] + [os.path.join(x, dlname) for x in dldir]
+
+    # Load the driver; Collect driver error information
+    path_not_exist = []
+    driver_load_error = []
+
+    for path in candidates:
+        try:
+            dll = dlloader(path)
+        except OSError as e:
+            # Problem opening the DLL
+            path_not_exist.append(not os.path.isfile(path))
+            driver_load_error.append(e)
+        else:
+            return dll
+
+    # Problem loading driver
+    if all(path_not_exist):
+        _raise_driver_not_found()
+    else:
+        errmsg = '\n'.join(str(e) for e in driver_load_error)
+        _raise_driver_error(errmsg)
+
+
+PLATFORM_NOT_SUPPORTED_ERROR = """
+HSA is not currently supported on this platform ({0}).
+"""
+
+
+def _raise_platform_not_supported():
+    raise HsaSupportError(PLATFORM_NOT_SUPPORTED_ERROR.format(sys.platform))
+
+
+DRIVER_NOT_FOUND_MSG = """
+The HSA runtime library cannot be found.
+
+If you are sure that the HSA is installed, try setting environment
+variable NUMBA_HSA_DRIVER with the file path of the HSA runtime shared
+library.
+"""
+
+
+def _raise_driver_not_found():
+    raise HsaSupportError(DRIVER_NOT_FOUND_MSG)
+
+
+DRIVER_LOAD_ERROR_MSG = """
+A HSA runtime library was found, but failed to load with error:
+%s
+"""
+
+
+def _raise_driver_error(e):
+    raise HsaSupportError(DRIVER_LOAD_ERROR_MSG % e)
+
+
+MISSING_FUNCTION_ERRMSG = """driver missing function: %s.
+"""
+
+
+class Recycler(object):
+    def __init__(self):
+        self._garbage = []
+        self.enabled = True
+
+    def free(self, obj):
+        self._garbage.append(obj)
+        self.service()
+
+    def _cleanup(self):
+        for obj in self._garbage:
+            obj._finalizer(obj)
+        del self._garbage[:]
+
+    def service(self):
+        if self.enabled:
+            if len(self._garbage) > 10:
+                self._cleanup()
+
+    def drain(self):
+        self._cleanup()
+        self.enabled = False
+
+
+# The Driver ###########################################################
+
+
+class Driver(object):
+    """
+    Driver API functions are lazily bound.
+    """
+    _singleton = None
+    _agent_map = None
+    _api_prototypes = drvapi.API_PROTOTYPES  # avoid premature GC at exit
+
+    _hsa_properties = {
+        'version_major': (enums.HSA_SYSTEM_INFO_VERSION_MAJOR, ctypes.c_uint16),
+        'version_minor': (enums.HSA_SYSTEM_INFO_VERSION_MINOR, ctypes.c_uint16),
+        'timestamp': (enums.HSA_SYSTEM_INFO_TIMESTAMP, ctypes.c_uint64),
+        'timestamp_frequency': (enums.HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, ctypes.c_uint16),
+        'signal_max_wait': (enums.HSA_SYSTEM_INFO_SIGNAL_MAX_WAIT, ctypes.c_uint64),
+    }
+
+    def __new__(cls):
+        obj = cls._singleton
+        if obj is not None:
+            return obj
+        else:
+            obj = object.__new__(cls)
+            cls._singleton = obj
+        return obj
+
+    def __init__(self):
+        try:
+            if config.DISABLE_HSA:
+                raise HsaSupportError("HSA disabled by user")
+            self.lib = _find_driver()
+            self.is_initialized = False
+            self.initialization_error = None
+        except HsaSupportError as e:
+            self.is_initialized = True
+            self.initialization_error = e
+
+        self._agent_map = None
+        self._programs = {}
+        self._recycler = Recycler()
+        self._active_streams = weakref.WeakSet()
+
+    def _initialize_api(self):
+        if self.is_initialized:
+            return
+
+        self.is_initialized = True
+        try:
+            self.hsa_init()
+        except HsaApiError as e:
+            self.initialization_error = e
+            raise HsaDriverError("Error at driver init: \n%s:" % e)
+        else:
+            @atexit.register
+            def shutdown():
+                try:
+                    for agent in self.agents:
+                        agent.release()
+                except AttributeError:
+                    # this is because no agents initialised
+                    #  so self.agents isn't present
+                    pass
+                else:
+                    self._recycler.drain()
+
+    def _initialize_agents(self):
+        if self._agent_map is not None:
+            return
+
+        self._initialize_api()
+
+        agent_ids = []
+
+        def on_agent(agent_id, ctxt):
+            agent_ids.append(agent_id)
+            return enums.HSA_STATUS_SUCCESS
+
+        callback = drvapi.HSA_ITER_AGENT_CALLBACK_FUNC(on_agent)
+        self.hsa_iterate_agents(callback, None)
+
+        agent_map = dict((agent_id, Agent(agent_id)) for agent_id in agent_ids)
+        self._agent_map = agent_map
+
+    @property
+    def is_available(self):
+        self._initialize_api()
+        return self.initialization_error is None
+
+    @property
+    def agents(self):
+        self._initialize_agents()
+        return self._agent_map.values()
+
+    def create_program(self, model=enums.HSA_MACHINE_MODEL_LARGE,
+                       profile=enums.HSA_PROFILE_FULL,
+                       rounding_mode=enums.HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT,
+                       options=None):
+        program = drvapi.hsa_ext_program_t()
+        assert options is None
+        self.hsa_ext_program_create(model, profile, rounding_mode,
+                                    options, ctypes.byref(program))
+        return Program(program)
+
+    def create_signal(self, initial_value, consumers=None):
+        if consumers is None:
+            consumers = tuple(self.agents)
+
+        consumers_len = len(consumers)
+        consumers_type = drvapi.hsa_agent_t * consumers_len
+        consumers = consumers_type(*[c._id for c in consumers])
+
+        result = drvapi.hsa_signal_t()
+        self.hsa_signal_create(initial_value, consumers_len, consumers,
+                               ctypes.byref(result))
+        return Signal(result.value)
+
+    def __getattr__(self, fname):
+        # Initialize driver
+        self._initialize_api()
+
+        # First try if it is an hsa property
+        try:
+            enum, typ = self._hsa_properties[fname]
+            result = typ()
+            self.hsa_system_get_info(enum, ctypes.byref(result))
+            return result.value
+        except KeyError:
+            pass
+
+        # if not a property... try if it is an api call
+        try:
+            proto = self._api_prototypes[fname]
+        except KeyError:
+            raise AttributeError(fname)
+
+        if self.initialization_error is not None:
+            raise HsaSupportError("Error at driver init: \n%s:" %
+                                  self.initialization_error)
+
+        # Find function in driver library
+        libfn = self._find_api(fname)
+
+        for key, val in proto.items():
+            setattr(libfn, key, val)
+
+        def driver_wrapper(fn):
+            def wrapped(*args, **kwargs):
+                _logger.debug('call driver api: %s', fname)
+                return fn(*args, **kwargs)
+            return wrapped
+
+        retval = driver_wrapper(libfn)
+        setattr(self, fname, retval)
+        return retval
+
+    def _find_api(self, fname):
+        # Try regular
+        try:
+            return getattr(self.lib, fname)
+        except AttributeError:
+            pass
+
+        # Not found.
+        # Delay missing function error to use
+        def absent_function(*args, **kws):
+            raise HsaDriverError(MISSING_FUNCTION_ERRMSG % fname)
+
+        setattr(self, fname, absent_function)
+        return absent_function
+
+    @property
+    def components(self):
+        """Returns a ordered list of components
+
+        The first device should be picked first
+        """
+        return list(filter(lambda a: a.is_component, reversed(sorted(
+            self.agents))))
+
+    def create_stream(self):
+        st = Stream()
+        self._active_streams.add(st)
+        return st
+
+    def implicit_sync(self):
+        """
+        Implicit synchronization for all asynchronous streams
+        across all devices.
+        """
+        _logger.info("implicit sync")
+        for st in self._active_streams:
+            st.synchronize()
+
+
+hsa = Driver()
+
+class HsaWrapper(object):
+    def __getattr__(self, fname):
+        try:
+            enum, typ = self._hsa_properties[fname]
+        except KeyError:
+            raise AttributeError(
+                "%r object has no attribute %r" % (self.__class__, fname))
+
+        func = getattr(hsa, self._hsa_info_function)
+        result = typ()
+        is_array_type = hasattr(typ, '_length_')
+        # if the result is not ctypes array, get a reference)
+        result_buff = result if is_array_type else ctypes.byref(result)
+        func(self._id, enum, result_buff)
+
+        if not is_array_type or typ._type_ == ctypes.c_char:
+            return result.value
+        else:
+            return list(result)
+
+    def __dir__(self):
+        return sorted(set(dir(type(self)) +
+                          self.__dict__.keys() +
+                          self._hsa_properties.keys()))
+
+@total_ordering
+class Agent(HsaWrapper):
+    """Abstracts a HSA compute agent.
+
+    This will wrap and provide an OO interface for hsa_agent_t C-API elements
+    """
+
+    # Note this will be handled in a rather unconventional way. When agents get
+    # initialized by the driver, a set of instances for all the available agents
+    # will be created. After that creation, the __new__ and __init__ methods will
+    # be replaced, and the constructor will act as a mapping from an agent_id to
+    # the equivalent Agent object. Any attempt to create an Agent with a non
+    # existing agent_id will result in an error.
+    #
+    # the logic for this resides in Driver._initialize_agents
+
+    _hsa_info_function = 'hsa_agent_get_info'
+    _hsa_properties = {
+        'name': (enums.HSA_AGENT_INFO_NAME, ctypes.c_char * 64),
+        'vendor_name': (enums.HSA_AGENT_INFO_VENDOR_NAME, ctypes.c_char * 64),
+        'feature': (enums.HSA_AGENT_INFO_FEATURE, drvapi.hsa_agent_feature_t),
+        'wavefront_size': (
+            enums.HSA_AGENT_INFO_WAVEFRONT_SIZE, ctypes.c_uint32),
+        'workgroup_max_dim': (
+            enums.HSA_AGENT_INFO_WORKGROUP_MAX_DIM, ctypes.c_uint16 * 3),
+        'grid_max_dim': (enums.HSA_AGENT_INFO_GRID_MAX_DIM, drvapi.hsa_dim3_t),
+        'grid_max_size': (enums.HSA_AGENT_INFO_GRID_MAX_SIZE, ctypes.c_uint32),
+        'fbarrier_max_size': (
+            enums.HSA_AGENT_INFO_FBARRIER_MAX_SIZE, ctypes.c_uint32),
+        'queues_max': (enums.HSA_AGENT_INFO_QUEUES_MAX, ctypes.c_uint32),
+        'queue_max_size': (
+            enums.HSA_AGENT_INFO_QUEUE_MAX_SIZE, ctypes.c_uint32),
+        'queue_type': (
+            enums.HSA_AGENT_INFO_QUEUE_TYPE, drvapi.hsa_queue_type_t),
+        'node': (enums.HSA_AGENT_INFO_NODE, ctypes.c_uint32),
+        '_device': (enums.HSA_AGENT_INFO_DEVICE, drvapi.hsa_device_type_t),
+        'cache_size': (enums.HSA_AGENT_INFO_CACHE_SIZE, ctypes.c_uint32 * 4),
+        'isa': (enums.HSA_AGENT_INFO_ISA, drvapi.hsa_isa_t),
+    }
+
+    def __init__(self, agent_id):
+        # This init will only happen when initializing the agents. After
+        # the agent initialization the instances of this class are considered
+        # initialized and locked, so this method will be removed.
+        self._id = agent_id
+        self._recycler = hsa._recycler
+        self._queues = set()
+        self._initialize_regions()
+        self._initialize_mempools()
+
+    @property
+    def device(self):
+        return _device_type_to_string(self._device)
+
+    @property
+    def is_component(self):
+        return (self.feature & enums.HSA_AGENT_FEATURE_KERNEL_DISPATCH) != 0
+
+    @property
+    def regions(self):
+        return self._regions
+
+    @property
+    def mempools(self):
+        return self._mempools
+
+    @property
+    def wavebits(self):
+        """
+        log2(wavefront_size)
+        """
+        # assume wavefront_size will always be a power of 2
+        return bin(self.wavefront_size)[::-1].index('1')
+
+    def _initialize_regions(self):
+        region_ids = []
+
+        def on_region(region_id, ctxt):
+            region_ids.append(region_id)
+            return enums.HSA_STATUS_SUCCESS
+
+        callback = drvapi.HSA_AGENT_ITERATE_REGIONS_CALLBACK_FUNC(on_region)
+        hsa.hsa_agent_iterate_regions(self._id, callback, None)
+        self._regions = _RegionList([MemRegion.instance_for(self, region_id)
+                                     for region_id in region_ids])
+
+    def _initialize_mempools(self):
+        mempool_ids = []
+
+        def on_region(_id, ctxt=None):
+            mempool_ids.append(_id)
+            return enums.HSA_STATUS_SUCCESS
+
+        callback = drvapi.HSA_AMD_AGENT_ITERATE_MEMORY_POOLS_CALLBACK(on_region)
+        hsa.hsa_amd_agent_iterate_memory_pools(self._id, callback, None)
+        self._mempools = _RegionList([MemPool.instance_for(self, mempool_id)
+                                     for mempool_id in mempool_ids])
+
+    def _create_queue(self, size, callback=None, data=None,
+                      private_segment_size=None, group_segment_size=None,
+                      queue_type=None):
+        assert queue_type is not None
+        assert size <= self.queue_max_size
+
+        cb_typ = drvapi.HSA_QUEUE_CALLBACK_FUNC
+        cb = ctypes.cast(None, cb_typ) if callback is None else cb_typ(callback)
+        result = ctypes.POINTER(drvapi.hsa_queue_t)()
+        private_segment_size = (ctypes.c_uint32(-1)
+                                if private_segment_size is None
+                                else private_segment_size)
+        group_segment_size = (ctypes.c_uint32(-1)
+                              if group_segment_size is None
+                              else group_segment_size)
+        hsa.hsa_queue_create(self._id, size, queue_type, cb, data,
+                             private_segment_size, group_segment_size,
+                             ctypes.byref(result))
+
+        q = Queue(self, result)
+        self._queues.add(q)
+        return weakref.proxy(q)
+
+    def create_queue_single(self, *args, **kwargs):
+        kwargs['queue_type'] = enums.HSA_QUEUE_TYPE_SINGLE
+        return self._create_queue(*args, **kwargs)
+
+    def create_queue_multi(self, *args, **kwargs):
+        kwargs['queue_type'] = enums.HSA_QUEUE_TYPE_MULTI
+        return self._create_queue(*args, **kwargs)
+
+    def release(self):
+        """
+        Release all resources
+
+        Called at system teardown
+        """
+        for q in list(self._queues):
+            q.release()
+
+    def release_queue(self, queue):
+        self._queues.remove(queue)
+        self._recycler.free(queue)
+
+    def __repr__(self):
+        return "<HSA agent ({0}): {1} {2} '{3}'{4}>".format(self._id,
+                                                            self.device,
+                                                            self.vendor_name,
+                                                            self.name,
+                                                            " (component)" if self.is_component else "")
+
+    def _rank(self):
+        return (self.is_component, self.grid_max_size, self._device)
+
+    def __lt__(self, other):
+        if isinstance(self, Agent):
+            return self._rank() < other._rank()
+        else:
+            return NotImplemented
+
+    def __eq__(self, other):
+        if isinstance(self, Agent):
+            return self._rank() == other._rank()
+        else:
+            return NotImplemented
+
+    def __hash__(self):
+        return hash(self._rank())
+
+    def create_context(self):
+        return Context(self)
+
+
+class _RegionList(Sequence):
+    __slots__ = '_all', 'globals', 'readonlys', 'privates', 'groups'
+
+    def __init__(self, lst):
+        self._all = tuple(lst)
+        self.globals = tuple(x for x in lst if x.kind == 'global')
+        self.readonlys = tuple(x for x in lst if x.kind == 'readonly')
+        self.privates = tuple(x for x in lst if x.kind == 'private')
+        self.groups = tuple(x for x in lst if x.kind == 'group')
+
+    def __len__(self):
+        return len(self._all)
+
+    def __contains__(self, item):
+        return item in self._all
+
+    def __reversed__(self):
+        return reversed(self._all)
+
+    def __getitem__(self, idx):
+        return self._all[idx]
+
+
+class MemPool(HsaWrapper):
+    """Abstracts a HSA mem pool.
+
+    This will wrap and provide an OO interface for hsa_amd_memory_pool_t
+    C-API elements
+    """
+    _hsa_info_function = 'hsa_amd_memory_pool_get_info'
+
+    _hsa_properties = {
+        'segment': (
+            enums_ext.HSA_AMD_MEMORY_POOL_INFO_SEGMENT,
+            drvapi.hsa_amd_segment_t
+        ),
+        '_flags': (
+            enums_ext.HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS,
+            ctypes.c_uint32
+        ),
+        'size': (enums_ext.HSA_AMD_MEMORY_POOL_INFO_SIZE,
+                    ctypes.c_size_t),
+        'alloc_allowed': (enums_ext.HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
+                            ctypes.c_bool),
+        'alloc_granule': (enums_ext.HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE,
+                            ctypes.c_size_t),
+        'alloc_alignment': (enums_ext.HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT,
+                            ctypes.c_size_t),
+        'accessible_by_all': (enums_ext.HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL,
+                            ctypes.c_bool),
+    }
+
+    _segment_name_map = {
+        enums_ext.HSA_AMD_SEGMENT_GLOBAL: 'global',
+        enums_ext.HSA_AMD_SEGMENT_READONLY: 'readonly',
+        enums_ext.HSA_AMD_SEGMENT_PRIVATE: 'private',
+        enums_ext.HSA_AMD_SEGMENT_GROUP: 'group',
+    }
+
+    def __init__(self, agent, pool):
+        """Do not instantiate MemPool objects directly, use the factory class
+        method 'instance_for' to ensure MemPool identity"""
+        self._id = pool
+        self._owner_agent = agent
+        self._as_parameter_ = self._id
+
+    @property
+    def kind(self):
+        return self._segment_name_map[self.segment]
+
+    @property
+    def agent(self):
+        return self._owner_agent
+
+    def supports(self, check_flag):
+        """
+            Determines if a given feature is supported by this MemRegion.
+            Feature flags are found in "./enums_exp.py" under:
+                * hsa_amd_memory_pool_global_flag_t
+                Params:
+                check_flag: Feature flag to test
+        """
+        if self.kind == 'global':
+            return self._flags & check_flag
+        else:
+            return False
+
+    def allocate(self, nbytes):
+        assert self.alloc_allowed
+        assert nbytes >= 0
+        buff = ctypes.c_void_p()
+        flags = ctypes.c_uint32(0) # From API docs "Must be 0"!
+        hsa.hsa_amd_memory_pool_allocate(self._id, nbytes, flags, ctypes.byref(buff))
+        if buff.value is None:
+            raise HsaDriverError("Failed to allocate from {}".format(self))
+        return buff
+
+    _instance_dict = {}
+
+    @classmethod
+    def instance_for(cls, owner, _id):
+        try:
+            return cls._instance_dict[_id]
+        except KeyError:
+            new_instance = cls(owner, _id)
+            cls._instance_dict[_id] = new_instance
+            return new_instance
+
+
+class MemRegion(HsaWrapper):
+    """Abstracts a HSA memory region.
+
+    This will wrap and provide an OO interface for hsa_region_t C-API elements
+    """
+    _hsa_info_function = 'hsa_region_get_info'
+    _hsa_properties = {
+        'segment': (
+            enums.HSA_REGION_INFO_SEGMENT,
+            drvapi.hsa_region_segment_t
+        ),
+        '_flags': (
+            enums.HSA_REGION_INFO_GLOBAL_FLAGS,
+            drvapi.hsa_region_global_flag_t
+        ),
+        'host_accessible': (enums_ext.HSA_AMD_REGION_INFO_HOST_ACCESSIBLE,
+                            ctypes.c_bool),
+        'size': (enums.HSA_REGION_INFO_SIZE,
+                    ctypes.c_size_t),
+        'alloc_max_size': (enums.HSA_REGION_INFO_ALLOC_MAX_SIZE,
+                            ctypes.c_size_t),
+        'alloc_alignment': (enums.HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT,
+                            ctypes.c_size_t),
+        'alloc_granule': (enums.HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE,
+                            ctypes.c_size_t),
+        'alloc_allowed': (enums.HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED,
+                            ctypes.c_bool),
+    }
+
+    _segment_name_map = {
+        enums.HSA_REGION_SEGMENT_GLOBAL: 'global',
+        enums.HSA_REGION_SEGMENT_READONLY: 'readonly',
+        enums.HSA_REGION_SEGMENT_PRIVATE: 'private',
+        enums.HSA_REGION_SEGMENT_GROUP: 'group',
+    }
+
+    def __init__(self, agent, region_id):
+        """Do not instantiate MemRegion objects directly, use the factory class
+        method 'instance_for' to ensure MemRegion identity"""
+        self._id = region_id
+        self._owner_agent = agent
+        self._as_parameter_ = self._id
+
+    @property
+    def kind(self):
+        return self._segment_name_map[self.segment]
+
+    @property
+    def agent(self):
+        return self._owner_agent
+
+    def supports(self, check_flag):
+        """
+            Determines if a given feature is supported by this MemRegion.
+            Feature flags are found in "./enums.py" under:
+                * hsa_region_global_flag_t
+                Params:
+                check_flag: Feature flag to test
+        """
+        if self.kind == 'global':
+            return self._flags & check_flag
+        else:
+            return False
+
+    def allocate(self, nbytes):
+        assert self.alloc_allowed
+        assert nbytes <= self.alloc_max_size
+        assert nbytes >= 0
+        buff = ctypes.c_void_p()
+        hsa.hsa_memory_allocate(self._id, nbytes, ctypes.byref(buff))
+        return buff
+
+    def free(self, ptr):
+        hsa.hsa_memory_free(ptr)
+
+    _instance_dict = {}
+
+    @classmethod
+    def instance_for(cls, owner, _id):
+        try:
+            return cls._instance_dict[_id]
+        except KeyError:
+            new_instance = cls(owner, _id)
+            cls._instance_dict[_id] = new_instance
+            return new_instance
+
+
+class Queue(object):
+    def __init__(self, agent, queue_ptr):
+        """The id in a queue is a pointer to the queue object returned by hsa_queue_create.
+        The Queue object has ownership on that queue object"""
+        self._agent = weakref.proxy(agent)
+        self._id = queue_ptr
+        self._as_parameter_ = self._id
+        self._finalizer = hsa.hsa_queue_destroy
+
+    def release(self):
+        self._agent.release_queue(self)
+
+    def __getattr__(self, fname):
+        return getattr(self._id.contents, fname)
+
+    @contextmanager
+    def _get_packet(self, packet_type):
+        # Write AQL packet at the calculated queue index address
+        queue_struct = self._id.contents
+        queue_mask = queue_struct.size - 1
+        assert (ctypes.sizeof(packet_type) ==
+                ctypes.sizeof(drvapi.hsa_kernel_dispatch_packet_t))
+        packet_array_t = (packet_type * queue_struct.size)
+
+        # Obtain the current queue write index
+        index = hsa.hsa_queue_add_write_index_acq_rel(self._id, 1)
+
+        while True:
+            read_offset = hsa.hsa_queue_load_read_index_acquire(self._id)
+            if read_offset <= index < read_offset + queue_struct.size:
+                break
+
+        queue_offset = index & queue_mask
+        queue = packet_array_t.from_address(queue_struct.base_address)
+        packet = queue[queue_offset]
+
+        # zero init
+        ctypes.memset(ctypes.addressof(packet), 0, ctypes.sizeof(packet_type))
+        yield packet
+        # Increment write index
+        # Ring the doorbell
+        hsa.hsa_signal_store_release(self._id.contents.doorbell_signal, index)
+
+    def insert_barrier(self, dep_signal):
+        with self._get_packet(drvapi.hsa_barrier_and_packet_t) as packet:
+            # Populate packet
+            packet.dep_signal0 = dep_signal._id
+
+            header = 0
+            header |= enums.HSA_FENCE_SCOPE_SYSTEM << enums.HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE
+            header |= enums.HSA_FENCE_SCOPE_SYSTEM << enums.HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE
+            header |= enums.HSA_PACKET_TYPE_BARRIER_AND << enums.HSA_PACKET_HEADER_TYPE
+            header |= 1 << enums.HSA_PACKET_HEADER_BARRIER
+
+            # Original example calls for an atomic store.
+            # Since we are on x86, store of aligned 16 bit is atomic.
+            # The C code is
+            # __atomic_store_n((uint16_t*)(&dispatch_packet->header), header, __ATOMIC_RELEASE);
+            packet.header = header
+
+    def dispatch(self, symbol, kernargs,
+                 workgroup_size=None,
+                 grid_size=None,
+                 signal=None):
+        _logger.info("dispatch %s", symbol.name)
+        dims = len(workgroup_size)
+        assert dims == len(grid_size)
+        assert 0 < dims <= 3
+        assert grid_size >= workgroup_size
+        if workgroup_size > tuple(self._agent.workgroup_max_dim)[:dims]:
+            msg = "workgroupsize is too big {0} > {1}"
+            raise HsaDriverError(msg.format(workgroup_size,
+                                 tuple(self._agent.workgroup_max_dim)[:dims]))
+        s = signal if signal is not None else hsa.create_signal(1)
+
+        # Note: following vector_copy.c
+        with self._get_packet(drvapi.hsa_kernel_dispatch_packet_t) as packet:
+
+            # Populate packet
+            packet.setup |= dims << enums.HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS
+
+            packet.workgroup_size_x = workgroup_size[0]
+            packet.workgroup_size_y = workgroup_size[1] if dims > 1 else 1
+            packet.workgroup_size_z = workgroup_size[2] if dims > 2 else 1
+
+            packet.grid_size_x = grid_size[0]
+            packet.grid_size_y = grid_size[1] if dims > 1 else 1
+            packet.grid_size_z = grid_size[2] if dims > 2 else 1
+
+            packet.completion_signal = s._id
+
+            packet.kernel_object = symbol.kernel_object
+
+            packet.kernarg_address = (0 if kernargs is None
+                                      else kernargs.value)
+
+            packet.private_segment_size = symbol.private_segment_size
+            packet.group_segment_size = symbol.group_segment_size
+
+            header = 0
+            header |= enums.HSA_FENCE_SCOPE_SYSTEM << enums.HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE
+            header |= enums.HSA_FENCE_SCOPE_SYSTEM << enums.HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE
+            header |= enums.HSA_PACKET_TYPE_KERNEL_DISPATCH << enums.HSA_PACKET_HEADER_TYPE
+
+            # Original example calls for an atomic store.
+            # Since we are on x86, store of aligned 16 bit is atomic.
+            # The C code is
+            # __atomic_store_n((uint16_t*)(&dispatch_packet->header), header, __ATOMIC_RELEASE);
+            packet.header = header
+
+        # Wait on the dispatch completion signal
+
+        # synchronous if no signal was provided
+        if signal is None:
+            _logger.info('wait for synchronous kernel to complete')
+            timeout = 10
+            if not s.wait_until_ne_one(timeout=timeout):
+                msg = "Kernel timed out after {timeout} second"
+                raise HsaKernelTimedOut(msg.format(timeout=timeout))
+
+    def __dir__(self):
+        return sorted(set(dir(self._id.contents) +
+                          self.__dict__.keys()))
+
+    def owned(self):
+        return ManagedQueueProxy(self)
+
+
+class ManagedQueueProxy(object):
+    def __init__(self, queue):
+        self._queue = weakref.ref(queue)
+
+    def __getattr__(self, item):
+        return getattr(self._queue(), item)
+
+
+class Signal(object):
+    """The id for the signal is going to be the hsa_signal_t returned by create_signal.
+    Lifetime of the underlying signal will be tied with this object".
+    Note that it is likely signals will have lifetime issues."""
+
+    def __init__(self, signal_id):
+        self._id = signal_id
+        self._as_parameter_ = self._id
+        weakref.finalize(self, hsa.hsa_signal_destroy, self._id)
+
+    def load_relaxed(self):
+        return hsa.hsa_signal_load_relaxed(self._id)
+
+    def load_acquire(self):
+        return hsa.hsa_signal_load_acquire(self._id)
+
+    def wait_until_ne_one(self, timeout=None):
+        """
+        Returns a boolean to indicate whether the wait has timeout
+        """
+        one = 1
+        mhz = 10 ** 6
+        if timeout is None:
+            # Infinite
+            expire = -1   # UINT_MAX
+        else:
+            # timeout as seconds
+            expire = timeout * hsa.timestamp_frequency * mhz
+
+        # XXX: use active wait instead of blocked seem to avoid hang in docker
+        hsa.hsa_signal_wait_acquire(self._id, enums.HSA_SIGNAL_CONDITION_NE,
+                                    one, expire,
+                                    enums.HSA_WAIT_STATE_ACTIVE)
+        return self.load_relaxed() != one
+
+
+class BrigModule(object):
+    def __init__(self, brig_buffer):
+        """
+        Take a byte buffer of a Brig module
+        """
+        buf = ctypes.create_string_buffer(brig_buffer)
+        self._buffer = buf
+        self._id = ctypes.cast(ctypes.addressof(buf),
+                               drvapi.hsa_ext_module_t)
+
+    @classmethod
+    def from_file(cls, file_name):
+        with open(file_name, 'rb') as fin:
+            buf = fin.read()
+
+        return BrigModule(buf)
+
+    def __len__(self):
+        return len(self._buffer)
+
+    def __repr__(self):
+        return "<BrigModule id={0} size={1}bytes>".format(hex(id(self)),
+                                                          len(self))
+
+
+class Program(object):
+    def __init__(self, model=enums.HSA_MACHINE_MODEL_LARGE,
+                 profile=enums.HSA_PROFILE_FULL,
+                 rounding_mode=enums.HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT,
+                 options=None, version_major=1, version_minor=0):
+        self._id = drvapi.hsa_ext_program_t()
+        assert options is None
+
+        def check_fptr_return(hsa_status):
+            if hsa_status is not enums.HSA_STATUS_SUCCESS:
+                msg = ctypes.c_char_p()
+                hsa.hsa_status_string(hsa_status, ctypes.byref(msg))
+                _logger.info(msg.value.decode("utf-8"))
+                exit(-hsa_status)
+
+        support = ctypes.c_bool(0)
+        hsa.hsa_system_extension_supported(enums.HSA_EXTENSION_FINALIZER,
+                                           version_major,
+                                           version_minor,
+                                           ctypes.byref(support))
+
+        assert support.value, ('HSA system extension %s.%s not supported' %
+                (version_major, version_minor))
+
+        # struct of function pointers
+        self._ftabl = drvapi.hsa_ext_finalizer_1_00_pfn_t()
+
+        # populate struct
+        hsa.hsa_system_get_extension_table(enums.HSA_EXTENSION_FINALIZER,
+                                           version_major,
+                                           version_minor,
+                                           ctypes.byref(self._ftabl))
+
+        ret = self._ftabl.hsa_ext_program_create(model, profile,
+                                    rounding_mode, options,
+                                    ctypes.byref(self._id))
+
+        check_fptr_return(ret)
+
+        self._as_parameter_ = self._id
+        weakref.finalize(self, self._ftabl.hsa_ext_program_destroy,
+                       self._id)
+
+    def add_module(self, module):
+        self._ftabl.hsa_ext_program_add_module(self._id, module._id)
+
+    def finalize(self, isa, callconv=0, options=None):
+        """
+        The program object is safe to be deleted after ``finalize``.
+        """
+        code_object = drvapi.hsa_code_object_t()
+        control_directives = drvapi.hsa_ext_control_directives_t()
+        ctypes.memset(ctypes.byref(control_directives), 0,
+                      ctypes.sizeof(control_directives))
+        self._ftabl.hsa_ext_program_finalize(self._id,
+                                     isa,
+                                     callconv,
+                                     control_directives,
+                                     options,
+                                     enums.HSA_CODE_OBJECT_TYPE_PROGRAM,
+                                     ctypes.byref(code_object))
+        return CodeObject(code_object)
+
+
+class CodeObject(object):
+    def __init__(self, code_object):
+        self._id = code_object
+        self._as_parameter_ = self._id
+        weakref.finalize(self, hsa.hsa_code_object_destroy, self._id)
+
+
+class Executable(object):
+    def __init__(self):
+        ex = drvapi.hsa_executable_t()
+        hsa.hsa_executable_create(enums.HSA_PROFILE_FULL,
+                                  enums.HSA_EXECUTABLE_STATE_UNFROZEN,
+                                  None,
+                                  ctypes.byref(ex))
+        self._id = ex
+        self._as_parameter_ = self._id
+        weakref.finalize(self, hsa.hsa_executable_destroy, self._id)
+
+    def load(self, agent, code_object):
+        hsa.hsa_executable_load_code_object(self._id, agent._id,
+                                            code_object._id, None)
+
+    def freeze(self):
+        """Freeze executable before we can query for symbol"""
+        hsa.hsa_executable_freeze(self._id, None)
+
+    def get_symbol(self, agent, name):
+        symbol = drvapi.hsa_executable_symbol_t()
+        hsa.hsa_executable_get_symbol(self._id, None,
+                                      ctypes.create_string_buffer(
+                                          name.encode('ascii')),
+                                      agent._id, 0,
+                                      ctypes.byref(symbol))
+        return Symbol(name, symbol)
+
+
+class Symbol(HsaWrapper):
+    _hsa_info_function = 'hsa_executable_symbol_get_info'
+    _hsa_properties = {
+        'kernel_object': (
+            enums.HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
+            ctypes.c_uint64,
+        ),
+        'kernarg_segment_size': (
+            enums.HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
+            ctypes.c_uint32,
+        ),
+        'group_segment_size': (
+            enums.HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
+            ctypes.c_uint32,
+        ),
+        'private_segment_size': (
+            enums.HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
+            ctypes.c_uint32,
+        ),
+    }
+
+    def __init__(self, name, symbol_id):
+        self._id = symbol_id
+        self.name = name
+
+
+class MemoryPointer(object):
+    __hsa_memory__ = True
+
+    def __init__(self, context, pointer, size, finalizer=None):
+        assert isinstance(context, Context)
+        self.context = context
+        self.device_pointer = pointer
+        self.size = size
+        self._hsa_memsize_ = size
+        self.finalizer = finalizer
+        self.is_managed = finalizer is not None
+        self.is_alive = True
+        self.refct = 0
+
+    def __del__(self):
+        try:
+            if self.is_managed and self.is_alive:
+                self.finalizer()
+        except:
+            traceback.print_exc()
+
+    def own(self):
+        return OwnedPointer(weakref.proxy(self))
+
+    def free(self):
+        """
+        Forces the device memory to the trash.
+        """
+        if self.is_managed:
+            if not self.is_alive:
+                raise RuntimeError("Freeing dead memory")
+            self.finalizer()
+            self.is_alive = False
+
+    def view(self):
+        pointer = self.device_pointer.value
+        view = MemoryPointer(self.context, pointer, self.size)
+        return OwnedPointer(weakref.proxy(self), view)
+
+    @property
+    def device_ctypes_pointer(self):
+        return self.device_pointer
+
+    def allow_access_to(self, *agents):
+        """
+        Grant access to given *agents*.
+        Upon return, only the listed-agents and the owner agent have direct
+        access to this pointer.
+        """
+        ct = len(agents)
+        if ct == 0:
+            return
+        agent_array = (ct * drvapi.hsa_agent_t)(*[a._id for a in agents])
+        hsa.hsa_amd_agents_allow_access(ct, agent_array, None,
+                                        self.device_pointer)
+
+
+class HostMemory(mviewbuf.MemAlloc):
+    def __init__(self, context, owner, pointer, size):
+        self.context = context
+        self.owned = owner
+        self.size = size
+        self.host_pointer = pointer
+        self.handle = self.host_pointer
+
+        # For buffer interface
+        self._buflen_ = self.size
+        self._bufptr_ = self.host_pointer.value
+
+    def own(self):
+        return self
+
+
+class OwnedPointer(object):
+    def __init__(self, memptr, view=None):
+        self._mem = memptr
+        self._mem.refct += 1
+        if view is None:
+            self._view = self._mem
+        else:
+            assert not view.is_managed
+            self._view = view
+
+    def __del__(self):
+        try:
+            self._mem.refct -= 1
+            assert self._mem.refct >= 0
+            if self._mem.refct == 0:
+                self._mem.free()
+        except ReferenceError:
+            pass
+        except:
+            traceback.print_exc()
+
+    def __getattr__(self, fname):
+        """Proxy MemoryPointer methods
+        """
+        return getattr(self._view, fname)
+
+
+class Context(object):
+    """
+    A context is associated with a component
+    """
+
+    """
+    Parameters:
+    agent the agent, and instance of the class Agent
+    """
+
+    # a weak set of active Stream objects
+    _active_streams = weakref.WeakSet()
+
+    def __init__(self, agent):
+        self._agent = weakref.proxy(agent)
+
+        if self._agent.is_component:  # only components have queues
+            qs = agent.queue_max_size
+            defq = self._agent.create_queue_multi(qs, callback=self._callback)
+            self._defaultqueue = defq.owned()
+
+        self.allocations = utils.UniqueDict()
+        # get pools
+        coarse_flag = enums_ext.HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED
+        fine_flag = enums_ext.HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED
+        alloc_mps = [mp for mp in agent.mempools.globals if mp.alloc_allowed]
+        self._coarsegrain_mempool = None
+        self._finegrain_mempool = None
+        for mp in alloc_mps:
+            if mp.supports(coarse_flag):
+                self._coarsegrain_mempool = mp
+            if mp.supports(fine_flag):
+                self._finegrain_mempool = mp
+
+    def _callback(self, status, queue):
+        drvapi._check_error(status, queue)
+        sys.exit(1)
+
+    @property
+    def unproxy(self):
+        # This is a trick to help handle weakproxy comparison with actual
+        # instance.
+        # See https://stackoverflow.com/a/49319989 for inspiration and the
+        # whole page for more general discussion.
+        return self
+
+    @property
+    def default_queue(self):
+        return self._defaultqueue
+
+    @property
+    def agent(self):
+        return self._agent
+
+    @property
+    def coarsegrain_mempool(self):
+        if self._coarsegrain_mempool is None:
+            msg = 'coarsegrain mempool is not available in {}'.format(self._agent)
+            raise ValueError(msg)
+        return self._coarsegrain_mempool
+
+    @property
+    def finegrain_mempool(self):
+        if self._finegrain_mempool is None:
+            msg = 'finegrain mempool is not available in {}'.format(self._agent)
+            raise ValueError(msg)
+        return self._finegrain_mempool
+
+    def memalloc(self, nbytes, memTypeFlags=None, hostAccessible=True):
+        """
+        Allocates memory.
+        Parameters:
+        nbytes the number of bytes to allocate.
+        memTypeFlags the flags for which the memory region must have support,\
+                     due to the inherent rawness of the underlying call, the\
+                     validity of the flag is not checked, cf. C language.
+        hostAccessible boolean as to whether the region in which the\
+                       allocation takes place should be host accessible
+        """
+        hw = self._agent.device
+        all_reg = self._agent.regions
+        flag_ok_r = list() # regions which pass the memTypeFlags test
+        regions = list()
+
+        # don't support DSP
+        if hw == "GPU" or hw == "CPU":
+            # check user requested flags
+            if memTypeFlags is not None:
+                for r in all_reg:
+                    count = 0
+                    for flags in memTypeFlags:
+                        if r.supports(flags):
+                            count += 1
+                    if count == len(memTypeFlags):
+                        flag_ok_r.append(r)
+            else:
+                flag_ok_r = all_reg
+
+            # check system required flags for allocation
+            for r in flag_ok_r:
+                # check the mem region is coarse grained if dGPU present
+                # TODO: this probably ought to explicitly check for a dGPU.
+                if (hw == "GPU" and
+                        not r.supports(enums.HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED)):
+                    continue
+                # check accessibility criteria
+                if hostAccessible:
+                    if r.host_accessible:
+                        regions.append(r)
+                else:
+                    if not r.host_accessible:
+                        regions.append(r)
+
+        else:
+            raise RuntimeError("Unknown device type string \"%s\"" % hw)
+
+        assert len(regions) > 0, "No suitable memory regions found."
+
+        # walk though valid regions trying to malloc until there's none left
+        mem = None
+        for region_id in regions:
+            try:
+                mem = MemRegion.instance_for(self._agent, region_id)\
+                        .allocate(nbytes)
+            except HsaApiError: # try next memory region if an allocation fails
+                pass
+            else: # allocation succeeded, stop looking for memory
+                break
+
+        if mem is None:
+            raise RuntimeError("Memory allocation failed. No agent/region \
+              combination could meet allocation restraints \
+              (hardware = %s, size = %s, flags = %s)." % \
+              ( hw, nbytes, memTypeFlags))
+
+        fin = _make_mem_finalizer(hsa.hsa_memory_free)
+        ret = MemoryPointer(weakref.proxy(self), mem, nbytes,
+                            finalizer=fin(self, mem))
+        if mem.value is None:
+            raise RuntimeError("MemoryPointer has no value")
+        self.allocations[mem.value] = ret
+        return ret.own()
+
+    def mempoolalloc(self, nbytes, allow_access_to=(), finegrain=False):
+        """
+        Allocates memory in a memory pool.
+        Parameters:
+        *nbytes* the number of bytes to allocate.
+        *allow_acces_to*
+        *finegrain*
+        """
+        mempool = (self.finegrain_mempool
+                   if finegrain
+                   else self.coarsegrain_mempool)
+
+        buff = mempool.allocate(nbytes)
+        fin = _make_mem_finalizer(hsa.hsa_amd_memory_pool_free)
+        mp = MemoryPointer(weakref.proxy(self), buff, nbytes,
+                           finalizer=fin(self, buff))
+        mp.allow_access_to(*allow_access_to)
+        self.allocations[buff.value] = mp
+        return mp.own()
+
+    def memhostalloc(self, size, finegrain, allow_access_to):
+        mem = self.mempoolalloc(size, allow_access_to=allow_access_to,
+                                finegrain=finegrain)
+        return HostMemory(weakref.proxy(self), owner=mem,
+                          pointer=mem.device_pointer, size=mem.size)
+
+
+class Stream(object):
+    """
+    An asynchronous stream for async API
+    """
+    def __init__(self):
+        self._signals = deque()
+        self._callbacks = defaultdict(list)
+
+    def _add_signal(self, signal):
+        """
+        Add a signal that corresponds to an async task.
+        """
+        # XXX: too many pending signals seem to cause async copy to hang
+        if len(self._signals) > 100:
+            self._sync(50)
+        self._signals.append(signal)
+
+    def _add_callback(self, callback):
+        assert callable(callback)
+        self._callbacks[self._get_last_signal()].append(callback)
+
+    def _get_last_signal(self):
+        """
+        Get the last signal.
+        """
+        return self._signals[-1] if self._signals else None
+
+    def synchronize(self):
+        """
+        Synchronize the stream.
+        """
+        self._sync(len(self._signals))
+
+    def _sync(self, limit):
+        ct = 0
+        while self._signals:
+            if ct >= limit:
+                break
+            sig = self._signals.popleft()
+            if sig.load_relaxed() == 1:
+                sig.wait_until_ne_one()
+            for cb in self._callbacks[sig]:
+                cb()
+            del self._callbacks[sig]
+            ct += 1
+
+    @contextmanager
+    def auto_synchronize(self):
+        '''
+        A context manager that waits for all commands in this stream to execute
+        and commits any pending memory transfers upon exiting the context.
+        '''
+        yield self
+        self.synchronize()
+
+
+def _make_mem_finalizer(dtor):
+    """
+    finalises memory
+    Parameters:
+    dtor a function that will delete/free held memory from a reference
+
+    Returns:
+    Finalising function
+    """
+    def mem_finalize(context, handle):
+        allocations = context.allocations
+        sync = hsa.implicit_sync
+
+        def core():
+            _logger.info("Current allocations: %s", allocations)
+            if allocations:
+                _logger.info("Attempting delete on %s" % handle.value)
+                del allocations[handle.value]
+            sync()  # implicit sync
+            dtor(handle)
+        return core
+
+    return mem_finalize
+
+def device_pointer(obj):
+    "Get the device pointer as an integer"
+    return device_ctypes_pointer(obj).value
+
+
+def device_ctypes_pointer(obj):
+    "Get the ctypes object for the device pointer"
+    if obj is None:
+        return c_void_p(0)
+    require_device_memory(obj)
+    return obj.device_ctypes_pointer
+
+
+def is_device_memory(obj):
+    """All HSA dGPU memory object is recognized as an instance with the
+    attribute "__hsa_memory__" defined and its value evaluated to True.
+
+    All HSA memory object should also define an attribute named
+    "device_pointer" which value is an int(or long) object carrying the pointer
+    value of the device memory address.  This is not tested in this method.
+    """
+    return getattr(obj, '__hsa_memory__', False)
+
+
+def require_device_memory(obj):
+    """A sentry for methods that accept HSA memory object.
+    """
+    if not is_device_memory(obj):
+        raise Exception("Not a HSA memory object.")
+
+
+def host_pointer(obj):
+    """
+    NOTE: The underlying data pointer from the host data buffer is used and
+    it should not be changed until the operation which can be asynchronous
+    completes.
+    """
+    if isinstance(obj, int):
+        return obj
+
+    forcewritable = isinstance(obj, np.void)
+    return mviewbuf.memoryview_get_buffer(obj, forcewritable)
+
+
+def host_to_dGPU(context, dst, src, size):
+    """
+    Copy data from a host memory region to a dGPU.
+    Parameters:
+    context the dGPU context
+    dst a pointer to the destination location in dGPU memory
+    src a pointer to the source location in host memory
+    size the size (in bytes) of data to transfer
+    """
+    _logger.info("CPU->dGPU")
+    if size < 0:
+        raise ValueError("Invalid size given: %s" % size)
+
+    hsa.hsa_memory_copy(device_pointer(dst), host_pointer(src), size)
+
+
+def dGPU_to_host(context, dst, src, size):
+    """
+    Copy data from a host memory region to a dGPU.
+    Parameters:
+    context the dGPU context
+    dst a pointer to the destination location in dGPU memory
+    src a pointer to the source location in host memory
+    size the size (in bytes) of data to transfer
+    """
+    _logger.info("dGPU->CPU")
+    if size < 0:
+        raise ValueError("Invalid size given: %s" % size)
+
+    hsa.hsa_memory_copy(host_pointer(dst), device_pointer(src), size)
+
+
+def dGPU_to_dGPU(context, dst, src, size):
+    _logger.info("dGPU->dGPU")
+    if size < 0:
+        raise ValueError("Invalid size given: %s" % size)
+
+    hsa.hsa_memory_copy(device_pointer(dst), device_pointer(src), size)
+
+
+def async_host_to_dGPU(dst_ctx, src_ctx, dst, src, size, stream):
+    _logger.info("Async CPU->dGPU")
+    async_copy_dgpu(dst_ctx=dst_ctx, src_ctx=src_ctx,
+                    src=host_pointer(src), dst=device_pointer(dst),
+                    size=size, stream=stream)
+
+
+def async_dGPU_to_host(dst_ctx, src_ctx, dst, src, size, stream):
+    _logger.info("Async dGPU->CPU")
+    async_copy_dgpu(dst_ctx=dst_ctx, src_ctx=src_ctx,
+                    dst=host_pointer(dst), src=device_pointer(src),
+                    size=size, stream=stream)
+
+
+def async_dGPU_to_dGPU(dst_ctx, src_ctx, dst, src, size, stream):
+    _logger.info("Async dGPU->dGPU")
+    async_copy_dgpu(dst_ctx=dst_ctx, src_ctx=src_ctx,
+                    dst=device_pointer(dst), src=device_pointer(src),
+                    size=size, stream=stream)
+
+
+def async_copy_dgpu(dst_ctx, src_ctx, dst, src, size, stream):
+    if size < 0:
+        raise ValueError("Invalid size given: %s" % size)
+
+    completion_signal = hsa.create_signal(1)
+    dependent_signal = stream._get_last_signal()
+
+    if dependent_signal is not None:
+        dsignal = drvapi.hsa_signal_t(dependent_signal._id)
+        signals = (1, ctypes.byref(dsignal), completion_signal)
+    else:
+        signals = (0, None, completion_signal)
+
+    hsa.hsa_amd_memory_async_copy(dst, dst_ctx._agent._id,
+                                  src, src_ctx._agent._id,
+                                  size, *signals)
+
+    stream._add_signal(completion_signal)
+
+
+def dgpu_count():
+    """
+    Returns the number of discrete GPUs present on the current machine.
+    """
+    ngpus = 0
+    try:
+        for a in hsa.agents:
+            if a.is_component and a.device == 'GPU':
+                ngpus += 1
+    except:
+        pass
+    return ngpus
+
+"""
+True if a dGPU is present in the current machine.
+"""
+dgpu_present = dgpu_count() > 0
+
--- a/numba/roc/hsadrv/drvapi.py
+++ b/numba/roc/hsadrv/drvapi.py
+import ctypes
+import warnings
+
+from numba.core import utils
+
+from numba.roc.hsadrv import enums
+from .error import HsaApiError, HsaWarning
+
+_PTR = ctypes.POINTER
+
+# This deals with types which are defined as
+# typedef struct { uint64_t handle;};
+handle_struct = ctypes.c_uint64
+
+#------------------------------------------------------------------------------
+# HSA types from hsa.h, ordered as per header file
+
+hsa_status_t = ctypes.c_int # enum
+class hsa_dim3_t(ctypes.Structure):
+    _fields_ = [
+        ('x', ctypes.c_uint32),
+        ('y', ctypes.c_uint32),
+        ('z', ctypes.c_uint32)
+        ]
+hsa_access_permission_t  = ctypes.c_int # enum
+hsa_endianness_t  = ctypes.c_int # enum
+hsa_machine_model_t  = ctypes.c_int # enum
+hsa_profile_t  = ctypes.c_int # enum
+hsa_system_info_t  = ctypes.c_int # enum
+hsa_extension_t = ctypes.c_int # enum
+hsa_agent_t = handle_struct
+hsa_agent_feature_t = ctypes.c_int # enum
+hsa_device_type_t = ctypes.c_int # enum
+hsa_default_float_rounding_mode_t = ctypes.c_int # enum
+hsa_agent_info_t = ctypes.c_int # enum
+hsa_exception_policy_t = ctypes.c_int # enum
+hsa_signal_t = handle_struct
+hsa_signal_value_t = ctypes.c_uint64 if enums.HSA_LARGE_MODEL else ctypes.c_uint32
+hsa_signal_condition_t = ctypes.c_int # enum
+hsa_wait_state_t = ctypes.c_int # enum
+hsa_region_t = handle_struct
+hsa_queue_type_t = ctypes.c_int # enum
+hsa_queue_feature_t = ctypes.c_int # enum
+class hsa_queue_t(ctypes.Structure):
+    """In theory, this should be aligned to 64 bytes. In any case, allocation
+    of this structure is done by the hsa library"""
+    _fields_ = [
+        ('type', hsa_queue_type_t),
+        ('features', ctypes.c_uint32),
+        ('base_address', ctypes.c_void_p),  # if LARGE MODEL
+        ('doorbell_signal', hsa_signal_t),
+        ('size', ctypes.c_uint32),
+        ('reserved1', ctypes.c_uint32),
+        ('id', ctypes.c_uint32),
+        ]
+hsa_packet_type_t = ctypes.c_int # enum
+hsa_fence_scope_t = ctypes.c_int # enum
+hsa_packet_header_t = ctypes.c_int # enum
+hsa_packet_header_width_t = ctypes.c_int # enum
+hsa_kernel_dispatch_packet_setup_t = ctypes.c_int # enum
+hsa_kernel_dispatch_packet_setup_width_t = ctypes.c_int # enum
+class hsa_kernel_dispatch_packet_t(ctypes.Structure):
+    _fields_ = [
+        ('header', ctypes.c_uint16),
+        ('setup', ctypes.c_uint16),
+        ('workgroup_size_x', ctypes.c_uint16),
+        ('workgroup_size_y', ctypes.c_uint16),
+        ('workgroup_size_z', ctypes.c_uint16),
+        ('reserved0', ctypes.c_uint16), # Must be zero
+        ('grid_size_x', ctypes.c_uint32),
+        ('grid_size_y', ctypes.c_uint32),
+        ('grid_size_z', ctypes.c_uint32),
+        ('private_segment_size', ctypes.c_uint32),
+        ('group_segment_size', ctypes.c_uint32),
+        ('kernel_object', ctypes.c_uint64),
+        # NOTE: Small model not dealt with properly...!
+        # ifdef HSA_LARGE_MODEL
+        ('kernarg_address', ctypes.c_uint64),
+        # SMALL Machine has a reserved uint32
+        ('reserved2', ctypes.c_uint64), # Must be zero
+        ('completion_signal', hsa_signal_t),
+        ]
+class hsa_agent_dispatch_packet_t(ctypes.Structure):
+        """This should be aligned to HSA_PACKET_ALIGN_BYTES (64)"""
+        _fields_ = [
+            ('header', ctypes.c_uint16),
+            ('type', ctypes.c_uint16),
+            ('reserved0', ctypes.c_uint32),
+            # NOTE: Small model not dealt with properly...!
+            ('return_address', ctypes.c_void_p),
+            ('arg', ctypes.c_uint64 * 4),
+            ('reserved2', ctypes.c_uint64),
+            ('completion_signal', hsa_signal_t),
+        ]
+class hsa_barrier_and_packet_t(ctypes.Structure):
+    _fields_ = [
+        ('header', ctypes.c_uint16),
+        ('reserved0', ctypes.c_uint16),
+        ('reserved1', ctypes.c_uint32),
+        ('dep_signal0', hsa_signal_t),
+        ('dep_signal1', hsa_signal_t),
+        ('dep_signal2', hsa_signal_t),
+        ('dep_signal3', hsa_signal_t),
+        ('dep_signal4', hsa_signal_t),
+        ('reserved2', ctypes.c_uint64),
+        ('completion_signal', hsa_signal_t),
+        ]
+
+hsa_barrier_or_packet_t = hsa_barrier_and_packet_t
+
+hsa_region_segment_t = ctypes.c_int # enum
+hsa_region_global_flag_t = ctypes.c_int # enum
+hsa_region_info_t = ctypes.c_int # enum
+hsa_symbol_kind_t = ctypes.c_int # enum
+hsa_variable_allocation_t = ctypes.c_int # enum
+hsa_symbol_linkage_t = ctypes.c_int # enum
+hsa_variable_segment_t = ctypes.c_int # enum
+hsa_isa_t = handle_struct
+hsa_isa_info_t = ctypes.c_int # enum
+hsa_code_object_t = handle_struct
+hsa_callback_data_t = handle_struct
+hsa_code_object_type_t = ctypes.c_int # enum
+hsa_code_object_info_t = ctypes.c_int # enum
+hsa_code_symbol_t = handle_struct
+hsa_code_symbol_info_t = ctypes.c_int # enum
+hsa_executable_t = handle_struct
+hsa_executable_state_t = ctypes.c_int # enum
+hsa_executable_info_t = ctypes.c_int # enum
+hsa_executable_symbol_t = handle_struct
+hsa_executable_symbol_info_t = ctypes.c_int # enum
+#------------------------------------------------------------------------------
+
+
+#------------------------------------------------------------------------------
+# HSA types from Brig.h, ordered as per header file
+# NOTE: not all of the definitions are needed
+BrigVersion32_t = ctypes.c_uint32
+MODULE_IDENTIFICATION_LENGTH=8
+class BrigModuleHeader(ctypes.Structure):
+    _fields_ = [
+        ('identification', ctypes.c_char*MODULE_IDENTIFICATION_LENGTH),
+        ('brigMajor', BrigVersion32_t),
+        ('brigMinor', BrigVersion32_t),
+        ('byteCount', ctypes.c_uint64),
+        ('hash', ctypes.c_uint8*64),
+        ('reserved',  ctypes.c_uint32),
+        ('sectionCount', ctypes.c_uint32),
+        ('sectionIndex', ctypes.c_uint64),
+    ]
+
+BrigModule_t = _PTR(BrigModuleHeader)
+
+#------------------------------------------------------------------------------
+
+
+#------------------------------------------------------------------------------
+# HSA types from hsa_ext_amd.h, ordered as per header file
+hsa_amd_agent_info_t = ctypes.c_int # enum
+hsa_amd_region_info_t = ctypes.c_int # enum
+hsa_amd_coherency_type_t = ctypes.c_int # enum
+class hsa_amd_profiling_dispatch_time_t(ctypes.Structure):
+    _fields_ = [
+        ('start', ctypes.c_uint64),
+        ('end', ctypes.c_uint64),
+        ]
+
+# typedef bool (*hsa_amd_signal_handler)(hsa_signal_value_t value, void* arg);
+hsa_amd_signal_handler = _PTR(
+    ctypes.CFUNCTYPE(ctypes.c_bool,
+                     hsa_signal_value_t,
+                     ctypes.c_void_p)
+    )
+
+hsa_amd_segment_t = ctypes.c_int # enum
+hsa_amd_memory_pool_t = handle_struct
+hsa_amd_memory_pool_global_flag_t = ctypes.c_int # enum
+hsa_amd_memory_pool_info_t = ctypes.c_int # enum
+hsa_amd_memory_pool_access_t = ctypes.c_int # enum
+hsa_amd_link_info_type_t = ctypes.c_int # enum
+hsa_amd_memory_pool_link_info_t = ctypes.c_int # enum
+hsa_amd_agent_memory_pool_info_t = ctypes.c_int # enum
+class hsa_amd_image_descriptor_t(ctypes.Structure):
+    _fields_ = [
+        ('version', ctypes.c_uint32),
+        ('deviceID', ctypes.c_uint32),
+        ('data', ctypes.c_uint32*1),
+        ]
+#------------------------------------------------------------------------------
+
+
+#------------------------------------------------------------------------------
+# HSA types from hsa_ext_finalize.h, ordered as per header file
+hsa_ext_module_t = BrigModule_t
+
+hsa_ext_program_t = handle_struct
+hsa_ext_program_info_t = ctypes.c_int # enum
+hsa_ext_finalizer_call_convention_t = ctypes.c_int # enum
+class hsa_ext_control_directives_t(ctypes.Structure):
+    _fields_ = [
+        ('control_directives_mask', ctypes.c_uint64),
+        ('break_exceptions_mask', ctypes.c_uint16),
+        ('detect_exceptions_mask', ctypes.c_uint16),
+        ('max_dynamic_group_size', ctypes.c_uint32),
+        ('max_flat_grid_size', ctypes.c_uint64),
+        ('max_flat_workgroup_size', ctypes.c_uint32),
+        ('reserved1', ctypes.c_uint32),
+        ('required_grid_size', ctypes.c_uint64*3),
+        ('required_workgroup_size', hsa_dim3_t),
+        ('required_dim', ctypes.c_uint8),
+        ('reserved2', ctypes.c_uint8*75),
+    ]
+
+# function pointers, that are used in the
+# "hsa_ext_finalizer_1_00_pfn_t" struct of pointers
+HSA_EXT_PROGRAM_CREATE_FPTR = ctypes.CFUNCTYPE(
+        hsa_status_t, # return value
+        hsa_machine_model_t, # machine_model
+        hsa_profile_t, # profile
+        hsa_default_float_rounding_mode_t, # default_float_rounding_mode
+        ctypes.c_char_p, # options
+        _PTR(hsa_ext_program_t)) # program
+
+HSA_EXT_PROGRAM_DESTROY_FPTR  = ctypes.CFUNCTYPE(
+        hsa_status_t, # return value
+        hsa_ext_program_t) # program
+
+HSA_EXT_PROGRAM_ADD_MODULE_FPTR = ctypes.CFUNCTYPE(
+        hsa_status_t, # return value
+        hsa_ext_program_t, # program
+        hsa_ext_module_t) # module
+
+HSA_EXT_PROGRAM_ITERATE_MODULES_CALLBACK_FUNC = ctypes.CFUNCTYPE(
+        hsa_status_t, # return
+        hsa_ext_program_t, # program
+        hsa_ext_module_t, # module
+        ctypes.c_void_p) # data
+
+HSA_EXT_PROGRAM_ITERATE_MODULES_FPTR = ctypes.CFUNCTYPE(
+        hsa_status_t, # return value
+        hsa_ext_program_t, # program
+        HSA_EXT_PROGRAM_ITERATE_MODULES_CALLBACK_FUNC, # callback
+        ctypes.c_void_p) # data
+
+HSA_EXT_PROGRAM_GET_INFO_FPTR = ctypes.CFUNCTYPE(
+        hsa_status_t, # return value
+        hsa_ext_program_t, # program
+        hsa_ext_program_info_t, # attribute
+        ctypes.c_void_p) # value
+
+HSA_EXT_PROGRAM_FINALIZE_FPTR = ctypes.CFUNCTYPE(
+        hsa_status_t, # return value
+        hsa_ext_program_t, # program
+        hsa_isa_t, # isa
+        ctypes.c_int32, # call_convention
+        hsa_ext_control_directives_t, # control_directives
+        ctypes.c_char_p, #options
+        hsa_code_object_type_t, #code_object_type
+        _PTR(hsa_code_object_t)) # code_object
+
+# this struct holds function pointers
+class hsa_ext_finalizer_1_00_pfn_t(ctypes.Structure):
+    _fields_ = [
+               ('hsa_ext_program_create', HSA_EXT_PROGRAM_CREATE_FPTR),
+               ('hsa_ext_program_destroy', HSA_EXT_PROGRAM_DESTROY_FPTR),
+               ('hsa_ext_program_add_module', HSA_EXT_PROGRAM_ADD_MODULE_FPTR),
+               ('hsa_ext_program_iterate_modules',
+                   HSA_EXT_PROGRAM_ITERATE_MODULES_FPTR),
+               ('hsa_ext_program_get_info', HSA_EXT_PROGRAM_GET_INFO_FPTR),
+               ('hsa_ext_program_finalize', HSA_EXT_PROGRAM_FINALIZE_FPTR)
+    ]
+
+#------------------------------------------------------------------------------
+
+
+
+#------------------------------------------------------------------------------
+# HSA types from hsa_ext_image.h (NOTE: support incomplete)
+
+hsa_ext_image_t = handle_struct
+hsa_ext_image_geometry_t = ctypes.c_int # enum
+hsa_ext_image_channel_type_t = ctypes.c_int # enum
+hsa_ext_image_channel_order_t = ctypes.c_int # enum
+
+class hsa_ext_image_format_t(ctypes.Structure):
+    _fields_ = [
+        ("channel_type", hsa_ext_image_channel_type_t),
+        ("channel_order", hsa_ext_image_channel_order_t)
+    ]
+
+class hsa_ext_image_descriptor_t(ctypes.Structure):
+    _fields_ = [
+        ("geometry", hsa_ext_image_geometry_t),
+        ("width", ctypes.c_size_t),
+        ("height", ctypes.c_size_t),
+        ("depth", ctypes.c_size_t),
+        ("array_size", ctypes.c_size_t),
+        ("format", hsa_ext_image_format_t)
+    ]
+
+hsa_ext_image_capability_t = ctypes.c_int # enum
+
+class hsa_ext_image_data_info_t(ctypes.Structure):
+    _fields_ = [
+             ("size", ctypes.c_size_t),
+             ("alignment", ctypes.c_size_t),
+             ]
+
+class hsa_ext_image_region_t(ctypes.Structure):
+    _fields_ = [
+             ("offset", hsa_dim3_t),
+             ("offset", hsa_dim3_t),
+    ]
+
+hsa_ext_sampler_t = handle_struct
+hsa_ext_sampler_addressing_mode_t = ctypes.c_int # enum
+hsa_ext_sampler_coordinate_mode_t = ctypes.c_int # enum
+hsa_ext_sampler_filter_mode_t = ctypes.c_int # enum
+
+class hsa_ext_sampler_descriptor_t(ctypes.Structure):
+    _fields_ = [
+        ("coordinate_mode", hsa_ext_sampler_coordinate_mode_t),
+        ("filter_mode", hsa_ext_sampler_filter_mode_t),
+        ("address_mode", hsa_ext_sampler_addressing_mode_t)
+    ]
+
+#NOTE: Not implemented yet: hsa_ext_images_1_00_pfn_t
+#------------------------------------------------------------------------------
+
+#------------------------------------------------------------------------------
+# callbacks that have no related typedef in the hsa include files
+
+HSA_ITER_AGENT_CALLBACK_FUNC = ctypes.CFUNCTYPE(
+    hsa_status_t, # return value
+    hsa_agent_t, # agent
+    ctypes.py_object) # this is a c_void_p used to wrap a python object
+
+HSA_QUEUE_CALLBACK_FUNC = ctypes.CFUNCTYPE(
+    None,  # return value
+    hsa_status_t,
+    _PTR(hsa_queue_t),
+    ctypes.py_object) # this is a c_void_p used to wrap a python object
+
+HSA_AGENT_ITERATE_REGIONS_CALLBACK_FUNC = ctypes.CFUNCTYPE(
+    hsa_status_t, # return value
+    hsa_region_t, # region
+    ctypes.py_object) # this is a c_void_p used to wrap a python object
+
+# hsa_status_t (*callback)(hsa_code_object_t code_object, hsa_code_symbol_t symbol, void* data),
+HSA_CODE_OBJECT_ITERATE_SYMBOLS_CALLBACK = ctypes.CFUNCTYPE(
+    hsa_status_t, # return value
+    hsa_code_object_t,
+    hsa_code_symbol_t,
+    ctypes.py_object) # this is a c_void_p used to wrap a python object
+
+# hsa_status_t (*alloc_callback)(size_t size, hsa_callback_data_t data, void **address),
+HSA_ALLOC_CALLBACK_FUNCTION = ctypes.CFUNCTYPE(
+    hsa_status_t, # return value
+    ctypes.c_size_t,
+    hsa_callback_data_t,
+    _PTR(ctypes.c_void_p) # this might need to be a ptr to a py_object
+    )
+
+void_fn_ptr =  ctypes.CFUNCTYPE(
+    None,
+    ctypes.c_void_p) # this might need to be a ptr to a py_object
+
+# hsa_status_t (*callback)(hsa_amd_memory_pool_t memory_pool, void* data)
+HSA_AMD_AGENT_ITERATE_MEMORY_POOLS_CALLBACK = ctypes.CFUNCTYPE(
+    hsa_status_t,
+    hsa_amd_memory_pool_t,
+    ctypes.c_void_p) # this is a c_void_p used to wrap a python object
+
+
+#------------------------------------------------------------------------------
+
+# Functions used by API calls returning hsa_status_t to check for errors ######
+
+def _build_reverse_error_warn_maps():
+    err_map = utils.UniqueDict()
+    warn_map = utils.UniqueDict()
+
+    for name in [name for name in dir(enums) if name.startswith('HSA_')]:
+        code = getattr(enums, name)
+        if 'STATUS_ERROR' in name:
+            err_map[code] = name
+        elif 'STATUS_INFO' in name:
+            warn_map[code] = name
+        else:
+            pass # should we warn here?
+    return err_map, warn_map
+
+ERROR_MAP, WARN_MAP = _build_reverse_error_warn_maps()
+
+
+def _check_error(result, func, arguments):
+    if result != enums.HSA_STATUS_SUCCESS:
+        if result >= enums.HSA_STATUS_ERROR:
+            errname = ERROR_MAP.get(result, "UNKNOWN_HSA_ERROR")
+            msg = "Call to {0} returned {1}".format(func.__name__, errname)
+            raise HsaApiError(result, msg)
+        else:
+            warnname = WARN_MAP.get(result, "UNKNOWN_HSA_INFO")
+            msg = "Call to {0} returned {1}".format(func.__name__, warnname)
+            warnings.warn(msg, HsaWarning)
+
+
+# The API prototypes
+# These are order based on header files.
+API_PROTOTYPES = {
+
+#------------------------------------------------------------------------------
+# HSA functions from hsa.h, ordered as per header file.
+
+    # hsa_status_t hsa_status_string(
+    #     hsa_status_t status,
+    #     const char **status_string);
+    'hsa_status_string': {
+        'restype': hsa_status_t,
+        'argtypes': [hsa_status_t, _PTR(ctypes.c_char_p)],
+        'errcheck': _check_error
+    },
+
+    # hsa_status_t hsa_init(void)
+    'hsa_init': {
+        'restype': hsa_status_t,
+        'argtypes': [],
+        'errcheck': _check_error
+    },
+
+    # hsa_status_t hsa_shut_down(void)
+    'hsa_shut_down': {
+        'restype': hsa_status_t,
+        'argtypes': [],
+        'errcheck': _check_error
+    },
+
+    # hsa_status_t hsa_system_get_info(hsa_system_info_t, void*)
+    'hsa_system_get_info': {
+        'restype': hsa_status_t,
+        'argtypes': [hsa_system_info_t, ctypes.c_void_p],
+        'errcheck': _check_error
+    },
+
+    # hsa_status_t HSA_API hsa_system_extension_supported(uint16_t, uint16_t,
+    #                                                     uint16_t, bool *);
+    'hsa_system_extension_supported': {
+        'restype': hsa_status_t,
+        'argtypes': [ctypes.c_uint16,      # extension
+                     ctypes.c_uint16,      # version_major
+                     ctypes.c_uint16,      # version_minor
+                     _PTR(ctypes.c_bool)], # result
+        'errcheck': _check_error
+    },
+
+    # hsa_status_t hsa_system_get_extension_table(uint16_t, uint16_t,
+    #                                             uint16_t, void *);
+    'hsa_system_get_extension_table': {
+        'restype': hsa_status_t,
+        'argtypes': [ctypes.c_uint16,  # extension
+                     ctypes.c_uint16,  # version_major
+                     ctypes.c_uint16,  # version_minor
+                     ctypes.c_void_p], # result
+        'errcheck': _check_error
+    },
+
+    # hsa_status_t hsa_agent_get_info(hsa_agent_t, hsa_agent_info_t, void*)
+    'hsa_agent_get_info': {
+        'restype': hsa_status_t,
+        'argtypes': [hsa_agent_t, hsa_agent_info_t, ctypes.c_void_p],
+        'errcheck': _check_error
+    },
+
+    # hsa_status_t hsa_iterate_agents(hsa_status_t(*)(hsa_agent_t, void*),
+    #                                                 void*)
+    'hsa_iterate_agents': {
+        'restype': hsa_status_t,
+        'argtypes': [HSA_ITER_AGENT_CALLBACK_FUNC, ctypes.py_object],
+        'errcheck': _check_error
+    },
+
+    # hsa_status_t hsa_agent_get_exception_policies(hsa_agent_t agent,
+    #                                               hsa_profile_t profile,
+    #                                               uint16_t *mask);
+    'hsa_agent_get_exception_policies': {
+        'restype': hsa_status_t,
+        'argtypes': [hsa_agent_t, hsa_profile_t, _PTR(ctypes.c_uint16)],
+        'errcheck': _check_error
+    },
+
+    # hsa_status_t hsa_agent_extension_supported(uint16_t extension, hsa_agent_t agent,
+    #                                           uint16_t version_major,
+    #                                           uint16_t version_minor, bool *result);
+    'hsa_agent_extension_supported': {
+        'restype': hsa_status_t,
+        'argtypes': [ctypes.c_uint16, hsa_agent_t, ctypes.c_uint16, ctypes.c_uint16,
+                     _PTR(ctypes.c_bool)],
+        'errcheck': _check_error
+    },
+
+    #--------------------------------------------------------------------------
+    # Signals
+    #--------------------------------------------------------------------------
+
+    # hsa_status_t hsa_signal_create(
+    #     hsa_signal_value_t initial_value,
+    #     uint32_t agent_count,
+    #     const hsa_agent_t *agents,
+    #     hsa_signal_t *signal)
+    'hsa_signal_create': {
+        'restype': hsa_status_t,
+        'argtypes': [hsa_signal_value_t,
+                     ctypes.c_uint32,
+                     _PTR(hsa_agent_t),
+                     _PTR(hsa_signal_t)],
+        'errcheck': _check_error
+    },
+
+    # hsa_status_t hsa_signal_destroy(
+    #     hsa_signal_t signal)
+    'hsa_signal_destroy': {
+        'restype': hsa_status_t,
+        'argtypes': [hsa_signal_t],
+        'errcheck': _check_error
+    },
+
+    # hsa_signal_value_t hsa_signal_load_acquire(
+    #     hsa_signal_t signal);
+    'hsa_signal_load_acquire': {
+        'restype': hsa_signal_value_t,
+        'argtypes': [hsa_signal_t],
+    },
+
+    # hsa_signal_value_t hsa_signal_load_relaxed(
+    #     hsa_signal_t signal);
+    'hsa_signal_load_relaxed': {
+        'restype': hsa_signal_value_t,
+        'argtypes': [hsa_signal_t],
+    },
+
+    # void hsa_signal_store_relaxed(
+    #     hsa_signal_t signal,
+    #     hsa_signal_value_t value);
+    'hsa_signal_store_relaxed': {
+        'restype': None,
+        'argtypes': [hsa_signal_t, hsa_signal_value_t]
+    },
+
+    # void hsa_signal_store_release(
+    #     hsa_signal_t signal,
+    #     hsa_signal_value_t value);
+    'hsa_signal_store_release': {
+        'restype': None,
+        'argtypes': [hsa_signal_t, hsa_signal_value_t],
+    },
+
+    # hsa_signal_value_t hsa_signal_exchange_acq_rel(
+    #     hsa_signal_t signal,
+    #     hsa_signal_value_t value);
+    'hsa_signal_exchange_acq_rel': {
+        'restype': hsa_signal_value_t,
+        'argtypes': [hsa_signal_t, hsa_signal_value_t]
+    },
+
+    # hsa_signal_value_t hsa_signal_exchange_acquire(
+    #     hsa_signal_t signal,
+    #     hsa_signal_value_t value);
+    'hsa_signal_exchange_acquire': {
+        'restype': hsa_signal_value_t,
+        'argtypes': [hsa_signal_t, hsa_signal_value_t]
+    },
+
+    # hsa_signal_value_t hsa_signal_exchange_relaxed(
+    #     hsa_signal_t signal,
+    #     hsa_signal_value_t value);
+    'hsa_signal_exchange_relaxed': {
+        'restype': hsa_signal_value_t,
+        'argtypes': [hsa_signal_t, hsa_signal_value_t]
+    },
+
+    # hsa_signal_value_t hsa_signal_exchange_release(
+    #     hsa_signal_t signal,
+    #     hsa_signal_value_t value);
+    'hsa_signal_exchange_release': {
+        'restype': hsa_signal_value_t,
+        'argtypes': [hsa_signal_t, hsa_signal_value_t]
+    },
+
+    # hsa_signal_value_t hsa_signal_cas_acq_rel(
+    #     hsa_signal_t signal,
+    #     hsa_signal_value_t expected,
+    #     hsa_signal_value_t value);
+    'hsa_signal_cas_acq_rel': {
+        'restype': hsa_signal_value_t,
+        'argtypes': [hsa_signal_t, hsa_signal_value_t, hsa_signal_value_t]
+    },
+
+    # hsa_signal_value_t hsa_signal_cas_acquire(
+    #     hsa_signal_t signal,
+    #     hsa_signal_value_t expected,
+    #     hsa_signal_value_t value);
+    'hsa_signal_cas_acquire': {
+        'restype': hsa_signal_value_t,
+        'argtypes': [hsa_signal_t, hsa_signal_value_t, hsa_signal_value_t]
+    },
+
+    # hsa_signal_value_t hsa_signal_cas_relaxed(
+    #     hsa_signal_t signal,
+    #     hsa_signal_value_t expected,
+    #     hsa_signal_value_t value);
+    'hsa_signal_cas_relaxed': {
+        'restype': hsa_signal_value_t,
+        'argtypes': [hsa_signal_t, hsa_signal_value_t, hsa_signal_value_t]
+    },
+
+    # hsa_signal_value_t hsa_signal_cas_release(
+    #     hsa_signal_t signal,
+    #     hsa_signal_value_t expected,
+    #     hsa_signal_value_t value);
+    'hsa_signal_cas_release': {
+        'restype': hsa_signal_value_t,
+        'argtypes': [hsa_signal_t, hsa_signal_value_t, hsa_signal_value_t]
+    },
+
+    # void hsa_signal_add_acq_rel(
+    #     hsa_signal_t signal,
+    #     hsa_signal_value_t value);
+    'hsa_signal_add_acq_rel': {
+        'restype': None,
+        'argtypes': [hsa_signal_t, hsa_signal_value_t]
+    },
+
+    # void hsa_signal_add_acquire(
+    #     hsa_signal_t signal,
+    #     hsa_signal_value_t value);
+    'hsa_signal_add_acquire': {
+        'restype': None,
+        'argtypes': [hsa_signal_t, hsa_signal_value_t]
+    },
+
+    # void hsa_signal_add_relaxed(
+    #     hsa_signal_t signal,
+    #     hsa_signal_value_t value);
+    'hsa_signal_add_relaxed': {
+        'restype': None,
+        'argtypes': [hsa_signal_t, hsa_signal_value_t]
+    },
+
+    # void hsa_signal_add_release(
+    #     hsa_signal_t signal,
+    #     hsa_signal_value_t value);
+    'hsa_signal_add_release': {
+        'restype': None,
+        'argtypes': [hsa_signal_t, hsa_signal_value_t]
+    },
+
+    # void hsa_signal_subtract_acq_rel(
+    #     hsa_signal_t signal,
+    #     hsa_signal_value_t value);
+    'hsa_signal_subtract_acq_rel': {
+        'restype': None,
+        'argtypes': [hsa_signal_t, hsa_signal_value_t]
+    },
+
+    # void hsa_signal_subtract_acquire(
+    #     hsa_signal_t signal,
+    #     hsa_signal_value_t value);
+    'hsa_signal_subtract_acquire': {
+        'restype': None,
+        'argtypes': [hsa_signal_t, hsa_signal_value_t]
+    },
+
+    # void hsa_signal_subtract_relaxed(
+    #     hsa_signal_t signal,
+    #     hsa_signal_value_t value);
+    'hsa_signal_subtract_relaxed': {
+        'restype': None,
+        'argtypes': [hsa_signal_t, hsa_signal_value_t]
+    },
+
+    # void hsa_signal_subtract_release(
+    #     hsa_signal_t signal,
+    #     hsa_signal_value_t value);
+    'hsa_signal_subtract_release': {
+        'restype': None,
+        'argtypes': [hsa_signal_t, hsa_signal_value_t]
+    },
+
+    # void hsa_signal_and_acq_rel(
+    #     hsa_signal_t signal,
+    #     hsa_signal_value_t value);
+    'hsa_signal_and_acq_rel': {
+        'restype': None,
+        'argtypes': [hsa_signal_t, hsa_signal_value_t]
+    },
+
+    # void hsa_signal_and_acquire(
+    #     hsa_signal_t signal,
+    #     hsa_signal_value_t value);
+    'hsa_signal_and_acquire': {
+        'restype': None,
+        'argtypes': [hsa_signal_t, hsa_signal_value_t]
+    },
+
+    # void hsa_signal_and_relaxed(
+    #     hsa_signal_t signal,
+    #     hsa_signal_value_t value);
+    'hsa_signal_and_relaxed': {
+        'restype': None,
+        'argtypes': [hsa_signal_t, hsa_signal_value_t]
+    },
+
+    # void hsa_signal_and_release(
+    #     hsa_signal_t signal,
+    #     hsa_signal_value_t value);
+    'hsa_signal_and_release': {
+        'restype': None,
+        'argtypes': [hsa_signal_t, hsa_signal_value_t]
+    },
+
+    # void hsa_signal_or_acq_rel(
+    #     hsa_signal_t signal,
+    #     hsa_signal_value_t value);
+    'hsa_signal_or_acq_rel': {
+        'restype': None,
+        'argtypes': [hsa_signal_t,
+                     hsa_signal_value_t]
+    },
+
+    # void hsa_signal_or_acquire(
+    #     hsa_signal_t signal,
+    #     hsa_signal_value_t value);
+    'hsa_signal_or_acquire': {
+        'restype': None,
+        'argtypes': [hsa_signal_t,
+                     hsa_signal_value_t]
+    },
+
+    # void hsa_signal_or_relaxed(
+    #     hsa_signal_t signal,
+    #     hsa_signal_value_t value);
+    'hsa_signal_or_relaxed': {
+        'restype': None,
+        'argtypes': [hsa_signal_t,
+                     hsa_signal_value_t]
+    },
+
+    # void hsa_signal_or_release(
+    #     hsa_signal_t signal,
+    #     hsa_signal_value_t value);
+    'hsa_signal_or_release': {
+        'restype': None,
+        'argtypes': [hsa_signal_t,
+                     hsa_signal_value_t]
+    },
+
+    # void hsa_signal_xor_acq_rel(
+    #     hsa_signal_t signal,
+    #     hsa_signal_value_t value);
+    'hsa_signal_xor_acq_rel': {
+        'restype': None,
+        'argtypes': [hsa_signal_t, hsa_signal_value_t]
+    },
+
+    # void hsa_signal_xor_acquire(
+    #     hsa_signal_t signal,
+    #     hsa_signal_value_t value);
+    'hsa_signal_xor_acquire': {
+        'restype': None,
+        'argtypes': [hsa_signal_t, hsa_signal_value_t]
+    },
+
+    # void hsa_signal_xor_relaxed(
+    #     hsa_signal_t signal,
+    #     hsa_signal_value_t value);
+    'hsa_signal_xor_relaxed': {
+        'restype': None,
+        'argtypes': [hsa_signal_t, hsa_signal_value_t]
+    },
+
+    # void hsa_signal_xor_release(
+    #     hsa_signal_t signal,
+    #     hsa_signal_value_t value);
+    'hsa_signal_xor_release': {
+        'restype': None,
+        'argtypes': [hsa_signal_t, hsa_signal_value_t]
+    },
+
+    # hsa_signal_value_t HSA_API
+    #     hsa_signal_wait_acquire(hsa_signal_t signal,
+    #                             hsa_signal_condition_t condition,
+    #                             hsa_signal_value_t compare_value,
+    #                             uint64_t timeout_hint,
+    #                             hsa_wait_state_t wait_state_hint);
+    'hsa_signal_wait_acquire': {
+        'restype': hsa_signal_value_t,
+        'argtypes': [hsa_signal_t,
+                     hsa_signal_condition_t,
+                     hsa_signal_value_t,
+                     ctypes.c_uint64,
+                     hsa_wait_state_t]
+    },
+
+    # hsa_signal_value_t hsa_signal_wait_relaxed(
+    #     hsa_signal_t signal,
+    #     hsa_signal_condition_t condition,
+    #     hsa_signal_value_t compare_value,
+    #     uint64_t timeout_hint,
+    #     hsa_wait_state_t wait_state_hint);
+    'hsa_signal_wait_relaxed': {
+        'restype': hsa_signal_value_t,
+        'argtypes': [hsa_signal_t,
+                     hsa_signal_condition_t,
+                     hsa_signal_value_t,
+                     ctypes.c_uint64,
+                     hsa_wait_state_t],
+    },
+
+    #--------------------------------------------------------------------------
+    # Queues
+    #--------------------------------------------------------------------------
+
+    # hsa_status_t HSA_API
+    # hsa_queue_create(hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
+    #                  void (*callback)(hsa_status_t status, hsa_queue_t *source,
+    #                                   void *data),
+    #                  void *data, uint32_t private_segment_size,
+    #                  uint32_t group_segment_size, hsa_queue_t **queue);
+    'hsa_queue_create': {
+        'restype': hsa_status_t,
+        'argtypes': [hsa_agent_t,
+                     ctypes.c_uint32,
+                     hsa_queue_type_t,
+                     HSA_QUEUE_CALLBACK_FUNC,
+                     ctypes.c_void_p, # data
+                     ctypes.c_uint32, # private segment size
+                     ctypes.c_uint32, # group segment size
+                     _PTR(_PTR(hsa_queue_t))],
+        'errcheck': _check_error
+    },
+
+    # hsa_status_t
+    # hsa_soft_queue_create(hsa_region_t region, uint32_t size,
+    #                      hsa_queue_type_t type, uint32_t features,
+    #                      hsa_signal_t doorbell_signal, hsa_queue_t **queue);
+    'hsa_soft_queue_create': {
+        'restype': hsa_status_t,
+        'argtypes': [hsa_region_t,
+                     ctypes.c_uint32,
+                     hsa_queue_type_t,
+                     ctypes.c_uint32,
+                     hsa_signal_t,
+                     _PTR(_PTR(hsa_queue_t))],
+        'errcheck': _check_error
+    },
+
+    # hsa_status_t hsa_queue_destroy(
+    #     hsa_queue_t *queue)
+    'hsa_queue_destroy': {
+        'restype': hsa_status_t,
+        'argtypes': [_PTR(hsa_queue_t)],
+        'errcheck': _check_error
+    },
+
+    # hsa_status_t hsa_queue_inactivate(hsa_queue_t *queue);
+    'hsa_queue_inactivate': {
+        'restype': hsa_status_t,
+        'argtypes': [_PTR(hsa_queue_t)],
+        'errcheck': _check_error
+    },
+
+    # uint64_t hsa_queue_load_read_index_acquire(hsa_queue_t *queue);
+    'hsa_queue_load_read_index_acquire': {
+        'restype': ctypes.c_uint64,
+        'argtypes': [_PTR(hsa_queue_t)]
+    },
+
+    # uint64_t hsa_queue_load_read_index_relaxed(hsa_queue_t *queue);
+    'hsa_queue_load_read_index_relaxed': {
+        'restype': ctypes.c_uint64,
+        'argtypes': [_PTR(hsa_queue_t)]
+    },
+
+    # uint64_t hsa_queue_load_write_index_acquire(hsa_queue_t *queue);
+    'hsa_queue_load_write_index_acquire': {
+        'restype': ctypes.c_uint64,
+        'argtypes': [_PTR(hsa_queue_t)]
+    },
+
+    # uint64_t hsa_queue_load_write_index_relaxed(hsa_queue_t *queue);
+    'hsa_queue_load_write_index_relaxed': {
+        'restype': ctypes.c_uint64,
+        'argtypes': [_PTR(hsa_queue_t)]
+    },
+
+    # void hsa_queue_store_write_index_relaxed(hsa_queue_t *queue, uint64_t value);
+    'hsa_queue_store_write_index_relaxed': {
+        'restype': None,
+        'argtypes': [_PTR(hsa_queue_t), ctypes.c_uint64]
+    },
+
+    # void hsa_queue_store_write_index_release(hsa_queue_t *queue, uint64_t value);
+    'hsa_queue_store_write_index_release': {
+        'restype': None,
+        'argtypes': [_PTR(hsa_queue_t), ctypes.c_uint64]
+    },
+
+    # uint64_t hsa_queue_cas_write_index_acq_rel(
+    #     hsa_queue_t *queue,
+    #     uint64_t expected,
+    #     uint64_t value);
+    'hsa_queue_cas_write_index_acq_rel': {
+        'restype': ctypes.c_uint64,
+        'argtypes': [_PTR(hsa_queue_t), ctypes.c_uint64, ctypes.c_uint64]
+    },
+
+    # uint64_t hsa_queue_cas_write_index_acquire(
+    #     hsa_queue_t *queue,
+    #     uint64_t expected,
+    #     uint64_t value);
+    'hsa_queue_cas_write_index_acquire': {
+        'restype': ctypes.c_uint64,
+        'argtypes': [_PTR(hsa_queue_t), ctypes.c_uint64, ctypes.c_uint64]
+    },
+
+    # uint64_t hsa_queue_cas_write_index_relaxed(
+    #     hsa_queue_t *queue,
+    #     uint64_t expected,
+    #     uint64_t value);
+    'hsa_queue_cas_write_index_relaxed': {
+        'restype': ctypes.c_uint64,
+        'argtypes': [_PTR(hsa_queue_t), ctypes.c_uint64, ctypes.c_uint64]
+    },
+
+    # uint64_t hsa_queue_cas_write_index_release(
+    #     hsa_queue_t *queue,
+    #     uint64_t expected,
+    #     uint64_t value);
+    'hsa_queue_cas_write_index_release': {
+        'restype': ctypes.c_uint64,
+        'argtypes': [_PTR(hsa_queue_t), ctypes.c_uint64, ctypes.c_uint64]
+    },
+
+    # uint64_t hsa_queue_add_write_index_acq_rel(
+    #     hsa_queue_t *queue,
+    #     uint64_t value);
+    'hsa_queue_add_write_index_acq_rel': {
+        'restype': ctypes.c_uint64,
+        'argtypes': [_PTR(hsa_queue_t), ctypes.c_uint64]
+    },
+
+    # uint64_t hsa_queue_add_write_index_acquire(
+    #     hsa_queue_t *queue,
+    #     uint64_t value);
+    'hsa_queue_add_write_index_acquire': {
+        'restype': ctypes.c_uint64,
+        'argtypes': [_PTR(hsa_queue_t), ctypes.c_uint64]
+    },
+
+    # uint64_t hsa_queue_add_write_index_relaxed(
+    #     hsa_queue_t *queue,
+    #     uint64_t value);
+    'hsa_queue_add_write_index_relaxed': {
+        'restype': ctypes.c_uint64,
+        'argtypes': [_PTR(hsa_queue_t), ctypes.c_uint64]
+    },
+
+    # uint64_t hsa_queue_add_write_index_release(
+    #     hsa_queue_t *queue,
+    #     uint64_t value);
+    'hsa_queue_add_write_index_release': {
+        'restype': ctypes.c_uint64,
+        'argtypes': [_PTR(hsa_queue_t), ctypes.c_uint64]
+    },
+
+    # void hsa_queue_store_read_index_relaxed(
+    #     hsa_queue_t *queue,
+    #     uint64_t value);
+    'hsa_queue_store_read_index_relaxed': {
+        'restype': None,
+        'argtypes': [_PTR(hsa_queue_t), ctypes.c_uint64]
+    },
+
+    # void hsa_queue_store_read_index_release(
+    #     hsa_queue_t *queue,
+    #     uint64_t value);
+    'hsa_queue_store_read_index_release': {
+        'restype': None,
+        'argtypes': [_PTR(hsa_queue_t), ctypes.c_uint64]
+    },
+
+    #--------------------------------------------------------------------------
+    # Memory
+    #--------------------------------------------------------------------------
+
+    # hsa_status_t hsa_region_get_info(
+    #     hsa_region_t region,
+    #     hsa_region_info_t attribute,
+    #     void *value);
+    'hsa_region_get_info': {
+        'restype': hsa_status_t,
+        'argtypes': [hsa_region_t, hsa_region_info_t, ctypes.c_void_p],
+        'errcheck': _check_error,
+    },
+
+    # hsa_status_t hsa_agent_iterate_regions(
+    #     hsa_agent_t agent,
+    #     hsa_status_t (*callback)(hsa_region_t region, void *data),
+    #     void *data);
+    'hsa_agent_iterate_regions': {
+        'restype': hsa_status_t,
+        'argtypes': [hsa_agent_t,
+                     HSA_AGENT_ITERATE_REGIONS_CALLBACK_FUNC,
+                     ctypes.py_object],
+        'errcheck': _check_error
+    },
+
+    # hsa_status_t hsa_memory_allocate(
+    #     hsa_region_t region,
+    #     size_t size,
+    #     void **ptr);
+    'hsa_memory_allocate': {
+        'restype': hsa_status_t,
+        'argtypes': [hsa_region_t, ctypes.c_size_t, _PTR(ctypes.c_void_p)],
+        'errcheck': _check_error
+    },
+
+    # hsa_status_t hsa_memory_free(
+    #     void *ptr);
+    'hsa_memory_free': {
+        'restype': hsa_status_t,
+        'argtypes': [ctypes.c_void_p],
+        'errcheck': _check_error
+    },
+
+    # hsa_status_t HSA_API hsa_memory_copy(
+    #     void * dst,
+    #     const void * src,
+    #     size_t size);
+    'hsa_memory_copy': {
+        'restype': hsa_status_t,
+        'argtypes': [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t],
+        'errcheck': _check_error
+    },
+
+    # hsa_status_t HSA_API hsa_memory_assign_agent(void *ptr,
+    #                                              hsa_agent_t agent,
+    #                                          hsa_access_permission_t access);
+    'hsa_memory_assign_agent': {
+        'restype': hsa_status_t,
+        'argtypes': [ctypes.c_void_p, hsa_agent_t, hsa_access_permission_t],
+        'errcheck': _check_error
+    },
+
+    # hsa_status_t hsa_memory_register(
+    #     void *address,
+    #     size_t size);
+    'hsa_memory_register': {
+        'restype': hsa_status_t,
+        'argtypes': [ctypes.c_void_p, ctypes.c_size_t],
+        'errcheck': _check_error
+    },
+
+    # hsa_status_t hsa_memory_deregister(
+    #     void *address,
+    #     size_t size);
+    'hsa_memory_deregister': {
+        'restype': hsa_status_t,
+        'argtypes': [ctypes.c_void_p, ctypes.c_size_t],
+        'errcheck': _check_error
+    },
+
+    #--------------------------------------------------------------------------
+    # Code Object functions
+    #--------------------------------------------------------------------------
+
+    # hsa_status_t HSA_API hsa_isa_from_name(const char* name,
+    #                                        hsa_isa_t* isa);
+    'hsa_isa_from_name': {
+        'restype': hsa_status_t,
+        'argtypes': [ctypes.c_char_p, _PTR(hsa_isa_t)],
+        'errcheck': _check_error
+    },
+
+    # hsa_status_t HSA_API hsa_isa_get_info(hsa_isa_t isa,
+    #                                       hsa_isa_info_t attribute,
+    #                                       uint32_t index,
+    #                                       void* value);
+    'hsa_isa_get_info': {
+        'restype': hsa_status_t,
+        'argtypes': [hsa_isa_t, hsa_isa_info_t, ctypes.c_void_p],
+        'errcheck': _check_error
+    },
+
+    # hsa_status_t HSA_API hsa_isa_compatible(hsa_isa_t code_object_isa,
+    #                                         hsa_isa_t agent_isa,
+    #                                         bool* result);
+    'hsa_isa_compatible': {
+        'restype': hsa_status_t,
+        'argtypes': [hsa_isa_t, hsa_isa_t, _PTR(ctypes.c_bool)],
+        'errcheck': _check_error
+    },
+
+    # hsa_status_t HSA_API hsa_code_object_serialize(
+    #    hsa_code_object_t code_object,
+    #    hsa_status_t (*alloc_callback)(size_t size,
+    #    hsa_callback_data_t data, void **address),
+    #    hsa_callback_data_t callback_data,
+    #    const char *options,
+    #    void **serialized_code_object,
+    #    size_t *serialized_code_object_size);
+    'hsa_code_object_serialize': {
+        'restype': hsa_status_t,
+        'argtypes': [HSA_ALLOC_CALLBACK_FUNCTION,
+                     hsa_callback_data_t,
+                     _PTR(ctypes.c_void_p),
+                     hsa_callback_data_t,
+                     ctypes.c_char_p,
+                     _PTR(ctypes.c_void_p),
+                     _PTR(ctypes.c_size_t)],
+        'errcheck': _check_error
+    },
+
+    # hsa_status_t HSA_API hsa_code_object_deserialize(
+    #    void *serialized_code_object,
+    #    size_t serialized_code_object_size,
+    #    const char *options,
+    #    hsa_code_object_t *code_object);
+    'hsa_code_object_deserialize': {
+        'restype': hsa_status_t,
+        'argtypes': [ctypes.c_void_p,
+                     ctypes.c_size_t,
+                     ctypes.c_char_p,
+                     _PTR(hsa_code_object_t)],
+        'errcheck': _check_error
+    },
+
+    # hsa_status_t HSA_API hsa_code_object_destroy(
+    #    hsa_code_object_t code_object);
+    'hsa_code_object_destroy': {
+        'restype': hsa_status_t,
+        'argtypes': [hsa_code_object_t],
+        'errcheck': _check_error
+    },
+
+    # hsa_status_t HSA_API hsa_code_object_get_info(
+    #    hsa_code_object_t code_object,
+    #    hsa_code_object_info_t attribute,
+    #    void *value);
+    'hsa_code_object_get_info': {
+        'restype': hsa_status_t,
+        'argtypes': [hsa_code_object_t,
+                     hsa_code_object_info_t,
+                     ctypes.c_void_p
+                     ],
+        'errcheck': _check_error
+    },
+
+    # hsa_status_t HSA_API hsa_code_object_get_symbol(
+    #    hsa_code_object_t code_object,
+    #    const char *symbol_name,
+    #    hsa_code_symbol_t *symbol);
+    'hsa_code_object_get_symbol': {
+        'restype': hsa_status_t,
+        'argtypes': [hsa_code_object_t,
+                     ctypes.c_char_p,
+                     _PTR(hsa_code_symbol_t)
+                     ],
+        'errcheck': _check_error
+    },
+
+    # hsa_status_t HSA_API hsa_code_symbol_get_info(
+    #    hsa_code_symbol_t code_symbol,
+    #    hsa_code_symbol_info_t attribute,
+    #    void *value);
+    'hsa_code_symbol_get_info': {
+        'restype': hsa_status_t,
+        'argtypes': [hsa_code_symbol_t,
+                     hsa_code_symbol_info_t,
+                     ctypes.c_void_p
+                     ],
+        'errcheck': _check_error
+    },
+
+    # hsa_status_t HSA_API hsa_code_object_iterate_symbols(
+    #    hsa_code_object_t code_object,
+    #    hsa_status_t (*callback)(hsa_code_object_t code_object, hsa_code_symbol_t symbol, void* data),
+    #    void* data);
+    'hsa_code_object_iterate_symbols': {
+        'restype': hsa_status_t,
+        'argtypes': [hsa_code_object_t,
+                     HSA_CODE_OBJECT_ITERATE_SYMBOLS_CALLBACK,
+                     ctypes.c_void_p
+                     ],
+        'errcheck': _check_error
+    },
+
+    #--------------------------------------------------------------------------
+    #  Executable functions
+    #--------------------------------------------------------------------------
+
+    # hsa_status_t HSA_API hsa_executable_create(
+    #     hsa_profile_t profile,
+    #     hsa_executable_state_t executable_state,
+    #     const char *options,
+    #     hsa_executable_t *executable);
+
+    "hsa_executable_create": {
+        'restype': hsa_status_t,
+        'argtypes': [hsa_profile_t,
+                     hsa_executable_state_t,
+                     ctypes.c_char_p,
+                     ctypes.POINTER(hsa_executable_t)],
+        'errcheck': _check_error,
+    },
+
+    # hsa_status_t HSA_API hsa_executable_destroy(
+    #     hsa_executable_t executable);
+
+    "hsa_executable_destroy": {
+        'errcheck': _check_error,
+        'restype': hsa_status_t,
+        'argtypes': [
+            hsa_executable_t,
+        ],
+    },
+
+    # hsa_status_t HSA_API hsa_executable_load_code_object(
+    #     hsa_executable_t executable,
+    #     hsa_agent_t agent,
+    #     hsa_code_object_t code_object,
+    #     const char *options);
+
+    "hsa_executable_load_code_object": {
+        'errcheck': _check_error,
+        'restype': hsa_status_t,
+        'argtypes': [
+            hsa_executable_t,
+            hsa_agent_t,
+            hsa_code_object_t,
+            ctypes.c_char_p,
+        ],
+    },
+
+    # hsa_status_t HSA_API hsa_executable_freeze(
+    #     hsa_executable_t executable,
+    #     const char *options);
+
+    "hsa_executable_freeze": {
+        'errcheck': _check_error,
+        'restype': hsa_status_t,
+        'argtypes': [
+            hsa_executable_t,
+            ctypes.c_char_p,
+        ],
+    },
+
+    # hsa_status_t HSA_API hsa_executable_get_info(
+    #   hsa_executable_t executable,
+    #   hsa_executable_info_t attribute,
+    #   void *value);
+    "hsa_executable_get_info": {
+        'errcheck': _check_error,
+        'restype': hsa_status_t,
+        'argtypes': [
+            hsa_executable_t,
+            hsa_executable_info_t,
+            ctypes.c_void_p
+        ],
+    },
+
+    # hsa_status_t HSA_API hsa_executable_global_variable_define(
+    #   hsa_executable_t executable,
+    #   const char *variable_name,
+    #   void *address);
+    "hsa_executable_global_variable_define": {
+        'restype': hsa_status_t,
+        'argtypes': [hsa_executable_t,
+                     ctypes.c_char_p,
+                     ctypes.c_void_p],
+        'errcheck': _check_error,
+    },
+
+    # hsa_status_t HSA_API hsa_executable_agent_global_variable_define(
+    #   hsa_executable_t executable,
+    #   hsa_agent_t agent,
+    #   const char *variable_name,
+    #   void *address);
+    "hsa_executable_agent_global_variable_define": {
+        'restype': hsa_status_t,
+        'argtypes': [hsa_executable_t,
+                     hsa_agent_t,
+                     ctypes.c_char_p,
+                     ctypes.c_void_p],
+        'errcheck': _check_error,
+    },
+
+    # hsa_status_t HSA_API hsa_executable_readonly_variable_define(
+    #   hsa_executable_t executable,
+    #   hsa_agent_t agent,
+    #   const char *variable_name,
+    #   void *address);
+    "hsa_executable_readonly_variable_define": {
+        'restype': hsa_status_t,
+        'argtypes': [hsa_executable_t,
+                     hsa_agent_t,
+                     ctypes.c_char_p,
+                     ctypes.c_void_p],
+        'errcheck': _check_error,
+    },
+
+    # hsa_status_t HSA_API hsa_executable_validate(
+    #   hsa_executable_t executable,
+    #   uint32_t* result);
+    "hsa_executable_validate": {
+        'restype': hsa_status_t,
+        'argtypes': [hsa_executable_t,
+                     _PTR(ctypes.c_uint32)],
+        'errcheck': _check_error,
+    },
+
+    # hsa_status_t HSA_API hsa_executable_get_symbol(
+    #     hsa_executable_t executable,
+    #     const char *module_name,
+    #     const char *symbol_name,
+    #     hsa_agent_t agent,
+    #     int32_t call_convention,
+    #     hsa_executable_symbol_t *symbol);
+    "hsa_executable_get_symbol": {
+        'errcheck': _check_error,
+        'restype': hsa_status_t,
+        'argtypes': [
+            hsa_executable_t,
+            ctypes.c_char_p,  # module_name (must be NULL for program linkage)
+            ctypes.c_char_p,  # symbol_name
+            hsa_agent_t,
+            ctypes.c_int32,
+            ctypes.POINTER(hsa_executable_symbol_t),
+        ],
+    },
+
+    # hsa_status_t HSA_API hsa_executable_symbol_get_info(
+    #     hsa_executable_symbol_t executable_symbol,
+    #     hsa_executable_symbol_info_t attribute,
+    #     void *value);
+    "hsa_executable_symbol_get_info": {
+        'errcheck': _check_error,
+        'restype': hsa_status_t,
+        'argtypes': [
+            hsa_executable_symbol_t,
+            hsa_executable_symbol_info_t,
+            ctypes.c_void_p,
+        ],
+    },
+
+
+    #hsa_status_t HSA_API hsa_executable_iterate_symbols(
+    #   hsa_executable_t executable,
+    #   hsa_status_t (*callback)(hsa_executable_t executable, hsa_executable_symbol_t symbol, void* data),
+    #   void* data);
+    "hsa_executable_iterate_symbols": {
+        'errcheck': _check_error,
+        'restype': hsa_status_t,
+        'argtypes': [
+            hsa_executable_symbol_t,
+            hsa_executable_symbol_info_t,
+            ctypes.c_void_p,
+        ],
+    },
+
+
+    #--------------------------------------------------------------------------
+    # AMD extensions from hsa_ext_amd.h
+    #--------------------------------------------------------------------------
+
+    # hsa_status_t HSA_API hsa_amd_coherency_get_type(hsa_agent_t agent,
+    #                                                hsa_amd_coherency_type_t* type);
+
+    "hsa_amd_coherency_get_type": {
+        'errcheck': _check_error,
+        'restype': hsa_status_t,
+        'argtypes': [
+            hsa_agent_t,
+            _PTR(hsa_amd_coherency_type_t),
+        ],
+    },
+
+    # hsa_status_t HSA_API hsa_amd_coherency_set_type(hsa_agent_t agent,
+    #                                                hsa_amd_coherency_type_t type);
+    "hsa_amd_coherency_get_type": {
+        'errcheck': _check_error,
+        'restype': hsa_status_t,
+        'argtypes': [
+            hsa_agent_t,
+            hsa_amd_coherency_type_t,
+        ],
+    },
+
+    # hsa_status_t HSA_API
+    #   hsa_amd_profiling_set_profiler_enabled(hsa_queue_t* queue, int enable);
+    "hsa_amd_profiling_set_profiler_enabled": {
+        'errcheck': _check_error,
+        'restype': hsa_status_t,
+        'argtypes': [
+            _PTR(hsa_queue_t),
+            ctypes.c_int,
+        ],
+    },
+
+    # hsa_status_t HSA_API hsa_amd_profiling_get_dispatch_time(
+    #   hsa_agent_t agent, hsa_signal_t signal,
+    #   hsa_amd_profiling_dispatch_time_t* time);
+    "hsa_amd_profiling_get_dispatch_time": {
+        'errcheck': _check_error,
+        'restype': hsa_status_t,
+        'argtypes': [
+            hsa_agent_t,
+            hsa_signal_t,
+            _PTR(hsa_amd_profiling_dispatch_time_t)
+        ],
+    },
+
+    # hsa_status_t HSA_API
+    #    hsa_amd_profiling_convert_tick_to_system_domain(hsa_agent_t agent,
+    #                                                    uint64_t agent_tick,
+    #                                                    uint64_t* system_tick);
+    "hsa_amd_profiling_convert_tick_to_system_domain": {
+        'errcheck': _check_error,
+        'restype': hsa_status_t,
+        'argtypes': [
+            ctypes.c_uint64,
+            _PTR(ctypes.c_uint64)
+        ],
+    },
+
+    # hsa_status_t HSA_API
+    # hsa_amd_signal_async_handler(hsa_signal_t signal,
+    #                             hsa_signal_condition_t cond,
+    #                             hsa_signal_value_t value,
+    #                             hsa_amd_signal_handler handler, void* arg);
+    "hsa_amd_signal_async_handler": {
+        'errcheck': _check_error,
+        'restype': hsa_status_t,
+        'argtypes': [
+            hsa_signal_t,
+            hsa_signal_condition_t,
+            hsa_signal_value_t,
+            hsa_amd_signal_handler,
+            ctypes.c_void_p,
+        ],
+    },
+
+    #hsa_amd_async_function(void (*callback)(void* arg), void* arg);
+    "hsa_amd_async_function": {
+        'errcheck': _check_error,
+        'restype': hsa_status_t,
+        'argtypes': [
+            ctypes.POINTER(void_fn_ptr),
+            ctypes.c_void_p,
+        ],
+    },
+
+    #uint32_t HSA_API
+    #hsa_amd_signal_wait_any(uint32_t signal_count, hsa_signal_t* signals,
+    #                        hsa_signal_condition_t* conds,
+    #                        hsa_signal_value_t* values, uint64_t timeout_hint,
+    #                        hsa_wait_state_t wait_hint,
+    #                        hsa_signal_value_t* satisfying_value);
+    "hsa_amd_signal_wait_any": {
+        'errcheck': _check_error,
+        'restype': ctypes.c_uint32,
+        'argtypes': [
+            ctypes.c_uint32,
+            _PTR(hsa_signal_t),
+            _PTR(hsa_signal_condition_t),
+            _PTR(hsa_signal_value_t),
+            ctypes.c_uint64,
+            hsa_wait_state_t,
+            _PTR(hsa_signal_value_t),
+        ],
+    },
+
+    # hsa_status_t HSA_API hsa_amd_image_get_info_max_dim(hsa_agent_t agent,
+    #                                               hsa_agent_info_t attribute,
+    #                                               void* value);
+    "hsa_amd_image_get_info_max_dim": {
+        'errcheck': _check_error,
+        'restype': hsa_status_t,
+        'argtypes': [
+            hsa_agent_t,
+            hsa_agent_info_t,
+            ctypes.c_void_p,
+        ],
+    },
+
+    # hsa_status_t HSA_API hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue,
+    #                                           uint32_t num_cu_mask_count,
+    #                                           const uint32_t* cu_mask);
+    "hsa_amd_queue_cu_set_mask": {
+        'errcheck': _check_error,
+        'restype': hsa_status_t,
+        'argtypes': [
+            _PTR(hsa_queue_t),
+            ctypes.c_uint32,
+            _PTR(ctypes.c_uint32)
+        ],
+    },
+
+    # hsa_status_t HSA_API
+    # hsa_amd_memory_pool_get_info(hsa_amd_memory_pool_t memory_pool,
+    #                             hsa_amd_memory_pool_info_t attribute,
+    #                             void* value);
+    "hsa_amd_memory_pool_get_info": {
+        'errcheck': _check_error,
+        'restype': hsa_status_t,
+        'argtypes': [
+            hsa_amd_memory_pool_t,
+            hsa_amd_memory_pool_info_t,
+            ctypes.c_void_p
+        ],
+    },
+
+    # hsa_status_t HSA_API hsa_amd_agent_iterate_memory_pools(
+    #    hsa_agent_t agent,
+    #    hsa_status_t (*callback)(hsa_amd_memory_pool_t memory_pool, void* data),
+    #    void* data);
+    "hsa_amd_agent_iterate_memory_pools": {
+        'errcheck': _check_error,
+        'restype': hsa_status_t,
+        'argtypes': [
+            hsa_agent_t,
+            HSA_AMD_AGENT_ITERATE_MEMORY_POOLS_CALLBACK,
+            ctypes.c_void_p
+        ],
+    },
+
+    # hsa_status_t HSA_API hsa_amd_memory_pool_allocate
+    #   (hsa_amd_memory_pool_t memory_pool, size_t size,
+    #    uint32_t flags, void** ptr);
+    "hsa_amd_memory_pool_allocate": {
+        'errcheck': _check_error,
+        'restype': hsa_status_t,
+        'argtypes': [
+            hsa_amd_memory_pool_t,
+            ctypes.c_size_t,
+            ctypes.c_uint32,
+            _PTR(ctypes.c_void_p)
+        ],
+    },
+
+    # hsa_status_t HSA_API hsa_amd_memory_pool_free(void* ptr);
+    "hsa_amd_memory_pool_free": {
+        'errcheck': _check_error,
+        'restype': hsa_status_t,
+        'argtypes': [
+            ctypes.c_void_p
+        ],
+    },
+
+    # hsa_status_t HSA_API hsa_amd_memory_async_copy(void* dst,
+    #                          hsa_agent_t dst_agent, const void* src,
+    #                          hsa_agent_t src_agent, size_t size,
+    #                          uint32_t num_dep_signals,
+    #                          const hsa_signal_t* dep_signals,
+    #                          hsa_signal_t completion_signal);
+    "hsa_amd_memory_async_copy": {
+        'errcheck': _check_error,
+        'restype': hsa_status_t,
+        'argtypes': [
+            ctypes.c_void_p,
+            hsa_agent_t,
+            ctypes.c_void_p,
+            hsa_agent_t,
+            ctypes.c_size_t,
+            ctypes.c_uint32,
+            _PTR(hsa_signal_t),
+            hsa_signal_t
+        ],
+    },
+
+    # hsa_status_t HSA_API hsa_amd_agent_memory_pool_get_info(
+    #    hsa_agent_t agent, hsa_amd_memory_pool_t memory_pool,
+    #    hsa_amd_agent_memory_pool_info_t attribute, void* value);
+    "hsa_amd_agent_memory_pool_get_info": {
+        'errcheck': _check_error,
+        'restype': hsa_status_t,
+        'argtypes': [
+            hsa_agent_t,
+            hsa_amd_memory_pool_t,
+            hsa_amd_agent_memory_pool_info_t,
+            ctypes.c_void_p
+        ],
+    },
+
+
+    # hsa_status_t HSA_API
+    # hsa_amd_agents_allow_access(uint32_t num_agents, const hsa_agent_t* agents,
+    #       const uint32_t* flags, const void* ptr);
+    "hsa_amd_agents_allow_access": {
+        'errcheck': _check_error,
+        'restype': hsa_status_t,
+        'argtypes': [
+            ctypes.c_uint32,
+            _PTR(hsa_agent_t),
+            _PTR(ctypes.c_uint32),
+            ctypes.c_void_p
+        ],
+    },
+
+
+    # hsa_status_t HSA_API
+    # hsa_amd_memory_pool_can_migrate(hsa_amd_memory_pool_t src_memory_pool,
+    #                                hsa_amd_memory_pool_t dst_memory_pool,
+    #                                bool* result);
+    "hsa_amd_memory_pool_can_migrate": {
+        'errcheck': _check_error,
+        'restype': hsa_status_t,
+        'argtypes': [
+            hsa_amd_memory_pool_t,
+            hsa_amd_memory_pool_t,
+            _PTR(ctypes.c_bool)
+        ],
+    },
+
+
+    # hsa_status_t HSA_API hsa_amd_memory_migrate(const void* ptr,
+    #                                            hsa_amd_memory_pool_t memory_pool,
+    #                                            uint32_t flags);
+    "hsa_amd_memory_migrate": {
+        'errcheck': _check_error,
+        'restype': hsa_status_t,
+        'argtypes': [
+            ctypes.c_void_p,
+            hsa_amd_memory_pool_t,
+            ctypes.c_uint32
+        ],
+    },
+
+
+    # hsa_status_t HSA_API hsa_amd_memory_lock(void* host_ptr, size_t size,
+    #                                        hsa_agent_t* agents, int num_agent,
+    #                                        void** agent_ptr);
+    "hsa_amd_memory_lock": {
+        'errcheck': _check_error,
+        'restype': hsa_status_t,
+        'argtypes': [
+            ctypes.c_void_p,
+            ctypes.c_size_t,
+            _PTR(hsa_agent_t),
+            ctypes.c_int,
+            _PTR(ctypes.c_void_p)
+        ],
+    },
+
+
+    # hsa_status_t HSA_API hsa_amd_memory_unlock(void* host_ptr);
+    "hsa_amd_memory_unlock": {
+        'errcheck': _check_error,
+        'restype': hsa_status_t,
+        'argtypes': [
+            ctypes.c_void_p
+        ],
+    },
+
+
+    # hsa_status_t HSA_API
+    # hsa_amd_memory_fill(void* ptr, uint32_t value, size_t count);
+    "hsa_amd_memory_unlock": {
+        'errcheck': _check_error,
+        'restype': hsa_status_t,
+        'argtypes': [
+            ctypes.c_void_p
+        ],
+    },
+
+    # hsa_status_t HSA_API hsa_amd_interop_map_buffer(uint32_t num_agents,
+    #                                        hsa_agent_t* agents,
+    #                                        int interop_handle,
+    #                                        uint32_t flags,
+    #                                        size_t* size,
+    #                                        void** ptr,
+    #                                        size_t* metadata_size,
+    #                                        const void** metadata);
+    "hsa_amd_interop_map_buffer": {
+        'errcheck': _check_error,
+        'restype': hsa_status_t,
+        'argtypes': [
+            ctypes.c_uint32,
+            _PTR(hsa_agent_t),
+            ctypes.c_int,
+            ctypes.c_uint32,
+            _PTR(ctypes.c_size_t),
+            _PTR(ctypes.c_void_p),
+            _PTR(ctypes.c_size_t),
+            _PTR(ctypes.c_void_p),
+        ],
+    },
+
+
+    # hsa_status_t HSA_API hsa_amd_interop_unmap_buffer(void* ptr);
+    "hsa_amd_interop_map_buffer": {
+        'errcheck': _check_error,
+        'restype': hsa_status_t,
+        'argtypes': [
+            _PTR(ctypes.c_void_p),
+        ],
+    },
+
+
+    # hsa_status_t HSA_API hsa_amd_image_create(
+    #    hsa_agent_t agent,
+    #    const hsa_ext_image_descriptor_t *image_descriptor,
+    #    const hsa_amd_image_descriptor_t *image_layout,
+    #    const void *image_data,
+    #    hsa_access_permission_t access_permission,
+    #    hsa_ext_image_t *image
+    #    );
+    "hsa_amd_image_create": {
+        'errcheck': _check_error,
+        'restype': hsa_status_t,
+        'argtypes': [
+            hsa_agent_t,
+            _PTR(hsa_ext_image_descriptor_t),
+            _PTR(hsa_amd_image_descriptor_t),
+            ctypes.c_void_p,
+            hsa_access_permission_t,
+            hsa_ext_image_t
+        ],
+    },
+
+    #--------------------------------------------------------------------------
+    # Functions from hsa_ext_finalize.h
+    # NOTE: To access these functions use the hsa_ext_finalizer_1_00_pfn_t
+    # struct.
+    #--------------------------------------------------------------------------
+
+}
--- a/numba/roc/hsadrv/enums.py
+++ b/numba/roc/hsadrv/enums.py
+"""Enum values for HSA
+
+Note that Python namespacing could be used to avoid the C-like
+prefixing, but we choose to keep the same names as found in the C
+enums, in order to match the documentation.
+"""
+
+import ctypes
+
+HSA_LARGE_MODEL = ctypes.sizeof(ctypes.c_void_p) == 8
+
+# hsa_status_t
+
+# The function has been executed successfully.
+HSA_STATUS_SUCCESS = 0x0
+# A traversal over a list of elements has been interrupted by the
+# application before completing.
+HSA_STATUS_INFO_BREAK = 0x1
+# A generic error has occurred.
+HSA_STATUS_ERROR = 0x1000
+# One of the actual arguments does not meet a precondition stated in the
+# documentation of the corresponding formal argument.
+HSA_STATUS_ERROR_INVALID_ARGUMENT = 0x1001
+# The requested queue creation is not valid.
+HSA_STATUS_ERROR_INVALID_QUEUE_CREATION = 0x1002
+# The requested allocation is not valid.
+HSA_STATUS_ERROR_INVALID_ALLOCATION = 0x1003
+# The agent is invalid.
+HSA_STATUS_ERROR_INVALID_AGENT = 0x1004
+# The memory region is invalid.
+HSA_STATUS_ERROR_INVALID_REGION = 0x1005
+# The signal is invalid.
+HSA_STATUS_ERROR_INVALID_SIGNAL = 0x1006
+# The queue is invalid.
+HSA_STATUS_ERROR_INVALID_QUEUE = 0x1007
+# The HSA runtime failed to allocate the necessary resources. This error
+# may also occur when the HSA runtime needs to spawn threads or create
+# internal OS-specific events.
+HSA_STATUS_ERROR_OUT_OF_RESOURCES = 0x1008
+# The AQL packet is malformed.
+HSA_STATUS_ERROR_INVALID_PACKET_FORMAT = 0x1009
+# An error has been detected while releasing a resource.
+HSA_STATUS_ERROR_RESOURCE_FREE = 0x100A
+# An API other than ::hsa_init has been invoked while the reference count
+# of the HSA runtime is 0.
+HSA_STATUS_ERROR_NOT_INITIALIZED = 0x100B
+# The maximum reference count for the object has been reached.
+HSA_STATUS_ERROR_REFCOUNT_OVERFLOW = 0x100C
+# The arguments passed to a functions are not compatible.
+HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS = 0x100D
+# The index is invalid.\
+HSA_STATUS_ERROR_INVALID_INDEX = 0x100E
+# The instruction set architecture is invalid.
+HSA_STATUS_ERROR_INVALID_ISA = 0x100F,
+# The instruction set architecture name is invalid.
+HSA_STATUS_ERROR_INVALID_ISA_NAME = 0x1017
+# The code object is invalid.
+HSA_STATUS_ERROR_INVALID_CODE_OBJECT = 0x1010
+# The executable is invalid.
+HSA_STATUS_ERROR_INVALID_EXECUTABLE = 0x1011
+# The executable is frozen.
+HSA_STATUS_ERROR_FROZEN_EXECUTABLE = 0x1012
+# There is no symbol with the given name.
+HSA_STATUS_ERROR_INVALID_SYMBOL_NAME = 0x1013
+# The variable is already defined.
+HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED = 0x1014
+# The variable is undefined.
+HSA_STATUS_ERROR_VARIABLE_UNDEFINED = 0x1015
+# An HSAIL operation resulted on a hardware exception.
+HSA_STATUS_ERROR_EXCEPTION = 0x1016
+
+# hsa_packet_type_t
+HSA_PACKET_TYPE_VENDOR_SPECIFIC               = 0
+# The packet has been processed in the past, but has not been reassigned to
+# the packet processor. A packet processor must not process a packet of this
+# type. All queues support this packet type.
+HSA_PACKET_TYPE_INVALID                       = 1
+# Packet used by agents for dispatching jobs to kernel agents. Not all
+# queues support packets of this type (see ::hsa_queue_feature_t).
+HSA_PACKET_TYPE_KERNEL_DISPATCH               = 2
+# Packet used by agents to delay processing of subsequent packets, and to
+# express complex dependencies between multiple packets. All queues support
+# this packet type.
+HSA_PACKET_TYPE_BARRIER_AND                   = 3
+# Packet used by agents for dispatching jobs to agents.  Not all
+# queues support packets of this type (see ::hsa_queue_feature_t).
+HSA_PACKET_TYPE_AGENT_DISPATCH                = 4
+# Packet used by agents to delay processing of subsequent packets, and to
+# express complex dependencies between multiple packets. All queues support
+# this packet type.
+HSA_PACKET_TYPE_BARRIER_OR                    = 5
+
+# hsa_queue_type_t
+HSA_QUEUE_TYPE_MULTI                          = 0
+HSA_QUEUE_TYPE_SINGLE                         = 1
+
+# hsa_queue_feature_t
+HSA_QUEUE_FEATURE_KERNEL_DISPATCH                    = 1
+HSA_QUEUE_FEATURE_AGENT_DISPATCH              = 2
+
+# hsa_fence_scope_t
+HSA_FENCE_SCOPE_NONE                          = 0
+HSA_FENCE_SCOPE_AGENT                         = 1
+HSA_FENCE_SCOPE_SYSTEM                        = 2
+
+# hsa_wait_state_t
+# The application thread may be rescheduled while waiting on the signal.
+HSA_WAIT_STATE_BLOCKED = 0
+# The application thread stays active while waiting on a signal.
+HSA_WAIT_STATE_ACTIVE = 1
+
+# hsa_signal_condition_t
+HSA_SIGNAL_CONDITION_EQ                                        = 0
+HSA_SIGNAL_CONDITION_NE                                        = 1
+HSA_SIGNAL_CONDITION_LT                                        = 2
+HSA_SIGNAL_CONDITION_GTE                                       = 3
+
+# # hsa_dim_t
+# HSA_DIM_X                                     = 0
+# HSA_DIM_Y                                     = 1
+# HSA_DIM_Z                                     = 2
+
+# hsa_extension_t
+HSA_EXTENSION_FINALIZER = 0
+HSA_EXTENSION_IMAGES = 1
+HSA_EXTENSION_AMD_PROFILER = 2
+
+# hsa_agent_feature_t
+HSA_AGENT_FEATURE_KERNEL_DISPATCH             = 1
+HSA_AGENT_FEATURE_AGENT_DISPATCH              = 2
+
+# hsa_device_type_t
+HSA_DEVICE_TYPE_CPU                           = 0
+HSA_DEVICE_TYPE_GPU                           = 1
+HSA_DEVICE_TYPE_DSP                           = 2
+
+# hsa_system_info_t
+HSA_SYSTEM_INFO_VERSION_MAJOR                 = 0
+HSA_SYSTEM_INFO_VERSION_MINOR                 = 1
+HSA_SYSTEM_INFO_TIMESTAMP                     = 2
+HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY           = 3
+HSA_SYSTEM_INFO_SIGNAL_MAX_WAIT               = 4
+HSA_SYSTEM_INFO_ENDIANNESS                    = 5
+HSA_SYSTEM_INFO_MACHINE_MODEL                 = 6
+HSA_SYSTEM_INFO_EXTENSIONS                    = 7
+
+# hsa_agent_info_t
+
+# Agent name. The type of this attribute is a NUL-terminated char[64]. If
+# the name of the agent uses less than 63 characters, the rest of the
+# array must be filled with NULs.
+HSA_AGENT_INFO_NAME = 0
+# Name of vendor. The type of this attribute is a NUL-terminated char[64]. If
+# the name of the vendor uses less than 63 characters, the rest of the array
+# must be filled with NULs.
+HSA_AGENT_INFO_VENDOR_NAME = 1
+# Agent capability. The type of this attribute is ::hsa_agent_feature_t.
+HSA_AGENT_INFO_FEATURE = 2
+# Machine model supported by the agent. The type of this attribute is
+# ::hsa_machine_model_t.
+HSA_AGENT_INFO_MACHINE_MODEL = 3
+# Profile supported by the agent. The type of this attribute is
+# ::hsa_profile_t.
+HSA_AGENT_INFO_PROFILE = 4
+# Default floating-point rounding mode. The type of this attribute is
+# ::hsa_default_float_rounding_mode_t, but the value
+# ::HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT is not allowed.
+HSA_AGENT_INFO_DEFAULT_FLOAT_ROUNDING_MODE = 5
+# Default floating-point rounding modes supported by the agent in the Base
+# profile. The type of this attribute is a mask of
+# ::hsa_default_float_rounding_mode_t. The default floating-point rounding
+# mode (::HSA_AGENT_INFO_DEFAULT_FLOAT_ROUNDING_MODE) bit must not be set.
+HSA_AGENT_INFO_BASE_PROFILE_DEFAULT_FLOAT_ROUNDING_MODES = 23
+# Flag indicating that the f16 HSAIL operation is at least as fast as the
+# f32 operation in the current agent. The value of this attribute is
+# undefined if the agent is not a kernel agent. The type of this
+# attribute is bool.
+HSA_AGENT_INFO_FAST_F16_OPERATION = 24
+# Number of work-items in a wavefront. Must be a power of 2 in the range
+# [1,256]. The value of this attribute is undefined if the agent is not
+# a kernel agent. The type of this attribute is uint32_t.
+HSA_AGENT_INFO_WAVEFRONT_SIZE = 6
+# Maximum number of work-items of each dimension of a work-group.  Each
+# maximum must be greater than 0. No maximum can exceed the value of
+# ::HSA_AGENT_INFO_WORKGROUP_MAX_SIZE. The value of this attribute is
+# undefined if the agent is not a kernel agent. The type of this
+# attribute is uint16_t[3].
+HSA_AGENT_INFO_WORKGROUP_MAX_DIM = 7
+# Maximum total number of work-items in a work-group. The value of this
+# attribute is undefined if the agent is not a kernel agent. The type
+# of this attribute is uint32_t.
+HSA_AGENT_INFO_WORKGROUP_MAX_SIZE = 8
+# Maximum number of work-items of each dimension of a grid. Each maximum must
+# be greater than 0, and must not be smaller than the corresponding value in
+# ::HSA_AGENT_INFO_WORKGROUP_MAX_DIM. No maximum can exceed the value of
+# ::HSA_AGENT_INFO_GRID_MAX_SIZE. The value of this attribute is undefined if
+# the agent is not a kernel agent. The type of this attribute is
+# ::hsa_dim3_t.
+HSA_AGENT_INFO_GRID_MAX_DIM = 9
+# Maximum total number of work-items in a grid. The value of this attribute
+# is undefined if the agent is not a kernel agent. The type of this
+# attribute is uint32_t.
+HSA_AGENT_INFO_GRID_MAX_SIZE = 10
+# Maximum number of fbarriers per work-group. Must be at least 32. The value
+# of this attribute is undefined if the agent is not a kernel agent. The
+# type of this attribute is uint32_t.
+HSA_AGENT_INFO_FBARRIER_MAX_SIZE = 11
+# Maximum number of queues that can be active (created but not destroyed) at
+# one time in the agent. The type of this attribute is uint32_t.
+HSA_AGENT_INFO_QUEUES_MAX = 12
+# Minimum number of packets that a queue created in the agent
+# can hold. Must be a power of 2 greater than 0. Must not exceed
+# the value of ::HSA_AGENT_INFO_QUEUE_MAX_SIZE. The type of this
+# attribute is uint32_t.
+HSA_AGENT_INFO_QUEUE_MIN_SIZE = 13
+# Maximum number of packets that a queue created in the agent can
+# hold. Must be a power of 2 greater than 0. The type of this attribute
+# is uint32_t.
+HSA_AGENT_INFO_QUEUE_MAX_SIZE = 14
+# Type of a queue created in the agent. The type of this attribute is
+# ::hsa_queue_type_t.
+HSA_AGENT_INFO_QUEUE_TYPE = 15
+# Identifier of the NUMA node associated with the agent. The type of this
+# attribute is uint32_t.
+HSA_AGENT_INFO_NODE = 16
+# Type of hardware device associated with the agent. The type of this
+# attribute is ::hsa_device_type_t.
+HSA_AGENT_INFO_DEVICE = 17
+# Array of data cache sizes (L1..L4). Each size is expressed in bytes. A size
+# of 0 for a particular level indicates that there is no cache information
+# for that level. The type of this attribute is uint32_t[4].
+HSA_AGENT_INFO_CACHE_SIZE = 18
+# Instruction set architecture of the agent. The type of this attribute
+# is ::hsa_isa_t.
+HSA_AGENT_INFO_ISA = 19
+# Bit-mask indicating which extensions are supported by the agent. An
+# extension with an ID of @p i is supported if the bit at position @p i is
+# set. The type of this attribute is uint8_t[128].
+HSA_AGENT_INFO_EXTENSIONS = 20
+# Major version of the HSA runtime specification supported by the
+# agent. The type of this attribute is uint16_t.
+HSA_AGENT_INFO_VERSION_MAJOR = 21
+# Minor version of the HSA runtime specification supported by the
+# agent. The type of this attribute is uint16_t.
+HSA_AGENT_INFO_VERSION_MINOR = 22
+
+# hsa_region_segment_t
+# Global segment. Used to hold data that is shared by all agents.
+HSA_REGION_SEGMENT_GLOBAL = 0
+# Read-only segment. Used to hold data that remains constant during the
+# execution of a kernel.
+HSA_REGION_SEGMENT_READONLY = 1
+# Private segment. Used to hold data that is local to a single work-item.
+HSA_REGION_SEGMENT_PRIVATE = 2
+# Group segment. Used to hold data that is shared by the work-items of a
+# work-group.
+HSA_REGION_SEGMENT_GROUP = 3
+
+# hsa_region_global_flag_t
+# The application can use memory in the region to store kernel arguments, and
+# provide the values for the kernarg segment of a kernel dispatch. If this
+# flag is set, then ::HSA_REGION_GLOBAL_FLAG_FINE_GRAINED must be set.
+HSA_REGION_GLOBAL_FLAG_KERNARG = 1
+# Updates to memory in this region are immediately visible to all the
+# agents under the terms of the HSA memory model. If this
+# flag is set, then ::HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED must not be set.
+HSA_REGION_GLOBAL_FLAG_FINE_GRAINED = 2
+# Updates to memory in this region can be performed by a single agent at
+# a time. If a different agent in the system is allowed to access the
+# region, the application must explicitely invoke ::hsa_memory_assign_agent
+# in order to transfer ownership to that agent for a particular buffer.
+HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED = 4
+
+# hsa_region_info_t
+
+# Segment where memory in the region can be used. The type of this
+# attribute is ::hsa_region_segment_t.
+HSA_REGION_INFO_SEGMENT = 0
+# Flag mask. The value of this attribute is undefined if the value of
+# ::HSA_REGION_INFO_SEGMENT is not ::HSA_REGION_SEGMENT_GLOBAL. The type of
+# this attribute is uint32_t, a bit-field of ::hsa_region_global_flag_t
+# values.
+HSA_REGION_INFO_GLOBAL_FLAGS = 1
+# Size of this region, in bytes. The type of this attribute is size_t.
+HSA_REGION_INFO_SIZE = 2
+# Maximum allocation size in this region, in bytes. Must not exceed the value
+# of ::HSA_REGION_INFO_SIZE. The type of this attribute is size_t.
+#
+# If the region is in the global or readonly segments, this is the maximum
+# size that the application can pass to ::hsa_memory_allocate. If the region
+# is in the group segment, this is the maximum size (per work-group) that can
+# be requested for a given kernel dispatch. If the region is in the private
+# segment, this is the maximum size (per work-item) that can be request for a
+# specific kernel dispatch.
+HSA_REGION_INFO_ALLOC_MAX_SIZE = 4
+# Indicates whether memory in this region can be allocated using
+# ::hsa_memory_allocate. The type of this attribute is bool.
+#
+# The value of this flag is always false for regions in the group and private
+# segments.
+HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED = 5
+# Allocation granularity of buffers allocated by ::hsa_memory_allocate in
+# this region. The size of a buffer allocated in this region is a multiple of
+# the value of this attribute. The value of this attribute is only defined if
+# ::HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED is true for this region. The type
+# of this attribute is size_t.
+HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE = 6
+# Alignment of buffers allocated by ::hsa_memory_allocate in this region. The
+# value of this attribute is only defined if
+# ::HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED is true for this region, and must
+# be a power of 2. The type of this attribute is size_t.
+HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT = 7
+
+
+# hsa_profile_t
+HSA_PROFILE_BASE                     = 0
+HSA_PROFILE_FULL                     = 1
+
+# hsa_machine_model_t
+HSA_MACHINE_MODEL_SMALL = 0
+HSA_MACHINE_MODEL_LARGE = 1
+
+
+# hsa_executable_symbol_info_t
+
+
+# The kind of the symbol. The type of this attribute is ::hsa_symbol_kind_t.
+HSA_EXECUTABLE_SYMBOL_INFO_TYPE = 0
+# The length of the symbol name. The type of this attribute is uint32_t.
+HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH = 1
+# The name of the symbol. The type of this attribute is character array with
+# the length equal to the value of ::HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH
+# attribute
+HSA_EXECUTABLE_SYMBOL_INFO_NAME = 2
+# The length of the module name to which this symbol belongs if this symbol
+# has module linkage, otherwise 0 is returned. The type of this attribute is
+# uint32_t.
+HSA_EXECUTABLE_SYMBOL_INFO_MODULE_NAME_LENGTH = 3
+# The module name to which this symbol belongs if this symbol has module
+# linkage, otherwise empty string is returned. The type of this attribute is
+# character array with the length equal to the value of
+# ::HSA_EXECUTABLE_SYMBOL_INFO_MODULE_NAME_LENGTH attribute.
+HSA_EXECUTABLE_SYMBOL_INFO_MODULE_NAME = 4
+# Agent associated with this symbol. If the symbol is a variable, the
+# value of this attribute is only defined if
+# ::HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ALLOCATION is
+# ::HSA_VARIABLE_ALLOCATION_AGENT. The type of this attribute is hsa_agent_t.
+HSA_EXECUTABLE_SYMBOL_INFO_AGENT = 20
+# The address of the variable. The value of this attribute is undefined if
+# the symbol is not a variable. The type of this attribute is uint64_t.
+# If executable's state is ::HSA_EXECUTABLE_STATE_UNFROZEN, then 0 is
+# returned.
+HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS = 21
+# The linkage kind of the symbol. The type of this attribute is
+# ::hsa_symbol_linkage_t.
+HSA_EXECUTABLE_SYMBOL_INFO_LINKAGE = 5
+# Indicates whether the symbol corresponds to a definition. The type of this
+# attribute is bool.
+HSA_EXECUTABLE_SYMBOL_INFO_IS_DEFINITION = 17
+# The allocation kind of the variable. The value of this attribute is
+# undefined if the symbol is not a variable.  The type of this attribute is
+# ::hsa_variable_allocation_t.
+HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ALLOCATION = 6
+# The segment kind of the variable. The value of this attribute is undefined
+# if the symbol is not a variable. The type of this attribute is
+# ::hsa_variable_segment_t.
+HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SEGMENT = 7
+# Alignment of the variable. The value of this attribute is undefined if
+# the symbol is not a variable. The type of this attribute is uint32_t.
+HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ALIGNMENT = 8
+# Size of the variable. The value of this attribute is undefined if
+# the symbol is not a variable. The type of this attribute is uint32_t.
+#
+# A value of 0 is returned if the variable is an external variable and has an
+# unknown dimension.
+HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE = 9
+# Indicates whether the variable is constant. The value of this attribute is
+# undefined if the symbol is not a variable. The type of this attribute is
+# bool.
+HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_IS_CONST = 10
+
+# Kernel object handle, used in the kernel dispatch packet. The value of this
+# attribute is undefined if the symbol is not a kernel. The type of this
+# attribute is uint64_t.
+#
+# If the state of the executable is ::HSA_EXECUTABLE_STATE_UNFROZEN, then 0
+# is returned.
+HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT = 22
+# Size of kernarg segment memory that is required to hold the values of the
+# kernel arguments, in bytes. The value of this attribute is undefined if the
+# symbol is not a kernel. The type of this attribute is uint32_t.
+HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE = 11
+# Alignment (in bytes) of the buffer used to pass arguments to the kernel,
+# which is the maximum of 16 and the maximum alignment of any of the kernel
+# arguments. The value of this attribute is undefined if the symbol is not a
+# kernel. The type of this attribute is uint32_t.
+HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT = 12
+# Size of static group segment memory required by the kernel (per
+# work-group), in bytes. The value of this attribute is undefined
+# if the symbol is not a kernel. The type of this attribute is uint32_t.
+#
+# The reported amount does not include any dynamically allocated group
+# segment memory that may be requested by the application when a kernel is
+# dispatched.
+HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE = 13
+# Size of static private, spill, and arg segment memory required by
+# this kernel (per work-item), in bytes. The value of this attribute is
+# undefined if the symbol is not a kernel. The type of this attribute is
+# uint32_t.
+#
+# If the value of ::HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK is
+# true, the kernel may use more private memory than the reported value, and
+# the application must add the dynamic call stack usage to @a
+# private_segment_size when populating a kernel dispatch packet.
+HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE = 14
+# Dynamic callstack flag. The value of this attribute is undefined if the
+# symbol is not a kernel. The type of this attribute is bool.
+#
+# If this flag is set (the value is true), the kernel uses a dynamically
+# sized call stack. This can happen if recursive calls, calls to indirect
+# functions, or the HSAIL alloca instruction are present in the kernel.
+HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK = 15
+# Indirect function object handle. The value of this attribute is undefined
+# if the symbol is not an indirect function, or the associated agent does
+# not support the Full Profile. The type of this attribute depends on the
+# machine model: if machine model is small, then the type is uint32_t, if
+# machine model is large, then the type is uint64_t.
+#
+# If the state of the executable is ::HSA_EXECUTABLE_STATE_UNFROZEN, then 0
+# is returned.
+HSA_EXECUTABLE_SYMBOL_INFO_INDIRECT_FUNCTION_OBJECT = 23
+# Call convention of the indirect function. The value of this attribute is
+# undefined if the symbol is not an indirect function, or the associated
+# agent does not support the Full Profile. The type of this attribute is
+# uint32_t.
+HSA_EXECUTABLE_SYMBOL_INFO_INDIRECT_FUNCTION_CALL_CONVENTION = 16
+
+
+# hsa_default_float_rounding_mode_t
+
+# Use a default floating-point rounding mode specified elsewhere.
+HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT = 0
+# Operations that specify the default floating-point mode are rounded to zero
+# by default.
+HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO = 1
+# Operations that specify the default floating-point mode are rounded to the
+# nearest representable number and that ties should be broken by selecting
+# the value with an even least significant bit.
+HSA_DEFAULT_FLOAT_ROUNDING_MODE_NEAR = 2
+
+# hsa_code_object_type_t
+HSA_CODE_OBJECT_TYPE_PROGRAM = 0
+
+
+# hsa_executable_state_t
+
+# Executable state, which allows the user to load code objects and define
+# external variables. Variable addresses, kernel code handles, and
+# indirect function code handles are not available in query operations until
+# the executable is frozen (zero always returned).
+
+HSA_EXECUTABLE_STATE_UNFROZEN = 0
+
+# Executable state, which allows the user to query variable addresses,
+# kernel code handles, and indirect function code handles using query
+# operation. Loading new code objects, as well as defining external variables
+# is not allowed in this state.
+
+HSA_EXECUTABLE_STATE_FROZEN = 1
+
+
+# hsa_kernel_dispatch_packet_setup_t
+HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS = 0
+
+
+
+# hsa_packet_header_t
+HSA_PACKET_HEADER_TYPE = 0
+HSA_PACKET_HEADER_BARRIER = 8
+HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE = 9
+HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE = 11
+
--- a/numba/roc/hsadrv/enums_ext.py
+++ b/numba/roc/hsadrv/enums_ext.py
+"""Enum values for HSA from the HSA extension header
+
+Note that Python namespacing could be used to avoid the C-like
+prefixing, but we choose to keep the same names as found in the C
+enums, in order to match the documentation.
+"""
+
+# These enums are a direct translation of those found in:
+# hsa_ext_amd.h from the ROCR-Runtime. For example:
+# https://github.com/RadeonOpenCompute/ROCR-Runtime/blob/master/src/inc/hsa_ext_amd.h
+# Comments relating to the values are largely wholesale copied.
+
+import ctypes
+
+
+#------------------------------------------------------------------------------
+#
+# Anonymous enum expressing that a memory pool is invalid
+#
+HSA_STATUS_ERROR_INVALID_MEMORY_POOL = 40
+#------------------------------------------------------------------------------
+
+
+#------------------------------------------------------------------------------
+#
+# Agent attributes
+#
+# Enums of the type hsa_amd_agent_info_t
+
+# Chip identifier. The type of this attribute is uint32_t.
+HSA_AMD_AGENT_INFO_CHIP_ID = 0xA000
+
+# Size of a cacheline in bytes. The type of this attribute is uint32_t.
+HSA_AMD_AGENT_INFO_CACHELINE_SIZE = 0xA001
+
+# The number of compute unit available in the agent. The type of this
+# attribute is uint32_t.
+HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT = 0xA002
+
+# The maximum clock frequency of the agent in MHz. The type of this
+# attribute is uint32_t.
+HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY = 0xA003
+
+# Internay driver node identifier. The type of this attribute is uint32_t.
+HSA_AMD_AGENT_INFO_DRIVER_NODE_ID = 0xA004
+
+# Max number of watch points on memory address ranges to generate exception
+# events when the watched addresses are accessed.
+HSA_AMD_AGENT_INFO_MAX_ADDRESS_WATCH_POINTS = 0xA005
+#------------------------------------------------------------------------------
+
+
+#------------------------------------------------------------------------------
+#
+# Region attributes
+#
+# Enums of the type hsa_amd_region_info_t
+
+# Determine if host can access the region. The type of this attribute is bool.
+HSA_AMD_REGION_INFO_HOST_ACCESSIBLE = 0xA000
+
+# Base address of the region in flat address space.
+HSA_AMD_REGION_INFO_BASE = 0xA001
+
+# Memory Interface width, the return value type is uint32_t.
+# This attribute is deprecated. Use HSA_AMD_AGENT_INFO_MEMORY_WIDTH.
+HSA_AMD_REGION_INFO_BUS_WIDTH = 0xA002
+
+# Max Memory Clock, the return value type is uint32_t.
+# This attribute is deprecated. Use HSA_AMD_AGENT_INFO_MEMORY_MAX_FREQUENCY.
+HSA_AMD_REGION_INFO_MAX_CLOCK_FREQUENCY = 0xA003
+#------------------------------------------------------------------------------
+
+
+#------------------------------------------------------------------------------
+#
+# Coherency attributes of a fine grained region
+#
+# Enums of the type hsa_amd_coherency_type_t
+
+# Coherent region.
+HSA_AMD_COHERENCY_TYPE_COHERENT = 0
+
+# Non coherent region.
+HSA_AMD_COHERENCY_TYPE_NONCOHERENT = 1
+#------------------------------------------------------------------------------
+
+
+#------------------------------------------------------------------------------
+#
+# Memory segments associated with a memory pool.
+#
+# Enums of the type hsa_amd_segment_t
+
+# Global segment. Used to hold data that is shared by all agents.
+HSA_AMD_SEGMENT_GLOBAL = 0
+
+# Read-only segment. Used to hold data that remains constant during the
+# execution of a kernel.
+HSA_AMD_SEGMENT_READONLY = 1
+
+# Private segment. Used to hold data that is local to a single work-item.
+HSA_AMD_SEGMENT_PRIVATE = 2
+
+# Group segment. Used to hold data that is shared by the work-items of a
+# work-group.
+HSA_AMD_SEGMENT_GROUP = 3
+#------------------------------------------------------------------------------
+
+
+#------------------------------------------------------------------------------
+#
+# Memory pool global flags.
+#
+# Enums of the type hsa_amd_memory_pool_global_flag_t.
+
+# The application can use allocations in the memory pool to store kernel
+# arguments, and provide the values for the kernarg segment of
+# a kernel dispatch.
+HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT = 1
+
+# Updates to memory in this pool conform to HSA memory consistency model.
+# If this flag is set, then HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED
+# must not be set.
+HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED = 2
+
+# Writes to memory in this pool can be performed by a single agent at a time.
+HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED = 4
+#------------------------------------------------------------------------------
+
+
+#------------------------------------------------------------------------------
+#
+# Memory pool features flags.
+#
+# Enums of the type hsa_amd_memory_pool_info_t.
+
+# Segment where the memory pool resides. The type of this attribute is
+# hsa_amd_segment_t.
+HSA_AMD_MEMORY_POOL_INFO_SEGMENT = 0
+
+# Flag mask. The value of this attribute is undefined if the value of
+# HSA_AMD_MEMORY_POOL_INFO_SEGMENT is not HSA_AMD_SEGMENT_GLOBAL. The type
+# of this attribute is uint32_t, a bit-field of
+# hsa_amd_memory_pool_global_flag_t values.
+HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS = 1
+
+# Size of this pool, in bytes. The type of this attribute is size_t.
+HSA_AMD_MEMORY_POOL_INFO_SIZE = 2
+
+# Indicates whether memory in this pool can be allocated using
+# hsa_amd_memory_pool_allocate. The type of this attribute is bool.
+# The value of this flag is always false for memory pools in the group and
+# private segments.
+HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED = 5
+
+# Allocation granularity of buffers allocated by hsa_amd_memory_pool_allocate
+# in this memory pool. The size of a buffer allocated in this pool is a
+# multiple of the value of this attribute. The value of this attribute is
+# only defined if HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED is true for
+# this pool. The type of this attribute is size_t.
+HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE = 6
+
+# Alignment of buffers allocated by hsa_amd_memory_pool_allocate in this
+# pool. The value of this attribute is only defined if
+# HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED is true for this pool, and
+# must be a power of 2. The type of this attribute is size_t.
+HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT = 7
+
+# This memory_pool can be made directly accessible by all the agents in the
+# system (hsa_amd_agent_memory_pool_get_info returns
+# HSA_AMD_MEMORY_POOL_ACCESS_ALLOWED_BY_DEFAULT for all agents). The type of
+# this attribute is bool.
+HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL = 15
+#------------------------------------------------------------------------------
+
+
+#------------------------------------------------------------------------------
+#
+# Type of accesses to a memory pool from a given agent.
+#
+# Enums of the type hsa_amd_memory_pool_access_t
+
+# The agent cannot directly access any buffer in the memory pool.
+HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED = 0
+
+# The agent can directly access a buffer located in the pool; the application
+# does not need to invoke hsa_amd_agents_allow_access.
+HSA_AMD_MEMORY_POOL_ACCESS_ALLOWED_BY_DEFAULT = 1
+
+# The agent can directly access a buffer located in the pool, but only if the
+# application has previously requested access to that buffer using
+# hsa_amd_agents_allow_access.
+HSA_AMD_MEMORY_POOL_ACCESS_DISALLOWED_BY_DEFAULT = 2
+#------------------------------------------------------------------------------
+
+
+#------------------------------------------------------------------------------
+#
+# Properties of the relationship between an agent a memory pool.
+#
+# Enums of the type hsa_amd_link_info_type_t
+
+# Hyper-transport bus type.
+HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT = 0
+
+# QPI bus type.
+HSA_AMD_LINK_INFO_TYPE_QPI = 1
+
+# PCIe bus type.
+HSA_AMD_LINK_INFO_TYPE_PCIE = 2
+
+# Infiniband bus type.
+HSA_AMD_LINK_INFO_TYPE_INFINBAND = 3
+#------------------------------------------------------------------------------
+
+
+#------------------------------------------------------------------------------
+#
+# Access to buffers located in the memory pool. The type of this attribute
+# is hsa_amd_memory_pool_access_t.
+#
+# Enums of type hsa_amd_agent_memory_pool_info_t.
+
+# An agent can always directly access buffers currently located in a memory
+# pool that is associated (the memory_pool is one of the values returned by
+# hsa_amd_agent_iterate_memory_pools on the agent) with that agent. If the
+# buffer is currently located in a memory pool that is not associated with
+# the agent, and the value returned by this function for the given
+# combination of agent and memory pool is not
+# HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED, the application still needs to
+# invoke hsa_amd_agents_allow_access in order to gain direct access to the
+# buffer.
+
+# If the given agent can directly access buffers the pool, the result is not
+# HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED. If the memory pool is associated
+# with the agent, or it is of fined-grained type, the result must not be
+# HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED. If the memory pool is not
+# associated with the agent, and does not reside in the global segment, the
+# result must be HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED.
+HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS = 0
+
+# Number of links to hop when accessing the memory pool from the specified
+# agent. The type of this attribute is uint32_t.
+HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS = 1
+
+# Details of each link hop when accessing the memory pool starting from the
+# specified agent. The type of this attribute is an array size of
+# HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS with each element containing
+# hsa_amd_memory_pool_link_info_t.
+HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO = 2
+#------------------------------------------------------------------------------
+
+
--- a/numba/roc/hsadrv/error.py
+++ b/numba/roc/hsadrv/error.py
+class HsaDriverError(Exception):
+    pass
+
+
+class HsaSupportError(ImportError):
+    pass
+
+
+class HsaApiError(HsaDriverError):
+    def __init__(self, code, msg):
+        self.code = code
+        super(HsaApiError, self).__init__(msg)
+
+
+class HsaWarning(UserWarning):
+    pass
+
+
+class HsaKernelLaunchError(HsaDriverError):
+    pass
+
+
+class HsaContextMismatchError(HsaDriverError):
+    def __init__(self, expect, got):
+        fmt = ("device array is associated with a different "
+               "context: expect {0} but got {1}")
+        msg = fmt.format(expect, got)
+        super(HsaContextMismatchError, self).__init__(msg)
+
+
--- a/numba/roc/hsaimpl.py
+++ b/numba/roc/hsaimpl.py
+import operator
+from functools import reduce
+
+from llvmlite.llvmpy.core import Type
+import llvmlite.llvmpy.core as lc
+import llvmlite.binding as ll
+from llvmlite import ir
+
+from numba import roc
+from numba.core.imputils import Registry
+from numba.core import types, cgutils
+from numba.core.itanium_mangler import mangle_c, mangle, mangle_type
+from numba.core.typing.npydecl import parse_dtype
+from numba.roc import target
+from numba.roc import stubs
+from numba.roc import hlc
+from numba.roc import enums
+
+registry = Registry()
+lower = registry.lower
+
+_void_value = lc.Constant.null(lc.Type.pointer(lc.Type.int(8)))
+
+# -----------------------------------------------------------------------------
+
+
+def _declare_function(context, builder, name, sig, cargs,
+                      mangler=mangle_c):
+    """Insert declaration for a opencl builtin function.
+    Uses the Itanium mangler.
+
+    Args
+    ----
+    context: target context
+
+    builder: llvm builder
+
+    name: str
+        symbol name
+
+    sig: signature
+        function signature of the symbol being declared
+
+    cargs: sequence of str
+        C type names for the arguments
+
+    mangler: a mangler function
+        function to use to mangle the symbol
+
+    """
+    mod = builder.module
+    if sig.return_type == types.void:
+        llretty = lc.Type.void()
+    else:
+        llretty = context.get_value_type(sig.return_type)
+    llargs = [context.get_value_type(t) for t in sig.args]
+    fnty = Type.function(llretty, llargs)
+    mangled = mangler(name, cargs)
+    fn = mod.get_or_insert_function(fnty, mangled)
+    fn.calling_convention = target.CC_SPIR_FUNC
+    return fn
+
+
+@lower(stubs.get_global_id, types.uint32)
+def get_global_id_impl(context, builder, sig, args):
+    [dim] = args
+    get_global_id = _declare_function(context, builder, 'get_global_id', sig,
+                                      ['unsigned int'])
+    res = builder.call(get_global_id, [dim])
+    return context.cast(builder, res, types.uintp, types.intp)
+
+
+@lower(stubs.get_local_id, types.uint32)
+def get_local_id_impl(context, builder, sig, args):
+    [dim] = args
+    get_local_id = _declare_function(context, builder, 'get_local_id', sig,
+                                     ['unsigned int'])
+    res = builder.call(get_local_id, [dim])
+    return context.cast(builder, res, types.uintp, types.intp)
+
+
+@lower(stubs.get_group_id, types.uint32)
+def get_group_id_impl(context, builder, sig, args):
+    [dim] = args
+    get_group_id = _declare_function(context, builder, 'get_group_id', sig,
+                                     ['unsigned int'])
+    res = builder.call(get_group_id, [dim])
+    return context.cast(builder, res, types.uintp, types.intp)
+
+
+@lower(stubs.get_num_groups, types.uint32)
+def get_num_groups_impl(context, builder, sig, args):
+    [dim] = args
+    get_num_groups = _declare_function(context, builder, 'get_num_groups', sig,
+                                       ['unsigned int'])
+    res = builder.call(get_num_groups, [dim])
+    return context.cast(builder, res, types.uintp, types.intp)
+
+
+@lower(stubs.get_work_dim)
+def get_work_dim_impl(context, builder, sig, args):
+    get_work_dim = _declare_function(context, builder, 'get_work_dim', sig,
+                                     ["void"])
+    res = builder.call(get_work_dim, [])
+    return res
+
+
+@lower(stubs.get_global_size, types.uint32)
+def get_global_size_impl(context, builder, sig, args):
+    [dim] = args
+    get_global_size = _declare_function(context, builder, 'get_global_size',
+                                        sig, ['unsigned int'])
+    res = builder.call(get_global_size, [dim])
+    return context.cast(builder, res, types.uintp, types.intp)
+
+
+@lower(stubs.get_local_size, types.uint32)
+def get_local_size_impl(context, builder, sig, args):
+    [dim] = args
+    get_local_size = _declare_function(context, builder, 'get_local_size',
+                                       sig, ['unsigned int'])
+    res = builder.call(get_local_size, [dim])
+    return context.cast(builder, res, types.uintp, types.intp)
+
+
+@lower(stubs.barrier, types.uint32)
+def barrier_one_arg_impl(context, builder, sig, args):
+    [flags] = args
+    barrier = _declare_function(context, builder, 'barrier', sig,
+                                ['unsigned int'])
+    builder.call(barrier, [flags])
+    return _void_value
+
+@lower(stubs.barrier)
+def barrier_no_arg_impl(context, builder, sig, args):
+    assert not args
+    sig = types.void(types.uint32)
+    barrier = _declare_function(context, builder, 'barrier', sig,
+                                ['unsigned int'])
+    flags = context.get_constant(types.uint32, enums.CLK_GLOBAL_MEM_FENCE)
+    builder.call(barrier, [flags])
+    return _void_value
+
+
+@lower(stubs.mem_fence, types.uint32)
+def mem_fence_impl(context, builder, sig, args):
+    [flags] = args
+    mem_fence = _declare_function(context, builder, 'mem_fence', sig,
+                                ['unsigned int'])
+    builder.call(mem_fence, [flags])
+    return _void_value
+
+
+@lower(stubs.wavebarrier)
+def wavebarrier_impl(context, builder, sig, args):
+    assert not args
+    fnty = Type.function(Type.void(), [])
+    fn = builder.module.declare_intrinsic('llvm.amdgcn.wave.barrier', fnty=fnty)
+    builder.call(fn, [])
+    return _void_value
+
+@lower(stubs.activelanepermute_wavewidth,
+           types.Any, types.uint32, types.Any, types.bool_)
+def activelanepermute_wavewidth_impl(context, builder, sig, args):
+    [src, laneid, identity, use_ident] = args
+    assert sig.args[0] == sig.args[2]
+    elem_type = sig.args[0]
+    bitwidth = elem_type.bitwidth
+    intbitwidth = Type.int(bitwidth)
+    i32 = Type.int(32)
+    i1 = Type.int(1)
+    name = "__hsail_activelanepermute_wavewidth_b{0}".format(bitwidth)
+
+    fnty = Type.function(intbitwidth, [intbitwidth, i32, intbitwidth, i1])
+    fn = builder.module.get_or_insert_function(fnty, name=name)
+    fn.calling_convention = target.CC_SPIR_FUNC
+
+    def cast(val):
+        return builder.bitcast(val, intbitwidth)
+
+    result = builder.call(fn, [cast(src), laneid, cast(identity), use_ident])
+    return builder.bitcast(result, context.get_value_type(elem_type))
+
+def _gen_ds_permute(intrinsic_name):
+    def _impl(context, builder, sig, args):
+        """
+        args are (index, src)
+        """
+        assert sig.return_type == sig.args[1]
+        idx, src = args
+        i32 = Type.int(32)
+        fnty = Type.function(i32, [i32, i32])
+        fn = builder.module.declare_intrinsic(intrinsic_name, fnty=fnty)
+        # the args are byte addressable, VGPRs are 4 wide so mul idx by 4
+        # the idx might be an int64, this is ok to trunc to int32 as
+        # wavefront_size is never likely overflow an int32
+        idx = builder.trunc(idx, i32)
+        four = lc.Constant.int(i32, 4)
+        idx = builder.mul(idx, four)
+        # bit cast is so float32 works as packed i32, the return casts back
+        result = builder.call(fn, (idx, builder.bitcast(src, i32)))
+        return builder.bitcast(result, context.get_value_type(sig.return_type))
+    return _impl
+
+lower(stubs.ds_permute, types.Any, types.Any)(_gen_ds_permute('llvm.amdgcn.ds.permute'))
+lower(stubs.ds_bpermute, types.Any, types.Any)(_gen_ds_permute('llvm.amdgcn.ds.bpermute'))
+
+@lower(stubs.atomic.add, types.Array, types.intp, types.Any)
+@lower(stubs.atomic.add, types.Array,
+           types.UniTuple, types.Any)
+@lower(stubs.atomic.add, types.Array, types.Tuple,
+           types.Any)
+def hsail_atomic_add_tuple(context, builder, sig, args):
+    aryty, indty, valty = sig.args
+    ary, inds, val = args
+    dtype = aryty.dtype
+
+    if indty == types.intp:
+        indices = [inds]  # just a single integer
+        indty = [indty]
+    else:
+        indices = cgutils.unpack_tuple(builder, inds, count=len(indty))
+        indices = [context.cast(builder, i, t, types.intp)
+                   for t, i in zip(indty, indices)]
+
+    if dtype != valty:
+        raise TypeError("expecting %s but got %s" % (dtype, valty))
+
+    if aryty.ndim != len(indty):
+        raise TypeError("indexing %d-D array with %d-D index" %
+                        (aryty.ndim, len(indty)))
+
+    lary = context.make_array(aryty)(context, builder, ary)
+    ptr = cgutils.get_item_pointer(context, builder, aryty, lary, indices)
+
+    return builder.atomic_rmw("add", ptr, val, ordering='monotonic')
+
+
+@lower(roc.shared.array, types.IntegerLiteral, types.Any)
+def hsail_smem_alloc_array_integer(context, builder, sig, args):
+    length = sig.args[0].literal_value
+    dtype = parse_dtype(sig.args[1])
+    return _generic_array(context, builder, shape=(length,), dtype=dtype,
+                          symbol_name='_hsapy_smem',
+                          addrspace=target.SPIR_LOCAL_ADDRSPACE)
+
+
+@lower(roc.shared.array, types.Tuple, types.Any)
+@lower(roc.shared.array, types.UniTuple, types.Any)
+def hsail_smem_alloc_array_tuple(context, builder, sig, args):
+    shape = [ s.literal_value for s in sig.args[0] ]
+    dtype = parse_dtype(sig.args[1])
+    return _generic_array(context, builder, shape=shape, dtype=dtype,
+                          symbol_name='_hsapy_smem',
+                          addrspace=target.SPIR_LOCAL_ADDRSPACE)
+
+
+def _generic_array(context, builder, shape, dtype, symbol_name, addrspace):
+    elemcount = reduce(operator.mul, shape, 1)
+    lldtype = context.get_data_type(dtype)
+    laryty = Type.array(lldtype, elemcount)
+
+    if addrspace == target.SPIR_LOCAL_ADDRSPACE:
+        lmod = builder.module
+
+        # Create global variable in the requested address-space
+        gvmem = lmod.add_global_variable(laryty, symbol_name, addrspace)
+
+        if elemcount <= 0:
+            raise ValueError("array length <= 0")
+        else:
+            gvmem.linkage = lc.LINKAGE_INTERNAL
+
+        if dtype not in types.number_domain:
+            raise TypeError("unsupported type: %s" % dtype)
+
+        # Convert to generic address-space
+        dataptr = context.addrspacecast(builder, gvmem,
+                                        target.SPIR_GENERIC_ADDRSPACE)
+
+    else:
+        raise NotImplementedError("addrspace {addrspace}".format(**locals()))
+
+    return _make_array(context, builder, dataptr, dtype, shape)
+
+
+def _make_array(context, builder, dataptr, dtype, shape, layout='C'):
+    ndim = len(shape)
+    # Create array object
+    aryty = types.Array(dtype=dtype, ndim=ndim, layout='C')
+    ary = context.make_array(aryty)(context, builder)
+
+    targetdata = _get_target_data(context)
+    lldtype = context.get_data_type(dtype)
+    itemsize = lldtype.get_abi_size(targetdata)
+    # Compute strides
+    rstrides = [itemsize]
+    for i, lastsize in enumerate(reversed(shape[1:])):
+        rstrides.append(lastsize * rstrides[-1])
+    strides = [s for s in reversed(rstrides)]
+
+    kshape = [context.get_constant(types.intp, s) for s in shape]
+    kstrides = [context.get_constant(types.intp, s) for s in strides]
+
+    context.populate_array(ary,
+                           data=builder.bitcast(dataptr, ary.data.type),
+                           shape=kshape,
+                           strides=kstrides,
+                           itemsize=context.get_constant(types.intp, itemsize),
+                           meminfo=None)
+
+    return ary._getvalue()
+
+
+def _get_target_data(context):
+    return ll.create_target_data(hlc.DATALAYOUT[context.address_size])
--- a/numba/roc/initialize.py
+++ b/numba/roc/initialize.py
+#### Additional initialization code ######
+def _initialize_ufunc():
+    from numba.np.ufunc import Vectorize
+
+    def init_vectorize():
+        from numba.roc.vectorizers import HsaVectorize
+
+        return HsaVectorize
+
+    Vectorize.target_registry.ondemand['roc'] = init_vectorize
+
+
+def _initialize_gufunc():
+    from numba.np.ufunc import GUVectorize
+
+    def init_guvectorize():
+        from numba.roc.vectorizers import HsaGUFuncVectorize
+
+        return HsaGUFuncVectorize
+
+    GUVectorize.target_registry.ondemand['roc'] = init_guvectorize
+
+
+_initialize_ufunc()
+_initialize_gufunc()
--- a/numba/roc/mathdecl.py
+++ b/numba/roc/mathdecl.py
+import math
+from numba.core import types, utils
+from numba.core.typing.templates import (AttributeTemplate, ConcreteTemplate,
+                                         signature, Registry)
+
+registry = Registry()
+builtin_attr = registry.register_attr
+infer_global = registry.register_global
+
+
+@builtin_attr
+class MathModuleAttribute(AttributeTemplate):
+    key = types.Module(math)
+
+    def resolve_fabs(self, mod):
+        return types.Function(Math_fabs)
+
+    def resolve_exp(self, mod):
+        return types.Function(Math_exp)
+
+    def resolve_expm1(self, mod):
+        return types.Function(Math_expm1)
+
+    def resolve_sqrt(self, mod):
+        return types.Function(Math_sqrt)
+
+    def resolve_log(self, mod):
+        return types.Function(Math_log)
+
+    def resolve_log1p(self, mod):
+        return types.Function(Math_log1p)
+
+    def resolve_log10(self, mod):
+        return types.Function(Math_log10)
+
+    def resolve_sin(self, mod):
+        return types.Function(Math_sin)
+
+    def resolve_cos(self, mod):
+        return types.Function(Math_cos)
+
+    def resolve_tan(self, mod):
+        return types.Function(Math_tan)
+
+    def resolve_sinh(self, mod):
+        return types.Function(Math_sinh)
+
+    def resolve_cosh(self, mod):
+        return types.Function(Math_cosh)
+
+    def resolve_tanh(self, mod):
+        return types.Function(Math_tanh)
+
+    def resolve_asin(self, mod):
+        return types.Function(Math_asin)
+
+    def resolve_acos(self, mod):
+        return types.Function(Math_acos)
+
+    def resolve_atan(self, mod):
+        return types.Function(Math_atan)
+
+    def resolve_atan2(self, mod):
+        return types.Function(Math_atan2)
+
+    def resolve_asinh(self, mod):
+        return types.Function(Math_asinh)
+
+    def resolve_acosh(self, mod):
+        return types.Function(Math_acosh)
+
+    def resolve_atanh(self, mod):
+        return types.Function(Math_atanh)
+
+    def resolve_pi(self, mod):
+        return types.float64
+
+    def resolve_e(self, mod):
+        return types.float64
+
+    def resolve_floor(self, mod):
+        return types.Function(Math_floor)
+
+    def resolve_ceil(self, mod):
+        return types.Function(Math_ceil)
+
+    def resolve_trunc(self, mod):
+        return types.Function(Math_trunc)
+
+    def resolve_isnan(self, mod):
+        return types.Function(Math_isnan)
+
+    def resolve_isinf(self, mod):
+        return types.Function(Math_isinf)
+
+    def resolve_degrees(self, mod):
+        return types.Function(Math_degrees)
+
+    def resolve_radians(self, mod):
+        return types.Function(Math_radians)
+
+    # def resolve_hypot(self, mod):
+    # return types.Function(Math_hypot)
+
+    def resolve_copysign(self, mod):
+        return types.Function(Math_copysign)
+
+    def resolve_fmod(self, mod):
+        return types.Function(Math_fmod)
+
+    def resolve_pow(self, mod):
+        return types.Function(Math_pow)
+
+    def resolve_erf(self, mod):
+        return types.Function(Math_erf)
+
+    def resolve_erfc(self, mod):
+        return types.Function(Math_erfc)
+
+    def resolve_gamma(self, mod):
+        return types.Function(Math_gamma)
+
+    def resolve_lgamma(self, mod):
+        return types.Function(Math_lgamma)
+
+
+class Math_unary(ConcreteTemplate):
+    cases = [
+        signature(types.float64, types.int64),
+        signature(types.float64, types.uint64),
+        signature(types.float32, types.float32),
+        signature(types.float64, types.float64),
+    ]
+
+
+class Math_fabs(Math_unary):
+    key = math.fabs
+
+
+class Math_exp(Math_unary):
+    key = math.exp
+
+
+class Math_expm1(Math_unary):
+    key = math.expm1
+
+
+class Math_sqrt(Math_unary):
+    key = math.sqrt
+
+
+class Math_log(Math_unary):
+    key = math.log
+
+
+class Math_log1p(Math_unary):
+    key = math.log1p
+
+
+class Math_log10(Math_unary):
+    key = math.log10
+
+
+class Math_sin(Math_unary):
+    key = math.sin
+
+
+class Math_cos(Math_unary):
+    key = math.cos
+
+
+class Math_tan(Math_unary):
+    key = math.tan
+
+
+class Math_sinh(Math_unary):
+    key = math.sinh
+
+
+class Math_cosh(Math_unary):
+    key = math.cosh
+
+
+class Math_tanh(Math_unary):
+    key = math.tanh
+
+
+class Math_asin(Math_unary):
+    key = math.asin
+
+
+class Math_acos(Math_unary):
+    key = math.acos
+
+
+class Math_atan(Math_unary):
+    key = math.atan
+
+
+class Math_atan2(ConcreteTemplate):
+    key = math.atan2
+    cases = [
+        signature(types.float64, types.int64, types.int64),
+        signature(types.float64, types.uint64, types.uint64),
+        signature(types.float32, types.float32, types.float32),
+        signature(types.float64, types.float64, types.float64),
+    ]
+
+
+class Math_asinh(Math_unary):
+    key = math.asinh
+
+
+class Math_acosh(Math_unary):
+    key = math.acosh
+
+
+class Math_atanh(Math_unary):
+    key = math.atanh
+
+
+class Math_floor(Math_unary):
+    key = math.floor
+
+
+class Math_ceil(Math_unary):
+    key = math.ceil
+
+
+class Math_trunc(Math_unary):
+    key = math.trunc
+
+
+class Math_radians(Math_unary):
+    key = math.radians
+
+
+class Math_degrees(Math_unary):
+    key = math.degrees
+
+
+# class Math_hypot(ConcreteTemplate):
+# key = math.hypot
+#     cases = [
+#         signature(types.float64, types.int64, types.int64),
+#         signature(types.float64, types.uint64, types.uint64),
+#         signature(types.float32, types.float32, types.float32),
+#         signature(types.float64, types.float64, types.float64),
+#     ]
+
+
+class Math_erf(Math_unary):
+    key = math.erf
+
+class Math_erfc(Math_unary):
+    key = math.erfc
+
+class Math_gamma(Math_unary):
+    key = math.gamma
+
+class Math_lgamma(Math_unary):
+    key = math.lgamma
+
+
+class Math_binary(ConcreteTemplate):
+    cases = [
+        signature(types.float32, types.float32, types.float32),
+        signature(types.float64, types.float64, types.float64),
+    ]
+
+
+class Math_copysign(Math_binary):
+    key = math.copysign
+
+
+class Math_fmod(Math_binary):
+    key = math.fmod
+
+
+class Math_pow(ConcreteTemplate):
+    key = math.pow
+    cases = [
+        signature(types.float32, types.float32, types.float32),
+        signature(types.float64, types.float64, types.float64),
+        signature(types.float32, types.float32, types.int32),
+        signature(types.float64, types.float64, types.int32),
+    ]
+
+
+class Math_isnan(ConcreteTemplate):
+    key = math.isnan
+    cases = [
+        signature(types.boolean, types.int64),
+        signature(types.boolean, types.uint64),
+        signature(types.boolean, types.float32),
+        signature(types.boolean, types.float64),
+    ]
+
+
+class Math_isinf(ConcreteTemplate):
+    key = math.isinf
+    cases = [
+        signature(types.boolean, types.int64),
+        signature(types.boolean, types.uint64),
+        signature(types.boolean, types.float32),
+        signature(types.boolean, types.float64),
+    ]
+
+
+infer_global(math, types.Module(math))
+infer_global(math.fabs, types.Function(Math_fabs))
+infer_global(math.exp, types.Function(Math_exp))
+infer_global(math.expm1, types.Function(Math_expm1))
+infer_global(math.sqrt, types.Function(Math_sqrt))
+infer_global(math.log, types.Function(Math_log))
+infer_global(math.log1p, types.Function(Math_log1p))
+infer_global(math.log10, types.Function(Math_log10))
+infer_global(math.sin, types.Function(Math_sin))
+infer_global(math.cos, types.Function(Math_cos))
+infer_global(math.tan, types.Function(Math_tan))
+infer_global(math.sinh, types.Function(Math_sinh))
+infer_global(math.cosh, types.Function(Math_cosh))
+infer_global(math.tanh, types.Function(Math_tanh))
+infer_global(math.asin, types.Function(Math_asin))
+infer_global(math.acos, types.Function(Math_acos))
+infer_global(math.atan, types.Function(Math_atan))
+infer_global(math.atan2, types.Function(Math_atan2))
+infer_global(math.asinh, types.Function(Math_asinh))
+infer_global(math.acosh, types.Function(Math_acosh))
+infer_global(math.atanh, types.Function(Math_atanh))
+# infer_global(math.hypot, types.Function(Math_hypot))
+infer_global(math.floor, types.Function(Math_floor))
+infer_global(math.ceil, types.Function(Math_ceil))
+infer_global(math.trunc, types.Function(Math_trunc))
+infer_global(math.isnan, types.Function(Math_isnan))
+infer_global(math.isinf, types.Function(Math_isinf))
+infer_global(math.degrees, types.Function(Math_degrees))
+infer_global(math.radians, types.Function(Math_radians))
+infer_global(math.copysign, types.Function(Math_copysign))
+infer_global(math.fmod, types.Function(Math_fmod))
+infer_global(math.pow, types.Function(Math_pow))
+infer_global(math.erf, types.Function(Math_erf))
+infer_global(math.erfc, types.Function(Math_erfc))
+infer_global(math.gamma, types.Function(Math_gamma))
+infer_global(math.lgamma, types.Function(Math_lgamma))
--- a/numba/roc/mathimpl.py
+++ b/numba/roc/mathimpl.py
+import math
+import warnings
+
+from numba.core.imputils import Registry
+from numba.core import types
+from numba.core.itanium_mangler import mangle
+from .hsaimpl import _declare_function
+
+registry = Registry()
+lower = registry.lower
+
+# -----------------------------------------------------------------------------
+
+_unary_b_f = types.int32(types.float32)
+_unary_b_d = types.int32(types.float64)
+_unary_f_f = types.float32(types.float32)
+_unary_d_d = types.float64(types.float64)
+_binary_f_ff = types.float32(types.float32, types.float32)
+_binary_d_dd = types.float64(types.float64, types.float64)
+
+function_descriptors = {
+    'isnan': (_unary_b_f, _unary_b_d),
+    'isinf': (_unary_b_f, _unary_b_d),
+
+    'ceil': (_unary_f_f, _unary_d_d),
+    'floor': (_unary_f_f, _unary_d_d),
+
+    'fabs': (_unary_f_f, _unary_d_d),
+
+    'sqrt': (_unary_f_f, _unary_d_d),
+    'exp': (_unary_f_f, _unary_d_d),
+    'expm1': (_unary_f_f, _unary_d_d),
+    'log': (_unary_f_f, _unary_d_d),
+    'log10': (_unary_f_f, _unary_d_d),
+    'log1p': (_unary_f_f, _unary_d_d),
+
+    'sin': (_unary_f_f, _unary_d_d),
+    'cos': (_unary_f_f, _unary_d_d),
+    'tan': (_unary_f_f, _unary_d_d),
+    'asin': (_unary_f_f, _unary_d_d),
+    'acos': (_unary_f_f, _unary_d_d),
+    'atan': (_unary_f_f, _unary_d_d),
+    'sinh': (_unary_f_f, _unary_d_d),
+    'cosh': (_unary_f_f, _unary_d_d),
+    'tanh': (_unary_f_f, _unary_d_d),
+    'asinh': (_unary_f_f, _unary_d_d),
+    'acosh': (_unary_f_f, _unary_d_d),
+    'atanh': (_unary_f_f, _unary_d_d),
+
+    'copysign': (_binary_f_ff, _binary_d_dd),
+    'atan2': (_binary_f_ff, _binary_d_dd),
+    'pow': (_binary_f_ff, _binary_d_dd),
+    'fmod': (_binary_f_ff, _binary_d_dd),
+
+    'erf': (_unary_f_f, _unary_d_d),
+    'erfc': (_unary_f_f, _unary_d_d),
+    'gamma': (_unary_f_f, _unary_d_d),
+    'lgamma': (_unary_f_f, _unary_d_d),
+
+    # unsupported functions listed in the math module documentation:
+    # frexp, ldexp, trunc, modf, factorial, fsum
+}
+
+
+# some functions may be named differently by the underlying math
+# library as opposed to the Python name.
+_lib_counterpart = {
+    'gamma': 'tgamma'
+}
+
+
+def _mk_fn_decl(name, decl_sig):
+    sym = _lib_counterpart.get(name, name)
+
+    def core(context, builder, sig, args):
+        fn = _declare_function(context, builder, sym, decl_sig, decl_sig.args,
+                               mangler=mangle)
+        res = builder.call(fn, args)
+        return context.cast(builder, res, decl_sig.return_type, sig.return_type)
+
+    core.__name__ = name
+    return core
+
+
+_supported = ['sin', 'cos', 'tan', 'asin', 'acos', 'atan', 'atan2', 'sinh',
+              'cosh', 'tanh', 'asinh', 'acosh', 'atanh', 'isnan', 'isinf',
+              'ceil', 'floor', 'fabs', 'sqrt', 'exp', 'expm1', 'log',
+              'log10', 'log1p', 'copysign', 'pow', 'fmod', 'erf', 'erfc',
+              'gamma', 'lgamma',
+              ]
+
+for name in _supported:
+    sigs = function_descriptors.get(name)
+    if sigs is None:
+        warnings.warn("HSA - failed to register '{0}'".format(name))
+        continue
+
+    try:
+        # only symbols present in the math module
+        key = getattr(math, name)
+    except AttributeError:
+        continue
+
+    for sig in sigs:
+        fn = _mk_fn_decl(name, sig)
+        lower(key, *sig.args)(fn)
--- a/numba/roc/servicelib/__init__.py
+++ b/numba/roc/servicelib/__init__.py
+from .service import Service
+from .threadlocal import TLStack
--- a/numba/roc/servicelib/service.py
+++ b/numba/roc/servicelib/service.py
+"""
+Implement background services for the application.
+This is implemented as a cooperative concurrent task.
+"""
+
+import functools
+
+
+class Service(object):
+    def __init__(self, name="unnamed", arg=None):
+        self.name = name
+        self.enabled = True
+        self.arg = arg
+        self._task = self.process(self.arg)
+        next(self._task)
+
+    def service(self):
+        """
+        Request for the service task.
+        Servicing is disabled if it is disabled thourght the "enabled"
+        attribute.  When the task is executing, the service is disabled to
+        avoid recursion.
+        """
+        if self.enabled:
+            enable = self.enabled
+            try:
+                # Prevent recursion
+                self.enabled = False
+                next(self._task)
+            finally:
+                self.enabled = enable
+
+    def process(self, arg):
+        """
+        Overrided to implement the service task.
+        This must be a generator.
+        Use `yield` to return control.
+        """
+        raise NotImplementedError
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.service()
+
+    def after(self, fn):
+        """
+        A decorator for a function. Service is triggered on return.
+        """
+        @functools.wraps(fn)
+        def wrap(*args, **kws):
+            with self:
+                return fn(*args, **kws)
+        return wrap
+
+# -----------------------------------------------------------------------------
+# The rest are for testing
+
+
+class HelloService(Service):
+    def process(self, arg):
+        count = 0
+        yield
+        while True:
+            print("Hello", count)
+            count += 1
+            yield
+
+def test():
+    serv = HelloService("my.hello")
+    print("1")
+    serv.service()
+    print("2")
+    serv.service()
+
+    with serv:
+        print("3")
+
+    @serv.after
+    def nested():
+        print("4")
+
+    nested()
+
+
+if __name__ == '__main__':
+    test()
--- a/numba/roc/servicelib/threadlocal.py
+++ b/numba/roc/servicelib/threadlocal.py
+"""
+Implements:
+- Threadlocal stack
+"""
+import threading
+
+
+class TLStack(object):
+    def __init__(self):
+        self.local = threading.local()
+
+    @property
+    def stack(self):
+        try:
+            # Retrieve thread local stack
+            return self.local.stack
+        except AttributeError:
+            # Initialize stack for the thread
+            self.local.stack = []
+            return self.local.stack
+
+    def push(self, item):
+        self.stack.append(item)
+
+    def pop(self):
+        return self.stack.pop()
+
+    @property
+    def top(self):
+        return self.stack[-1]
+
+    @property
+    def is_empty(self):
+        return not self.stack
+
+    def __bool__(self):
+        return not self.is_empty
+
+    def __nonzero__(self):
+        return self.__bool__()
+
+    def __len__(self):
+        return len(self.stack)
+
+    def clear(self):
+        self.__init__()
--- a/numba/roc/stubs.py
+++ b/numba/roc/stubs.py
+from numba.core import types, typing, ir
+
+_stub_error = NotImplementedError("This is a stub.")
+
+
+def get_global_id(*args, **kargs):
+    """
+    OpenCL get_global_id()
+    """
+    raise _stub_error
+
+
+def get_local_id(*args, **kargs):
+    """
+    OpenCL get_local_id()
+    """
+    raise _stub_error
+
+
+def get_global_size(*args, **kargs):
+    """
+    OpenCL get_global_size()
+    """
+    raise _stub_error
+
+
+def get_local_size(*args, **kargs):
+    """
+    OpenCL get_local_size()
+    """
+    raise _stub_error
+
+
+def get_group_id(*args, **kargs):
+    """
+    OpenCL get_group_id()
+    """
+    raise _stub_error
+
+
+def get_num_groups(*args, **kargs):
+    """
+    OpenCL get_num_groups()
+    """
+    raise _stub_error
+
+
+def get_work_dim(*args, **kargs):
+    """
+    OpenCL get_work_dim()
+    """
+    raise _stub_error
+
+
+def barrier(*args, **kargs):
+    """
+    OpenCL barrier()
+
+    Example:
+
+        # workgroup barrier + local memory fence
+        hsa.barrier(hsa.CLK_LOCAL_MEM_FENCE)
+        # workgroup barrier + global memory fence
+        hsa.barrier(hsa.CLK_GLOBAL_MEM_FENCE)
+        # workgroup barrier + global memory fence
+        hsa.barrier()
+
+    """
+    raise _stub_error
+
+
+def mem_fence(*args, **kargs):
+    """
+    OpenCL mem_fence()
+
+    Example:
+
+        # local memory fence
+        hsa.mem_fence(hsa.CLK_LOCAL_MEM_FENCE)
+        # global memory fence
+        hsa.mem_fence(hsa.CLK_GLOBAL_MEM_FENCE)
+    """
+    raise _stub_error
+
+
+def wavebarrier():
+    """
+    HSAIL wavebarrier
+    """
+    raise _stub_error
+
+
+def activelanepermute_wavewidth(src, laneid, identity, useidentity):
+    """
+    HSAIL activelanepermute_wavewidth_*
+    """
+    raise _stub_error
+
+
+def ds_permute(src_lane, dest_lane):
+    """
+    AMDGCN Data Share intrinsic forwards permute (push semantics)
+    """
+    raise _stub_error
+
+
+def ds_bpermute(src_lane, dest_lane):
+    """
+    AMDGCN Data Share intrinsic backwards permute (pull semantics)
+    """
+    raise _stub_error
+
+
+class Stub(object):
+    """A stub object to represent special objects which is meaningless
+    outside the context of HSA-python.
+    """
+    _description_ = '<ptx special value>'
+    __slots__ = ()  # don't allocate __dict__
+
+    def __new__(cls):
+        raise NotImplementedError("%s is not instantiable" % cls)
+
+    def __repr__(self):
+        return self._description_
+
+
+class shared(Stub):
+    """shared namespace
+    """
+    _description_ = '<shared>'
+
+    def array(shape, dtype):
+        """shared.array(shape, dtype)
+
+        Allocate a shared memory array.
+        """
+
+
+#-------------------------------------------------------------------------------
+# atomic
+
+class atomic(Stub):
+    """atomic namespace
+    """
+    _description_ = '<atomic>'
+
+    class add(Stub):
+        """add(ary, idx, val)
+
+        Perform atomic ary[idx] += val
+        """
--- a/numba/roc/target.py
+++ b/numba/roc/target.py
+import re
+
+from llvmlite.llvmpy import core as lc
+from llvmlite import ir as llvmir
+from llvmlite import binding as ll
+
+from numba.core import typing, types, utils, datamodel, cgutils
+from numba.core.utils import cached_property
+from numba.core.base import BaseContext
+from numba.core.callconv import MinimalCallConv
+from numba.roc import codegen
+from .hlc import DATALAYOUT
+
+CC_SPIR_KERNEL = "spir_kernel"
+CC_SPIR_FUNC = ""
+
+
+# -----------------------------------------------------------------------------
+# Typing
+
+
+class HSATypingContext(typing.BaseContext):
+    def load_additional_registries(self):
+        from . import hsadecl, mathdecl
+
+        self.install_registry(hsadecl.registry)
+        self.install_registry(mathdecl.registry)
+
+
+# -----------------------------------------------------------------------------
+# Implementation
+
+VALID_CHARS = re.compile(r'[^a-z0-9]', re.I)
+
+
+# Address spaces
+SPIR_GENERIC_ADDRSPACE = 0
+SPIR_GLOBAL_ADDRSPACE = 1
+SPIR_REGION_ADDRSPACE = 2
+SPIR_CONSTANT_ADDRSPACE = 4
+SPIR_LOCAL_ADDRSPACE = 3
+SPIR_PRIVATE_ADDRSPACE = 5
+SPIR_CONSTANT_32BIT_ADDRSPACE = 6
+
+SPIR_VERSION = (2, 0)
+
+
+class GenericPointerModel(datamodel.PrimitiveModel):
+    def __init__(self, dmm, fe_type):
+        adrsp = SPIR_GENERIC_ADDRSPACE
+        be_type = dmm.lookup(fe_type.dtype).get_data_type().as_pointer(adrsp)
+        super(GenericPointerModel, self).__init__(dmm, fe_type, be_type)
+
+
+def _init_data_model_manager():
+    dmm = datamodel.default_manager.copy()
+    dmm.register(types.CPointer, GenericPointerModel)
+    return dmm
+
+
+hsa_data_model_manager = _init_data_model_manager()
+
+
+class HSATargetContext(BaseContext):
+    implement_powi_as_math_call = True
+    generic_addrspace = SPIR_GENERIC_ADDRSPACE
+
+    def init(self):
+        self._internal_codegen = codegen.JITHSACodegen("numba.hsa.jit")
+        self._target_data = \
+            ll.create_target_data(DATALAYOUT[utils.MACHINE_BITS])
+        # Override data model manager
+        self.data_model_manager = hsa_data_model_manager
+
+    def load_additional_registries(self):
+        from . import hsaimpl, mathimpl
+
+        self.insert_func_defn(hsaimpl.registry.functions)
+        self.insert_func_defn(mathimpl.registry.functions)
+
+    @cached_property
+    def call_conv(self):
+        return HSACallConv(self)
+
+    def codegen(self):
+        return self._internal_codegen
+
+    @property
+    def target_data(self):
+        return self._target_data
+
+    def mangler(self, name, argtypes):
+        def repl(m):
+            ch = m.group(0)
+            return "_%X_" % ord(ch)
+
+        qualified = name + '.' + '.'.join(str(a) for a in argtypes)
+        mangled = VALID_CHARS.sub(repl, qualified)
+        return 'hsapy_devfn_' + mangled
+
+    def prepare_hsa_kernel(self, func, argtypes):
+        module = func.module
+        func.linkage = 'linkonce_odr'
+
+        module.data_layout = DATALAYOUT[self.address_size]
+        wrapper = self.generate_kernel_wrapper(func, argtypes)
+
+        return wrapper
+
+    def mark_hsa_device(self, func):
+        # Adapt to SPIR
+        # module = func.module
+        func.calling_convention = CC_SPIR_FUNC
+        func.linkage = 'linkonce_odr'
+        return func
+
+    def generate_kernel_wrapper(self, func, argtypes):
+        module = func.module
+        arginfo = self.get_arg_packer(argtypes)
+
+        def sub_gen_with_global(lty):
+            if isinstance(lty, llvmir.PointerType):
+                return (lty.pointee.as_pointer(SPIR_GLOBAL_ADDRSPACE),
+                        lty.addrspace)
+            return lty, None
+
+        if len(arginfo.argument_types) > 0:
+            llargtys, changed = zip(*map(sub_gen_with_global,
+                                         arginfo.argument_types))
+        else:
+            llargtys = changed = ()
+        wrapperfnty = lc.Type.function(lc.Type.void(), llargtys)
+
+        wrapper_module = self.create_module("hsa.kernel.wrapper")
+        wrappername = 'hsaPy_{name}'.format(name=func.name)
+
+        argtys = list(arginfo.argument_types)
+        fnty = lc.Type.function(lc.Type.int(),
+                                [self.call_conv.get_return_type(
+                                    types.pyobject)] + argtys)
+
+        func = wrapper_module.add_function(fnty, name=func.name)
+        func.calling_convention = CC_SPIR_FUNC
+
+        wrapper = wrapper_module.add_function(wrapperfnty, name=wrappername)
+
+        builder = lc.Builder(wrapper.append_basic_block(''))
+
+        # Adjust address space of each kernel argument
+        fixed_args = []
+        for av, adrsp in zip(wrapper.args, changed):
+            if adrsp is not None:
+                casted = self.addrspacecast(builder, av, adrsp)
+                fixed_args.append(casted)
+            else:
+                fixed_args.append(av)
+
+        callargs = arginfo.from_arguments(builder, fixed_args)
+
+        # XXX handle error status
+        status, _ = self.call_conv.call_function(builder, func, types.void,
+                                                 argtypes, callargs)
+        builder.ret_void()
+
+        set_hsa_kernel(wrapper)
+
+        # Link
+        module.link_in(ll.parse_assembly(str(wrapper_module)))
+        # To enable inlining which is essential because addrspacecast 1->0 is
+        # illegal.  Inlining will optimize the addrspacecast out.
+        func.linkage = 'internal'
+        wrapper = module.get_function(wrapper.name)
+        module.get_function(func.name).linkage = 'internal'
+        return wrapper
+
+    def declare_function(self, module, fndesc):
+        ret = super(HSATargetContext, self).declare_function(module, fndesc)
+        # XXX: Refactor fndesc instead of this special case
+        if fndesc.llvm_func_name.startswith('hsapy_devfn'):
+            ret.calling_convention = CC_SPIR_FUNC
+        return ret
+
+    def make_constant_array(self, builder, typ, ary):
+        """
+        Return dummy value.
+        """
+        #
+        # a = self.make_array(typ)(self, builder)
+        # return a._getvalue()
+        raise NotImplementedError
+
+    def addrspacecast(self, builder, src, addrspace):
+        """
+        Handle addrspacecast
+        """
+        ptras = llvmir.PointerType(src.type.pointee, addrspace=addrspace)
+        return builder.addrspacecast(src, ptras)
+
+
+def set_hsa_kernel(fn):
+    """
+    Ensure `fn` is usable as a SPIR kernel.
+    - Fix calling convention
+    - Add metadata
+    """
+    mod = fn.module
+
+    # Set nounwind
+    # fn.add_attribute(lc.ATTR_NO_UNWIND)
+
+    # Set SPIR kernel calling convention
+    fn.calling_convention = CC_SPIR_KERNEL
+
+    # Mark kernels
+    ocl_kernels = mod.get_or_insert_named_metadata("opencl.kernels")
+    ocl_kernels.add(lc.MetaData.get(mod, [fn,
+                                          gen_arg_addrspace_md(fn),
+                                          gen_arg_access_qual_md(fn),
+                                          gen_arg_type(fn),
+                                          gen_arg_type_qual(fn),
+                                          gen_arg_base_type(fn)]))
+
+    # SPIR version 2.0
+    make_constant = lambda x: lc.Constant.int(lc.Type.int(), x)
+    spir_version_constant = [make_constant(x) for x in SPIR_VERSION]
+
+    spir_version = mod.get_or_insert_named_metadata("opencl.spir.version")
+    if not spir_version.operands:
+        spir_version.add(lc.MetaData.get(mod, spir_version_constant))
+
+    ocl_version = mod.get_or_insert_named_metadata("opencl.ocl.version")
+    if not ocl_version.operands:
+        ocl_version.add(lc.MetaData.get(mod, spir_version_constant))
+
+        ## The following metadata does not seem to be necessary
+        # Other metadata
+        # empty_md = lc.MetaData.get(mod, ())
+        # others = ["opencl.used.extensions",
+        #           "opencl.used.optional.core.features",
+        #           "opencl.compiler.options"]cat
+        #
+        # for name in others:
+        #     nmd = mod.get_or_insert_named_metadata(name)
+        #     if not nmd.operands:
+        #         nmd.add(empty_md)
+
+
+def gen_arg_addrspace_md(fn):
+    """
+    Generate kernel_arg_addr_space metadata
+    """
+    mod = fn.module
+    fnty = fn.type.pointee
+    codes = []
+
+    for a in fnty.args:
+        if cgutils.is_pointer(a):
+            codes.append(SPIR_GLOBAL_ADDRSPACE)
+        else:
+            codes.append(SPIR_PRIVATE_ADDRSPACE)
+
+    consts = [lc.Constant.int(lc.Type.int(), x) for x in codes]
+    name = lc.MetaDataString.get(mod, "kernel_arg_addr_space")
+    return lc.MetaData.get(mod, [name] + consts)
+
+
+def gen_arg_access_qual_md(fn):
+    """
+    Generate kernel_arg_access_qual metadata
+    """
+    mod = fn.module
+    consts = [lc.MetaDataString.get(mod, "none")] * len(fn.args)
+    name = lc.MetaDataString.get(mod, "kernel_arg_access_qual")
+    return lc.MetaData.get(mod, [name] + consts)
+
+
+def gen_arg_type(fn):
+    """
+    Generate kernel_arg_type metadata
+    """
+    mod = fn.module
+    fnty = fn.type.pointee
+    consts = [lc.MetaDataString.get(mod, str(a)) for a in fnty.args]
+    name = lc.MetaDataString.get(mod, "kernel_arg_type")
+    return lc.MetaData.get(mod, [name] + consts)
+
+
+def gen_arg_type_qual(fn):
+    """
+    Generate kernel_arg_type_qual metadata
+    """
+    mod = fn.module
+    fnty = fn.type.pointee
+    consts = [lc.MetaDataString.get(mod, "") for _ in fnty.args]
+    name = lc.MetaDataString.get(mod, "kernel_arg_type_qual")
+    return lc.MetaData.get(mod, [name] + consts)
+
+
+def gen_arg_base_type(fn):
+    """
+    Generate kernel_arg_base_type metadata
+    """
+    mod = fn.module
+    fnty = fn.type.pointee
+    consts = [lc.MetaDataString.get(mod, str(a)) for a in fnty.args]
+    name = lc.MetaDataString.get(mod, "kernel_arg_base_type")
+    return lc.MetaData.get(mod, [name] + consts)
+
+
+class HSACallConv(MinimalCallConv):
+    def call_function(self, builder, callee, resty, argtys, args, env=None):
+        """
+        Call the Numba-compiled *callee*.
+        """
+        assert env is None
+        retty = callee.args[0].type.pointee
+        retvaltmp = cgutils.alloca_once(builder, retty)
+        # initialize return value
+        builder.store(cgutils.get_null_value(retty), retvaltmp)
+
+        arginfo = self.context.get_arg_packer(argtys)
+        args = arginfo.as_arguments(builder, args)
+        realargs = [retvaltmp] + list(args)
+        code = builder.call(callee, realargs)
+        status = self._get_return_status(builder, code)
+        retval = builder.load(retvaltmp)
+        out = self.context.get_returned_value(builder, resty, retval)
+        return status, out
--- a/numba/roc/tests/__init__.py
+++ b/numba/roc/tests/__init__.py
+from numba.testing import SerialSuite
+from numba.testing import load_testsuite
+from numba import roc
+from os.path import dirname, join
+
+def load_tests(loader, tests, pattern):
+
+    suite = SerialSuite()
+    this_dir = dirname(__file__)
+
+    if roc.is_available():
+        suite.addTests(load_testsuite(loader, join(this_dir, 'hsadrv')))
+        suite.addTests(load_testsuite(loader, join(this_dir, 'hsapy')))
+
+    else:
+        print("skipped HSA tests")
+    return suite
--- a/numba/roc/tests/hsadrv/__init__.py
+++ b/numba/roc/tests/hsadrv/__init__.py
+from numba.testing import SerialSuite
+from numba.testing import load_testsuite
+import os
+
+def load_tests(loader, tests, pattern):
+    return SerialSuite(load_testsuite(loader, os.path.dirname(__file__)))
--- a/numba/roc/tests/hsadrv/test_async.py
+++ b/numba/roc/tests/hsadrv/test_async.py
+import numpy as np
+
+from numba import roc
+import unittest
+from numba.roc.hsadrv.driver import dgpu_present
+
+
+@unittest.skipUnless(dgpu_present, 'test only on dGPU system')
+class TestAsync(unittest.TestCase):
+
+    def test_coarsegrain_array(self):
+        arr = roc.coarsegrain_array(shape=1024, dtype=np.float32)
+        self.assertEqual(arr.size, 1024)
+        arr[:] = expect = np.arange(arr.size)
+        np.testing.assert_allclose(arr, expect)
+
+    def test_async_copy_to_device(self):
+        arr = np.arange(1024)
+
+        devarr = roc.to_device(arr)
+
+        # allocate pinned array equivalent
+        hostarr = roc.coarsegrain_array(shape=arr.shape, dtype=arr.dtype)
+        hostarr[:] = arr + 100
+
+        stream = roc.stream()
+        ct = len(stream._signals)
+        devarr.copy_to_device(hostarr, stream=stream)
+        self.assertEqual(ct + 1, len(stream._signals),
+                         "no new async signal")
+        # implicit synchronization
+        got = devarr.copy_to_host()
+        self.assertEqual(0, len(stream._signals),
+                         "missing implicit synchronization")
+        np.testing.assert_equal(hostarr, got)
+
+    def test_async_copy_to_device_and_back(self):
+        arr = np.arange(1024)
+        hostarr = roc.coarsegrain_array(shape=arr.shape, dtype=arr.dtype)
+        gotarr = roc.coarsegrain_array(shape=arr.shape, dtype=arr.dtype)
+        stream = roc.stream()
+        ct = len(stream._signals)
+        devarr = roc.to_device(hostarr, stream=stream)
+        self.assertEqual(ct + 1, len(stream._signals))
+        devarr.copy_to_host(gotarr, stream=stream)
+        self.assertEqual(ct + 2, len(stream._signals))
+        stream.synchronize()
+        self.assertEqual(0, len(stream._signals))
+        np.testing.assert_equal(hostarr, gotarr)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/numba/roc/tests/hsadrv/test_driver.py
+++ b/numba/roc/tests/hsadrv/test_driver.py
+import ctypes
+import os
+import threading
+
+import numpy as np
+
+import unittest
+from numba.roc.hsadrv.driver import hsa, Queue, Program, Executable,\
+                                    BrigModule, Context, dgpu_present
+
+from numba.roc.hsadrv.driver import hsa as roc
+import numba.roc.api as hsaapi
+from numba import float32, float64, vectorize
+
+from numba.roc.hsadrv import drvapi
+from numba.roc.hsadrv import enums
+from numba.roc.hsadrv import enums_ext
+
+from numba.core import config
+
+try:
+    import queue
+except ImportError:
+    import Queue as queue
+
+class TestLowLevelApi(unittest.TestCase):
+    """This test checks that all the functions defined in drvapi
+    bind properly using ctypes."""
+
+    def test_functions_available(self):
+        missing_functions = []
+        for fname in drvapi.API_PROTOTYPES.keys():
+            try:
+                getattr(hsa, fname)
+            except Exception as e:
+                missing_functions.append("'{0}': {1}".format(fname, str(e)))
+
+        self.assertEqual(len(missing_functions), 0,
+                         msg='\n'.join(missing_functions))
+
+
+class TestAgents(unittest.TestCase):
+    def test_agents_init(self):
+        self.assertGreater(len(roc.agents), 0)
+
+    def test_agents_create_queue_single(self):
+        for agent in roc.agents:
+            if agent.is_component:
+                queue = agent.create_queue_single(2 ** 5)
+                self.assertIsInstance(queue, Queue)
+
+    def test_agents_create_queue_multi(self):
+        for agent in roc.agents:
+            if agent.is_component:
+                queue = agent.create_queue_multi(2 ** 5)
+                self.assertIsInstance(queue, Queue)
+
+    def test_agent_wavebits(self):
+        for agent in roc.agents:
+            if agent.is_component:
+                if agent.name.decode() in ['gfx803', 'gfx900']:
+                    self.assertEqual(agent.wavebits, 6)
+
+
+class _TestBase(unittest.TestCase):
+    def setUp(self):
+        self.gpu = [a for a in roc.agents if a.is_component][0]
+        self.cpu = [a for a in roc.agents if not a.is_component][0]
+        self.queue = self.gpu.create_queue_multi(self.gpu.queue_max_size)
+
+    def tearDown(self):
+        del self.queue
+        del self.gpu
+        del self.cpu
+
+
+def get_brig_file():
+    path = os.path.join('/opt/rocm/hsa/sample/vector_copy_full.brig')
+    assert os.path.isfile(path)
+    return path
+
+def _check_example_file():
+    try:
+        get_brig_file()
+    except Exception:
+        return False
+    return True
+
+has_brig_example = _check_example_file()
+
+
+@unittest.skipUnless(has_brig_example, "Brig example not found")
+class TestBrigModule(unittest.TestCase):
+    def test_from_file(self):
+        brig_file = get_brig_file()
+        brig_module = BrigModule.from_file(brig_file)
+        self.assertGreater(len(brig_module), 0)
+
+
+@unittest.skipUnless(has_brig_example, "Brig example not found")
+class TestProgram(_TestBase):
+    def test_create_program(self):
+        brig_file = get_brig_file()
+        symbol = '&__vector_copy_kernel'
+        brig_module = BrigModule.from_file(brig_file)
+        program = Program()
+        program.add_module(brig_module)
+        code = program.finalize(self.gpu.isa)
+
+        ex = Executable()
+        ex.load(self.gpu, code)
+        ex.freeze()
+
+        sym = ex.get_symbol(self.gpu, symbol)
+        self.assertGreater(sym.kernarg_segment_size, 0)
+
+
+class TestMemory(_TestBase):
+    def test_region_list(self):
+        self.assertGreater(len(self.gpu.regions.globals), 0)
+        self.assertGreater(len(self.gpu.regions.groups), 0)
+        # The following maybe empty
+        # print(self.gpu.regions.privates)
+        # print(self.gpu.regions.readonlys)
+
+    def test_register(self):
+        src = np.random.random(1024).astype(np.float32)
+        roc.hsa_memory_register(src.ctypes.data, src.nbytes)
+        roc.hsa_memory_deregister(src.ctypes.data, src.nbytes)
+
+    def test_allocate(self):
+        regions = self.gpu.regions
+        # More than one region
+        self.assertGreater(len(regions), 0)
+        # Find kernel argument regions
+        kernarg_regions = list()
+        for r in regions:
+            if r.supports(enums.HSA_REGION_GLOBAL_FLAG_KERNARG):
+                kernarg_regions.append(r)
+
+        self.assertGreater(len(kernarg_regions), 0)
+        # Test allocating at the kernel argument region
+        kernarg_region = kernarg_regions[0]
+        nelem = 10
+        ptr = kernarg_region.allocate(ctypes.sizeof(ctypes.c_float) * nelem)
+        self.assertNotEqual(ctypes.addressof(ptr), 0,
+                            "pointer must not be NULL")
+        # Test writing to it
+        src = np.random.random(nelem).astype(np.float32)
+        ctypes.memmove(ptr, src.ctypes.data, src.nbytes)
+
+        ref = (ctypes.c_float * nelem).from_address(ptr.value)
+        for i in range(src.size):
+            self.assertEqual(ref[i], src[i])
+        roc.hsa_memory_free(ptr)
+
+    @unittest.skipUnless(dgpu_present, "dGPU only")
+    def test_coarse_grained_allocate(self):
+        """
+        Tests the coarse grained allocation works on a dGPU.
+        It performs a data copying round trip via:
+        memory
+          |
+        HSA cpu memory
+          |
+        HSA dGPU host accessible memory <---|
+          |                                 |
+        HSA dGPU memory --------------------|
+        """
+        gpu_regions = self.gpu.regions
+        gpu_only_coarse_regions = list()
+        gpu_host_accessible_coarse_regions = list()
+        for r in gpu_regions:
+            if r.supports(enums.HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED):
+                if r.host_accessible:
+                    gpu_host_accessible_coarse_regions.append(r)
+                else:
+                    gpu_only_coarse_regions.append(r)
+
+        # check we have 1+ coarse gpu region(s) of each type
+        self.assertGreater(len(gpu_only_coarse_regions), 0)
+        self.assertGreater(len(gpu_host_accessible_coarse_regions), 0)
+
+        cpu_regions = self.cpu.regions
+        cpu_coarse_regions = list()
+        for r in cpu_regions:
+            if r.supports(enums.HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED):
+                cpu_coarse_regions.append(r)
+        # check we have 1+ coarse cpu region(s)
+        self.assertGreater(len(cpu_coarse_regions), 0)
+
+        # ten elements of data used
+        nelem = 10
+
+        # allocation
+        cpu_region = cpu_coarse_regions[0]
+        cpu_ptr = cpu_region.allocate(ctypes.sizeof(ctypes.c_float) * nelem)
+        self.assertNotEqual(ctypes.addressof(cpu_ptr), 0,
+                "pointer must not be NULL")
+
+        gpu_only_region = gpu_only_coarse_regions[0]
+        gpu_only_ptr = gpu_only_region.allocate(ctypes.sizeof(ctypes.c_float) *
+                nelem)
+        self.assertNotEqual(ctypes.addressof(gpu_only_ptr), 0,
+                "pointer must not be NULL")
+
+        gpu_host_accessible_region = gpu_host_accessible_coarse_regions[0]
+        gpu_host_accessible_ptr = gpu_host_accessible_region.allocate(
+                ctypes.sizeof(ctypes.c_float) * nelem)
+        self.assertNotEqual(ctypes.addressof(gpu_host_accessible_ptr), 0,
+                "pointer must not be NULL")
+
+        # Test writing to allocated area
+        src = np.random.random(nelem).astype(np.float32)
+        roc.hsa_memory_copy(cpu_ptr, src.ctypes.data, src.nbytes)
+        roc.hsa_memory_copy(gpu_host_accessible_ptr, cpu_ptr, src.nbytes)
+        roc.hsa_memory_copy(gpu_only_ptr, gpu_host_accessible_ptr, src.nbytes)
+
+        # check write is correct
+        cpu_ref = (ctypes.c_float * nelem).from_address(cpu_ptr.value)
+        for i in range(src.size):
+            self.assertEqual(cpu_ref[i], src[i])
+
+        gpu_ha_ref = (ctypes.c_float * nelem).\
+            from_address(gpu_host_accessible_ptr.value)
+        for i in range(src.size):
+            self.assertEqual(gpu_ha_ref[i], src[i])
+
+        # zero out host accessible GPU memory and CPU memory
+        z0 = np.zeros(nelem).astype(np.float32)
+        roc.hsa_memory_copy(cpu_ptr, z0.ctypes.data, z0.nbytes)
+        roc.hsa_memory_copy(gpu_host_accessible_ptr, cpu_ptr, z0.nbytes)
+
+        # check zeroing is correct
+        for i in range(z0.size):
+            self.assertEqual(cpu_ref[i], z0[i])
+
+        for i in range(z0.size):
+            self.assertEqual(gpu_ha_ref[i], z0[i])
+
+        # copy back the data from the GPU
+        roc.hsa_memory_copy(gpu_host_accessible_ptr, gpu_only_ptr, src.nbytes)
+
+        # check the copy back is ok
+        for i in range(src.size):
+            self.assertEqual(gpu_ha_ref[i], src[i])
+
+        # free
+        roc.hsa_memory_free(cpu_ptr)
+        roc.hsa_memory_free(gpu_only_ptr)
+        roc.hsa_memory_free(gpu_host_accessible_ptr)
+
+    @unittest.skipUnless(has_brig_example, "Brig example not found")
+    @unittest.skipUnless(dgpu_present, "dGPU only")
+    @unittest.skip("Permanently skip? HSA spec violation causes corruption")
+    def test_coarse_grained_kernel_execution(self):
+        """
+        This tests the execution of a kernel on a dGPU using coarse memory
+        regions for the buffers.
+        NOTE: the code violates the HSA spec in that it uses a coarse region
+        for kernargs, this is a performance hack.
+        """
+
+        from numba.roc.hsadrv.driver import BrigModule, Program, hsa,\
+                Executable
+
+        # get a brig file
+        brig_file = get_brig_file()
+        brig_module = BrigModule.from_file(brig_file)
+        self.assertGreater(len(brig_module), 0)
+
+        # use existing GPU regions for computation space
+        gpu_regions = self.gpu.regions
+        gpu_only_coarse_regions = list()
+        gpu_host_accessible_coarse_regions = list()
+        for r in gpu_regions:
+            if r.supports(enums.HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED):
+                if r.host_accessible:
+                    gpu_host_accessible_coarse_regions.append(r)
+                else:
+                    gpu_only_coarse_regions.append(r)
+
+        # check we have 1+ coarse gpu region(s) of each type
+        self.assertGreater(len(gpu_only_coarse_regions), 0)
+        self.assertGreater(len(gpu_host_accessible_coarse_regions), 0)
+
+        # Compilation phase:
+
+        # FIXME: this is dubious, assume launching agent is indexed first
+        agent = roc.components[0]
+
+        prog = Program()
+        prog.add_module(brig_module)
+
+        # get kernel and load
+        code = prog.finalize(agent.isa)
+
+        ex = Executable()
+        ex.load(agent, code)
+        ex.freeze()
+
+        # extract symbols
+        sym = ex.get_symbol(agent, "&__vector_copy_kernel")
+        self.assertNotEqual(sym.kernel_object, 0)
+        self.assertGreater(sym.kernarg_segment_size, 0)
+
+        # attempt kernel execution
+        import ctypes
+        import numpy as np
+
+        # Do memory allocations
+
+        # allocate and initialise memory
+        nelem = 1024 * 1024
+
+        src = np.random.random(nelem).astype(np.float32)
+        z0 = np.zeros_like(src)
+
+        # alloc host accessible memory
+        nbytes = ctypes.sizeof(ctypes.c_float) * nelem
+        gpu_host_accessible_region = gpu_host_accessible_coarse_regions[0]
+        host_in_ptr = gpu_host_accessible_region.allocate(nbytes)
+        self.assertNotEqual(host_in_ptr.value, None,
+                "pointer must not be NULL")
+        host_out_ptr = gpu_host_accessible_region.allocate(nbytes)
+        self.assertNotEqual(host_out_ptr.value, None,
+                "pointer must not be NULL")
+
+        # init mem with data
+        roc.hsa_memory_copy(host_in_ptr, src.ctypes.data, src.nbytes)
+        roc.hsa_memory_copy(host_out_ptr, z0.ctypes.data, z0.nbytes)
+
+        # alloc gpu only memory
+        gpu_only_region = gpu_only_coarse_regions[0]
+        gpu_in_ptr = gpu_only_region.allocate(nbytes)
+        self.assertNotEqual(gpu_in_ptr.value, None, "pointer must not be NULL")
+        gpu_out_ptr = gpu_only_region.allocate(nbytes)
+        self.assertNotEqual(gpu_out_ptr.value, None,
+            "pointer must not be NULL")
+
+        # copy memory from host accessible location to gpu only
+        roc.hsa_memory_copy(gpu_in_ptr, host_in_ptr, src.nbytes)
+
+        # Do kernargs
+
+        # Find a coarse region (for better performance on dGPU) in which
+        # to place kernargs. NOTE: This violates the HSA spec
+        kernarg_regions = list()
+        for r in gpu_host_accessible_coarse_regions:
+           # NOTE: VIOLATION
+            if r.supports(enums.HSA_REGION_GLOBAL_FLAG_KERNARG):
+                kernarg_regions.append(r)
+        self.assertGreater(len(kernarg_regions), 0)
+
+        # use first region for args
+        kernarg_region = kernarg_regions[0]
+
+        kernarg_ptr = kernarg_region.allocate(
+                2 * ctypes.sizeof(ctypes.c_void_p))
+
+        self.assertNotEqual(kernarg_ptr, None, "pointer must not be NULL")
+
+        # wire in gpu memory
+        argref = (2 * ctypes.c_size_t).from_address(kernarg_ptr.value)
+        argref[0] = gpu_in_ptr.value
+        argref[1] = gpu_out_ptr.value
+
+        # signal
+        sig = roc.create_signal(1)
+
+        # create queue and dispatch job
+
+        queue = agent.create_queue_single(32)
+        queue.dispatch(sym, kernarg_ptr, workgroup_size=(256, 1, 1),
+                           grid_size=(nelem, 1, 1),signal=None)
+
+        # copy result back to host accessible memory to check
+        roc.hsa_memory_copy(host_out_ptr, gpu_out_ptr, src.nbytes)
+
+        # check the data is recovered
+        ref = (nelem * ctypes.c_float).from_address(host_out_ptr.value)
+        np.testing.assert_equal(ref, src)
+
+        # free
+        roc.hsa_memory_free(host_in_ptr)
+        roc.hsa_memory_free(host_out_ptr)
+        roc.hsa_memory_free(gpu_in_ptr)
+        roc.hsa_memory_free(gpu_out_ptr)
+
+
+class TestContext(_TestBase):
+    """Tests the Context class behaviour is correct."""
+
+    def test_memalloc(self):
+        """
+            Tests Context.memalloc() for a given, in the parlance of HSA,\
+            `component`. Testing includes specialisations for the supported
+            components of dGPUs and APUs.
+        """
+        n = 10 # things to alloc
+        nbytes = ctypes.sizeof(ctypes.c_double) * n
+
+        # run if a dGPU is present
+        if dgpu_present:
+            # find a host accessible region
+            dGPU_agent = self.gpu
+            CPU_agent = self.cpu
+            gpu_ctx = Context(dGPU_agent)
+            gpu_only_mem = gpu_ctx.memalloc(nbytes, hostAccessible=False)
+            ha_mem = gpu_ctx.memalloc(nbytes, hostAccessible=True)
+
+            # on dGPU systems, all host mem is host accessible
+            cpu_ctx = Context(CPU_agent)
+            cpu_mem = cpu_ctx.memalloc(nbytes, hostAccessible=True)
+
+            # Test writing to allocated area
+            src = np.random.random(n).astype(np.float64)
+            roc.hsa_memory_copy(cpu_mem.device_pointer, src.ctypes.data, src.nbytes)
+            roc.hsa_memory_copy(ha_mem.device_pointer, cpu_mem.device_pointer, src.nbytes)
+            roc.hsa_memory_copy(gpu_only_mem.device_pointer, ha_mem.device_pointer, src.nbytes)
+
+            # clear
+            z0 = np.zeros_like(src)
+            roc.hsa_memory_copy(ha_mem.device_pointer, z0.ctypes.data, z0.nbytes)
+            ref = (n * ctypes.c_double).from_address(ha_mem.device_pointer.value)
+            for k in range(n):
+                self.assertEqual(ref[k], 0)
+
+            # copy back from dGPU
+            roc.hsa_memory_copy(ha_mem.device_pointer, gpu_only_mem.device_pointer, src.nbytes)
+            for k in range(n):
+                self.assertEqual(ref[k], src[k])
+
+        else: #TODO: write APU variant
+            pass
+
+    def check_mempools(self, agent, has_fine_grain=True):
+        # get allocation-allowed pools
+        mp_alloc_list = [mp for mp in agent.mempools if mp.alloc_allowed]
+        mpdct = {'global': [], 'readonly': [], 'private': [], 'group': []}
+
+        for mp in mp_alloc_list:
+            mpdct[mp.kind].append(mp)
+
+        # only globals are allocation-allowed
+        if has_fine_grain:
+            self.assertEqual(len(mpdct['global']), 2)
+        else:
+            self.assertEqual(len(mpdct['global']), 1)
+        self.assertEqual(len(mpdct['readonly']), 0)
+        self.assertEqual(len(mpdct['private']), 0)
+        self.assertEqual(len(mpdct['group']), 0)
+
+        self.assertEqual(len(agent.mempools.globals), len(mpdct['global']))
+
+        # the global-pools are coarse-grain and fine-grain pools
+        glbs = mpdct['global']
+        coarsegrain = None
+        finegrain = None
+        for gmp in glbs:
+            if gmp.supports(enums_ext.HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED):
+                coarsegrain = gmp
+            if gmp.supports(enums_ext.HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED):
+                finegrain = gmp
+
+        self.assertIsNotNone(coarsegrain)
+        if has_fine_grain:
+            self.assertIsNotNone(finegrain)
+        else:
+            self.assertIsNone(finegrain)
+        self.assertIsNot(coarsegrain, finegrain)
+
+    def test_cpu_mempool_property(self):
+        self.check_mempools(self.cpu)
+
+    @unittest.skipUnless(dgpu_present, "dGPU only")
+    def test_gpu_mempool_property(self):
+        self.check_mempools(self.gpu, has_fine_grain=False)
+
+    @unittest.skipUnless(dgpu_present, "dGPU only")
+    def test_mempool(self):
+        n = 10 # things to alloc
+        nbytes = ctypes.sizeof(ctypes.c_double) * n
+
+        dGPU_agent = self.gpu
+        CPU_agent = self.cpu
+
+        # allocate a GPU memory pool
+        gpu_ctx = Context(dGPU_agent)
+        gpu_only_mem = gpu_ctx.mempoolalloc(nbytes)
+
+        # allocate a CPU memory pool, allow the GPU access to it
+        cpu_ctx = Context(CPU_agent)
+        cpu_mem = cpu_ctx.mempoolalloc(nbytes, allow_access_to=[gpu_ctx.agent])
+
+        ## Test writing to allocated area
+        src = np.random.random(n).astype(np.float64)
+        roc.hsa_memory_copy(cpu_mem.device_pointer, src.ctypes.data, src.nbytes)
+        roc.hsa_memory_copy(gpu_only_mem.device_pointer, cpu_mem.device_pointer, src.nbytes)
+
+
+        # clear
+        z0 = np.zeros_like(src)
+        roc.hsa_memory_copy(cpu_mem.device_pointer, z0.ctypes.data, z0.nbytes)
+        ref = (n * ctypes.c_double).from_address(cpu_mem.device_pointer.value)
+        for k in range(n):
+            self.assertEqual(ref[k], 0)
+
+        # copy back from dGPU
+        roc.hsa_memory_copy(cpu_mem.device_pointer, gpu_only_mem.device_pointer, src.nbytes)
+        for k in range(n):
+            self.assertEqual(ref[k], src[k])
+
+    def check_mempool_with_flags(self, finegrain):
+        dGPU_agent = self.gpu
+        gpu_ctx = Context(dGPU_agent)
+
+        CPU_agent = self.cpu
+        cpu_ctx = Context(CPU_agent)
+
+        # get mempool with specific flags
+        cpu_ctx.mempoolalloc(1024, allow_access_to=[gpu_ctx._agent])
+
+    @unittest.skipUnless(dgpu_present, 'dGPU only')
+    def test_mempool_finegrained(self):
+        self.check_mempool_with_flags(finegrain=True)
+
+    @unittest.skipUnless(dgpu_present, 'dGPU only')
+    def test_mempool_coarsegrained(self):
+        self.check_mempool_with_flags(finegrain=False)
+
+    @unittest.skipUnless(dgpu_present, 'dGPU only')
+    def test_mempool_amd_example(self):
+        dGPU_agent = self.gpu
+        gpu_ctx = Context(dGPU_agent)
+        CPU_agent = self.cpu
+        cpu_ctx = Context(CPU_agent)
+
+        kNumInt = 1024
+        kSize = kNumInt * ctypes.sizeof(ctypes.c_int)
+
+        dependent_signal = roc.create_signal(0)
+        completion_signal = roc.create_signal(0)
+
+        ## allocate host src and dst, allow gpu access
+        flags = dict(allow_access_to=[gpu_ctx.agent], finegrain=False)
+        host_src = cpu_ctx.mempoolalloc(kSize, **flags)
+        host_dst = cpu_ctx.mempoolalloc(kSize, **flags)
+
+        # there's a loop in `i` here over GPU hardware
+        i = 0
+
+        # get gpu local pool
+        local_memory = gpu_ctx.mempoolalloc(kSize)
+
+        host_src_view = (kNumInt * ctypes.c_int).from_address(host_src.device_pointer.value)
+        host_dst_view = (kNumInt * ctypes.c_int).from_address(host_dst.device_pointer.value)
+
+        host_src_view[:] = i + 2016 + np.arange(0, kNumInt, dtype=np.int32)
+        host_dst_view[:] = np.zeros(kNumInt, dtype=np.int32)
+
+        # print("GPU: %s"%gpu_ctx._agent.name)
+        # print("CPU: %s"%cpu_ctx._agent.name)
+
+        roc.hsa_signal_store_relaxed(completion_signal, 1);
+
+        q = queue.Queue()
+
+        class validatorThread(threading.Thread):
+            def run(self):
+                val = roc.hsa_signal_wait_acquire(
+                    completion_signal,
+                    enums.HSA_SIGNAL_CONDITION_EQ,
+                    0,
+                    ctypes.c_uint64(-1),
+                    enums.HSA_WAIT_STATE_ACTIVE)
+
+                q.put(val)  # wait_res
+
+        # this could be a call on the signal itself dependent_signal.store_relaxed(1)
+        roc.hsa_signal_store_relaxed(dependent_signal, 1);
+
+        h2l_start = threading.Semaphore(value=0)
+
+        class l2hThread(threading.Thread):
+            def run(self):
+                dep_signal = drvapi.hsa_signal_t(dependent_signal._id)
+                roc.hsa_amd_memory_async_copy(host_dst.device_pointer.value,
+                                        cpu_ctx._agent._id,
+                                        local_memory.device_pointer.value,
+                                        gpu_ctx._agent._id, kSize, 1,
+                                        ctypes.byref(dep_signal),
+                                        completion_signal)
+                h2l_start.release()  # signal h2l to start
+
+        class h2lThread(threading.Thread):
+            def run(self):
+                h2l_start.acquire()  # to wait until l2h thread has started
+                roc.hsa_amd_memory_async_copy(local_memory.device_pointer.value,
+                                            gpu_ctx._agent._id,
+                                            host_src.device_pointer.value,
+                                            cpu_ctx._agent._id, kSize, 0,
+                                            None,
+                                            dependent_signal)
+
+        timeout = 10  # 10 seconds timeout
+        # # init thread instances
+        validator = validatorThread()
+        l2h = l2hThread()
+        h2l = h2lThread()
+        # run them
+        validator.start()
+        l2h.start()
+        h2l.start()
+        # join
+        l2h.join(timeout)
+        h2l.join(timeout)
+        validator.join(timeout)
+        # verify
+        wait_res = q.get()
+        self.assertEqual(wait_res, 0)
+        np.testing.assert_allclose(host_dst_view, host_src_view)
+
+    @unittest.skipUnless(dgpu_present, "dGPU only")
+    def test_to_device_to_host(self):
+        """
+            Tests .to_device() and .copy_to_host()
+        """
+        n  = 10
+        data = np.zeros(n)
+        output = np.zeros(n)
+        @vectorize("float64(float64)", target='roc')
+        def func(x):
+            return x + 1
+
+        hsaapi.to_device(data)
+        out_device = hsaapi.to_device(output)
+        func(data, out=out_device)
+        host_output = out_device.copy_to_host()
+        np.testing.assert_equal(np.ones(n), host_output)
+
+
+if __name__ == '__main__':
+    unittest.main()