update roc

5c70ef66 · dugupeiwen · 1fb0017a · 5c70ef66 · 5c70ef66 · 5c70ef66
Commit 5c70ef66 authored Mar 23, 2024 by dugupeiwen
20 changed files
--- a/numba/core/target_extension.py
+++ b/numba/core/target_extension.py
@@ -144,6 +144,9 @@ class CUDA(GPU):
    """Mark the target as CUDA.
    """

+class ROCm(GPU):
+    """Mark the target as ROCm.
+    """

 class NPyUfunc(Target):
    """Mark the target as a ufunc
@@ -157,6 +160,7 @@ target_registry['GPU'] = GPU
 target_registry['gpu'] = GPU
 target_registry['CUDA'] = CUDA
 target_registry['cuda'] = CUDA
+target_registry['ROCm'] = ROCm
 target_registry['npyufunc'] = NPyUfunc

 dispatcher_registry = DelayedRegistry(key_type=Target)

--- a/numba/misc/numba_sysinfo.py
+++ b/numba/misc/numba_sysinfo.py
@@ -13,6 +13,8 @@ import numpy as np
 import llvmlite.binding as llvmbind
 from llvmlite import __version__ as llvmlite_version
 from numba import cuda as cu, __version__ as version_number
+from numba import roc
+from numba.roc.hlc import hlc, libhlc
 from numba.cuda import cudadrv
 from numba.cuda.cudadrv.driver import driver as cudriver
 from numba.cuda.cudadrv.runtime import runtime as curuntime
@@ -70,6 +72,10 @@ _numpy_supported_simd_features = 'NumPy Supported SIMD features'
 _numpy_supported_simd_dispatch = 'NumPy Supported SIMD dispatch'
 _numpy_supported_simd_baseline = 'NumPy Supported SIMD baseline'
 _numpy_AVX512_SKX_detected = 'NumPy AVX512_SKX detected'
+# ROC information
+_roc_available, _roc_toolchains = 'ROC Available', 'ROC Toolchains'
+_hsa_agents_count, _hsa_agents = 'HSA Agents Count', 'HSA Agents'
+_hsa_gpus_count, _hsa_gpus = 'HSA Discrete GPUs Count', 'HSA Discrete GPUs'
 # SVML info
 _svml_state, _svml_loaded = 'SVML State', 'SVML Lib Loaded'
 _llvm_svml_patched = 'LLVM SVML Patched'
@@ -309,6 +315,7 @@ def get_sysinfo():
        _numba_version: version_number,
        _llvm_version: '.'.join(str(i) for i in llvmbind.llvm_version_info),
        _llvmlite_version: llvmlite_version,
+        _roc_available: roc.is_available(),
        _psutil: _psutil_import,
    }

@@ -417,6 +424,61 @@ def get_sysinfo():
        sys_info[_numpy_AVX512_SKX_detected] = \
            __cpu_features__.get("AVX512_SKX", False)

+    # ROC information
+    # If no ROC try and report why
+    if not sys_info[_roc_available]:
+        from numba.roc.hsadrv.driver import hsa
+        try:
+            hsa.is_available
+        except Exception as e:
+            msg = str(e)
+        else:
+            msg = 'No ROC toolchains found.'
+        _warning_log.append(f"Warning (roc): Error initialising ROC: {msg}")
+
+    toolchains = []
+    try:
+        libhlc.HLC()
+        toolchains.append('librocmlite library')
+    except Exception:
+        pass
+    try:
+        cmd = hlc.CmdLine().check_tooling()
+        toolchains.append('ROC command line tools')
+    except Exception:
+        pass
+    sys_info[_roc_toolchains] = toolchains
+
+    try:
+        # ROC might not be available due to lack of tool chain, but HSA
+        # agents may be listed
+        from numba.roc.hsadrv.driver import hsa, dgpu_count
+
+        def decode(x):
+            return x.decode('utf-8') if isinstance(x, bytes) else x
+
+        sys_info[_hsa_agents_count] = len(hsa.agents)
+        agents = []
+        for i, agent in enumerate(hsa.agents):
+            agents.append({
+                'Agent id': i,
+                'Vendor': decode(agent.vendor_name),
+                'Name': decode(agent.name),
+                'Type': agent.device,
+            })
+        sys_info[_hsa_agents] = agents
+
+        _dgpus = []
+        for a in hsa.agents:
+            if a.is_component and a.device == 'GPU':
+                _dgpus.append(decode(a.name))
+        sys_info[_hsa_gpus_count] = dgpu_count()
+        sys_info[_hsa_gpus] = ', '.join(_dgpus)
+    except Exception as e:
+        _warning_log.append(
+            "Warning (roc): No HSA Agents found, "
+            f"encountered exception when searching: {e}")
+
    # SVML information
    # Replicate some SVML detection logic from numba.__init__ here.
    # If SVML load fails in numba.__init__ the splitting of the logic
@@ -625,6 +687,15 @@ def display_sysinfo(info=None, sep_pos=45):
        ("NumPy AVX512_SKX support detected",
         info.get(_numpy_AVX512_SKX_detected, '?')),
        ("",),
+        ("__ROC information__",),
+        ("ROC Available", info.get(_roc_available, '?')),
+        ("ROC Toolchains", info.get(_roc_toolchains, []) or 'None'),
+        ("HSA Agents Count", info.get(_hsa_agents_count, 0)),
+        ("HSA Agents:",),
+        (DisplaySeqMaps(info.get(_hsa_agents, {})) or ('None',)),
+        ('HSA Discrete GPUs Count', info.get(_hsa_gpus_count, 0)),
+        ('HSA Discrete GPUs', info.get(_hsa_gpus, 'None')),
+        ("",),
        ("__SVML Information__",),
        ("SVML State, config.USING_SVML", info.get(_svml_state, '?')),
        ("SVML Library Loaded", info.get(_svml_loaded, '?')),

--- a/numba/np/ufunc/__init__.py
+++ b/numba/np/ufunc/__init__.py
@@ -27,6 +27,16 @@ def _init():
    Vectorize.target_registry.ondemand['cuda'] = init_cuda_vectorize
    GUVectorize.target_registry.ondemand['cuda'] = init_cuda_guvectorize

+    def init_roc_vectorize():
+        from numba.roc.vectorizers import HsaVectorize
+        return HsaVectorize
+
+    def init_roc_guvectorize():
+        from numba.roc.vectorizers import HsaGUFuncVectorize
+        return HsaGUFuncVectorize
+
+    Vectorize.target_registry.ondemand['roc'] = init_roc_vectorize
+    GUVectorize.target_registry.ondemand['roc'] = init_roc_guvectorize

 _init()
 del _init
--- a/numba/roc/README.md
+++ b/numba/roc/README.md
+Setup
+-----
+
+`libhsakmt.so.1`, `libhsa-runtime64.so`, `libhsa-runtime-ext64.so` must be in
+ the `LD_LIBRARY_PATH`.
+
+The standard location of these libraries are in `/opt/hsa/lib`.  Thus,
+user can simply do `export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hsa/lib`
+
+
+Run Tests
+---------
+
+The HSA test suite can be executed from the base of the source tree with:
+
+```bash
+python runtests.py numba.hsa.tests -vb
+```
+
+The test suite can also be executed inside the python interpreter with:
+
+```python
+import numba.hsa
+numba.hsa.test("-vb")
+```
+
+Or directly from the terminal with:
+
+```bash
+python -c 'import numba.hsa; numba.hsa.test("-vb")'
+```
+
+Note that the "-vb" flags are optional.  The "-v" flag enables verbose mode
+that will print the name of each test.  The "-b" flag enables capturing
+the stdout messages printed from within the tests.
+
--- a/numba/roc/__init__.py
+++ b/numba/roc/__init__.py
+"""
+Module that deals with HSA in a high level way
+"""
+import os
+import numba.testing
+from .api import *
+from .stubs import atomic
+
+def is_available():
+    """Returns a boolean to indicate the availability of a HSA runtime.
+
+    This will force initialization of the driver if it hasn't been
+    initialized. It also checks that a toolchain is present.
+    """
+    from .hsadrv.driver import hsa
+
+    from .hlc import hlc, libhlc
+    has_a_toolchain = False
+
+    try:
+       libhlc.HLC()
+       has_a_toolchain = True
+    except:
+        try:
+            cmd = hlc.CmdLine().check_tooling()
+            has_a_toolchain = True
+        except:
+            pass
+
+    return hsa.is_available and has_a_toolchain
+
+if is_available():
+    from .hsadrv.driver import hsa
+    agents = list(hsa.agents)
+else:
+    agents = []
+
+def test(*args, **kwargs):
+    if not is_available():
+        raise RuntimeError("HSA is not detected")
+
+    return numba.testing.test("numba.hsa.tests", *args, **kwargs)
--- a/numba/roc/api.py
+++ b/numba/roc/api.py
+import numpy as np
+from numba import mviewbuf
+from numba.roc.hsadrv.devices import get_context
+
+from .stubs import (
+    get_global_id,
+    get_global_size,
+    get_local_id,
+    get_local_size,
+    get_group_id,
+    get_work_dim,
+    get_num_groups,
+    barrier,
+    mem_fence,
+    shared,
+    wavebarrier,
+    activelanepermute_wavewidth,
+    ds_permute,
+    ds_bpermute,
+)
+
+from .decorators import (
+    jit,
+)
+
+from .enums import (
+    CLK_LOCAL_MEM_FENCE,
+    CLK_GLOBAL_MEM_FENCE
+)
+
+from .hsadrv.driver import hsa as _hsadrv
+from .hsadrv import devicearray
+
+
+class _AutoDeregister(object):
+    def __init__(self, args):
+        self.args = args
+
+    def __enter__(self):
+        pass
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        deregister(*self.args)
+
+
+def register(*args):
+    """Register data into the HSA system
+
+    Returns a contextmanager for use in with-context for auto deregistration.
+
+    Use in context:
+
+        with hsa.register(array):
+            do_work_on_HSA(array)
+
+    """
+    for data in args:
+        if isinstance(data, np.ndarray):
+            _hsadrv.hsa_memory_register(data.ctypes.data, data.nbytes)
+        else:
+            raise TypeError(type(data))
+    return _AutoDeregister(args)
+
+
+def deregister(*args):
+    """Deregister data from the HSA system
+    """
+    for data in args:
+        if isinstance(data, np.ndarray):
+            _hsadrv.hsa_memory_deregister(data.ctypes.data, data.nbytes)
+        else:
+            raise TypeError(type(data))
+
+def device_array(shape, dtype=np.float_, strides=None, order='C'):
+    """device_array(shape, dtype=np.float_, strides=None, order='C')
+
+    Allocate an empty device ndarray. Similar to :meth:`numpy.empty`.
+    """
+    shape, strides, dtype = _prepare_shape_strides_dtype(shape, strides, dtype,
+                                                         order)
+    return devicearray.DeviceNDArray(shape=shape, strides=strides, dtype=dtype)
+
+
+def device_array_like(ary):
+    """Call roc.devicearray() with information from the array.
+    """
+    return device_array(shape=ary.shape, dtype=ary.dtype, strides=ary.strides)
+
+
+def to_device(obj, stream=None, context=None, copy=True, to=None):
+    """to_device(obj, context, copy=True, to=None)
+
+    Allocate and transfer a numpy ndarray or structured scalar to the device.
+
+    To copy host->device a numpy array::
+
+        ary = numpy.arange(10)
+        d_ary = roc.to_device(ary)
+
+    The resulting ``d_ary`` is a ``DeviceNDArray``.
+
+    To copy device->host::
+
+        hary = d_ary.copy_to_host()
+
+    To copy device->host to an existing array::
+
+        ary = numpy.empty(shape=d_ary.shape, dtype=d_ary.dtype)
+        d_ary.copy_to_host(ary)
+
+    """
+    context = context or get_context()
+
+    if to is None:
+        to = devicearray.from_array_like(obj)
+
+    if copy:
+        to.copy_to_device(obj, stream=stream, context=context)
+    return to
+
+
+def stream():
+    return _hsadrv.create_stream()
+
+def _fill_stride_by_order(shape, dtype, order):
+    nd = len(shape)
+    strides = [0] * nd
+    if order == 'C':
+        strides[-1] = dtype.itemsize
+        for d in reversed(range(nd - 1)):
+            strides[d] = strides[d + 1] * shape[d + 1]
+    elif order == 'F':
+        strides[0] = dtype.itemsize
+        for d in range(1, nd):
+            strides[d] = strides[d - 1] * shape[d - 1]
+    else:
+        raise ValueError('must be either C/F order')
+    return tuple(strides)
+
+
+def _prepare_shape_strides_dtype(shape, strides, dtype, order):
+    dtype = np.dtype(dtype)
+    if isinstance(shape, int):
+        shape = (shape,)
+    if isinstance(strides, int):
+        strides = (strides,)
+    else:
+        if shape == ():
+            shape = (1,)
+        strides = strides or _fill_stride_by_order(shape, dtype, order)
+    return shape, strides, dtype
+
+def _memory_size_from_info(shape, strides, itemsize):
+    """Get the byte size of a contiguous memory buffer given the shape, strides
+    and itemsize.
+    """
+    assert len(shape) == len(strides), "# dim mismatch"
+    ndim = len(shape)
+    s, e = mviewbuf.memoryview_get_extents_info(shape, strides, ndim, itemsize)
+    return e - s
+
+def _host_array(finegrain, shape, dtype, strides, order):
+    from .hsadrv import devices
+    shape, strides, dtype = _prepare_shape_strides_dtype(shape, strides, dtype,
+                                                         order)
+    bytesize = _memory_size_from_info(shape, strides, dtype.itemsize)
+    # TODO does allowing access by all dGPUs really work in a multiGPU system?
+    agents = [c._agent for c in devices.get_all_contexts()]
+    buf = devices.get_cpu_context().memhostalloc(bytesize, finegrain=finegrain,
+                                                 allow_access_to=agents)
+    arr = np.ndarray(shape=shape, strides=strides, dtype=dtype, order=order,
+                     buffer=buf)
+    return arr.view(type=devicearray.HostArray)
+
+
+def coarsegrain_array(shape, dtype=np.float_, strides=None, order='C'):
+    """coarsegrain_array(shape, dtype=np.float_, strides=None, order='C')
+    Similar to np.empty().
+    """
+    return _host_array(finegrain=False, shape=shape, dtype=dtype,
+                       strides=strides, order=order)
+
+
+def finegrain_array(shape, dtype=np.float_, strides=None, order='C'):
+    """finegrain_array(shape, dtype=np.float_, strides=None, order='C')
+
+    Similar to np.empty().
+    """
+    return _host_array(finegrain=False, shape=shape, dtype=dtype,
+                       strides=strides, order=order)
--- a/numba/roc/codegen.py
+++ b/numba/roc/codegen.py
+from llvmlite import binding as ll
+from llvmlite.llvmpy import core as lc
+from numba.core import utils
+from numba.core.codegen import BaseCPUCodegen, CodeLibrary
+from .hlc import DATALAYOUT, TRIPLE, hlc
+
+
+class HSACodeLibrary(CodeLibrary):
+    def _optimize_functions(self, ll_module):
+        pass
+
+    def _optimize_final_module(self):
+        pass
+
+    def _finalize_specific(self):
+        pass
+
+    def get_asm_str(self):
+        """
+        Get the human-readable assembly.
+        """
+        m = hlc.Module()
+        m.load_llvm(str(self._final_module))
+        out = m.finalize()
+        return str(out.hsail)
+
+
+class JITHSACodegen(BaseCPUCodegen):
+    _library_class = HSACodeLibrary
+
+    def _init(self, llvm_module):
+        assert list(llvm_module.global_variables) == [], "Module isn't empty"
+        self._data_layout = DATALAYOUT[utils.MACHINE_BITS]
+        self._target_data = ll.create_target_data(self._data_layout)
+
+    def _create_empty_module(self, name):
+        ir_module = lc.Module(name)
+        ir_module.triple = TRIPLE
+        return ir_module
+
+    def _module_pass_manager(self):
+        raise NotImplementedError
+
+    def _function_pass_manager(self, llvm_module):
+        raise NotImplementedError
+
+    def _add_module(self, module):
+        pass
--- a/numba/roc/compiler.py
+++ b/numba/roc/compiler.py
+import copy
+from collections import namedtuple
+import ctypes
+import re
+
+import numpy as np
+
+from numba.core.typing.templates import ConcreteTemplate
+from numba.core import types, config, compiler
+from .hlc import hlc
+from .hsadrv import devices, driver, enums, drvapi
+from .hsadrv.error import HsaKernelLaunchError
+from numba.roc import gcn_occupancy
+from numba.roc.hsadrv.driver import hsa, dgpu_present
+from .hsadrv import devicearray
+from numba.core.typing.templates import AbstractTemplate
+from numba.core.compiler_lock import global_compiler_lock
+
+@global_compiler_lock
+def compile_hsa(pyfunc, return_type, args, debug):
+    # First compilation will trigger the initialization of the HSA backend.
+    from .descriptor import HSATargetDesc
+
+    typingctx = HSATargetDesc.typingctx
+    targetctx = HSATargetDesc.targetctx
+    # TODO handle debug flag
+    flags = compiler.Flags()
+    # Do not compile (generate native code), just lower (to LLVM)
+    flags.set('no_compile')
+    flags.set('no_cpython_wrapper')
+    flags.set('no_cfunc_wrapper')
+    flags.unset('nrt')
+    # Run compilation pipeline
+    cres = compiler.compile_extra(typingctx=typingctx,
+                                  targetctx=targetctx,
+                                  func=pyfunc,
+                                  args=args,
+                                  return_type=return_type,
+                                  flags=flags,
+                                  locals={})
+
+    # Linking depending libraries
+    # targetctx.link_dependencies(cres.llvm_module, cres.target_context.linking)
+    library = cres.library
+    library.finalize()
+
+    return cres
+
+
+def compile_kernel(pyfunc, args, debug=False):
+    cres = compile_hsa(pyfunc, types.void, args, debug=debug)
+    func = cres.library.get_function(cres.fndesc.llvm_func_name)
+    kernel = cres.target_context.prepare_hsa_kernel(func, cres.signature.args)
+    hsakern = HSAKernel(llvm_module=kernel.module,
+                        name=kernel.name,
+                        argtypes=cres.signature.args)
+    return hsakern
+
+
+def compile_device(pyfunc, return_type, args, debug=False):
+    cres = compile_hsa(pyfunc, return_type, args, debug=debug)
+    func = cres.library.get_function(cres.fndesc.llvm_func_name)
+    cres.target_context.mark_hsa_device(func)
+    devfn = DeviceFunction(cres)
+
+    class device_function_template(ConcreteTemplate):
+        key = devfn
+        cases = [cres.signature]
+
+    cres.typing_context.insert_user_function(devfn, device_function_template)
+    libs = [cres.library]
+    cres.target_context.insert_user_function(devfn, cres.fndesc, libs)
+    return devfn
+
+
+def compile_device_template(pyfunc):
+    """Compile a DeviceFunctionTemplate
+    """
+    from .descriptor import HSATargetDesc
+
+    dft = DeviceFunctionTemplate(pyfunc)
+
+    class device_function_template(AbstractTemplate):
+        key = dft
+
+        def generic(self, args, kws):
+            assert not kws
+            return dft.compile(args)
+
+    typingctx = HSATargetDesc.typingctx
+    typingctx.insert_user_function(dft, device_function_template)
+    return dft
+
+
+class DeviceFunctionTemplate(object):
+    """Unmaterialized device function
+    """
+    def __init__(self, pyfunc, debug=False):
+        self.py_func = pyfunc
+        self.debug = debug
+        # self.inline = inline
+        self._compileinfos = {}
+
+    def compile(self, args):
+        """Compile the function for the given argument types.
+
+        Each signature is compiled once by caching the compiled function inside
+        this object.
+        """
+        if args not in self._compileinfos:
+            cres = compile_hsa(self.py_func, None, args, debug=self.debug)
+            func = cres.library.get_function(cres.fndesc.llvm_func_name)
+            cres.target_context.mark_hsa_device(func)
+            first_definition = not self._compileinfos
+            self._compileinfos[args] = cres
+            libs = [cres.library]
+
+            if first_definition:
+                # First definition
+                cres.target_context.insert_user_function(self, cres.fndesc,
+                                                         libs)
+            else:
+                cres.target_context.add_user_function(self, cres.fndesc, libs)
+
+        else:
+            cres = self._compileinfos[args]
+
+        return cres.signature
+
+
+class DeviceFunction(object):
+    def __init__(self, cres):
+        self.cres = cres
+
+
+def _ensure_list(val):
+    if not isinstance(val, (tuple, list)):
+        return [val]
+    else:
+        return list(val)
+
+
+def _ensure_size_or_append(val, size):
+    n = len(val)
+    for _ in range(n, size):
+        val.append(1)
+
+
+class HSAKernelBase(object):
+    """Define interface for configurable kernels
+    """
+
+    def __init__(self):
+        self.global_size = (1,)
+        self.local_size = (1,)
+        self.stream = None
+
+    def copy(self):
+        return copy.copy(self)
+
+    def configure(self, global_size, local_size=None, stream=None):
+        """Configure the OpenCL kernel
+        local_size can be None
+        """
+        global_size = _ensure_list(global_size)
+
+        if local_size is not None:
+            local_size = _ensure_list(local_size)
+            size = max(len(global_size), len(local_size))
+            _ensure_size_or_append(global_size, size)
+            _ensure_size_or_append(local_size, size)
+
+        clone = self.copy()
+        clone.global_size = tuple(global_size)
+        clone.local_size = tuple(local_size) if local_size else None
+        clone.stream = stream
+
+        return clone
+
+    def forall(self, nelem, local_size=64, stream=None):
+        """Simplified configuration for 1D kernel launch
+        """
+        return self.configure(nelem, min(nelem, local_size), stream=stream)
+
+    def __getitem__(self, args):
+        """Mimick CUDA python's square-bracket notation for configuration.
+        This assumes a the argument to be:
+            `griddim, blockdim, stream`
+        The blockdim maps directly to local_size.
+        The actual global_size is computed by multiplying the local_size to
+        griddim.
+        """
+        griddim = _ensure_list(args[0])
+        blockdim = _ensure_list(args[1])
+        size = max(len(griddim), len(blockdim))
+        _ensure_size_or_append(griddim, size)
+        _ensure_size_or_append(blockdim, size)
+        # Compute global_size
+        gs = [g * l for g, l in zip(griddim, blockdim)]
+        return self.configure(gs, blockdim, *args[2:])
+
+
+_CacheEntry = namedtuple("_CachedEntry", ['symbol', 'executable',
+                                          'kernarg_region'])
+
+
+class _CachedProgram(object):
+    def __init__(self, entry_name, binary):
+        self._entry_name = entry_name
+        self._binary = binary
+        # key: hsa context
+        self._cache = {}
+
+    def get(self):
+        ctx = devices.get_context()
+        result = self._cache.get(ctx)
+        # The program does not exist as GCN yet.
+        if result is None:
+
+            # generate GCN
+            symbol = '{0}'.format(self._entry_name)
+            agent = ctx.agent
+
+            ba = bytearray(self._binary)
+            bblob = ctypes.c_byte * len(self._binary)
+            bas = bblob.from_buffer(ba)
+
+            code_ptr = drvapi.hsa_code_object_t()
+            driver.hsa.hsa_code_object_deserialize(
+                    ctypes.addressof(bas),
+                    len(self._binary),
+                    None,
+                    ctypes.byref(code_ptr)
+                    )
+
+            code = driver.CodeObject(code_ptr)
+
+            ex = driver.Executable()
+            ex.load(agent, code)
+            ex.freeze()
+            symobj = ex.get_symbol(agent, symbol)
+            regions = agent.regions.globals
+            for reg in regions:
+                if reg.host_accessible:
+                    if reg.supports(enums.HSA_REGION_GLOBAL_FLAG_KERNARG):
+                        kernarg_region = reg
+                        break
+            assert kernarg_region is not None
+
+            # Cache the GCN program
+            result = _CacheEntry(symbol=symobj, executable=ex,
+                                 kernarg_region=kernarg_region)
+            self._cache[ctx] = result
+
+        return ctx, result
+
+
+class HSAKernel(HSAKernelBase):
+    """
+    A HSA kernel object
+    """
+    def __init__(self, llvm_module, name, argtypes):
+        super(HSAKernel, self).__init__()
+        self._llvm_module = llvm_module
+        self.assembly, self.binary = self._generateGCN()
+        self.entry_name = name
+        self.argument_types = tuple(argtypes)
+        self._argloc = []
+        # cached program
+        self._cacheprog = _CachedProgram(entry_name=self.entry_name,
+                                         binary=self.binary)
+        self._parse_kernel_resource()
+
+    def _parse_kernel_resource(self):
+        """
+        Temporary workaround for register limit
+        """
+        m = re.search(r"\bwavefront_sgpr_count\s*=\s*(\d+)", self.assembly)
+        self._wavefront_sgpr_count = int(m.group(1))
+        m = re.search(r"\bworkitem_vgpr_count\s*=\s*(\d+)", self.assembly)
+        self._workitem_vgpr_count = int(m.group(1))
+
+    def _sentry_resource_limit(self):
+        # only check resource factprs if either sgpr or vgpr is non-zero
+        #if (self._wavefront_sgpr_count > 0 or self._workitem_vgpr_count > 0):
+        group_size = np.prod(self.local_size)
+        limits = gcn_occupancy.get_limiting_factors(
+            group_size=group_size,
+            vgpr_per_workitem=self._workitem_vgpr_count,
+            sgpr_per_wave=self._wavefront_sgpr_count)
+        if limits.reasons:
+            fmt = 'insufficient resources to launch kernel due to:\n{}'
+            msg = fmt.format('\n'.join(limits.suggestions))
+            raise HsaKernelLaunchError(msg)
+
+    def _generateGCN(self):
+        hlcmod = hlc.Module()
+        hlcmod.load_llvm(str(self._llvm_module))
+        return hlcmod.generateGCN()
+
+    def bind(self):
+        """
+        Bind kernel to device
+        """
+        ctx, entry = self._cacheprog.get()
+        if entry.symbol.kernarg_segment_size > 0:
+            sz = ctypes.sizeof(ctypes.c_byte) *\
+                entry.symbol.kernarg_segment_size
+            kernargs = entry.kernarg_region.allocate(sz)
+        else:
+            kernargs = None
+
+        return ctx, entry.symbol, kernargs, entry.kernarg_region
+
+    def __call__(self, *args):
+        self._sentry_resource_limit()
+
+        ctx, symbol, kernargs, kernarg_region = self.bind()
+
+        # Unpack pyobject values into ctypes scalar values
+        expanded_values = []
+
+        # contains lambdas to execute on return
+        retr = []
+        for ty, val in zip(self.argument_types, args):
+            _unpack_argument(ty, val, expanded_values, retr)
+
+        # Insert kernel arguments
+        base = 0
+        for av in expanded_values:
+            # Adjust for alignment
+            align = ctypes.sizeof(av)
+            pad = _calc_padding_for_alignment(align, base)
+            base += pad
+            # Move to offset
+            offseted = kernargs.value + base
+            asptr = ctypes.cast(offseted, ctypes.POINTER(type(av)))
+            # Assign value
+            asptr[0] = av
+            # Increment offset
+            base += align
+
+        # Actual Kernel launch
+        qq = ctx.default_queue
+
+        if self.stream is None:
+            hsa.implicit_sync()
+
+        # Dispatch
+        signal = None
+        if self.stream is not None:
+            signal = hsa.create_signal(1)
+            qq.insert_barrier(self.stream._get_last_signal())
+
+        qq.dispatch(symbol, kernargs, workgroup_size=self.local_size,
+                    grid_size=self.global_size, signal=signal)
+
+        if self.stream is not None:
+            self.stream._add_signal(signal)
+
+        # retrieve auto converted arrays
+        for wb in retr:
+            wb()
+
+        # Free kernel region
+        if kernargs is not None:
+            if self.stream is None:
+                kernarg_region.free(kernargs)
+            else:
+                self.stream._add_callback(lambda: kernarg_region.free(kernargs))
+
+
+def _unpack_argument(ty, val, kernelargs, retr):
+    """
+    Convert arguments to ctypes and append to kernelargs
+    """
+    if isinstance(ty, types.Array):
+        c_intp = ctypes.c_ssize_t
+        # if a dgpu is present, move the data to the device.
+        if dgpu_present:
+            devary, conv = devicearray.auto_device(val, devices.get_context())
+            if conv:
+                retr.append(lambda: devary.copy_to_host(val))
+            data = devary.device_ctypes_pointer
+        else:
+            data = ctypes.c_void_p(val.ctypes.data)
+
+
+        meminfo = parent = ctypes.c_void_p(0)
+        nitems = c_intp(val.size)
+        itemsize = c_intp(val.dtype.itemsize)
+        kernelargs.append(meminfo)
+        kernelargs.append(parent)
+        kernelargs.append(nitems)
+        kernelargs.append(itemsize)
+        kernelargs.append(data)
+        for ax in range(val.ndim):
+            kernelargs.append(c_intp(val.shape[ax]))
+        for ax in range(val.ndim):
+            kernelargs.append(c_intp(val.strides[ax]))
+
+    elif isinstance(ty, types.Integer):
+        cval = getattr(ctypes, "c_%s" % ty)(val)
+        kernelargs.append(cval)
+
+    elif ty == types.float64:
+        cval = ctypes.c_double(val)
+        kernelargs.append(cval)
+
+    elif ty == types.float32:
+        cval = ctypes.c_float(val)
+        kernelargs.append(cval)
+
+    elif ty == types.boolean:
+        cval = ctypes.c_uint8(int(val))
+        kernelargs.append(cval)
+
+    elif ty == types.complex64:
+        kernelargs.append(ctypes.c_float(val.real))
+        kernelargs.append(ctypes.c_float(val.imag))
+
+    elif ty == types.complex128:
+        kernelargs.append(ctypes.c_double(val.real))
+        kernelargs.append(ctypes.c_double(val.imag))
+
+    else:
+        raise NotImplementedError(ty, val)
+
+
+def _calc_padding_for_alignment(align, base):
+    """
+    Returns byte padding required to move the base pointer into proper alignment
+    """
+    rmdr = int(base) % align
+    if rmdr == 0:
+        return 0
+    else:
+        return align - rmdr
+
+
+class AutoJitHSAKernel(HSAKernelBase):
+    def __init__(self, func):
+        super(AutoJitHSAKernel, self).__init__()
+        self.py_func = func
+        self.definitions = {}
+
+        from .descriptor import HSATargetDesc
+
+        self.typingctx = HSATargetDesc.typingctx
+
+    def __call__(self, *args):
+        kernel = self.specialize(*args)
+        cfg = kernel.configure(self.global_size, self.local_size, self.stream)
+        cfg(*args)
+
+    def specialize(self, *args):
+        argtypes = tuple([self.typingctx.resolve_argument_type(a)
+                          for a in args])
+        kernel = self.definitions.get(argtypes)
+        if kernel is None:
+            kernel = compile_kernel(self.py_func, argtypes)
+            self.definitions[argtypes] = kernel
+        return kernel
+
--- a/numba/roc/decorators.py
+++ b/numba/roc/decorators.py
+from numba.core import types, sigutils
+from .compiler import (compile_kernel, compile_device, AutoJitHSAKernel,
+                       compile_device_template)
+
+
+def jit(signature=None, device=False):
+    """JIT compile a python function conforming to
+    the HSA-Python
+    """
+    if signature is None:
+        return autojit(device=device)
+    elif not sigutils.is_signature(signature):
+        func = signature
+        return autojit(device=device)(func)
+    else:
+        if device:
+            return _device_jit(signature)
+        else:
+            return _kernel_jit(signature)
+
+
+def autojit(device=False):
+    if device:
+        return _device_autojit
+    else:
+        return _kernel_autojit
+
+
+def _device_jit(signature):
+    argtypes, restype = sigutils.normalize_signature(signature)
+
+    def _wrapped(pyfunc):
+        return compile_device(pyfunc, restype, argtypes)
+
+    return _wrapped
+
+
+def _kernel_jit(signature):
+    argtypes, restype = sigutils.normalize_signature(signature)
+    if restype is not None and restype != types.void:
+        msg = "HSA kernel must have void return type but got {restype}"
+        raise TypeError(msg.format(restype=restype))
+
+    def _wrapped(pyfunc):
+        return compile_kernel(pyfunc, argtypes)
+
+    return _wrapped
+
+
+def _device_autojit(pyfunc):
+    return compile_device_template(pyfunc)
+
+
+def _kernel_autojit(pyfunc):
+    return AutoJitHSAKernel(pyfunc)
+
+
+
--- a/numba/roc/descriptor.py
+++ b/numba/roc/descriptor.py
+from numba.core.descriptors import TargetDescriptor
+from numba.core.options import TargetOptions
+from .target import HSATargetContext, HSATypingContext
+
+
+class HSATargetOptions(TargetOptions):
+    OPTIONS = {}
+
+
+class HSATargetDesc(TargetDescriptor):
+    options = HSATargetOptions
+    typingctx = HSATypingContext()
+    targetctx = HSATargetContext(typingctx)
--- a/numba/roc/dispatch.py
+++ b/numba/roc/dispatch.py
+import numpy as np
+
+from numba.np.ufunc.deviceufunc import (UFuncMechanism, GenerializedUFunc,
+                                        GUFuncCallSteps)
+from numba.roc.hsadrv.driver import dgpu_present
+import numba.roc.hsadrv.devicearray as devicearray
+import numba.roc.api as api
+
+class HsaUFuncDispatcher(object):
+    """
+    Invoke the HSA ufunc specialization for the given inputs.
+    """
+
+    def __init__(self, types_to_retty_kernels):
+        self.functions = types_to_retty_kernels
+
+    def __call__(self, *args, **kws):
+        """
+        *args: numpy arrays
+        **kws:
+            stream -- hsa stream; when defined, asynchronous mode is used.
+            out    -- output array. Can be a numpy array or DeviceArrayBase
+                      depending on the input arguments.  Type must match
+                      the input arguments.
+        """
+        return HsaUFuncMechanism.call(self.functions, args, kws)
+
+    def reduce(self, arg, stream=0):
+        raise NotImplementedError
+
+
+class HsaUFuncMechanism(UFuncMechanism):
+    """
+    Provide OpenCL specialization
+    """
+    DEFAULT_STREAM = 0
+    ARRAY_ORDER = 'A'
+
+    def is_device_array(self, obj):
+        if dgpu_present:
+            return devicearray.is_hsa_ndarray(obj)
+        else:
+            return isinstance(obj, np.ndarray)
+
+    def is_host_array(self, obj):
+        if dgpu_present:
+            return False
+        else:
+            return isinstance(obj, np.ndarray)
+
+    def to_device(self, hostary, stream):
+        if dgpu_present:
+            return api.to_device(hostary)
+        else:
+            return hostary
+
+    def launch(self, func, count, stream, args):
+        # ILP must match vectorize kernel source
+        ilp = 4
+        # Use more wavefront to allow hiding latency
+        tpb = 64 * 2
+        count = (count + (ilp - 1)) // ilp
+        blockcount = (count + (tpb - 1)) // tpb
+        func[blockcount, tpb](*args)
+
+    def device_array(self, shape, dtype, stream):
+        if dgpu_present:
+            return api.device_array(shape=shape, dtype=dtype)
+        else:
+            return np.empty(shape=shape, dtype=dtype)
+
+    def broadcast_device(self, ary, shape):
+        if dgpu_present:
+            raise NotImplementedError('device broadcast_device NIY')
+        else:
+            ax_differs = [ax for ax in range(len(shape))
+                          if ax >= ary.ndim
+                          or ary.shape[ax] != shape[ax]]
+
+            missingdim = len(shape) - len(ary.shape)
+            strides = [0] * missingdim + list(ary.strides)
+
+            for ax in ax_differs:
+                strides[ax] = 0
+
+            return np.ndarray(shape=shape, strides=strides,
+                              dtype=ary.dtype, buffer=ary)
+
+
+class _HsaGUFuncCallSteps(GUFuncCallSteps):
+    __slots__ = ()
+
+    def is_device_array(self, obj):
+        if dgpu_present:
+            return devicearray.is_hsa_ndarray(obj)
+        else:
+            return True
+
+    def to_device(self, hostary):
+        if dgpu_present:
+            return api.to_device(hostary)
+        else:
+            return hostary
+
+    def to_host(self, devary, hostary):
+        if dgpu_present:
+            out = devary.copy_to_host(hostary)
+            return out
+        else:
+            pass
+
+    def device_array(self, shape, dtype):
+        if dgpu_present:
+            return api.device_array(shape=shape, dtype=dtype)
+        else:
+            return np.empty(shape=shape, dtype=dtype)
+
+    def launch_kernel(self, kernel, nelem, args):
+        kernel.configure(nelem, min(nelem, 64))(*args)
+
+
+class HSAGenerializedUFunc(GenerializedUFunc):
+    @property
+    def _call_steps(self):
+        return _HsaGUFuncCallSteps
+
+    def _broadcast_scalar_input(self, ary, shape):
+        if dgpu_present:
+            return devicearray.DeviceNDArray(shape=shape,
+                                         strides=(0,),
+                                         dtype=ary.dtype,
+                                         dgpu_data=ary.dgpu_data)
+        else:
+            return np.lib.stride_tricks.as_strided(ary, shape=(shape,),
+                                               strides=(0,))
+
+    def _broadcast_add_axis(self, ary, newshape):
+        newax = len(newshape) - len(ary.shape)
+        # Add 0 strides for missing dimension
+        newstrides = (0,) * newax + ary.strides
+        if dgpu_present:
+                return devicearray.DeviceNDArray(shape=newshape,
+                                         strides=newstrides,
+                                         dtype=ary.dtype,
+                                         dgpu_data=ary.dgpu_data)
+        else:
+                raise NotImplementedError
+
--- a/numba/roc/enums.py
+++ b/numba/roc/enums.py
+CLK_LOCAL_MEM_FENCE = 0
+CLK_GLOBAL_MEM_FENCE = 1
--- a/numba/roc/gcn_occupancy.py
+++ b/numba/roc/gcn_occupancy.py
+import math
+from collections import namedtuple
+
+
+# GCN architecture specific info
+simd_per_cu = 4
+wave_size = 64
+vector_register_file_size = 64 * 2**10  # 64 kB
+byte_per_VGPR = 4
+vgpr_per_simd = vector_register_file_size // byte_per_VGPR
+sgpr_per_simd = 512
+max_wave_count = 10
+max_inflight_wave_per_cu = max_wave_count * simd_per_cu
+
+# XXX due to limit in AMDGPU backend
+max_group_size = 256
+
+
+_limits = namedtuple('_limits', ['allowed_wave_due_to_sgpr',
+                                 'allowed_wave_due_to_vgpr',
+                                 'allowed_wave',
+                                 'allowed_vgpr_per_workitem',
+                                 'occupancy',
+                                 'reasons',
+                                 'suggestions'])
+
+
+def get_limiting_factors(group_size, vgpr_per_workitem, sgpr_per_wave):
+    def _ceil(x):
+        return int(math.ceil(x))
+
+    # these might be zero, for resource limit treat as 1
+    vgpr_per_workitem =  vgpr_per_workitem if vgpr_per_workitem > 0 else 1
+    sgpr_per_wave = sgpr_per_wave if sgpr_per_wave > 0 else 1
+
+    workitem_per_simd = group_size / simd_per_cu
+    required_wave_count_per_simd = _ceil(workitem_per_simd / wave_size)
+    required_vgpr_per_wave = vgpr_per_workitem * wave_size
+    # limiting factor
+    allowed_wave_due_to_sgpr = sgpr_per_simd // sgpr_per_wave
+    allowed_wave_due_to_vgpr = vgpr_per_simd // required_vgpr_per_wave
+    allowed_wave = min(allowed_wave_due_to_sgpr, max_wave_count, allowed_wave_due_to_vgpr)
+    allowed_vgpr_per_workitem = _ceil(vgpr_per_simd / required_wave_count_per_simd / wave_size)
+    # reasons
+    reasons = set()
+    if allowed_wave_due_to_sgpr < required_wave_count_per_simd:
+        reasons.add('allowed_wave_due_to_sgpr')
+    if allowed_wave_due_to_vgpr < required_wave_count_per_simd:
+        reasons.add('allowed_wave_due_to_vgpr')
+    if allowed_wave < required_wave_count_per_simd:
+        reasons.add('allowed_wave')
+    if group_size > max_group_size:
+        reasons.add('group_size')
+
+    suggestions = [_suggestions[r] for r in sorted(reasons)]
+
+    # occupancy
+    inflight_wave_per_cu = (0 if reasons else
+                            required_wave_count_per_simd * simd_per_cu)
+    occupancy = inflight_wave_per_cu / max_inflight_wave_per_cu
+
+    return _limits(allowed_wave_due_to_sgpr=allowed_wave_due_to_sgpr,
+                   allowed_wave_due_to_vgpr=allowed_wave_due_to_vgpr,
+                   allowed_wave=allowed_wave,
+                   allowed_vgpr_per_workitem=allowed_vgpr_per_workitem,
+                   occupancy=occupancy,
+                   reasons=reasons,
+                   suggestions=suggestions)
+
+
+_suggestions = {}
+
+_suggestions['allowed_wave_due_to_sgpr'] = (
+    "* Cannot allocate enough sGPRs for all resident wavefronts."
+)
+
+_suggestions['allowed_wave_due_to_vgpr'] = (
+    "* Cannot allocate enough vGPRs for all resident wavefronts."
+)
+
+_suggestions['allowed_wave'] = (
+    "* Launch requires too many wavefronts. Try reducing group-size."
+)
+
+_suggestions['group_size'] = (
+    "* Exceeds max group size (256)."
+)
+
--- a/numba/roc/hlc/__init__.py
+++ b/numba/roc/hlc/__init__.py
+import os
+
+# 32-bit private, local, and region pointers. 64-bit global, constant and flat.
+# See:
+# https://github.com/RadeonOpenCompute/llvm/blob/b20b796f65ab6ac12fac4ea32e1d89e1861dee6a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp#L270-L275
+# Alloc goes into addrspace(5) (private)
+DATALAYOUT = {
+    64: ("e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
+         "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
+         "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"),
+}
+
+TRIPLE = "amdgcn--amdhsa"
+
+# Allow user to use "NUMBA_USE_LIBHLC" env-var to use cmdline HLC.
+if os.environ.get('NUMBA_USE_LIBHLC', '').lower() not in ['0', 'no', 'false']:
+    from numba.roc.hlc import libhlc as hlc
--- a/numba/roc/hlc/common.py
+++ b/numba/roc/hlc/common.py
+"""
+Shared code for the low level compiler tooling
+"""
+
+
+from abc import abstractmethod, ABCMeta
+import re
+
+# These are for parsing labels and metadata
+_re_labelname = re.compile(r"\n\.([0-9a-z_\.]+):", re.I) # label: .<stuff>:
+_re_regname = re.compile(r"%\.([0-9a-z_]+)", re.I) # register: %.<stuff>
+_re_metadata_def = re.compile(r"\!\d+\s*=")
+_re_metadata_correct_usage = re.compile(r"metadata\s*\![{'\"]")
+_re_metadata_ref = re.compile(r"\!\d+")
+
+# These are for parsing alloca instructions
+_re_alloca_quoted = re.compile('(.*)"(\$.*)".*')
+_re_alloca_parts = re.compile('(.*)=(.*alloca(.*))')
+
+def add_metadata_type(ir):
+    """
+    Rewrite metadata since llvm3.6 dropped the "metadata" type prefix.
+    """
+    buf = []
+    for line in ir.splitlines():
+        # If the line is a metadata
+        if _re_metadata_def.match(line):
+            # Does not contain any correct usage (Maybe already fixed)
+            if None is _re_metadata_correct_usage.search(line):
+                line = line.replace('!{', 'metadata !{')
+                line = line.replace('!"', 'metadata !"')
+
+                def sub_metadata(m):
+                    return "metadata {0}".format(m.group(0))
+
+                line = _re_metadata_ref.sub(sub_metadata, line)
+                line = line.lstrip('metadata ')
+        buf.append(line)
+
+    return '\n'.join(buf)
+
+
+def rename_register(llvmir):
+    """
+    HLC does not like variable with '.' prefix.
+    """
+    def repl(mat):
+        return '%_dot_.{0}'.format(mat.group(1))
+
+    return _re_regname.sub(repl, llvmir)
+
+
+def rename_label(llvmir):
+    """
+    HLC does not like a label with '.' prefix.
+    """
+    def repl(mat):
+        return '_dot_.{0}:'.format(mat.group(1))
+
+    return _re_labelname.sub(repl, llvmir)
+
+
+def adapt_llvm_version(llvmir):
+    """
+    Adapt the LLVM IR to match the syntax required by HLC.
+    """
+    llvmir = rename_register(llvmir)
+    llvmir = rename_label(llvmir)
+    #   return add_metadata_type(llvmir)
+    return llvmir
+
+
+def alloca_addrspace_correction(llvmir):
+    """
+    rewrites llvmir such that alloca's go into addrspace(5) and are then
+    addrspacecast back to to addrspace(0). Alloca into 5 is a requirement of
+    the datalayout specification.
+    """
+    lines = llvmir.splitlines()
+    mangle = '__tmp'
+    new_ir = []
+    for l in lines:
+        # pluck lines containing alloca
+        if 'alloca' in l:
+            assignee, alloca_match, ptrty = _re_alloca_parts.match(l).groups()
+            q_match = _re_alloca_quoted.match(assignee)
+            if q_match:
+                start, var = q_match.groups()
+                var = var.strip()
+                name_fmt = '%s"%s"'
+                old_name = name_fmt % (start, var)
+                new_name = name_fmt % (start, var + mangle)
+            else:
+                old_name = assignee.strip()
+                new_name = old_name + mangle
+            allocaline = "%s = %s, addrspace(5)" % (new_name, alloca_match)
+            castline_fmt = ("%s = addrspacecast %s addrspace(5)* "
+                            "%s to %s addrspace(0)*")
+            castline =  castline_fmt % (old_name, ptrty, new_name, ptrty)
+            new_ir.append(allocaline)
+            new_ir.append(castline)
+        else:
+            new_ir.append(l)
+    return '\n'.join(new_ir)
+
+
+class _AMDGCNModule(metaclass=ABCMeta):
+    """
+    The AMDCGN LLVM module contract
+    """
+
+    @abstractmethod
+    def load_llvm(self, llvmir):
+        pass
+
+    @abstractmethod
+    def link_builtins(self, main):
+        pass
+
+    @abstractmethod
+    def generateGCN(self, llvmir):
+        pass
+
+
+class AMDGCNModule(object):
+    """
+    The AMDCGN LLVM module contract
+    """
+
+    bitcodes = [
+    "opencl.amdgcn.bc",
+    "ocml.amdgcn.bc",
+    "ockl.amdgcn.bc",
+    "oclc_correctly_rounded_sqrt_off.amdgcn.bc",
+    "oclc_daz_opt_off.amdgcn.bc",
+    "oclc_finite_only_off.amdgcn.bc",
+    "oclc_isa_version_803.amdgcn.bc",
+    "oclc_unsafe_math_off.amdgcn.bc",
+    "irif.amdgcn.bc"
+    ]
+
+    def __init__(self):
+        self._finalized = False
+
+    def _preprocess(self, llvmir):
+        version_adapted = adapt_llvm_version(llvmir)
+        alloca_fixed = alloca_addrspace_correction(version_adapted)
+        return alloca_fixed
+
+    def load_llvm(self, llvmir):
+        pass
+
+    def link_builtins(self, main):
+        pass
+
+    def generateGCN(self):
+        pass
+
--- a/numba/roc/hlc/config.py
+++ b/numba/roc/hlc/config.py
+import sys
+import os
+
+# where ROCM bitcode is installed
+DEFAULT_ROCM_BC_PATH = '/opt/rocm/opencl/lib/x86_64/bitcode/'
+ROCM_BC_PATH = os.environ.get("NUMBA_ROCM_BC_PATH", DEFAULT_ROCM_BC_PATH)
--- a/numba/roc/hlc/hlc.py
+++ b/numba/roc/hlc/hlc.py
+# A temporary wrapper to connect to the HLC LLVM binaries.
+# Currently, connect to commandline interface.
+import sys
+from subprocess import check_call, check_output
+import subprocess
+import tempfile
+import os
+import re
+from collections import namedtuple
+from numba.roc.hsadrv import devices
+from .common import AMDGCNModule
+from .config import ROCM_BC_PATH
+from numba.roc.hlc import TRIPLE
+from datetime import datetime
+from contextlib import contextmanager
+from numba.core import utils, config
+from numba.roc.hsadrv.error import HsaSupportError
+
+_real_check_call = check_call
+
+NOISY_CMDLINE = False
+
+@contextmanager
+def error_pipe():
+    if NOISY_CMDLINE:
+       yield subprocess.STDOUT
+    else:
+        yield subprocess.DEVNULL
+
+
+def check_call(*args, **kwargs):
+    # This is so that time is stamped against invocation
+    # such that correlations can be looked for against messages in the
+    # sys and kernel logs.
+    try:
+        with error_pipe() as stderr:
+            if NOISY_CMDLINE:
+                print(datetime.now().strftime("%b %d %H:%M:%S"),
+                      file=sys.stdout)
+                print('CMD: ' + ';'.join(args), file=sys.stdout)
+
+            ret = _real_check_call(*args, stderr=stderr, **kwargs)
+
+    except subprocess.CalledProcessError as e:
+        print(e)
+        raise(e)
+    return ret
+
+
+class CmdLine(object):
+
+    def _initialize(self):
+        if not self.initialized:
+            dev_ctx = devices.get_context()
+            target_cpu = dev_ctx.agent.name.decode('UTF-8')
+            self.target_cpu = "-mcpu %s" % target_cpu
+
+        self.CMD_OPT = ' '.join([
+                self.opt,
+                "-O3",
+                self.triple_flag,
+                self.target_cpu,
+                "-disable-simplify-libcalls",
+                "-verify",
+                "-S",
+                "-o {fout}",
+                "{fin}"])
+
+        self.CMD_VERIFY = ' '.join([
+                    self.opt,
+                    "-verify",
+                    self.triple_flag,
+                    self.target_cpu,
+                    "-S",
+                    "-o {fout}",
+                    "{fin}"])
+
+        self.CMD_GEN_HSAIL = ' '.join([self.llc,
+                        "-O2",
+                        self.triple_flag,
+                        self.target_cpu,
+                        "-filetype=asm",
+                        "-o {fout}",
+                        "{fin}"])
+
+        self.CMD_GEN_BRIG = ' '.join([self.llc,
+                        "-O2",
+                        self.triple_flag,
+                        self.target_cpu,
+                        "-filetype=obj",
+                        "-o {fout}",
+                        "{fin}"])
+
+        self.CMD_LINK_BUILTINS = ' '.join([
+                            self.llvm_link,
+                            "-S",
+                            "-o {fout}",
+                            "{fin}",
+                            "{lib}"])
+
+        self.CMD_LINK_LIBS = ' '.join([self.llvm_link,
+                        "-S",
+                        "-o {fout}",
+                        "{fin}"])
+
+        self.CMD_LINK_BRIG = ' '.join([self.ld_lld,
+                        "-shared",
+                        "-o {fout}",
+                        "{fin}"])
+
+    def __init__(self):
+        self._binary_path = os.environ.get('HSAILBIN', None)
+        def _setup_path(tool):
+            if self._binary_path is not None:
+                return os.path.join(self._binary_path, tool)
+            else:
+                binpath = os.path.join(sys.prefix, 'bin', tool)
+                return binpath
+        self._triple = TRIPLE
+
+        self.opt = _setup_path("opt")
+        self.llc = _setup_path("llc")
+        self.llvm_link = _setup_path("llvm-link")
+        self.ld_lld = _setup_path("ld.lld")
+        self.triple_flag = "-mtriple %s" % self._triple
+        self.initialized = False
+
+    def check_tooling(self):
+        # make sure the llc can actually target amdgcn, ideally all tooling
+        # should be checked but most don't print anything useful and so
+        # compilation for AMDGCN would have to be tested instead. This is a
+        # smoke test like check.
+        try:
+            if not os.path.isfile(self.llc):
+                raise HsaSupportError('llc not found')
+            output = check_output([self.llc, '--version'],
+                                  universal_newlines=True)
+            olines = [x.strip() for x in output.splitlines()]
+            tgtidx = olines.index('Registered Targets:')
+            targets = olines[tgtidx + 1:]
+            for tgt in targets:
+                if 'amdgcn' in tgt:
+                    break
+            else:
+                msg = 'Command line tooling does not support "amdgcn" target'
+                raise HsaSupportError(msg)
+        except Exception as e:
+            raise
+
+    def verify(self, ipath, opath):
+        if not self.initialized:
+            self._initialize()
+        check_call(self.CMD_VERIFY.format(fout=opath, fin=ipath), shell=True)
+
+    def optimize(self, ipath, opath):
+        if not self.initialized:
+            self._initialize()
+        check_call(self.CMD_OPT.format(fout=opath, fin=ipath), shell=True)
+
+    def generate_hsail(self, ipath, opath):
+        if not self.initialized:
+            self._initialize()
+        check_call(self.CMD_GEN_HSAIL.format(fout=opath, fin=ipath), shell=True)
+
+    def generate_brig(self, ipath, opath):
+        if not self.initialized:
+            self._initialize()
+        check_call(self.CMD_GEN_BRIG.format(fout=opath, fin=ipath), shell=True)
+
+    def link_libs(self, ipath, libpaths, opath):
+        if not self.initialized:
+            self._initialize()
+        cmdline = self.CMD_LINK_LIBS.format(fout=opath, fin=ipath)
+        cmdline += ' '.join(["{0}".format(lib) for lib in libpaths])
+        check_call(cmdline, shell=True)
+
+    def link_brig(self, ipath, opath):
+        if not self.initialized:
+            self._initialize()
+        check_call(self.CMD_LINK_BRIG.format(fout=opath, fin=ipath), shell=True)
+
+
+class Module(AMDGCNModule):
+    def __init__(self):
+        """
+        Setup
+        """
+        self._tmpdir = tempfile.mkdtemp()
+        self._tempfiles = []
+        self._linkfiles = []
+        self._cmd = CmdLine()
+        AMDGCNModule.__init__(self)
+
+    def __del__(self):
+        return
+        self.close()
+
+    def close(self):
+       # Remove all temporary files
+        for afile in self._tempfiles:
+            os.unlink(afile)
+        #Remove directory
+        os.rmdir(self._tmpdir)
+
+    def _create_temp_file(self, name, mode='wb'):
+        path = self._track_temp_file(name)
+        fobj = open(path, mode=mode)
+        return fobj, path
+
+    def _track_temp_file(self, name):
+        path = os.path.join(self._tmpdir,
+                            "{0}-{1}".format(len(self._tempfiles), name))
+        self._tempfiles.append(path)
+        return path
+
+    def load_llvm(self, llvmir):
+        """
+        Load LLVM with HSAIL SPIR spec
+        """
+        # Preprocess LLVM IR
+        llvmir = self._preprocess(llvmir)
+
+        # Create temp file to store the input file
+        tmp_llvm_ir, fin = self._create_temp_file("dump-llvm-ir")
+        with tmp_llvm_ir:
+            tmp_llvm_ir.write(llvmir.encode('ascii'))
+
+        # Create temp file for optimization
+        fout = self._track_temp_file("verified-llvm-ir")
+        self._cmd.verify(ipath=fin, opath=fout)
+
+        if config.DUMP_OPTIMIZED:
+            with open(fout, 'rb') as fin_opt:
+                print(fin_opt.read().decode('ascii'))
+
+        self._linkfiles.append(fout)
+
+    def link_builtins(self, ipath, opath):
+
+        # progressively link in all the bitcodes
+        for bc in self.bitcodes:
+            if bc != self.bitcodes[-1]:
+                tmp_opath = opath + bc.replace('/', '_').replace('.','_')
+            else:
+                tmp_opath = opath
+            lib = os.path.join(ROCM_BC_PATH, bc)
+            cmd = self._cmd.CMD_LINK_BUILTINS.format(fout=tmp_opath, fin=ipath, lib=lib)
+            check_call(cmd, shell=True)
+            ipath = tmp_opath
+
+    def generateGCN(self):
+        """
+        Generate GCN from a module and also return the HSAIL code.
+        """
+        assert not self._finalized, "Module already has GCN generated"
+
+        # Link dependencies libraries
+        llvmfile = self._linkfiles[0]
+        pre_builtin_path = self._track_temp_file("link-dep")
+        libpaths = self._linkfiles[1:]
+        self._cmd.link_libs(ipath=llvmfile, libpaths=libpaths,
+                            opath=pre_builtin_path)
+
+        # Link library with the builtin modules
+        linked_path = self._track_temp_file("linked-path")
+        self.link_builtins(ipath=pre_builtin_path, opath=linked_path)
+
+        # Optimize
+        opt_path = self._track_temp_file("optimized-llvm-ir")
+        self._cmd.optimize(ipath=linked_path, opath=opt_path)
+
+        if config.DUMP_OPTIMIZED:
+            with open(opt_path, 'rb') as fin:
+                print(fin.read().decode('ascii'))
+
+        # Compile the llvm to HSAIL
+        hsail_path = self._track_temp_file("create-hsail")
+        self._cmd.generate_hsail(ipath=opt_path, opath=hsail_path)
+
+        # Compile the llvm to BRIG
+        brig_path = self._track_temp_file("create-brig")
+        self._cmd.generate_brig(ipath=opt_path, opath=brig_path)
+
+        # link
+        end_brig_path = self._track_temp_file("linked-brig")
+        self._cmd.link_brig(ipath = brig_path, opath=end_brig_path)
+
+        self._finalized = True
+
+        # Read HSAIL
+        with open(hsail_path, 'rb') as fin:
+            hsail = fin.read().decode('ascii')
+
+        # Read BRIG
+        with open(end_brig_path, 'rb') as fin:
+            brig = fin.read()
+
+        if config.DUMP_ASSEMBLY:
+            print(hsail)
+
+        return namedtuple('FinalizerResult', ['hsail', 'brig'])(hsail, brig)
--- a/numba/roc/hlc/libhlc.py
+++ b/numba/roc/hlc/libhlc.py
+import os
+import sys
+from collections import namedtuple
+from ctypes import (c_size_t, byref, c_char_p, c_void_p, Structure, CDLL,
+                    POINTER, create_string_buffer, c_int, addressof,
+                    c_byte)
+import tempfile
+import os
+import re
+import weakref
+from numba.roc.hsadrv import devices
+from .common import AMDGCNModule
+
+from numba.roc.hlc.hlc import CmdLine
+from numba.core import config
+
+# the CLI tooling is needed for the linking phase at present
+cli = CmdLine()
+
+
+class OpaqueModuleRef(Structure):
+    pass
+
+
+moduleref_ptr = POINTER(OpaqueModuleRef)
+
+
+def set_option(*opt):
+    """
+    Use this for setting debug flags to libHLC using the same options
+    available to LLVM.
+    E.g -debug-pass=Structure
+    """
+    inp = [create_string_buffer(x.encode('ascii')) for x in (('libhlc',) + opt)]
+    argc = len(inp)
+    argv = (c_char_p * argc)()
+    for i in range(argc):
+        argv[i] = addressof(inp[i])
+    hlc.ROC_SetCommandLineOption(argc, byref(argv))
+
+
+class Error(Exception):
+    pass
+
+
+class HLC(object):
+    """
+    LibHLC wrapper interface
+    """
+    hlc = None
+
+    def __init__(self):
+        # Lazily load the libHLC library
+        bitcode_path = os.path.join(sys.prefix, 'share', 'rocmtools')
+        assert os.path.exists(bitcode_path) and os.path.isdir(bitcode_path)
+        self.bitcode_path = bitcode_path
+        dev_ctx = devices.get_context()
+        target_cpu = dev_ctx.agent.name
+        self.target_cpu = target_cpu
+
+        if self.hlc is None:
+            try:
+                hlc = CDLL(os.path.join(sys.prefix, 'lib', 'librocmlite.so'))
+            except OSError:
+                raise ImportError("librocmlite.so cannot be found.  Please "
+                                  "install the roctools package by: "
+                                  "conda install -c numba roctools")
+
+            else:
+                hlc.ROC_ParseModule.restype = moduleref_ptr
+                hlc.ROC_ParseBitcode.restype = moduleref_ptr
+                hlc.ROC_ModuleEmitBRIG.restype = c_size_t
+                hlc.ROC_Initialize()
+                weakref.finalize(hlc, hlc.ROC_Finalize)
+
+                hlc.ROC_SetCommandLineOption.argtypes = [
+                    c_int,
+                    c_void_p,
+                ]
+
+                type(self).hlc = hlc
+
+    def parse_assembly(self, ir):
+        if isinstance(ir, str):
+            ir = ir.encode("latin1")
+        buf = create_string_buffer(ir)
+        mod = self.hlc.ROC_ParseModule(buf)
+        if not mod:
+            raise Error("Failed to parse assembly")
+        return mod
+
+    def parse_bitcode(self, bitcode):
+        buf = create_string_buffer(bitcode, len(bitcode))
+        mod = self.hlc.ROC_ParseBitcode(buf, c_size_t(len(bitcode)))
+        if not mod:
+            raise Error("Failed to parse bitcode")
+        return mod
+
+    def optimize(self, mod, opt=3, size=0, verify=1):
+        if not self.hlc.ROC_ModuleOptimize(mod, int(opt), int(size),
+                int(verify), c_char_p(self.target_cpu)):
+            raise Error("Failed to optimize module")
+
+    def link(self, dst, src):
+        if not self.hlc.ROC_ModuleLinkIn(dst, src):
+            raise Error("Failed to link modules")
+
+    def to_hsail(self, mod, opt=2):
+        buf = c_char_p(0)
+        if not self.hlc.ROC_ModuleEmitHSAIL(mod, int(opt),
+                c_char_p(self.target_cpu), byref(buf)):
+            raise Error("Failed to emit HSAIL")
+        ret = buf.value.decode("latin1")
+        self.hlc.ROC_DisposeString(buf)
+        return ret
+
+    def _link_brig(self, upbrig_loc, patchedbrig_loc):
+        cli.link_brig(upbrig_loc, patchedbrig_loc)
+
+    def to_brig(self, mod, opt=2):
+        bufptr = c_void_p(0)
+        size = self.hlc.ROC_ModuleEmitBRIG(mod, int(opt),
+                c_char_p(self.target_cpu), byref(bufptr))
+        if not size:
+            raise Error("Failed to emit BRIG")
+        buf = (c_byte * size).from_address(bufptr.value)
+        try:
+            buffer
+        except NameError:
+            ret = bytes(buf)
+        else:
+            ret = bytes(buffer(buf))
+        self.hlc.ROC_DisposeString(buf)
+        # Now we have an ELF, this needs patching with ld.lld which doesn't
+        # have an API. So we write out `ret` to a temporary file, then call
+        # the ld.lld ELF linker main() on it to generate a patched ELF
+        # temporary file output, which we read back in.
+
+        # tmpdir, not using a ctx manager as debugging is easier without
+        tmpdir = tempfile.mkdtemp()
+        tmp_files = []
+
+        # write out unpatched BRIG
+        upbrig_file = "unpatched.brig"
+        upbrig_loc = os.path.join(tmpdir, upbrig_file)
+        with open(upbrig_loc, "wb") as up_brig_fobj:
+            up_brig_fobj.write(ret)
+            tmp_files.append(upbrig_loc)
+
+        # record the location of the patched ELF
+        patchedbrig_file = "patched.brig"
+        patchedbrig_loc = os.path.join(tmpdir, patchedbrig_file)
+
+        # call out to ld.lld to patch
+        self._link_brig(upbrig_loc, patchedbrig_loc)
+
+        # read back in brig temporary.
+        with open(patchedbrig_loc, "rb") as p_brig_fobj:
+            patchedBrig = p_brig_fobj.read()
+            tmp_files.append(patchedbrig_loc)
+
+        # Remove all temporary files
+        for afile in tmp_files:
+            os.unlink(afile)
+        # Remove directory
+        os.rmdir(tmpdir)
+
+        return patchedBrig
+
+    def to_string(self, mod):
+        buf = c_char_p(0)
+        self.hlc.ROC_ModulePrint(mod, byref(buf))
+        ret = buf.value.decode("latin1")
+        self.hlc.ROC_DisposeString(buf)
+        return ret
+
+    def destroy_module(self, mod):
+        self.hlc.ROC_ModuleDestroy(mod)
+
+
+class Module(AMDGCNModule):
+    def __init__(self):
+        self._llvm_modules = []
+        self._hlc = HLC()
+        AMDGCNModule.__init__(self)
+
+    def load_llvm(self, llvmir):
+        """
+        Load LLVM with HSAIL SPIR spec
+        """
+        # Preprocess LLVM IR
+        # Because HLC does not handle dot in LLVM variable names
+        llvmir = self._preprocess(llvmir)
+
+        mod = self._hlc.parse_assembly(llvmir)
+
+        if config.DUMP_OPTIMIZED:
+            print(self._hlc.to_string(mod))
+
+        self._llvm_modules.append(mod)
+
+    def link_builtins(self, main):
+
+        for bc in self.bitcodes:
+            bc_path = os.path.join(self._hlc.bitcode_path, bc)
+            with open(bc_path, 'rb') as builtin:
+                buf = builtin.read()
+                mod = self._hlc.parse_bitcode(buf)
+                self._hlc.link(main, mod)
+
+
+    def generateGCN(self):
+        """
+        Finalize module and return the HSAIL code
+        """
+        assert not self._finalized, "Module finalized already"
+
+        # Link dependencies
+        main = self._llvm_modules[0]
+        for dep in self._llvm_modules[1:]:
+            self._hlc.link(main, dep)
+
+        # link bitcode
+        self.link_builtins(main)
+
+        # Optimize
+        self._hlc.optimize(main)
+
+        if config.DUMP_OPTIMIZED:
+            print(self._hlc.to_string(main))
+
+        # create HSAIL
+        hsail = self._hlc.to_hsail(main)
+
+        # Finalize the llvm to BRIG
+        brig = self._hlc.to_brig(main)
+
+        self._finalized = True
+
+        # Clean up main; other modules are destroyed at linking
+        self._hlc.destroy_module(main)
+
+        if config.DUMP_ASSEMBLY:
+            print(hsail)
+
+        return namedtuple('FinalizerResult', ['hsail', 'brig'])(hsail, brig)
--- a/numba/roc/hsadecl.py
+++ b/numba/roc/hsadecl.py
+from numba.core import types
+from numba.core.typing.npydecl import parse_dtype, parse_shape
+from numba.core.typing.templates import (AttributeTemplate, AbstractTemplate,
+                                         CallableTemplate, ConcreteTemplate,
+                                         signature, Registry)
+from numba import roc
+
+registry = Registry()
+intrinsic = registry.register
+intrinsic_attr = registry.register_attr
+intrinsic_global = registry.register_global
+
+
+# =============================== NOTE ===============================
+# Even though the following functions return size_t in the OpenCL standard,
+# It should be rare (and unrealistic) to have 2**63 number of work items.
+# We are choosing to use intp (signed 64-bit in large model) due to potential
+# loss of precision in coerce(intp, uintp) that results in double.
+
+
+@intrinsic
+class Hsa_get_global_id(ConcreteTemplate):
+    key = roc.get_global_id
+    cases = [signature(types.intp, types.uint32)]
+
+
+@intrinsic
+class Hsa_get_local_id(ConcreteTemplate):
+    key = roc.get_local_id
+    cases = [signature(types.intp, types.uint32)]
+
+
+@intrinsic
+class Hsa_get_group_id(ConcreteTemplate):
+    key = roc.get_group_id
+    cases = [signature(types.intp, types.uint32)]
+
+
+@intrinsic
+class Hsa_get_num_groups(ConcreteTemplate):
+    key = roc.get_num_groups
+    cases = [signature(types.intp, types.uint32)]
+
+
+@intrinsic
+class Hsa_get_work_dim(ConcreteTemplate):
+    key = roc.get_work_dim
+    cases = [signature(types.uint32)]
+
+
+@intrinsic
+class Hsa_get_global_size(ConcreteTemplate):
+    key = roc.get_global_size
+    cases = [signature(types.intp, types.uint32)]
+
+
+@intrinsic
+class Hsa_get_local_size(ConcreteTemplate):
+    key = roc.get_local_size
+    cases = [signature(types.intp, types.uint32)]
+
+
+@intrinsic
+class Hsa_barrier(ConcreteTemplate):
+    key = roc.barrier
+    cases = [signature(types.void, types.uint32),
+             signature(types.void)]
+
+
+@intrinsic
+class Hsa_mem_fence(ConcreteTemplate):
+    key = roc.mem_fence
+    cases = [signature(types.void, types.uint32)]
+
+
+@intrinsic
+class Hsa_wavebarrier(ConcreteTemplate):
+    key = roc.wavebarrier
+    cases = [signature(types.void)]
+
+
+@intrinsic
+class Hsa_activelanepermute_wavewidth(ConcreteTemplate):
+    key = roc.activelanepermute_wavewidth
+    # parameter: src, laneid, identity, useidentity
+    cases = [signature(ty, ty, types.uint32, ty, types.bool_)
+             for ty in (types.integer_domain|types.real_domain)]
+
+
+class _Hsa_ds_permuting(ConcreteTemplate):
+    # parameter: index, source
+    cases = [signature(types.int32, types.int32, types.int32),
+             signature(types.int32, types.int64, types.int32),
+             signature(types.float32, types.int32, types.float32),
+             signature(types.float32, types.int64, types.float32)]
+    unsafe_casting = False
+
+
+@intrinsic
+class Hsa_ds_permute(_Hsa_ds_permuting):
+    key = roc.ds_permute
+
+
+@intrinsic
+class Hsa_ds_bpermute(_Hsa_ds_permuting):
+    key = roc.ds_bpermute
+
+
+# hsa.shared submodule -------------------------------------------------------
+
+@intrinsic
+class Hsa_shared_array(CallableTemplate):
+    key = roc.shared.array
+
+    def generic(self):
+        def typer(shape, dtype):
+
+            # Only integer literals and tuples of integer literals are valid
+            # shapes
+            if isinstance(shape, types.Integer):
+                if not isinstance(shape, types.IntegerLiteral):
+                    return None
+            elif isinstance(shape, (types.Tuple, types.UniTuple)):
+                if any([not isinstance(s, types.IntegerLiteral) for s in shape]):
+                    return None
+            else:
+                return None
+
+            ndim = parse_shape(shape)
+            nb_dtype = parse_dtype(dtype)
+            if nb_dtype is not None and ndim is not None:
+                return types.Array(dtype=nb_dtype, ndim=ndim, layout='C')
+
+        return typer
+
+
+@intrinsic_attr
+class HsaSharedTemplate(AttributeTemplate):
+    key = types.Module(roc.shared)
+
+    def resolve_array(self, mod):
+        return types.Function(Hsa_shared_array)
+
+
+# hsa.atomic submodule -------------------------------------------------------
+
+@intrinsic
+class Hsa_atomic_add(AbstractTemplate):
+    key = roc.atomic.add
+
+    def generic(self, args, kws):
+        assert not kws
+        ary, idx, val = args
+
+        if ary.ndim == 1:
+            return signature(ary.dtype, ary, types.intp, ary.dtype)
+        elif ary.ndim > 1:
+            return signature(ary.dtype, ary, idx, ary.dtype)
+
+
+@intrinsic_attr
+class HsaAtomicTemplate(AttributeTemplate):
+    key = types.Module(roc.atomic)
+
+    def resolve_add(self, mod):
+        return types.Function(Hsa_atomic_add)
+
+
+# hsa module -----------------------------------------------------------------
+
+@intrinsic_attr
+class HsaModuleTemplate(AttributeTemplate):
+    key = types.Module(roc)
+
+    def resolve_get_global_id(self, mod):
+        return types.Function(Hsa_get_global_id)
+
+    def resolve_get_local_id(self, mod):
+        return types.Function(Hsa_get_local_id)
+
+    def resolve_get_global_size(self, mod):
+        return types.Function(Hsa_get_global_size)
+
+    def resolve_get_local_size(self, mod):
+        return types.Function(Hsa_get_local_size)
+
+    def resolve_get_num_groups(self, mod):
+        return types.Function(Hsa_get_num_groups)
+
+    def resolve_get_work_dim(self, mod):
+        return types.Function(Hsa_get_work_dim)
+
+    def resolve_get_group_id(self, mod):
+        return types.Function(Hsa_get_group_id)
+
+    def resolve_barrier(self, mod):
+        return types.Function(Hsa_barrier)
+
+    def resolve_mem_fence(self, mod):
+        return types.Function(Hsa_mem_fence)
+
+    def resolve_wavebarrier(self, mod):
+        return types.Function(Hsa_wavebarrier)
+
+    def resolve_activelanepermute_wavewidth(self, mod):
+        return types.Function(Hsa_activelanepermute_wavewidth)
+
+    def resolve_ds_permute(self, mod):
+        return types.Function(Hsa_ds_permute)
+
+    def resolve_ds_bpermute(self, mod):
+        return types.Function(Hsa_ds_bpermute)
+
+    def resolve_shared(self, mod):
+        return types.Module(roc.shared)
+
+    def resolve_atomic(self, mod):
+        return types.Module(roc.atomic)
+
+
+# intrinsic
+
+intrinsic_global(roc, types.Module(roc))
--- a/numba/roc/hsadrv/__init__.py
+++ b/numba/roc/hsadrv/__init__.py
+"""HSA driver
+
+This submodule contains low level bindings to HSA
+"""