Commit 5c70ef66 authored by dugupeiwen's avatar dugupeiwen
Browse files

update roc

parent 1fb0017a
......@@ -144,6 +144,9 @@ class CUDA(GPU):
"""Mark the target as CUDA.
"""
class ROCm(GPU):
"""Mark the target as ROCm.
"""
class NPyUfunc(Target):
"""Mark the target as a ufunc
......@@ -157,6 +160,7 @@ target_registry['GPU'] = GPU
target_registry['gpu'] = GPU
target_registry['CUDA'] = CUDA
target_registry['cuda'] = CUDA
target_registry['ROCm'] = ROCm
target_registry['npyufunc'] = NPyUfunc
dispatcher_registry = DelayedRegistry(key_type=Target)
......
......@@ -13,6 +13,8 @@ import numpy as np
import llvmlite.binding as llvmbind
from llvmlite import __version__ as llvmlite_version
from numba import cuda as cu, __version__ as version_number
from numba import roc
from numba.roc.hlc import hlc, libhlc
from numba.cuda import cudadrv
from numba.cuda.cudadrv.driver import driver as cudriver
from numba.cuda.cudadrv.runtime import runtime as curuntime
......@@ -70,6 +72,10 @@ _numpy_supported_simd_features = 'NumPy Supported SIMD features'
_numpy_supported_simd_dispatch = 'NumPy Supported SIMD dispatch'
_numpy_supported_simd_baseline = 'NumPy Supported SIMD baseline'
_numpy_AVX512_SKX_detected = 'NumPy AVX512_SKX detected'
# ROC information
_roc_available, _roc_toolchains = 'ROC Available', 'ROC Toolchains'
_hsa_agents_count, _hsa_agents = 'HSA Agents Count', 'HSA Agents'
_hsa_gpus_count, _hsa_gpus = 'HSA Discrete GPUs Count', 'HSA Discrete GPUs'
# SVML info
_svml_state, _svml_loaded = 'SVML State', 'SVML Lib Loaded'
_llvm_svml_patched = 'LLVM SVML Patched'
......@@ -309,6 +315,7 @@ def get_sysinfo():
_numba_version: version_number,
_llvm_version: '.'.join(str(i) for i in llvmbind.llvm_version_info),
_llvmlite_version: llvmlite_version,
_roc_available: roc.is_available(),
_psutil: _psutil_import,
}
......@@ -417,6 +424,61 @@ def get_sysinfo():
sys_info[_numpy_AVX512_SKX_detected] = \
__cpu_features__.get("AVX512_SKX", False)
# ROC information
# If no ROC try and report why
if not sys_info[_roc_available]:
from numba.roc.hsadrv.driver import hsa
try:
hsa.is_available
except Exception as e:
msg = str(e)
else:
msg = 'No ROC toolchains found.'
_warning_log.append(f"Warning (roc): Error initialising ROC: {msg}")
toolchains = []
try:
libhlc.HLC()
toolchains.append('librocmlite library')
except Exception:
pass
try:
cmd = hlc.CmdLine().check_tooling()
toolchains.append('ROC command line tools')
except Exception:
pass
sys_info[_roc_toolchains] = toolchains
try:
# ROC might not be available due to lack of tool chain, but HSA
# agents may be listed
from numba.roc.hsadrv.driver import hsa, dgpu_count
def decode(x):
return x.decode('utf-8') if isinstance(x, bytes) else x
sys_info[_hsa_agents_count] = len(hsa.agents)
agents = []
for i, agent in enumerate(hsa.agents):
agents.append({
'Agent id': i,
'Vendor': decode(agent.vendor_name),
'Name': decode(agent.name),
'Type': agent.device,
})
sys_info[_hsa_agents] = agents
_dgpus = []
for a in hsa.agents:
if a.is_component and a.device == 'GPU':
_dgpus.append(decode(a.name))
sys_info[_hsa_gpus_count] = dgpu_count()
sys_info[_hsa_gpus] = ', '.join(_dgpus)
except Exception as e:
_warning_log.append(
"Warning (roc): No HSA Agents found, "
f"encountered exception when searching: {e}")
# SVML information
# Replicate some SVML detection logic from numba.__init__ here.
# If SVML load fails in numba.__init__ the splitting of the logic
......@@ -625,6 +687,15 @@ def display_sysinfo(info=None, sep_pos=45):
("NumPy AVX512_SKX support detected",
info.get(_numpy_AVX512_SKX_detected, '?')),
("",),
("__ROC information__",),
("ROC Available", info.get(_roc_available, '?')),
("ROC Toolchains", info.get(_roc_toolchains, []) or 'None'),
("HSA Agents Count", info.get(_hsa_agents_count, 0)),
("HSA Agents:",),
(DisplaySeqMaps(info.get(_hsa_agents, {})) or ('None',)),
('HSA Discrete GPUs Count', info.get(_hsa_gpus_count, 0)),
('HSA Discrete GPUs', info.get(_hsa_gpus, 'None')),
("",),
("__SVML Information__",),
("SVML State, config.USING_SVML", info.get(_svml_state, '?')),
("SVML Library Loaded", info.get(_svml_loaded, '?')),
......
......@@ -27,6 +27,16 @@ def _init():
Vectorize.target_registry.ondemand['cuda'] = init_cuda_vectorize
GUVectorize.target_registry.ondemand['cuda'] = init_cuda_guvectorize
def init_roc_vectorize():
from numba.roc.vectorizers import HsaVectorize
return HsaVectorize
def init_roc_guvectorize():
from numba.roc.vectorizers import HsaGUFuncVectorize
return HsaGUFuncVectorize
Vectorize.target_registry.ondemand['roc'] = init_roc_vectorize
GUVectorize.target_registry.ondemand['roc'] = init_roc_guvectorize
_init()
del _init
Setup
-----
`libhsakmt.so.1`, `libhsa-runtime64.so`, `libhsa-runtime-ext64.so` must be in
the `LD_LIBRARY_PATH`.
The standard location of these libraries are in `/opt/hsa/lib`. Thus,
user can simply do `export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hsa/lib`
Run Tests
---------
The HSA test suite can be executed from the base of the source tree with:
```bash
python runtests.py numba.hsa.tests -vb
```
The test suite can also be executed inside the python interpreter with:
```python
import numba.hsa
numba.hsa.test("-vb")
```
Or directly from the terminal with:
```bash
python -c 'import numba.hsa; numba.hsa.test("-vb")'
```
Note that the "-vb" flags are optional. The "-v" flag enables verbose mode
that will print the name of each test. The "-b" flag enables capturing
the stdout messages printed from within the tests.
"""
Module that deals with HSA in a high level way
"""
import os
import numba.testing
from .api import *
from .stubs import atomic
def is_available():
"""Returns a boolean to indicate the availability of a HSA runtime.
This will force initialization of the driver if it hasn't been
initialized. It also checks that a toolchain is present.
"""
from .hsadrv.driver import hsa
from .hlc import hlc, libhlc
has_a_toolchain = False
try:
libhlc.HLC()
has_a_toolchain = True
except:
try:
cmd = hlc.CmdLine().check_tooling()
has_a_toolchain = True
except:
pass
return hsa.is_available and has_a_toolchain
if is_available():
from .hsadrv.driver import hsa
agents = list(hsa.agents)
else:
agents = []
def test(*args, **kwargs):
if not is_available():
raise RuntimeError("HSA is not detected")
return numba.testing.test("numba.hsa.tests", *args, **kwargs)
import numpy as np
from numba import mviewbuf
from numba.roc.hsadrv.devices import get_context
from .stubs import (
get_global_id,
get_global_size,
get_local_id,
get_local_size,
get_group_id,
get_work_dim,
get_num_groups,
barrier,
mem_fence,
shared,
wavebarrier,
activelanepermute_wavewidth,
ds_permute,
ds_bpermute,
)
from .decorators import (
jit,
)
from .enums import (
CLK_LOCAL_MEM_FENCE,
CLK_GLOBAL_MEM_FENCE
)
from .hsadrv.driver import hsa as _hsadrv
from .hsadrv import devicearray
class _AutoDeregister(object):
def __init__(self, args):
self.args = args
def __enter__(self):
pass
def __exit__(self, exc_type, exc_val, exc_tb):
deregister(*self.args)
def register(*args):
"""Register data into the HSA system
Returns a contextmanager for use in with-context for auto deregistration.
Use in context:
with hsa.register(array):
do_work_on_HSA(array)
"""
for data in args:
if isinstance(data, np.ndarray):
_hsadrv.hsa_memory_register(data.ctypes.data, data.nbytes)
else:
raise TypeError(type(data))
return _AutoDeregister(args)
def deregister(*args):
"""Deregister data from the HSA system
"""
for data in args:
if isinstance(data, np.ndarray):
_hsadrv.hsa_memory_deregister(data.ctypes.data, data.nbytes)
else:
raise TypeError(type(data))
def device_array(shape, dtype=np.float_, strides=None, order='C'):
"""device_array(shape, dtype=np.float_, strides=None, order='C')
Allocate an empty device ndarray. Similar to :meth:`numpy.empty`.
"""
shape, strides, dtype = _prepare_shape_strides_dtype(shape, strides, dtype,
order)
return devicearray.DeviceNDArray(shape=shape, strides=strides, dtype=dtype)
def device_array_like(ary):
"""Call roc.devicearray() with information from the array.
"""
return device_array(shape=ary.shape, dtype=ary.dtype, strides=ary.strides)
def to_device(obj, stream=None, context=None, copy=True, to=None):
"""to_device(obj, context, copy=True, to=None)
Allocate and transfer a numpy ndarray or structured scalar to the device.
To copy host->device a numpy array::
ary = numpy.arange(10)
d_ary = roc.to_device(ary)
The resulting ``d_ary`` is a ``DeviceNDArray``.
To copy device->host::
hary = d_ary.copy_to_host()
To copy device->host to an existing array::
ary = numpy.empty(shape=d_ary.shape, dtype=d_ary.dtype)
d_ary.copy_to_host(ary)
"""
context = context or get_context()
if to is None:
to = devicearray.from_array_like(obj)
if copy:
to.copy_to_device(obj, stream=stream, context=context)
return to
def stream():
return _hsadrv.create_stream()
def _fill_stride_by_order(shape, dtype, order):
nd = len(shape)
strides = [0] * nd
if order == 'C':
strides[-1] = dtype.itemsize
for d in reversed(range(nd - 1)):
strides[d] = strides[d + 1] * shape[d + 1]
elif order == 'F':
strides[0] = dtype.itemsize
for d in range(1, nd):
strides[d] = strides[d - 1] * shape[d - 1]
else:
raise ValueError('must be either C/F order')
return tuple(strides)
def _prepare_shape_strides_dtype(shape, strides, dtype, order):
dtype = np.dtype(dtype)
if isinstance(shape, int):
shape = (shape,)
if isinstance(strides, int):
strides = (strides,)
else:
if shape == ():
shape = (1,)
strides = strides or _fill_stride_by_order(shape, dtype, order)
return shape, strides, dtype
def _memory_size_from_info(shape, strides, itemsize):
"""Get the byte size of a contiguous memory buffer given the shape, strides
and itemsize.
"""
assert len(shape) == len(strides), "# dim mismatch"
ndim = len(shape)
s, e = mviewbuf.memoryview_get_extents_info(shape, strides, ndim, itemsize)
return e - s
def _host_array(finegrain, shape, dtype, strides, order):
from .hsadrv import devices
shape, strides, dtype = _prepare_shape_strides_dtype(shape, strides, dtype,
order)
bytesize = _memory_size_from_info(shape, strides, dtype.itemsize)
# TODO does allowing access by all dGPUs really work in a multiGPU system?
agents = [c._agent for c in devices.get_all_contexts()]
buf = devices.get_cpu_context().memhostalloc(bytesize, finegrain=finegrain,
allow_access_to=agents)
arr = np.ndarray(shape=shape, strides=strides, dtype=dtype, order=order,
buffer=buf)
return arr.view(type=devicearray.HostArray)
def coarsegrain_array(shape, dtype=np.float_, strides=None, order='C'):
"""coarsegrain_array(shape, dtype=np.float_, strides=None, order='C')
Similar to np.empty().
"""
return _host_array(finegrain=False, shape=shape, dtype=dtype,
strides=strides, order=order)
def finegrain_array(shape, dtype=np.float_, strides=None, order='C'):
"""finegrain_array(shape, dtype=np.float_, strides=None, order='C')
Similar to np.empty().
"""
return _host_array(finegrain=False, shape=shape, dtype=dtype,
strides=strides, order=order)
from llvmlite import binding as ll
from llvmlite.llvmpy import core as lc
from numba.core import utils
from numba.core.codegen import BaseCPUCodegen, CodeLibrary
from .hlc import DATALAYOUT, TRIPLE, hlc
class HSACodeLibrary(CodeLibrary):
def _optimize_functions(self, ll_module):
pass
def _optimize_final_module(self):
pass
def _finalize_specific(self):
pass
def get_asm_str(self):
"""
Get the human-readable assembly.
"""
m = hlc.Module()
m.load_llvm(str(self._final_module))
out = m.finalize()
return str(out.hsail)
class JITHSACodegen(BaseCPUCodegen):
_library_class = HSACodeLibrary
def _init(self, llvm_module):
assert list(llvm_module.global_variables) == [], "Module isn't empty"
self._data_layout = DATALAYOUT[utils.MACHINE_BITS]
self._target_data = ll.create_target_data(self._data_layout)
def _create_empty_module(self, name):
ir_module = lc.Module(name)
ir_module.triple = TRIPLE
return ir_module
def _module_pass_manager(self):
raise NotImplementedError
def _function_pass_manager(self, llvm_module):
raise NotImplementedError
def _add_module(self, module):
pass
import copy
from collections import namedtuple
import ctypes
import re
import numpy as np
from numba.core.typing.templates import ConcreteTemplate
from numba.core import types, config, compiler
from .hlc import hlc
from .hsadrv import devices, driver, enums, drvapi
from .hsadrv.error import HsaKernelLaunchError
from numba.roc import gcn_occupancy
from numba.roc.hsadrv.driver import hsa, dgpu_present
from .hsadrv import devicearray
from numba.core.typing.templates import AbstractTemplate
from numba.core.compiler_lock import global_compiler_lock
@global_compiler_lock
def compile_hsa(pyfunc, return_type, args, debug):
# First compilation will trigger the initialization of the HSA backend.
from .descriptor import HSATargetDesc
typingctx = HSATargetDesc.typingctx
targetctx = HSATargetDesc.targetctx
# TODO handle debug flag
flags = compiler.Flags()
# Do not compile (generate native code), just lower (to LLVM)
flags.set('no_compile')
flags.set('no_cpython_wrapper')
flags.set('no_cfunc_wrapper')
flags.unset('nrt')
# Run compilation pipeline
cres = compiler.compile_extra(typingctx=typingctx,
targetctx=targetctx,
func=pyfunc,
args=args,
return_type=return_type,
flags=flags,
locals={})
# Linking depending libraries
# targetctx.link_dependencies(cres.llvm_module, cres.target_context.linking)
library = cres.library
library.finalize()
return cres
def compile_kernel(pyfunc, args, debug=False):
cres = compile_hsa(pyfunc, types.void, args, debug=debug)
func = cres.library.get_function(cres.fndesc.llvm_func_name)
kernel = cres.target_context.prepare_hsa_kernel(func, cres.signature.args)
hsakern = HSAKernel(llvm_module=kernel.module,
name=kernel.name,
argtypes=cres.signature.args)
return hsakern
def compile_device(pyfunc, return_type, args, debug=False):
cres = compile_hsa(pyfunc, return_type, args, debug=debug)
func = cres.library.get_function(cres.fndesc.llvm_func_name)
cres.target_context.mark_hsa_device(func)
devfn = DeviceFunction(cres)
class device_function_template(ConcreteTemplate):
key = devfn
cases = [cres.signature]
cres.typing_context.insert_user_function(devfn, device_function_template)
libs = [cres.library]
cres.target_context.insert_user_function(devfn, cres.fndesc, libs)
return devfn
def compile_device_template(pyfunc):
"""Compile a DeviceFunctionTemplate
"""
from .descriptor import HSATargetDesc
dft = DeviceFunctionTemplate(pyfunc)
class device_function_template(AbstractTemplate):
key = dft
def generic(self, args, kws):
assert not kws
return dft.compile(args)
typingctx = HSATargetDesc.typingctx
typingctx.insert_user_function(dft, device_function_template)
return dft
class DeviceFunctionTemplate(object):
"""Unmaterialized device function
"""
def __init__(self, pyfunc, debug=False):
self.py_func = pyfunc
self.debug = debug
# self.inline = inline
self._compileinfos = {}
def compile(self, args):
"""Compile the function for the given argument types.
Each signature is compiled once by caching the compiled function inside
this object.
"""
if args not in self._compileinfos:
cres = compile_hsa(self.py_func, None, args, debug=self.debug)
func = cres.library.get_function(cres.fndesc.llvm_func_name)
cres.target_context.mark_hsa_device(func)
first_definition = not self._compileinfos
self._compileinfos[args] = cres
libs = [cres.library]
if first_definition:
# First definition
cres.target_context.insert_user_function(self, cres.fndesc,
libs)
else:
cres.target_context.add_user_function(self, cres.fndesc, libs)
else:
cres = self._compileinfos[args]
return cres.signature
class DeviceFunction(object):
def __init__(self, cres):
self.cres = cres
def _ensure_list(val):
if not isinstance(val, (tuple, list)):
return [val]
else:
return list(val)
def _ensure_size_or_append(val, size):
n = len(val)
for _ in range(n, size):
val.append(1)
class HSAKernelBase(object):
"""Define interface for configurable kernels
"""
def __init__(self):
self.global_size = (1,)
self.local_size = (1,)
self.stream = None
def copy(self):
return copy.copy(self)
def configure(self, global_size, local_size=None, stream=None):
"""Configure the OpenCL kernel
local_size can be None
"""
global_size = _ensure_list(global_size)
if local_size is not None:
local_size = _ensure_list(local_size)
size = max(len(global_size), len(local_size))
_ensure_size_or_append(global_size, size)
_ensure_size_or_append(local_size, size)
clone = self.copy()
clone.global_size = tuple(global_size)
clone.local_size = tuple(local_size) if local_size else None
clone.stream = stream
return clone
def forall(self, nelem, local_size=64, stream=None):
"""Simplified configuration for 1D kernel launch
"""
return self.configure(nelem, min(nelem, local_size), stream=stream)
def __getitem__(self, args):
"""Mimick CUDA python's square-bracket notation for configuration.
This assumes a the argument to be:
`griddim, blockdim, stream`
The blockdim maps directly to local_size.
The actual global_size is computed by multiplying the local_size to
griddim.
"""
griddim = _ensure_list(args[0])
blockdim = _ensure_list(args[1])
size = max(len(griddim), len(blockdim))
_ensure_size_or_append(griddim, size)
_ensure_size_or_append(blockdim, size)
# Compute global_size
gs = [g * l for g, l in zip(griddim, blockdim)]
return self.configure(gs, blockdim, *args[2:])
_CacheEntry = namedtuple("_CachedEntry", ['symbol', 'executable',
'kernarg_region'])
class _CachedProgram(object):
def __init__(self, entry_name, binary):
self._entry_name = entry_name
self._binary = binary
# key: hsa context
self._cache = {}
def get(self):
ctx = devices.get_context()
result = self._cache.get(ctx)
# The program does not exist as GCN yet.
if result is None:
# generate GCN
symbol = '{0}'.format(self._entry_name)
agent = ctx.agent
ba = bytearray(self._binary)
bblob = ctypes.c_byte * len(self._binary)
bas = bblob.from_buffer(ba)
code_ptr = drvapi.hsa_code_object_t()
driver.hsa.hsa_code_object_deserialize(
ctypes.addressof(bas),
len(self._binary),
None,
ctypes.byref(code_ptr)
)
code = driver.CodeObject(code_ptr)
ex = driver.Executable()
ex.load(agent, code)
ex.freeze()
symobj = ex.get_symbol(agent, symbol)
regions = agent.regions.globals
for reg in regions:
if reg.host_accessible:
if reg.supports(enums.HSA_REGION_GLOBAL_FLAG_KERNARG):
kernarg_region = reg
break
assert kernarg_region is not None
# Cache the GCN program
result = _CacheEntry(symbol=symobj, executable=ex,
kernarg_region=kernarg_region)
self._cache[ctx] = result
return ctx, result
class HSAKernel(HSAKernelBase):
"""
A HSA kernel object
"""
def __init__(self, llvm_module, name, argtypes):
super(HSAKernel, self).__init__()
self._llvm_module = llvm_module
self.assembly, self.binary = self._generateGCN()
self.entry_name = name
self.argument_types = tuple(argtypes)
self._argloc = []
# cached program
self._cacheprog = _CachedProgram(entry_name=self.entry_name,
binary=self.binary)
self._parse_kernel_resource()
def _parse_kernel_resource(self):
"""
Temporary workaround for register limit
"""
m = re.search(r"\bwavefront_sgpr_count\s*=\s*(\d+)", self.assembly)
self._wavefront_sgpr_count = int(m.group(1))
m = re.search(r"\bworkitem_vgpr_count\s*=\s*(\d+)", self.assembly)
self._workitem_vgpr_count = int(m.group(1))
def _sentry_resource_limit(self):
# only check resource factprs if either sgpr or vgpr is non-zero
#if (self._wavefront_sgpr_count > 0 or self._workitem_vgpr_count > 0):
group_size = np.prod(self.local_size)
limits = gcn_occupancy.get_limiting_factors(
group_size=group_size,
vgpr_per_workitem=self._workitem_vgpr_count,
sgpr_per_wave=self._wavefront_sgpr_count)
if limits.reasons:
fmt = 'insufficient resources to launch kernel due to:\n{}'
msg = fmt.format('\n'.join(limits.suggestions))
raise HsaKernelLaunchError(msg)
def _generateGCN(self):
hlcmod = hlc.Module()
hlcmod.load_llvm(str(self._llvm_module))
return hlcmod.generateGCN()
def bind(self):
"""
Bind kernel to device
"""
ctx, entry = self._cacheprog.get()
if entry.symbol.kernarg_segment_size > 0:
sz = ctypes.sizeof(ctypes.c_byte) *\
entry.symbol.kernarg_segment_size
kernargs = entry.kernarg_region.allocate(sz)
else:
kernargs = None
return ctx, entry.symbol, kernargs, entry.kernarg_region
def __call__(self, *args):
self._sentry_resource_limit()
ctx, symbol, kernargs, kernarg_region = self.bind()
# Unpack pyobject values into ctypes scalar values
expanded_values = []
# contains lambdas to execute on return
retr = []
for ty, val in zip(self.argument_types, args):
_unpack_argument(ty, val, expanded_values, retr)
# Insert kernel arguments
base = 0
for av in expanded_values:
# Adjust for alignment
align = ctypes.sizeof(av)
pad = _calc_padding_for_alignment(align, base)
base += pad
# Move to offset
offseted = kernargs.value + base
asptr = ctypes.cast(offseted, ctypes.POINTER(type(av)))
# Assign value
asptr[0] = av
# Increment offset
base += align
# Actual Kernel launch
qq = ctx.default_queue
if self.stream is None:
hsa.implicit_sync()
# Dispatch
signal = None
if self.stream is not None:
signal = hsa.create_signal(1)
qq.insert_barrier(self.stream._get_last_signal())
qq.dispatch(symbol, kernargs, workgroup_size=self.local_size,
grid_size=self.global_size, signal=signal)
if self.stream is not None:
self.stream._add_signal(signal)
# retrieve auto converted arrays
for wb in retr:
wb()
# Free kernel region
if kernargs is not None:
if self.stream is None:
kernarg_region.free(kernargs)
else:
self.stream._add_callback(lambda: kernarg_region.free(kernargs))
def _unpack_argument(ty, val, kernelargs, retr):
"""
Convert arguments to ctypes and append to kernelargs
"""
if isinstance(ty, types.Array):
c_intp = ctypes.c_ssize_t
# if a dgpu is present, move the data to the device.
if dgpu_present:
devary, conv = devicearray.auto_device(val, devices.get_context())
if conv:
retr.append(lambda: devary.copy_to_host(val))
data = devary.device_ctypes_pointer
else:
data = ctypes.c_void_p(val.ctypes.data)
meminfo = parent = ctypes.c_void_p(0)
nitems = c_intp(val.size)
itemsize = c_intp(val.dtype.itemsize)
kernelargs.append(meminfo)
kernelargs.append(parent)
kernelargs.append(nitems)
kernelargs.append(itemsize)
kernelargs.append(data)
for ax in range(val.ndim):
kernelargs.append(c_intp(val.shape[ax]))
for ax in range(val.ndim):
kernelargs.append(c_intp(val.strides[ax]))
elif isinstance(ty, types.Integer):
cval = getattr(ctypes, "c_%s" % ty)(val)
kernelargs.append(cval)
elif ty == types.float64:
cval = ctypes.c_double(val)
kernelargs.append(cval)
elif ty == types.float32:
cval = ctypes.c_float(val)
kernelargs.append(cval)
elif ty == types.boolean:
cval = ctypes.c_uint8(int(val))
kernelargs.append(cval)
elif ty == types.complex64:
kernelargs.append(ctypes.c_float(val.real))
kernelargs.append(ctypes.c_float(val.imag))
elif ty == types.complex128:
kernelargs.append(ctypes.c_double(val.real))
kernelargs.append(ctypes.c_double(val.imag))
else:
raise NotImplementedError(ty, val)
def _calc_padding_for_alignment(align, base):
"""
Returns byte padding required to move the base pointer into proper alignment
"""
rmdr = int(base) % align
if rmdr == 0:
return 0
else:
return align - rmdr
class AutoJitHSAKernel(HSAKernelBase):
def __init__(self, func):
super(AutoJitHSAKernel, self).__init__()
self.py_func = func
self.definitions = {}
from .descriptor import HSATargetDesc
self.typingctx = HSATargetDesc.typingctx
def __call__(self, *args):
kernel = self.specialize(*args)
cfg = kernel.configure(self.global_size, self.local_size, self.stream)
cfg(*args)
def specialize(self, *args):
argtypes = tuple([self.typingctx.resolve_argument_type(a)
for a in args])
kernel = self.definitions.get(argtypes)
if kernel is None:
kernel = compile_kernel(self.py_func, argtypes)
self.definitions[argtypes] = kernel
return kernel
from numba.core import types, sigutils
from .compiler import (compile_kernel, compile_device, AutoJitHSAKernel,
compile_device_template)
def jit(signature=None, device=False):
"""JIT compile a python function conforming to
the HSA-Python
"""
if signature is None:
return autojit(device=device)
elif not sigutils.is_signature(signature):
func = signature
return autojit(device=device)(func)
else:
if device:
return _device_jit(signature)
else:
return _kernel_jit(signature)
def autojit(device=False):
if device:
return _device_autojit
else:
return _kernel_autojit
def _device_jit(signature):
argtypes, restype = sigutils.normalize_signature(signature)
def _wrapped(pyfunc):
return compile_device(pyfunc, restype, argtypes)
return _wrapped
def _kernel_jit(signature):
argtypes, restype = sigutils.normalize_signature(signature)
if restype is not None and restype != types.void:
msg = "HSA kernel must have void return type but got {restype}"
raise TypeError(msg.format(restype=restype))
def _wrapped(pyfunc):
return compile_kernel(pyfunc, argtypes)
return _wrapped
def _device_autojit(pyfunc):
return compile_device_template(pyfunc)
def _kernel_autojit(pyfunc):
return AutoJitHSAKernel(pyfunc)
from numba.core.descriptors import TargetDescriptor
from numba.core.options import TargetOptions
from .target import HSATargetContext, HSATypingContext
class HSATargetOptions(TargetOptions):
OPTIONS = {}
class HSATargetDesc(TargetDescriptor):
options = HSATargetOptions
typingctx = HSATypingContext()
targetctx = HSATargetContext(typingctx)
import numpy as np
from numba.np.ufunc.deviceufunc import (UFuncMechanism, GenerializedUFunc,
GUFuncCallSteps)
from numba.roc.hsadrv.driver import dgpu_present
import numba.roc.hsadrv.devicearray as devicearray
import numba.roc.api as api
class HsaUFuncDispatcher(object):
"""
Invoke the HSA ufunc specialization for the given inputs.
"""
def __init__(self, types_to_retty_kernels):
self.functions = types_to_retty_kernels
def __call__(self, *args, **kws):
"""
*args: numpy arrays
**kws:
stream -- hsa stream; when defined, asynchronous mode is used.
out -- output array. Can be a numpy array or DeviceArrayBase
depending on the input arguments. Type must match
the input arguments.
"""
return HsaUFuncMechanism.call(self.functions, args, kws)
def reduce(self, arg, stream=0):
raise NotImplementedError
class HsaUFuncMechanism(UFuncMechanism):
"""
Provide OpenCL specialization
"""
DEFAULT_STREAM = 0
ARRAY_ORDER = 'A'
def is_device_array(self, obj):
if dgpu_present:
return devicearray.is_hsa_ndarray(obj)
else:
return isinstance(obj, np.ndarray)
def is_host_array(self, obj):
if dgpu_present:
return False
else:
return isinstance(obj, np.ndarray)
def to_device(self, hostary, stream):
if dgpu_present:
return api.to_device(hostary)
else:
return hostary
def launch(self, func, count, stream, args):
# ILP must match vectorize kernel source
ilp = 4
# Use more wavefront to allow hiding latency
tpb = 64 * 2
count = (count + (ilp - 1)) // ilp
blockcount = (count + (tpb - 1)) // tpb
func[blockcount, tpb](*args)
def device_array(self, shape, dtype, stream):
if dgpu_present:
return api.device_array(shape=shape, dtype=dtype)
else:
return np.empty(shape=shape, dtype=dtype)
def broadcast_device(self, ary, shape):
if dgpu_present:
raise NotImplementedError('device broadcast_device NIY')
else:
ax_differs = [ax for ax in range(len(shape))
if ax >= ary.ndim
or ary.shape[ax] != shape[ax]]
missingdim = len(shape) - len(ary.shape)
strides = [0] * missingdim + list(ary.strides)
for ax in ax_differs:
strides[ax] = 0
return np.ndarray(shape=shape, strides=strides,
dtype=ary.dtype, buffer=ary)
class _HsaGUFuncCallSteps(GUFuncCallSteps):
__slots__ = ()
def is_device_array(self, obj):
if dgpu_present:
return devicearray.is_hsa_ndarray(obj)
else:
return True
def to_device(self, hostary):
if dgpu_present:
return api.to_device(hostary)
else:
return hostary
def to_host(self, devary, hostary):
if dgpu_present:
out = devary.copy_to_host(hostary)
return out
else:
pass
def device_array(self, shape, dtype):
if dgpu_present:
return api.device_array(shape=shape, dtype=dtype)
else:
return np.empty(shape=shape, dtype=dtype)
def launch_kernel(self, kernel, nelem, args):
kernel.configure(nelem, min(nelem, 64))(*args)
class HSAGenerializedUFunc(GenerializedUFunc):
@property
def _call_steps(self):
return _HsaGUFuncCallSteps
def _broadcast_scalar_input(self, ary, shape):
if dgpu_present:
return devicearray.DeviceNDArray(shape=shape,
strides=(0,),
dtype=ary.dtype,
dgpu_data=ary.dgpu_data)
else:
return np.lib.stride_tricks.as_strided(ary, shape=(shape,),
strides=(0,))
def _broadcast_add_axis(self, ary, newshape):
newax = len(newshape) - len(ary.shape)
# Add 0 strides for missing dimension
newstrides = (0,) * newax + ary.strides
if dgpu_present:
return devicearray.DeviceNDArray(shape=newshape,
strides=newstrides,
dtype=ary.dtype,
dgpu_data=ary.dgpu_data)
else:
raise NotImplementedError
CLK_LOCAL_MEM_FENCE = 0
CLK_GLOBAL_MEM_FENCE = 1
import math
from collections import namedtuple
# GCN architecture specific info
simd_per_cu = 4
wave_size = 64
vector_register_file_size = 64 * 2**10 # 64 kB
byte_per_VGPR = 4
vgpr_per_simd = vector_register_file_size // byte_per_VGPR
sgpr_per_simd = 512
max_wave_count = 10
max_inflight_wave_per_cu = max_wave_count * simd_per_cu
# XXX due to limit in AMDGPU backend
max_group_size = 256
_limits = namedtuple('_limits', ['allowed_wave_due_to_sgpr',
'allowed_wave_due_to_vgpr',
'allowed_wave',
'allowed_vgpr_per_workitem',
'occupancy',
'reasons',
'suggestions'])
def get_limiting_factors(group_size, vgpr_per_workitem, sgpr_per_wave):
def _ceil(x):
return int(math.ceil(x))
# these might be zero, for resource limit treat as 1
vgpr_per_workitem = vgpr_per_workitem if vgpr_per_workitem > 0 else 1
sgpr_per_wave = sgpr_per_wave if sgpr_per_wave > 0 else 1
workitem_per_simd = group_size / simd_per_cu
required_wave_count_per_simd = _ceil(workitem_per_simd / wave_size)
required_vgpr_per_wave = vgpr_per_workitem * wave_size
# limiting factor
allowed_wave_due_to_sgpr = sgpr_per_simd // sgpr_per_wave
allowed_wave_due_to_vgpr = vgpr_per_simd // required_vgpr_per_wave
allowed_wave = min(allowed_wave_due_to_sgpr, max_wave_count, allowed_wave_due_to_vgpr)
allowed_vgpr_per_workitem = _ceil(vgpr_per_simd / required_wave_count_per_simd / wave_size)
# reasons
reasons = set()
if allowed_wave_due_to_sgpr < required_wave_count_per_simd:
reasons.add('allowed_wave_due_to_sgpr')
if allowed_wave_due_to_vgpr < required_wave_count_per_simd:
reasons.add('allowed_wave_due_to_vgpr')
if allowed_wave < required_wave_count_per_simd:
reasons.add('allowed_wave')
if group_size > max_group_size:
reasons.add('group_size')
suggestions = [_suggestions[r] for r in sorted(reasons)]
# occupancy
inflight_wave_per_cu = (0 if reasons else
required_wave_count_per_simd * simd_per_cu)
occupancy = inflight_wave_per_cu / max_inflight_wave_per_cu
return _limits(allowed_wave_due_to_sgpr=allowed_wave_due_to_sgpr,
allowed_wave_due_to_vgpr=allowed_wave_due_to_vgpr,
allowed_wave=allowed_wave,
allowed_vgpr_per_workitem=allowed_vgpr_per_workitem,
occupancy=occupancy,
reasons=reasons,
suggestions=suggestions)
_suggestions = {}
_suggestions['allowed_wave_due_to_sgpr'] = (
"* Cannot allocate enough sGPRs for all resident wavefronts."
)
_suggestions['allowed_wave_due_to_vgpr'] = (
"* Cannot allocate enough vGPRs for all resident wavefronts."
)
_suggestions['allowed_wave'] = (
"* Launch requires too many wavefronts. Try reducing group-size."
)
_suggestions['group_size'] = (
"* Exceeds max group size (256)."
)
import os
# 32-bit private, local, and region pointers. 64-bit global, constant and flat.
# See:
# https://github.com/RadeonOpenCompute/llvm/blob/b20b796f65ab6ac12fac4ea32e1d89e1861dee6a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp#L270-L275
# Alloc goes into addrspace(5) (private)
DATALAYOUT = {
64: ("e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
"-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
"-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"),
}
TRIPLE = "amdgcn--amdhsa"
# Allow user to use "NUMBA_USE_LIBHLC" env-var to use cmdline HLC.
if os.environ.get('NUMBA_USE_LIBHLC', '').lower() not in ['0', 'no', 'false']:
from numba.roc.hlc import libhlc as hlc
"""
Shared code for the low level compiler tooling
"""
from abc import abstractmethod, ABCMeta
import re
# These are for parsing labels and metadata
_re_labelname = re.compile(r"\n\.([0-9a-z_\.]+):", re.I) # label: .<stuff>:
_re_regname = re.compile(r"%\.([0-9a-z_]+)", re.I) # register: %.<stuff>
_re_metadata_def = re.compile(r"\!\d+\s*=")
_re_metadata_correct_usage = re.compile(r"metadata\s*\![{'\"]")
_re_metadata_ref = re.compile(r"\!\d+")
# These are for parsing alloca instructions
_re_alloca_quoted = re.compile('(.*)"(\$.*)".*')
_re_alloca_parts = re.compile('(.*)=(.*alloca(.*))')
def add_metadata_type(ir):
"""
Rewrite metadata since llvm3.6 dropped the "metadata" type prefix.
"""
buf = []
for line in ir.splitlines():
# If the line is a metadata
if _re_metadata_def.match(line):
# Does not contain any correct usage (Maybe already fixed)
if None is _re_metadata_correct_usage.search(line):
line = line.replace('!{', 'metadata !{')
line = line.replace('!"', 'metadata !"')
def sub_metadata(m):
return "metadata {0}".format(m.group(0))
line = _re_metadata_ref.sub(sub_metadata, line)
line = line.lstrip('metadata ')
buf.append(line)
return '\n'.join(buf)
def rename_register(llvmir):
"""
HLC does not like variable with '.' prefix.
"""
def repl(mat):
return '%_dot_.{0}'.format(mat.group(1))
return _re_regname.sub(repl, llvmir)
def rename_label(llvmir):
"""
HLC does not like a label with '.' prefix.
"""
def repl(mat):
return '_dot_.{0}:'.format(mat.group(1))
return _re_labelname.sub(repl, llvmir)
def adapt_llvm_version(llvmir):
"""
Adapt the LLVM IR to match the syntax required by HLC.
"""
llvmir = rename_register(llvmir)
llvmir = rename_label(llvmir)
# return add_metadata_type(llvmir)
return llvmir
def alloca_addrspace_correction(llvmir):
"""
rewrites llvmir such that alloca's go into addrspace(5) and are then
addrspacecast back to to addrspace(0). Alloca into 5 is a requirement of
the datalayout specification.
"""
lines = llvmir.splitlines()
mangle = '__tmp'
new_ir = []
for l in lines:
# pluck lines containing alloca
if 'alloca' in l:
assignee, alloca_match, ptrty = _re_alloca_parts.match(l).groups()
q_match = _re_alloca_quoted.match(assignee)
if q_match:
start, var = q_match.groups()
var = var.strip()
name_fmt = '%s"%s"'
old_name = name_fmt % (start, var)
new_name = name_fmt % (start, var + mangle)
else:
old_name = assignee.strip()
new_name = old_name + mangle
allocaline = "%s = %s, addrspace(5)" % (new_name, alloca_match)
castline_fmt = ("%s = addrspacecast %s addrspace(5)* "
"%s to %s addrspace(0)*")
castline = castline_fmt % (old_name, ptrty, new_name, ptrty)
new_ir.append(allocaline)
new_ir.append(castline)
else:
new_ir.append(l)
return '\n'.join(new_ir)
class _AMDGCNModule(metaclass=ABCMeta):
"""
The AMDCGN LLVM module contract
"""
@abstractmethod
def load_llvm(self, llvmir):
pass
@abstractmethod
def link_builtins(self, main):
pass
@abstractmethod
def generateGCN(self, llvmir):
pass
class AMDGCNModule(object):
"""
The AMDCGN LLVM module contract
"""
bitcodes = [
"opencl.amdgcn.bc",
"ocml.amdgcn.bc",
"ockl.amdgcn.bc",
"oclc_correctly_rounded_sqrt_off.amdgcn.bc",
"oclc_daz_opt_off.amdgcn.bc",
"oclc_finite_only_off.amdgcn.bc",
"oclc_isa_version_803.amdgcn.bc",
"oclc_unsafe_math_off.amdgcn.bc",
"irif.amdgcn.bc"
]
def __init__(self):
self._finalized = False
def _preprocess(self, llvmir):
version_adapted = adapt_llvm_version(llvmir)
alloca_fixed = alloca_addrspace_correction(version_adapted)
return alloca_fixed
def load_llvm(self, llvmir):
pass
def link_builtins(self, main):
pass
def generateGCN(self):
pass
import sys
import os
# where ROCM bitcode is installed
DEFAULT_ROCM_BC_PATH = '/opt/rocm/opencl/lib/x86_64/bitcode/'
ROCM_BC_PATH = os.environ.get("NUMBA_ROCM_BC_PATH", DEFAULT_ROCM_BC_PATH)
# A temporary wrapper to connect to the HLC LLVM binaries.
# Currently, connect to commandline interface.
import sys
from subprocess import check_call, check_output
import subprocess
import tempfile
import os
import re
from collections import namedtuple
from numba.roc.hsadrv import devices
from .common import AMDGCNModule
from .config import ROCM_BC_PATH
from numba.roc.hlc import TRIPLE
from datetime import datetime
from contextlib import contextmanager
from numba.core import utils, config
from numba.roc.hsadrv.error import HsaSupportError
_real_check_call = check_call
NOISY_CMDLINE = False
@contextmanager
def error_pipe():
if NOISY_CMDLINE:
yield subprocess.STDOUT
else:
yield subprocess.DEVNULL
def check_call(*args, **kwargs):
# This is so that time is stamped against invocation
# such that correlations can be looked for against messages in the
# sys and kernel logs.
try:
with error_pipe() as stderr:
if NOISY_CMDLINE:
print(datetime.now().strftime("%b %d %H:%M:%S"),
file=sys.stdout)
print('CMD: ' + ';'.join(args), file=sys.stdout)
ret = _real_check_call(*args, stderr=stderr, **kwargs)
except subprocess.CalledProcessError as e:
print(e)
raise(e)
return ret
class CmdLine(object):
def _initialize(self):
if not self.initialized:
dev_ctx = devices.get_context()
target_cpu = dev_ctx.agent.name.decode('UTF-8')
self.target_cpu = "-mcpu %s" % target_cpu
self.CMD_OPT = ' '.join([
self.opt,
"-O3",
self.triple_flag,
self.target_cpu,
"-disable-simplify-libcalls",
"-verify",
"-S",
"-o {fout}",
"{fin}"])
self.CMD_VERIFY = ' '.join([
self.opt,
"-verify",
self.triple_flag,
self.target_cpu,
"-S",
"-o {fout}",
"{fin}"])
self.CMD_GEN_HSAIL = ' '.join([self.llc,
"-O2",
self.triple_flag,
self.target_cpu,
"-filetype=asm",
"-o {fout}",
"{fin}"])
self.CMD_GEN_BRIG = ' '.join([self.llc,
"-O2",
self.triple_flag,
self.target_cpu,
"-filetype=obj",
"-o {fout}",
"{fin}"])
self.CMD_LINK_BUILTINS = ' '.join([
self.llvm_link,
"-S",
"-o {fout}",
"{fin}",
"{lib}"])
self.CMD_LINK_LIBS = ' '.join([self.llvm_link,
"-S",
"-o {fout}",
"{fin}"])
self.CMD_LINK_BRIG = ' '.join([self.ld_lld,
"-shared",
"-o {fout}",
"{fin}"])
def __init__(self):
self._binary_path = os.environ.get('HSAILBIN', None)
def _setup_path(tool):
if self._binary_path is not None:
return os.path.join(self._binary_path, tool)
else:
binpath = os.path.join(sys.prefix, 'bin', tool)
return binpath
self._triple = TRIPLE
self.opt = _setup_path("opt")
self.llc = _setup_path("llc")
self.llvm_link = _setup_path("llvm-link")
self.ld_lld = _setup_path("ld.lld")
self.triple_flag = "-mtriple %s" % self._triple
self.initialized = False
def check_tooling(self):
# make sure the llc can actually target amdgcn, ideally all tooling
# should be checked but most don't print anything useful and so
# compilation for AMDGCN would have to be tested instead. This is a
# smoke test like check.
try:
if not os.path.isfile(self.llc):
raise HsaSupportError('llc not found')
output = check_output([self.llc, '--version'],
universal_newlines=True)
olines = [x.strip() for x in output.splitlines()]
tgtidx = olines.index('Registered Targets:')
targets = olines[tgtidx + 1:]
for tgt in targets:
if 'amdgcn' in tgt:
break
else:
msg = 'Command line tooling does not support "amdgcn" target'
raise HsaSupportError(msg)
except Exception as e:
raise
def verify(self, ipath, opath):
if not self.initialized:
self._initialize()
check_call(self.CMD_VERIFY.format(fout=opath, fin=ipath), shell=True)
def optimize(self, ipath, opath):
if not self.initialized:
self._initialize()
check_call(self.CMD_OPT.format(fout=opath, fin=ipath), shell=True)
def generate_hsail(self, ipath, opath):
if not self.initialized:
self._initialize()
check_call(self.CMD_GEN_HSAIL.format(fout=opath, fin=ipath), shell=True)
def generate_brig(self, ipath, opath):
if not self.initialized:
self._initialize()
check_call(self.CMD_GEN_BRIG.format(fout=opath, fin=ipath), shell=True)
def link_libs(self, ipath, libpaths, opath):
if not self.initialized:
self._initialize()
cmdline = self.CMD_LINK_LIBS.format(fout=opath, fin=ipath)
cmdline += ' '.join(["{0}".format(lib) for lib in libpaths])
check_call(cmdline, shell=True)
def link_brig(self, ipath, opath):
if not self.initialized:
self._initialize()
check_call(self.CMD_LINK_BRIG.format(fout=opath, fin=ipath), shell=True)
class Module(AMDGCNModule):
def __init__(self):
"""
Setup
"""
self._tmpdir = tempfile.mkdtemp()
self._tempfiles = []
self._linkfiles = []
self._cmd = CmdLine()
AMDGCNModule.__init__(self)
def __del__(self):
return
self.close()
def close(self):
# Remove all temporary files
for afile in self._tempfiles:
os.unlink(afile)
#Remove directory
os.rmdir(self._tmpdir)
def _create_temp_file(self, name, mode='wb'):
path = self._track_temp_file(name)
fobj = open(path, mode=mode)
return fobj, path
def _track_temp_file(self, name):
path = os.path.join(self._tmpdir,
"{0}-{1}".format(len(self._tempfiles), name))
self._tempfiles.append(path)
return path
def load_llvm(self, llvmir):
"""
Load LLVM with HSAIL SPIR spec
"""
# Preprocess LLVM IR
llvmir = self._preprocess(llvmir)
# Create temp file to store the input file
tmp_llvm_ir, fin = self._create_temp_file("dump-llvm-ir")
with tmp_llvm_ir:
tmp_llvm_ir.write(llvmir.encode('ascii'))
# Create temp file for optimization
fout = self._track_temp_file("verified-llvm-ir")
self._cmd.verify(ipath=fin, opath=fout)
if config.DUMP_OPTIMIZED:
with open(fout, 'rb') as fin_opt:
print(fin_opt.read().decode('ascii'))
self._linkfiles.append(fout)
def link_builtins(self, ipath, opath):
# progressively link in all the bitcodes
for bc in self.bitcodes:
if bc != self.bitcodes[-1]:
tmp_opath = opath + bc.replace('/', '_').replace('.','_')
else:
tmp_opath = opath
lib = os.path.join(ROCM_BC_PATH, bc)
cmd = self._cmd.CMD_LINK_BUILTINS.format(fout=tmp_opath, fin=ipath, lib=lib)
check_call(cmd, shell=True)
ipath = tmp_opath
def generateGCN(self):
"""
Generate GCN from a module and also return the HSAIL code.
"""
assert not self._finalized, "Module already has GCN generated"
# Link dependencies libraries
llvmfile = self._linkfiles[0]
pre_builtin_path = self._track_temp_file("link-dep")
libpaths = self._linkfiles[1:]
self._cmd.link_libs(ipath=llvmfile, libpaths=libpaths,
opath=pre_builtin_path)
# Link library with the builtin modules
linked_path = self._track_temp_file("linked-path")
self.link_builtins(ipath=pre_builtin_path, opath=linked_path)
# Optimize
opt_path = self._track_temp_file("optimized-llvm-ir")
self._cmd.optimize(ipath=linked_path, opath=opt_path)
if config.DUMP_OPTIMIZED:
with open(opt_path, 'rb') as fin:
print(fin.read().decode('ascii'))
# Compile the llvm to HSAIL
hsail_path = self._track_temp_file("create-hsail")
self._cmd.generate_hsail(ipath=opt_path, opath=hsail_path)
# Compile the llvm to BRIG
brig_path = self._track_temp_file("create-brig")
self._cmd.generate_brig(ipath=opt_path, opath=brig_path)
# link
end_brig_path = self._track_temp_file("linked-brig")
self._cmd.link_brig(ipath = brig_path, opath=end_brig_path)
self._finalized = True
# Read HSAIL
with open(hsail_path, 'rb') as fin:
hsail = fin.read().decode('ascii')
# Read BRIG
with open(end_brig_path, 'rb') as fin:
brig = fin.read()
if config.DUMP_ASSEMBLY:
print(hsail)
return namedtuple('FinalizerResult', ['hsail', 'brig'])(hsail, brig)
import os
import sys
from collections import namedtuple
from ctypes import (c_size_t, byref, c_char_p, c_void_p, Structure, CDLL,
POINTER, create_string_buffer, c_int, addressof,
c_byte)
import tempfile
import os
import re
import weakref
from numba.roc.hsadrv import devices
from .common import AMDGCNModule
from numba.roc.hlc.hlc import CmdLine
from numba.core import config
# the CLI tooling is needed for the linking phase at present
cli = CmdLine()
class OpaqueModuleRef(Structure):
pass
moduleref_ptr = POINTER(OpaqueModuleRef)
def set_option(*opt):
"""
Use this for setting debug flags to libHLC using the same options
available to LLVM.
E.g -debug-pass=Structure
"""
inp = [create_string_buffer(x.encode('ascii')) for x in (('libhlc',) + opt)]
argc = len(inp)
argv = (c_char_p * argc)()
for i in range(argc):
argv[i] = addressof(inp[i])
hlc.ROC_SetCommandLineOption(argc, byref(argv))
class Error(Exception):
pass
class HLC(object):
"""
LibHLC wrapper interface
"""
hlc = None
def __init__(self):
# Lazily load the libHLC library
bitcode_path = os.path.join(sys.prefix, 'share', 'rocmtools')
assert os.path.exists(bitcode_path) and os.path.isdir(bitcode_path)
self.bitcode_path = bitcode_path
dev_ctx = devices.get_context()
target_cpu = dev_ctx.agent.name
self.target_cpu = target_cpu
if self.hlc is None:
try:
hlc = CDLL(os.path.join(sys.prefix, 'lib', 'librocmlite.so'))
except OSError:
raise ImportError("librocmlite.so cannot be found. Please "
"install the roctools package by: "
"conda install -c numba roctools")
else:
hlc.ROC_ParseModule.restype = moduleref_ptr
hlc.ROC_ParseBitcode.restype = moduleref_ptr
hlc.ROC_ModuleEmitBRIG.restype = c_size_t
hlc.ROC_Initialize()
weakref.finalize(hlc, hlc.ROC_Finalize)
hlc.ROC_SetCommandLineOption.argtypes = [
c_int,
c_void_p,
]
type(self).hlc = hlc
def parse_assembly(self, ir):
if isinstance(ir, str):
ir = ir.encode("latin1")
buf = create_string_buffer(ir)
mod = self.hlc.ROC_ParseModule(buf)
if not mod:
raise Error("Failed to parse assembly")
return mod
def parse_bitcode(self, bitcode):
buf = create_string_buffer(bitcode, len(bitcode))
mod = self.hlc.ROC_ParseBitcode(buf, c_size_t(len(bitcode)))
if not mod:
raise Error("Failed to parse bitcode")
return mod
def optimize(self, mod, opt=3, size=0, verify=1):
if not self.hlc.ROC_ModuleOptimize(mod, int(opt), int(size),
int(verify), c_char_p(self.target_cpu)):
raise Error("Failed to optimize module")
def link(self, dst, src):
if not self.hlc.ROC_ModuleLinkIn(dst, src):
raise Error("Failed to link modules")
def to_hsail(self, mod, opt=2):
buf = c_char_p(0)
if not self.hlc.ROC_ModuleEmitHSAIL(mod, int(opt),
c_char_p(self.target_cpu), byref(buf)):
raise Error("Failed to emit HSAIL")
ret = buf.value.decode("latin1")
self.hlc.ROC_DisposeString(buf)
return ret
def _link_brig(self, upbrig_loc, patchedbrig_loc):
cli.link_brig(upbrig_loc, patchedbrig_loc)
def to_brig(self, mod, opt=2):
bufptr = c_void_p(0)
size = self.hlc.ROC_ModuleEmitBRIG(mod, int(opt),
c_char_p(self.target_cpu), byref(bufptr))
if not size:
raise Error("Failed to emit BRIG")
buf = (c_byte * size).from_address(bufptr.value)
try:
buffer
except NameError:
ret = bytes(buf)
else:
ret = bytes(buffer(buf))
self.hlc.ROC_DisposeString(buf)
# Now we have an ELF, this needs patching with ld.lld which doesn't
# have an API. So we write out `ret` to a temporary file, then call
# the ld.lld ELF linker main() on it to generate a patched ELF
# temporary file output, which we read back in.
# tmpdir, not using a ctx manager as debugging is easier without
tmpdir = tempfile.mkdtemp()
tmp_files = []
# write out unpatched BRIG
upbrig_file = "unpatched.brig"
upbrig_loc = os.path.join(tmpdir, upbrig_file)
with open(upbrig_loc, "wb") as up_brig_fobj:
up_brig_fobj.write(ret)
tmp_files.append(upbrig_loc)
# record the location of the patched ELF
patchedbrig_file = "patched.brig"
patchedbrig_loc = os.path.join(tmpdir, patchedbrig_file)
# call out to ld.lld to patch
self._link_brig(upbrig_loc, patchedbrig_loc)
# read back in brig temporary.
with open(patchedbrig_loc, "rb") as p_brig_fobj:
patchedBrig = p_brig_fobj.read()
tmp_files.append(patchedbrig_loc)
# Remove all temporary files
for afile in tmp_files:
os.unlink(afile)
# Remove directory
os.rmdir(tmpdir)
return patchedBrig
def to_string(self, mod):
buf = c_char_p(0)
self.hlc.ROC_ModulePrint(mod, byref(buf))
ret = buf.value.decode("latin1")
self.hlc.ROC_DisposeString(buf)
return ret
def destroy_module(self, mod):
self.hlc.ROC_ModuleDestroy(mod)
class Module(AMDGCNModule):
def __init__(self):
self._llvm_modules = []
self._hlc = HLC()
AMDGCNModule.__init__(self)
def load_llvm(self, llvmir):
"""
Load LLVM with HSAIL SPIR spec
"""
# Preprocess LLVM IR
# Because HLC does not handle dot in LLVM variable names
llvmir = self._preprocess(llvmir)
mod = self._hlc.parse_assembly(llvmir)
if config.DUMP_OPTIMIZED:
print(self._hlc.to_string(mod))
self._llvm_modules.append(mod)
def link_builtins(self, main):
for bc in self.bitcodes:
bc_path = os.path.join(self._hlc.bitcode_path, bc)
with open(bc_path, 'rb') as builtin:
buf = builtin.read()
mod = self._hlc.parse_bitcode(buf)
self._hlc.link(main, mod)
def generateGCN(self):
"""
Finalize module and return the HSAIL code
"""
assert not self._finalized, "Module finalized already"
# Link dependencies
main = self._llvm_modules[0]
for dep in self._llvm_modules[1:]:
self._hlc.link(main, dep)
# link bitcode
self.link_builtins(main)
# Optimize
self._hlc.optimize(main)
if config.DUMP_OPTIMIZED:
print(self._hlc.to_string(main))
# create HSAIL
hsail = self._hlc.to_hsail(main)
# Finalize the llvm to BRIG
brig = self._hlc.to_brig(main)
self._finalized = True
# Clean up main; other modules are destroyed at linking
self._hlc.destroy_module(main)
if config.DUMP_ASSEMBLY:
print(hsail)
return namedtuple('FinalizerResult', ['hsail', 'brig'])(hsail, brig)
from numba.core import types
from numba.core.typing.npydecl import parse_dtype, parse_shape
from numba.core.typing.templates import (AttributeTemplate, AbstractTemplate,
CallableTemplate, ConcreteTemplate,
signature, Registry)
from numba import roc
registry = Registry()
intrinsic = registry.register
intrinsic_attr = registry.register_attr
intrinsic_global = registry.register_global
# =============================== NOTE ===============================
# Even though the following functions return size_t in the OpenCL standard,
# It should be rare (and unrealistic) to have 2**63 number of work items.
# We are choosing to use intp (signed 64-bit in large model) due to potential
# loss of precision in coerce(intp, uintp) that results in double.
@intrinsic
class Hsa_get_global_id(ConcreteTemplate):
key = roc.get_global_id
cases = [signature(types.intp, types.uint32)]
@intrinsic
class Hsa_get_local_id(ConcreteTemplate):
key = roc.get_local_id
cases = [signature(types.intp, types.uint32)]
@intrinsic
class Hsa_get_group_id(ConcreteTemplate):
key = roc.get_group_id
cases = [signature(types.intp, types.uint32)]
@intrinsic
class Hsa_get_num_groups(ConcreteTemplate):
key = roc.get_num_groups
cases = [signature(types.intp, types.uint32)]
@intrinsic
class Hsa_get_work_dim(ConcreteTemplate):
key = roc.get_work_dim
cases = [signature(types.uint32)]
@intrinsic
class Hsa_get_global_size(ConcreteTemplate):
key = roc.get_global_size
cases = [signature(types.intp, types.uint32)]
@intrinsic
class Hsa_get_local_size(ConcreteTemplate):
key = roc.get_local_size
cases = [signature(types.intp, types.uint32)]
@intrinsic
class Hsa_barrier(ConcreteTemplate):
key = roc.barrier
cases = [signature(types.void, types.uint32),
signature(types.void)]
@intrinsic
class Hsa_mem_fence(ConcreteTemplate):
key = roc.mem_fence
cases = [signature(types.void, types.uint32)]
@intrinsic
class Hsa_wavebarrier(ConcreteTemplate):
key = roc.wavebarrier
cases = [signature(types.void)]
@intrinsic
class Hsa_activelanepermute_wavewidth(ConcreteTemplate):
key = roc.activelanepermute_wavewidth
# parameter: src, laneid, identity, useidentity
cases = [signature(ty, ty, types.uint32, ty, types.bool_)
for ty in (types.integer_domain|types.real_domain)]
class _Hsa_ds_permuting(ConcreteTemplate):
# parameter: index, source
cases = [signature(types.int32, types.int32, types.int32),
signature(types.int32, types.int64, types.int32),
signature(types.float32, types.int32, types.float32),
signature(types.float32, types.int64, types.float32)]
unsafe_casting = False
@intrinsic
class Hsa_ds_permute(_Hsa_ds_permuting):
key = roc.ds_permute
@intrinsic
class Hsa_ds_bpermute(_Hsa_ds_permuting):
key = roc.ds_bpermute
# hsa.shared submodule -------------------------------------------------------
@intrinsic
class Hsa_shared_array(CallableTemplate):
key = roc.shared.array
def generic(self):
def typer(shape, dtype):
# Only integer literals and tuples of integer literals are valid
# shapes
if isinstance(shape, types.Integer):
if not isinstance(shape, types.IntegerLiteral):
return None
elif isinstance(shape, (types.Tuple, types.UniTuple)):
if any([not isinstance(s, types.IntegerLiteral) for s in shape]):
return None
else:
return None
ndim = parse_shape(shape)
nb_dtype = parse_dtype(dtype)
if nb_dtype is not None and ndim is not None:
return types.Array(dtype=nb_dtype, ndim=ndim, layout='C')
return typer
@intrinsic_attr
class HsaSharedTemplate(AttributeTemplate):
key = types.Module(roc.shared)
def resolve_array(self, mod):
return types.Function(Hsa_shared_array)
# hsa.atomic submodule -------------------------------------------------------
@intrinsic
class Hsa_atomic_add(AbstractTemplate):
key = roc.atomic.add
def generic(self, args, kws):
assert not kws
ary, idx, val = args
if ary.ndim == 1:
return signature(ary.dtype, ary, types.intp, ary.dtype)
elif ary.ndim > 1:
return signature(ary.dtype, ary, idx, ary.dtype)
@intrinsic_attr
class HsaAtomicTemplate(AttributeTemplate):
key = types.Module(roc.atomic)
def resolve_add(self, mod):
return types.Function(Hsa_atomic_add)
# hsa module -----------------------------------------------------------------
@intrinsic_attr
class HsaModuleTemplate(AttributeTemplate):
key = types.Module(roc)
def resolve_get_global_id(self, mod):
return types.Function(Hsa_get_global_id)
def resolve_get_local_id(self, mod):
return types.Function(Hsa_get_local_id)
def resolve_get_global_size(self, mod):
return types.Function(Hsa_get_global_size)
def resolve_get_local_size(self, mod):
return types.Function(Hsa_get_local_size)
def resolve_get_num_groups(self, mod):
return types.Function(Hsa_get_num_groups)
def resolve_get_work_dim(self, mod):
return types.Function(Hsa_get_work_dim)
def resolve_get_group_id(self, mod):
return types.Function(Hsa_get_group_id)
def resolve_barrier(self, mod):
return types.Function(Hsa_barrier)
def resolve_mem_fence(self, mod):
return types.Function(Hsa_mem_fence)
def resolve_wavebarrier(self, mod):
return types.Function(Hsa_wavebarrier)
def resolve_activelanepermute_wavewidth(self, mod):
return types.Function(Hsa_activelanepermute_wavewidth)
def resolve_ds_permute(self, mod):
return types.Function(Hsa_ds_permute)
def resolve_ds_bpermute(self, mod):
return types.Function(Hsa_ds_bpermute)
def resolve_shared(self, mod):
return types.Module(roc.shared)
def resolve_atomic(self, mod):
return types.Module(roc.atomic)
# intrinsic
intrinsic_global(roc, types.Module(roc))
"""HSA driver
This submodule contains low level bindings to HSA
"""
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment