Commit 5c70ef66 authored by dugupeiwen's avatar dugupeiwen
Browse files

update roc

parent 1fb0017a
"""
A HSA dGPU backed ND Array is recognized by checking the __hsa_memory__
attribute on the object. If it exists and evaluate to True, it must define
shape, strides, dtype and size attributes similar to a NumPy ndarray.
"""
import warnings
import math
import copy
import weakref
from ctypes import c_void_p
import numpy as np
from numba.roc.hsadrv import driver as _driver
from numba.roc.hsadrv import devices
from numba.core import types
from .error import HsaContextMismatchError
from numba.misc import dummyarray
from numba.np import numpy_support
def is_hsa_ndarray(obj):
"Check if an object is a HSA ndarray"
return getattr(obj, '__hsa_ndarray__', False)
def verify_hsa_ndarray_interface(obj):
"Verify the HSA ndarray interface for an obj"
require_hsa_ndarray(obj)
def requires_attr(attr, typ):
if not hasattr(obj, attr):
raise AttributeError(attr)
if not isinstance(getattr(obj, attr), typ):
raise AttributeError('%s must be of type %s' % (attr, typ))
requires_attr('shape', tuple)
requires_attr('strides', tuple)
requires_attr('dtype', np.dtype)
requires_attr('size', int)
def require_hsa_ndarray(obj):
"Raises ValueError if is_hsa_ndarray(obj) evaluates False"
if not is_hsa_ndarray(obj):
raise ValueError('require an hsa ndarray object')
class DeviceNDArrayBase(object):
"""Base class for an on dGPU NDArray representation cf. numpy.ndarray
"""
__hsa_memory__ = True
__hsa_ndarray__ = True # There must be dgpu_data attribute as a result
def __init__(self, shape, strides, dtype, dgpu_data=None):
"""
Args
----
shape
array shape.
strides
array strides.
dtype
data type as numpy.dtype.
dgpu_data
user provided device memory for the ndarray data buffer
"""
if isinstance(shape, int):
shape = (shape,)
if isinstance(strides, int):
strides = (strides,)
self.ndim = len(shape)
if len(strides) != self.ndim:
raise ValueError('strides not match ndim')
self._dummy = dummyarray.Array.from_desc(0, shape, strides,
dtype.itemsize)
self.shape = tuple(shape)
self.strides = tuple(strides)
self.dtype = np.dtype(dtype)
self.size = int(np.prod(self.shape))
# prepare dgpu memory
if self.size > 0:
if dgpu_data is None:
from numba.roc.api import _memory_size_from_info
self.alloc_size = _memory_size_from_info(self.shape,
self.strides, self.dtype.itemsize)
# find a coarse region on the dGPU
dgpu_data = devices.get_context().mempoolalloc(self.alloc_size)
else: # we have some preallocated dgpu_memory
sz = getattr(dgpu_data, '_hsa_memsize_', None)
if sz is None:
raise ValueError('dgpu_data as no _hsa_memsize_ attribute')
assert sz >= 0
self.alloc_size = sz
else:
dgpu_data = None
self.alloc_size = 0
self.dgpu_data = dgpu_data
@property
def _context(self):
return self.dgpu_data.context
@property
def _numba_type_(self):
"""
Magic attribute expected by Numba to get the numba type that
represents this object.
"""
dtype = numpy_support.from_dtype(self.dtype)
return types.Array(dtype, self.ndim, 'A')
@property
def device_ctypes_pointer(self):
"""Returns the ctypes pointer to the GPU data buffer
"""
if self.dgpu_data is None:
return c_void_p(0)
else:
return self.dgpu_data.device_ctypes_pointer
def copy_to_device(self, ary, stream=None, context=None):
"""Copy `ary` to `self`.
If `ary` is a HSA memory, perform a device-to-device transfer.
Otherwise, perform a a host-to-device transfer.
If `stream` is a stream object, an async copy to used.
"""
if ary.size == 0:
# Nothing to do
return
if context is not None:
if self.dgpu_data is not None:
expect, got = self._context, context
if expect.unproxy != got.unproxy:
raise HsaContextMismatchError(expect=expect, got=got)
else:
context = self._context
# TODO: Worry about multiple dGPUs
#if _driver.is_device_memory(ary):
# sz = min(self.alloc_size, ary.alloc_size)
# _driver.device_to_device(self, ary, sz)
#else:
# sz = min(_driver.host_memory_size(ary), self.alloc_size)
sz = self.alloc_size
# host_to_dGPU(context, dst, src, size):
if stream is None:
_driver.hsa.implicit_sync()
if isinstance(ary, DeviceNDArray):
_driver.dGPU_to_dGPU(self._context, self, ary, sz)
else:
_driver.host_to_dGPU(self._context, self, ary, sz)
else:
if isinstance(ary, DeviceNDArray):
_driver.async_dGPU_to_dGPU(dst_ctx=self._context,
src_ctx=ary._context,
dst=self, src=ary, size=sz,
stream=stream)
else:
_driver.async_host_to_dGPU(dst_ctx=self._context,
src_ctx=devices.get_cpu_context(),
dst=self, src=ary, size=sz,
stream=stream)
def copy_to_host(self, ary=None, stream=None):
"""Copy ``self`` to ``ary`` or create a new Numpy ndarray
if ``ary`` is ``None``.
The transfer is synchronous: the function returns after the copy
is finished.
Always returns the host array.
Example::
import numpy as np
from numba import hsa
arr = np.arange(1000)
d_arr = hsa.to_device(arr)
my_kernel[100, 100](d_arr)
result_array = d_arr.copy_to_host()
"""
if ary is None: # destination does not exist
hostary = np.empty(shape=self.alloc_size, dtype=np.byte)
else: # destination does exist, it's `ary`, check it
if ary.dtype != self.dtype:
raise TypeError('incompatible dtype')
if ary.shape != self.shape:
scalshapes = (), (1,)
if not (ary.shape in scalshapes and self.shape in scalshapes):
raise TypeError('incompatible shape; device %s; host %s' %
(self.shape, ary.shape))
if ary.strides != self.strides:
scalstrides = (), (self.dtype.itemsize,)
if not (ary.strides in scalstrides and
self.strides in scalstrides):
raise TypeError('incompatible strides; device %s; host %s' %
(self.strides, ary.strides))
hostary = ary # this is supposed to be a ptr for writing
# a location for the data exists as `hostary`
assert self.alloc_size >= 0, "Negative memory size"
context = self._context
# copy the data from the device to the hostary
if self.alloc_size != 0:
sz = self.alloc_size
if stream is None:
_driver.hsa.implicit_sync()
_driver.dGPU_to_host(context, hostary, self, sz)
else:
_driver.async_dGPU_to_host(dst_ctx=devices.get_cpu_context(),
src_ctx=self._context,
dst=hostary, src=self,
size=sz, stream=stream)
# if the location for the data was originally None
# then create a new ndarray and plumb in the new memory
if ary is None:
if self.size == 0:
hostary = np.ndarray(shape=self.shape, dtype=self.dtype,
buffer=hostary)
else:
hostary = np.ndarray(shape=self.shape, dtype=self.dtype,
strides=self.strides, buffer=hostary)
else: # else hostary points to ary and how has the right memory
hostary = ary
return hostary
def as_hsa_arg(self):
"""Returns a device memory object that is used as the argument.
"""
return self.dgpu_data
class DeviceNDArray(DeviceNDArrayBase):
'''
An on-dGPU array type
'''
def is_f_contiguous(self):
'''
Return true if the array is Fortran-contiguous.
'''
return self._dummy.is_f_contig
def is_c_contiguous(self):
'''
Return true if the array is C-contiguous.
'''
return self._dummy.is_c_contig
def reshape(self, *newshape, **kws):
"""
Reshape the array without changing its contents, similarly to
:meth:`numpy.ndarray.reshape`. Example::
d_arr = d_arr.reshape(20, 50, order='F')
"""
if len(newshape) == 1 and isinstance(newshape[0], (tuple, list)):
newshape = newshape[0]
cls = type(self)
if newshape == self.shape:
# nothing to do
return cls(shape=self.shape, strides=self.strides,
dtype=self.dtype, dgpu_data=self.dgpu_data)
newarr, extents = self._dummy.reshape(*newshape, **kws)
if extents == [self._dummy.extent]:
return cls(shape=newarr.shape, strides=newarr.strides,
dtype=self.dtype, dgpu_data=self.dgpu_data)
else:
raise NotImplementedError("operation requires copying")
def ravel(self, order='C'):
'''
Flatten the array without changing its contents, similar to
:meth:`numpy.ndarray.ravel`.
'''
cls = type(self)
newarr, extents = self._dummy.ravel(order=order)
if extents == [self._dummy.extent]:
return cls(shape=newarr.shape, strides=newarr.strides,
dtype=self.dtype, dgpu_data=self.dgpu_data)
else:
raise NotImplementedError("operation requires copying")
class HostArray(np.ndarray):
__hsa_memory__ = True
@property
def device_ctypes_pointer(self):
return self.ctypes.data_as(c_void_p)
def from_array_like(ary, dgpu_data=None):
"Create a DeviceNDArray object that is like ary."
if ary.ndim == 0:
ary = ary.reshape(1)
return DeviceNDArray(ary.shape, ary.strides, ary.dtype,
dgpu_data=dgpu_data)
errmsg_contiguous_buffer = ("Array contains non-contiguous buffer and cannot "
"be transferred as a single memory region. Please "
"ensure contiguous buffer with numpy "
".ascontiguousarray()")
def _single_buffer(ary):
i = np.argmax(ary.strides)
size = ary.strides[i] * ary.shape[i]
return size == ary.nbytes
def sentry_contiguous(ary):
if not ary.flags['C_CONTIGUOUS'] and not ary.flags['F_CONTIGUOUS']:
if ary.strides[0] == 0:
# Broadcasted, ensure inner contiguous
return sentry_contiguous(ary[0])
elif _single_buffer(ary):
return True
else:
raise ValueError(errmsg_contiguous_buffer)
def auto_device(obj, context, stream=None, copy=True):
"""
Create a DeviceArray like obj and optionally copy data from
host to device. If obj already represents device memory, it is returned and
no copy is made.
"""
if _driver.is_device_memory(obj): # it's already on the dGPU
return obj, False
else: # needs to be copied to the dGPU
sentry_contiguous(obj)
devobj = from_array_like(obj)
if copy:
devobj.copy_to_device(obj, stream=stream, context=context)
return devobj, True
"""
Expose each GPU device directly
"""
import functools
from .driver import hsa as driver, Context as _Context
from numba.roc import servicelib
class _culist(object):
"""A thread local list of GPU instances
"""
def __init__(self):
self._lst = None
@property
def _gpus(self):
if not self._lst:
self._lst = self._init_gpus()
return self._lst
def _init_gpus(self):
gpus = []
for com in driver.components:
gpus.append(CU(com))
return gpus
def __getitem__(self, item):
return self._gpus[item]
def append(self, item):
return self._gpus.append(item)
def __len__(self):
return len(self._gpus)
def __nonzero__(self):
return bool(self._gpus)
def __iter__(self):
return iter(self._gpus)
__bool__ = __nonzero__
def reset(self):
for gpu in self:
gpu.reset()
@property
def current(self):
"""Get the current GPU object associated with the thread
"""
return _custack.top
cus = _culist()
del _culist
class CU(object):
def __init__(self, cu):
self._cu = cu
self._context = None
def __getattr__(self, key):
"""Redirect to self._gpu
"""
if key.startswith('_'):
raise AttributeError(key)
return getattr(self._cu, key)
def __repr__(self):
return repr(self._cu)
def associate_context(self):
"""Associate the context of this GPU to the running thread
"""
# No context was created for this GPU
if self._context is None:
self._context = self._cu.create_context()
return self._context
def __enter__(self):
self.associate_context()
_custack.push(self)
def __exit__(self, exc_type, exc_val, exc_tb):
assert _get_device() is self
self._context.pop()
_custack.pop()
def reset(self):
if self._context:
self._context.reset()
self._context = None
_cpu_context = None
def get_cpu_context():
global _cpu_context
if _cpu_context is None:
cpu_agent = [a for a in driver.agents if not a.is_component][0]
_cpu_context = _Context(cpu_agent)
return _cpu_context
def get_gpu(i):
return cus[i]
def get_num_gpus():
return len(cus)
_custack = servicelib.TLStack()
def _get_device(devnum=0):
"""Get the current device or use a device by device number.
"""
if not _custack:
_custack.push(get_gpu(devnum))
return _custack.top
def get_context(devnum=0):
"""Get the current device or use a device by device number, and
return the HSA context.
"""
return _get_device(devnum=devnum).associate_context()
def get_all_contexts():
return [get_context(i) for i in range(get_num_gpus())]
def require_context(fn):
"""
A decorator to ensure a context for the HSA subsystem
"""
@functools.wraps(fn)
def _require_cu_context(*args, **kws):
get_context()
return fn(*args, **kws)
return _require_cu_context
def reset():
cus.reset()
_custack.clear()
"""
HSA driver bridge implementation
"""
from collections.abc import Sequence
import sys
import atexit
import os
import ctypes
import struct
import traceback
import weakref
import logging
from contextlib import contextmanager
from collections import defaultdict, deque
from functools import total_ordering
from numba import mviewbuf
from numba.core import utils, config
from .error import HsaSupportError, HsaDriverError, HsaApiError
from numba.roc.hsadrv import enums, enums_ext, drvapi
import numpy as np
_logger = logging.getLogger(__name__)
class HsaKernelTimedOut(HsaDriverError):
pass
def _device_type_to_string(device):
try:
return ['CPU', 'GPU', 'DSP'][device]
except IndexError:
return 'Unknown'
DEFAULT_HSA_DRIVER = '/opt/rocm/lib/libhsa-runtime64.so'
def _find_driver():
envpath = os.environ.get('NUMBA_HSA_DRIVER', DEFAULT_HSA_DRIVER)
if envpath == '0':
# Force fail
_raise_driver_not_found()
# Determine DLL type
if (struct.calcsize('P') != 8
or sys.platform == 'win32'
or sys.platform == 'darwin'):
_raise_platform_not_supported()
else:
# Assume to be *nix like and 64 bit
dlloader = ctypes.CDLL
dldir = ['/usr/lib', '/usr/lib64']
dlname = 'libhsa-runtime64.so'
if envpath is not None:
try:
envpath = os.path.abspath(envpath)
except ValueError:
raise HsaSupportError("NUMBA_HSA_DRIVER %s is not a valid path" %
envpath)
if not os.path.isfile(envpath):
raise HsaSupportError("NUMBA_HSA_DRIVER %s is not a valid file "
"path. Note it must be a filepath of the .so/"
".dll/.dylib or the driver" % envpath)
candidates = [envpath]
else:
# First search for the name in the default library path.
# If that is not found, try the specific path.
candidates = [dlname] + [os.path.join(x, dlname) for x in dldir]
# Load the driver; Collect driver error information
path_not_exist = []
driver_load_error = []
for path in candidates:
try:
dll = dlloader(path)
except OSError as e:
# Problem opening the DLL
path_not_exist.append(not os.path.isfile(path))
driver_load_error.append(e)
else:
return dll
# Problem loading driver
if all(path_not_exist):
_raise_driver_not_found()
else:
errmsg = '\n'.join(str(e) for e in driver_load_error)
_raise_driver_error(errmsg)
PLATFORM_NOT_SUPPORTED_ERROR = """
HSA is not currently supported on this platform ({0}).
"""
def _raise_platform_not_supported():
raise HsaSupportError(PLATFORM_NOT_SUPPORTED_ERROR.format(sys.platform))
DRIVER_NOT_FOUND_MSG = """
The HSA runtime library cannot be found.
If you are sure that the HSA is installed, try setting environment
variable NUMBA_HSA_DRIVER with the file path of the HSA runtime shared
library.
"""
def _raise_driver_not_found():
raise HsaSupportError(DRIVER_NOT_FOUND_MSG)
DRIVER_LOAD_ERROR_MSG = """
A HSA runtime library was found, but failed to load with error:
%s
"""
def _raise_driver_error(e):
raise HsaSupportError(DRIVER_LOAD_ERROR_MSG % e)
MISSING_FUNCTION_ERRMSG = """driver missing function: %s.
"""
class Recycler(object):
def __init__(self):
self._garbage = []
self.enabled = True
def free(self, obj):
self._garbage.append(obj)
self.service()
def _cleanup(self):
for obj in self._garbage:
obj._finalizer(obj)
del self._garbage[:]
def service(self):
if self.enabled:
if len(self._garbage) > 10:
self._cleanup()
def drain(self):
self._cleanup()
self.enabled = False
# The Driver ###########################################################
class Driver(object):
"""
Driver API functions are lazily bound.
"""
_singleton = None
_agent_map = None
_api_prototypes = drvapi.API_PROTOTYPES # avoid premature GC at exit
_hsa_properties = {
'version_major': (enums.HSA_SYSTEM_INFO_VERSION_MAJOR, ctypes.c_uint16),
'version_minor': (enums.HSA_SYSTEM_INFO_VERSION_MINOR, ctypes.c_uint16),
'timestamp': (enums.HSA_SYSTEM_INFO_TIMESTAMP, ctypes.c_uint64),
'timestamp_frequency': (enums.HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, ctypes.c_uint16),
'signal_max_wait': (enums.HSA_SYSTEM_INFO_SIGNAL_MAX_WAIT, ctypes.c_uint64),
}
def __new__(cls):
obj = cls._singleton
if obj is not None:
return obj
else:
obj = object.__new__(cls)
cls._singleton = obj
return obj
def __init__(self):
try:
if config.DISABLE_HSA:
raise HsaSupportError("HSA disabled by user")
self.lib = _find_driver()
self.is_initialized = False
self.initialization_error = None
except HsaSupportError as e:
self.is_initialized = True
self.initialization_error = e
self._agent_map = None
self._programs = {}
self._recycler = Recycler()
self._active_streams = weakref.WeakSet()
def _initialize_api(self):
if self.is_initialized:
return
self.is_initialized = True
try:
self.hsa_init()
except HsaApiError as e:
self.initialization_error = e
raise HsaDriverError("Error at driver init: \n%s:" % e)
else:
@atexit.register
def shutdown():
try:
for agent in self.agents:
agent.release()
except AttributeError:
# this is because no agents initialised
# so self.agents isn't present
pass
else:
self._recycler.drain()
def _initialize_agents(self):
if self._agent_map is not None:
return
self._initialize_api()
agent_ids = []
def on_agent(agent_id, ctxt):
agent_ids.append(agent_id)
return enums.HSA_STATUS_SUCCESS
callback = drvapi.HSA_ITER_AGENT_CALLBACK_FUNC(on_agent)
self.hsa_iterate_agents(callback, None)
agent_map = dict((agent_id, Agent(agent_id)) for agent_id in agent_ids)
self._agent_map = agent_map
@property
def is_available(self):
self._initialize_api()
return self.initialization_error is None
@property
def agents(self):
self._initialize_agents()
return self._agent_map.values()
def create_program(self, model=enums.HSA_MACHINE_MODEL_LARGE,
profile=enums.HSA_PROFILE_FULL,
rounding_mode=enums.HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT,
options=None):
program = drvapi.hsa_ext_program_t()
assert options is None
self.hsa_ext_program_create(model, profile, rounding_mode,
options, ctypes.byref(program))
return Program(program)
def create_signal(self, initial_value, consumers=None):
if consumers is None:
consumers = tuple(self.agents)
consumers_len = len(consumers)
consumers_type = drvapi.hsa_agent_t * consumers_len
consumers = consumers_type(*[c._id for c in consumers])
result = drvapi.hsa_signal_t()
self.hsa_signal_create(initial_value, consumers_len, consumers,
ctypes.byref(result))
return Signal(result.value)
def __getattr__(self, fname):
# Initialize driver
self._initialize_api()
# First try if it is an hsa property
try:
enum, typ = self._hsa_properties[fname]
result = typ()
self.hsa_system_get_info(enum, ctypes.byref(result))
return result.value
except KeyError:
pass
# if not a property... try if it is an api call
try:
proto = self._api_prototypes[fname]
except KeyError:
raise AttributeError(fname)
if self.initialization_error is not None:
raise HsaSupportError("Error at driver init: \n%s:" %
self.initialization_error)
# Find function in driver library
libfn = self._find_api(fname)
for key, val in proto.items():
setattr(libfn, key, val)
def driver_wrapper(fn):
def wrapped(*args, **kwargs):
_logger.debug('call driver api: %s', fname)
return fn(*args, **kwargs)
return wrapped
retval = driver_wrapper(libfn)
setattr(self, fname, retval)
return retval
def _find_api(self, fname):
# Try regular
try:
return getattr(self.lib, fname)
except AttributeError:
pass
# Not found.
# Delay missing function error to use
def absent_function(*args, **kws):
raise HsaDriverError(MISSING_FUNCTION_ERRMSG % fname)
setattr(self, fname, absent_function)
return absent_function
@property
def components(self):
"""Returns a ordered list of components
The first device should be picked first
"""
return list(filter(lambda a: a.is_component, reversed(sorted(
self.agents))))
def create_stream(self):
st = Stream()
self._active_streams.add(st)
return st
def implicit_sync(self):
"""
Implicit synchronization for all asynchronous streams
across all devices.
"""
_logger.info("implicit sync")
for st in self._active_streams:
st.synchronize()
hsa = Driver()
class HsaWrapper(object):
def __getattr__(self, fname):
try:
enum, typ = self._hsa_properties[fname]
except KeyError:
raise AttributeError(
"%r object has no attribute %r" % (self.__class__, fname))
func = getattr(hsa, self._hsa_info_function)
result = typ()
is_array_type = hasattr(typ, '_length_')
# if the result is not ctypes array, get a reference)
result_buff = result if is_array_type else ctypes.byref(result)
func(self._id, enum, result_buff)
if not is_array_type or typ._type_ == ctypes.c_char:
return result.value
else:
return list(result)
def __dir__(self):
return sorted(set(dir(type(self)) +
self.__dict__.keys() +
self._hsa_properties.keys()))
@total_ordering
class Agent(HsaWrapper):
"""Abstracts a HSA compute agent.
This will wrap and provide an OO interface for hsa_agent_t C-API elements
"""
# Note this will be handled in a rather unconventional way. When agents get
# initialized by the driver, a set of instances for all the available agents
# will be created. After that creation, the __new__ and __init__ methods will
# be replaced, and the constructor will act as a mapping from an agent_id to
# the equivalent Agent object. Any attempt to create an Agent with a non
# existing agent_id will result in an error.
#
# the logic for this resides in Driver._initialize_agents
_hsa_info_function = 'hsa_agent_get_info'
_hsa_properties = {
'name': (enums.HSA_AGENT_INFO_NAME, ctypes.c_char * 64),
'vendor_name': (enums.HSA_AGENT_INFO_VENDOR_NAME, ctypes.c_char * 64),
'feature': (enums.HSA_AGENT_INFO_FEATURE, drvapi.hsa_agent_feature_t),
'wavefront_size': (
enums.HSA_AGENT_INFO_WAVEFRONT_SIZE, ctypes.c_uint32),
'workgroup_max_dim': (
enums.HSA_AGENT_INFO_WORKGROUP_MAX_DIM, ctypes.c_uint16 * 3),
'grid_max_dim': (enums.HSA_AGENT_INFO_GRID_MAX_DIM, drvapi.hsa_dim3_t),
'grid_max_size': (enums.HSA_AGENT_INFO_GRID_MAX_SIZE, ctypes.c_uint32),
'fbarrier_max_size': (
enums.HSA_AGENT_INFO_FBARRIER_MAX_SIZE, ctypes.c_uint32),
'queues_max': (enums.HSA_AGENT_INFO_QUEUES_MAX, ctypes.c_uint32),
'queue_max_size': (
enums.HSA_AGENT_INFO_QUEUE_MAX_SIZE, ctypes.c_uint32),
'queue_type': (
enums.HSA_AGENT_INFO_QUEUE_TYPE, drvapi.hsa_queue_type_t),
'node': (enums.HSA_AGENT_INFO_NODE, ctypes.c_uint32),
'_device': (enums.HSA_AGENT_INFO_DEVICE, drvapi.hsa_device_type_t),
'cache_size': (enums.HSA_AGENT_INFO_CACHE_SIZE, ctypes.c_uint32 * 4),
'isa': (enums.HSA_AGENT_INFO_ISA, drvapi.hsa_isa_t),
}
def __init__(self, agent_id):
# This init will only happen when initializing the agents. After
# the agent initialization the instances of this class are considered
# initialized and locked, so this method will be removed.
self._id = agent_id
self._recycler = hsa._recycler
self._queues = set()
self._initialize_regions()
self._initialize_mempools()
@property
def device(self):
return _device_type_to_string(self._device)
@property
def is_component(self):
return (self.feature & enums.HSA_AGENT_FEATURE_KERNEL_DISPATCH) != 0
@property
def regions(self):
return self._regions
@property
def mempools(self):
return self._mempools
@property
def wavebits(self):
"""
log2(wavefront_size)
"""
# assume wavefront_size will always be a power of 2
return bin(self.wavefront_size)[::-1].index('1')
def _initialize_regions(self):
region_ids = []
def on_region(region_id, ctxt):
region_ids.append(region_id)
return enums.HSA_STATUS_SUCCESS
callback = drvapi.HSA_AGENT_ITERATE_REGIONS_CALLBACK_FUNC(on_region)
hsa.hsa_agent_iterate_regions(self._id, callback, None)
self._regions = _RegionList([MemRegion.instance_for(self, region_id)
for region_id in region_ids])
def _initialize_mempools(self):
mempool_ids = []
def on_region(_id, ctxt=None):
mempool_ids.append(_id)
return enums.HSA_STATUS_SUCCESS
callback = drvapi.HSA_AMD_AGENT_ITERATE_MEMORY_POOLS_CALLBACK(on_region)
hsa.hsa_amd_agent_iterate_memory_pools(self._id, callback, None)
self._mempools = _RegionList([MemPool.instance_for(self, mempool_id)
for mempool_id in mempool_ids])
def _create_queue(self, size, callback=None, data=None,
private_segment_size=None, group_segment_size=None,
queue_type=None):
assert queue_type is not None
assert size <= self.queue_max_size
cb_typ = drvapi.HSA_QUEUE_CALLBACK_FUNC
cb = ctypes.cast(None, cb_typ) if callback is None else cb_typ(callback)
result = ctypes.POINTER(drvapi.hsa_queue_t)()
private_segment_size = (ctypes.c_uint32(-1)
if private_segment_size is None
else private_segment_size)
group_segment_size = (ctypes.c_uint32(-1)
if group_segment_size is None
else group_segment_size)
hsa.hsa_queue_create(self._id, size, queue_type, cb, data,
private_segment_size, group_segment_size,
ctypes.byref(result))
q = Queue(self, result)
self._queues.add(q)
return weakref.proxy(q)
def create_queue_single(self, *args, **kwargs):
kwargs['queue_type'] = enums.HSA_QUEUE_TYPE_SINGLE
return self._create_queue(*args, **kwargs)
def create_queue_multi(self, *args, **kwargs):
kwargs['queue_type'] = enums.HSA_QUEUE_TYPE_MULTI
return self._create_queue(*args, **kwargs)
def release(self):
"""
Release all resources
Called at system teardown
"""
for q in list(self._queues):
q.release()
def release_queue(self, queue):
self._queues.remove(queue)
self._recycler.free(queue)
def __repr__(self):
return "<HSA agent ({0}): {1} {2} '{3}'{4}>".format(self._id,
self.device,
self.vendor_name,
self.name,
" (component)" if self.is_component else "")
def _rank(self):
return (self.is_component, self.grid_max_size, self._device)
def __lt__(self, other):
if isinstance(self, Agent):
return self._rank() < other._rank()
else:
return NotImplemented
def __eq__(self, other):
if isinstance(self, Agent):
return self._rank() == other._rank()
else:
return NotImplemented
def __hash__(self):
return hash(self._rank())
def create_context(self):
return Context(self)
class _RegionList(Sequence):
__slots__ = '_all', 'globals', 'readonlys', 'privates', 'groups'
def __init__(self, lst):
self._all = tuple(lst)
self.globals = tuple(x for x in lst if x.kind == 'global')
self.readonlys = tuple(x for x in lst if x.kind == 'readonly')
self.privates = tuple(x for x in lst if x.kind == 'private')
self.groups = tuple(x for x in lst if x.kind == 'group')
def __len__(self):
return len(self._all)
def __contains__(self, item):
return item in self._all
def __reversed__(self):
return reversed(self._all)
def __getitem__(self, idx):
return self._all[idx]
class MemPool(HsaWrapper):
"""Abstracts a HSA mem pool.
This will wrap and provide an OO interface for hsa_amd_memory_pool_t
C-API elements
"""
_hsa_info_function = 'hsa_amd_memory_pool_get_info'
_hsa_properties = {
'segment': (
enums_ext.HSA_AMD_MEMORY_POOL_INFO_SEGMENT,
drvapi.hsa_amd_segment_t
),
'_flags': (
enums_ext.HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS,
ctypes.c_uint32
),
'size': (enums_ext.HSA_AMD_MEMORY_POOL_INFO_SIZE,
ctypes.c_size_t),
'alloc_allowed': (enums_ext.HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
ctypes.c_bool),
'alloc_granule': (enums_ext.HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE,
ctypes.c_size_t),
'alloc_alignment': (enums_ext.HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT,
ctypes.c_size_t),
'accessible_by_all': (enums_ext.HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL,
ctypes.c_bool),
}
_segment_name_map = {
enums_ext.HSA_AMD_SEGMENT_GLOBAL: 'global',
enums_ext.HSA_AMD_SEGMENT_READONLY: 'readonly',
enums_ext.HSA_AMD_SEGMENT_PRIVATE: 'private',
enums_ext.HSA_AMD_SEGMENT_GROUP: 'group',
}
def __init__(self, agent, pool):
"""Do not instantiate MemPool objects directly, use the factory class
method 'instance_for' to ensure MemPool identity"""
self._id = pool
self._owner_agent = agent
self._as_parameter_ = self._id
@property
def kind(self):
return self._segment_name_map[self.segment]
@property
def agent(self):
return self._owner_agent
def supports(self, check_flag):
"""
Determines if a given feature is supported by this MemRegion.
Feature flags are found in "./enums_exp.py" under:
* hsa_amd_memory_pool_global_flag_t
Params:
check_flag: Feature flag to test
"""
if self.kind == 'global':
return self._flags & check_flag
else:
return False
def allocate(self, nbytes):
assert self.alloc_allowed
assert nbytes >= 0
buff = ctypes.c_void_p()
flags = ctypes.c_uint32(0) # From API docs "Must be 0"!
hsa.hsa_amd_memory_pool_allocate(self._id, nbytes, flags, ctypes.byref(buff))
if buff.value is None:
raise HsaDriverError("Failed to allocate from {}".format(self))
return buff
_instance_dict = {}
@classmethod
def instance_for(cls, owner, _id):
try:
return cls._instance_dict[_id]
except KeyError:
new_instance = cls(owner, _id)
cls._instance_dict[_id] = new_instance
return new_instance
class MemRegion(HsaWrapper):
"""Abstracts a HSA memory region.
This will wrap and provide an OO interface for hsa_region_t C-API elements
"""
_hsa_info_function = 'hsa_region_get_info'
_hsa_properties = {
'segment': (
enums.HSA_REGION_INFO_SEGMENT,
drvapi.hsa_region_segment_t
),
'_flags': (
enums.HSA_REGION_INFO_GLOBAL_FLAGS,
drvapi.hsa_region_global_flag_t
),
'host_accessible': (enums_ext.HSA_AMD_REGION_INFO_HOST_ACCESSIBLE,
ctypes.c_bool),
'size': (enums.HSA_REGION_INFO_SIZE,
ctypes.c_size_t),
'alloc_max_size': (enums.HSA_REGION_INFO_ALLOC_MAX_SIZE,
ctypes.c_size_t),
'alloc_alignment': (enums.HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT,
ctypes.c_size_t),
'alloc_granule': (enums.HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE,
ctypes.c_size_t),
'alloc_allowed': (enums.HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED,
ctypes.c_bool),
}
_segment_name_map = {
enums.HSA_REGION_SEGMENT_GLOBAL: 'global',
enums.HSA_REGION_SEGMENT_READONLY: 'readonly',
enums.HSA_REGION_SEGMENT_PRIVATE: 'private',
enums.HSA_REGION_SEGMENT_GROUP: 'group',
}
def __init__(self, agent, region_id):
"""Do not instantiate MemRegion objects directly, use the factory class
method 'instance_for' to ensure MemRegion identity"""
self._id = region_id
self._owner_agent = agent
self._as_parameter_ = self._id
@property
def kind(self):
return self._segment_name_map[self.segment]
@property
def agent(self):
return self._owner_agent
def supports(self, check_flag):
"""
Determines if a given feature is supported by this MemRegion.
Feature flags are found in "./enums.py" under:
* hsa_region_global_flag_t
Params:
check_flag: Feature flag to test
"""
if self.kind == 'global':
return self._flags & check_flag
else:
return False
def allocate(self, nbytes):
assert self.alloc_allowed
assert nbytes <= self.alloc_max_size
assert nbytes >= 0
buff = ctypes.c_void_p()
hsa.hsa_memory_allocate(self._id, nbytes, ctypes.byref(buff))
return buff
def free(self, ptr):
hsa.hsa_memory_free(ptr)
_instance_dict = {}
@classmethod
def instance_for(cls, owner, _id):
try:
return cls._instance_dict[_id]
except KeyError:
new_instance = cls(owner, _id)
cls._instance_dict[_id] = new_instance
return new_instance
class Queue(object):
def __init__(self, agent, queue_ptr):
"""The id in a queue is a pointer to the queue object returned by hsa_queue_create.
The Queue object has ownership on that queue object"""
self._agent = weakref.proxy(agent)
self._id = queue_ptr
self._as_parameter_ = self._id
self._finalizer = hsa.hsa_queue_destroy
def release(self):
self._agent.release_queue(self)
def __getattr__(self, fname):
return getattr(self._id.contents, fname)
@contextmanager
def _get_packet(self, packet_type):
# Write AQL packet at the calculated queue index address
queue_struct = self._id.contents
queue_mask = queue_struct.size - 1
assert (ctypes.sizeof(packet_type) ==
ctypes.sizeof(drvapi.hsa_kernel_dispatch_packet_t))
packet_array_t = (packet_type * queue_struct.size)
# Obtain the current queue write index
index = hsa.hsa_queue_add_write_index_acq_rel(self._id, 1)
while True:
read_offset = hsa.hsa_queue_load_read_index_acquire(self._id)
if read_offset <= index < read_offset + queue_struct.size:
break
queue_offset = index & queue_mask
queue = packet_array_t.from_address(queue_struct.base_address)
packet = queue[queue_offset]
# zero init
ctypes.memset(ctypes.addressof(packet), 0, ctypes.sizeof(packet_type))
yield packet
# Increment write index
# Ring the doorbell
hsa.hsa_signal_store_release(self._id.contents.doorbell_signal, index)
def insert_barrier(self, dep_signal):
with self._get_packet(drvapi.hsa_barrier_and_packet_t) as packet:
# Populate packet
packet.dep_signal0 = dep_signal._id
header = 0
header |= enums.HSA_FENCE_SCOPE_SYSTEM << enums.HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE
header |= enums.HSA_FENCE_SCOPE_SYSTEM << enums.HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE
header |= enums.HSA_PACKET_TYPE_BARRIER_AND << enums.HSA_PACKET_HEADER_TYPE
header |= 1 << enums.HSA_PACKET_HEADER_BARRIER
# Original example calls for an atomic store.
# Since we are on x86, store of aligned 16 bit is atomic.
# The C code is
# __atomic_store_n((uint16_t*)(&dispatch_packet->header), header, __ATOMIC_RELEASE);
packet.header = header
def dispatch(self, symbol, kernargs,
workgroup_size=None,
grid_size=None,
signal=None):
_logger.info("dispatch %s", symbol.name)
dims = len(workgroup_size)
assert dims == len(grid_size)
assert 0 < dims <= 3
assert grid_size >= workgroup_size
if workgroup_size > tuple(self._agent.workgroup_max_dim)[:dims]:
msg = "workgroupsize is too big {0} > {1}"
raise HsaDriverError(msg.format(workgroup_size,
tuple(self._agent.workgroup_max_dim)[:dims]))
s = signal if signal is not None else hsa.create_signal(1)
# Note: following vector_copy.c
with self._get_packet(drvapi.hsa_kernel_dispatch_packet_t) as packet:
# Populate packet
packet.setup |= dims << enums.HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS
packet.workgroup_size_x = workgroup_size[0]
packet.workgroup_size_y = workgroup_size[1] if dims > 1 else 1
packet.workgroup_size_z = workgroup_size[2] if dims > 2 else 1
packet.grid_size_x = grid_size[0]
packet.grid_size_y = grid_size[1] if dims > 1 else 1
packet.grid_size_z = grid_size[2] if dims > 2 else 1
packet.completion_signal = s._id
packet.kernel_object = symbol.kernel_object
packet.kernarg_address = (0 if kernargs is None
else kernargs.value)
packet.private_segment_size = symbol.private_segment_size
packet.group_segment_size = symbol.group_segment_size
header = 0
header |= enums.HSA_FENCE_SCOPE_SYSTEM << enums.HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE
header |= enums.HSA_FENCE_SCOPE_SYSTEM << enums.HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE
header |= enums.HSA_PACKET_TYPE_KERNEL_DISPATCH << enums.HSA_PACKET_HEADER_TYPE
# Original example calls for an atomic store.
# Since we are on x86, store of aligned 16 bit is atomic.
# The C code is
# __atomic_store_n((uint16_t*)(&dispatch_packet->header), header, __ATOMIC_RELEASE);
packet.header = header
# Wait on the dispatch completion signal
# synchronous if no signal was provided
if signal is None:
_logger.info('wait for synchronous kernel to complete')
timeout = 10
if not s.wait_until_ne_one(timeout=timeout):
msg = "Kernel timed out after {timeout} second"
raise HsaKernelTimedOut(msg.format(timeout=timeout))
def __dir__(self):
return sorted(set(dir(self._id.contents) +
self.__dict__.keys()))
def owned(self):
return ManagedQueueProxy(self)
class ManagedQueueProxy(object):
def __init__(self, queue):
self._queue = weakref.ref(queue)
def __getattr__(self, item):
return getattr(self._queue(), item)
class Signal(object):
"""The id for the signal is going to be the hsa_signal_t returned by create_signal.
Lifetime of the underlying signal will be tied with this object".
Note that it is likely signals will have lifetime issues."""
def __init__(self, signal_id):
self._id = signal_id
self._as_parameter_ = self._id
weakref.finalize(self, hsa.hsa_signal_destroy, self._id)
def load_relaxed(self):
return hsa.hsa_signal_load_relaxed(self._id)
def load_acquire(self):
return hsa.hsa_signal_load_acquire(self._id)
def wait_until_ne_one(self, timeout=None):
"""
Returns a boolean to indicate whether the wait has timeout
"""
one = 1
mhz = 10 ** 6
if timeout is None:
# Infinite
expire = -1 # UINT_MAX
else:
# timeout as seconds
expire = timeout * hsa.timestamp_frequency * mhz
# XXX: use active wait instead of blocked seem to avoid hang in docker
hsa.hsa_signal_wait_acquire(self._id, enums.HSA_SIGNAL_CONDITION_NE,
one, expire,
enums.HSA_WAIT_STATE_ACTIVE)
return self.load_relaxed() != one
class BrigModule(object):
def __init__(self, brig_buffer):
"""
Take a byte buffer of a Brig module
"""
buf = ctypes.create_string_buffer(brig_buffer)
self._buffer = buf
self._id = ctypes.cast(ctypes.addressof(buf),
drvapi.hsa_ext_module_t)
@classmethod
def from_file(cls, file_name):
with open(file_name, 'rb') as fin:
buf = fin.read()
return BrigModule(buf)
def __len__(self):
return len(self._buffer)
def __repr__(self):
return "<BrigModule id={0} size={1}bytes>".format(hex(id(self)),
len(self))
class Program(object):
def __init__(self, model=enums.HSA_MACHINE_MODEL_LARGE,
profile=enums.HSA_PROFILE_FULL,
rounding_mode=enums.HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT,
options=None, version_major=1, version_minor=0):
self._id = drvapi.hsa_ext_program_t()
assert options is None
def check_fptr_return(hsa_status):
if hsa_status is not enums.HSA_STATUS_SUCCESS:
msg = ctypes.c_char_p()
hsa.hsa_status_string(hsa_status, ctypes.byref(msg))
_logger.info(msg.value.decode("utf-8"))
exit(-hsa_status)
support = ctypes.c_bool(0)
hsa.hsa_system_extension_supported(enums.HSA_EXTENSION_FINALIZER,
version_major,
version_minor,
ctypes.byref(support))
assert support.value, ('HSA system extension %s.%s not supported' %
(version_major, version_minor))
# struct of function pointers
self._ftabl = drvapi.hsa_ext_finalizer_1_00_pfn_t()
# populate struct
hsa.hsa_system_get_extension_table(enums.HSA_EXTENSION_FINALIZER,
version_major,
version_minor,
ctypes.byref(self._ftabl))
ret = self._ftabl.hsa_ext_program_create(model, profile,
rounding_mode, options,
ctypes.byref(self._id))
check_fptr_return(ret)
self._as_parameter_ = self._id
weakref.finalize(self, self._ftabl.hsa_ext_program_destroy,
self._id)
def add_module(self, module):
self._ftabl.hsa_ext_program_add_module(self._id, module._id)
def finalize(self, isa, callconv=0, options=None):
"""
The program object is safe to be deleted after ``finalize``.
"""
code_object = drvapi.hsa_code_object_t()
control_directives = drvapi.hsa_ext_control_directives_t()
ctypes.memset(ctypes.byref(control_directives), 0,
ctypes.sizeof(control_directives))
self._ftabl.hsa_ext_program_finalize(self._id,
isa,
callconv,
control_directives,
options,
enums.HSA_CODE_OBJECT_TYPE_PROGRAM,
ctypes.byref(code_object))
return CodeObject(code_object)
class CodeObject(object):
def __init__(self, code_object):
self._id = code_object
self._as_parameter_ = self._id
weakref.finalize(self, hsa.hsa_code_object_destroy, self._id)
class Executable(object):
def __init__(self):
ex = drvapi.hsa_executable_t()
hsa.hsa_executable_create(enums.HSA_PROFILE_FULL,
enums.HSA_EXECUTABLE_STATE_UNFROZEN,
None,
ctypes.byref(ex))
self._id = ex
self._as_parameter_ = self._id
weakref.finalize(self, hsa.hsa_executable_destroy, self._id)
def load(self, agent, code_object):
hsa.hsa_executable_load_code_object(self._id, agent._id,
code_object._id, None)
def freeze(self):
"""Freeze executable before we can query for symbol"""
hsa.hsa_executable_freeze(self._id, None)
def get_symbol(self, agent, name):
symbol = drvapi.hsa_executable_symbol_t()
hsa.hsa_executable_get_symbol(self._id, None,
ctypes.create_string_buffer(
name.encode('ascii')),
agent._id, 0,
ctypes.byref(symbol))
return Symbol(name, symbol)
class Symbol(HsaWrapper):
_hsa_info_function = 'hsa_executable_symbol_get_info'
_hsa_properties = {
'kernel_object': (
enums.HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
ctypes.c_uint64,
),
'kernarg_segment_size': (
enums.HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
ctypes.c_uint32,
),
'group_segment_size': (
enums.HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
ctypes.c_uint32,
),
'private_segment_size': (
enums.HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
ctypes.c_uint32,
),
}
def __init__(self, name, symbol_id):
self._id = symbol_id
self.name = name
class MemoryPointer(object):
__hsa_memory__ = True
def __init__(self, context, pointer, size, finalizer=None):
assert isinstance(context, Context)
self.context = context
self.device_pointer = pointer
self.size = size
self._hsa_memsize_ = size
self.finalizer = finalizer
self.is_managed = finalizer is not None
self.is_alive = True
self.refct = 0
def __del__(self):
try:
if self.is_managed and self.is_alive:
self.finalizer()
except:
traceback.print_exc()
def own(self):
return OwnedPointer(weakref.proxy(self))
def free(self):
"""
Forces the device memory to the trash.
"""
if self.is_managed:
if not self.is_alive:
raise RuntimeError("Freeing dead memory")
self.finalizer()
self.is_alive = False
def view(self):
pointer = self.device_pointer.value
view = MemoryPointer(self.context, pointer, self.size)
return OwnedPointer(weakref.proxy(self), view)
@property
def device_ctypes_pointer(self):
return self.device_pointer
def allow_access_to(self, *agents):
"""
Grant access to given *agents*.
Upon return, only the listed-agents and the owner agent have direct
access to this pointer.
"""
ct = len(agents)
if ct == 0:
return
agent_array = (ct * drvapi.hsa_agent_t)(*[a._id for a in agents])
hsa.hsa_amd_agents_allow_access(ct, agent_array, None,
self.device_pointer)
class HostMemory(mviewbuf.MemAlloc):
def __init__(self, context, owner, pointer, size):
self.context = context
self.owned = owner
self.size = size
self.host_pointer = pointer
self.handle = self.host_pointer
# For buffer interface
self._buflen_ = self.size
self._bufptr_ = self.host_pointer.value
def own(self):
return self
class OwnedPointer(object):
def __init__(self, memptr, view=None):
self._mem = memptr
self._mem.refct += 1
if view is None:
self._view = self._mem
else:
assert not view.is_managed
self._view = view
def __del__(self):
try:
self._mem.refct -= 1
assert self._mem.refct >= 0
if self._mem.refct == 0:
self._mem.free()
except ReferenceError:
pass
except:
traceback.print_exc()
def __getattr__(self, fname):
"""Proxy MemoryPointer methods
"""
return getattr(self._view, fname)
class Context(object):
"""
A context is associated with a component
"""
"""
Parameters:
agent the agent, and instance of the class Agent
"""
# a weak set of active Stream objects
_active_streams = weakref.WeakSet()
def __init__(self, agent):
self._agent = weakref.proxy(agent)
if self._agent.is_component: # only components have queues
qs = agent.queue_max_size
defq = self._agent.create_queue_multi(qs, callback=self._callback)
self._defaultqueue = defq.owned()
self.allocations = utils.UniqueDict()
# get pools
coarse_flag = enums_ext.HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED
fine_flag = enums_ext.HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED
alloc_mps = [mp for mp in agent.mempools.globals if mp.alloc_allowed]
self._coarsegrain_mempool = None
self._finegrain_mempool = None
for mp in alloc_mps:
if mp.supports(coarse_flag):
self._coarsegrain_mempool = mp
if mp.supports(fine_flag):
self._finegrain_mempool = mp
def _callback(self, status, queue):
drvapi._check_error(status, queue)
sys.exit(1)
@property
def unproxy(self):
# This is a trick to help handle weakproxy comparison with actual
# instance.
# See https://stackoverflow.com/a/49319989 for inspiration and the
# whole page for more general discussion.
return self
@property
def default_queue(self):
return self._defaultqueue
@property
def agent(self):
return self._agent
@property
def coarsegrain_mempool(self):
if self._coarsegrain_mempool is None:
msg = 'coarsegrain mempool is not available in {}'.format(self._agent)
raise ValueError(msg)
return self._coarsegrain_mempool
@property
def finegrain_mempool(self):
if self._finegrain_mempool is None:
msg = 'finegrain mempool is not available in {}'.format(self._agent)
raise ValueError(msg)
return self._finegrain_mempool
def memalloc(self, nbytes, memTypeFlags=None, hostAccessible=True):
"""
Allocates memory.
Parameters:
nbytes the number of bytes to allocate.
memTypeFlags the flags for which the memory region must have support,\
due to the inherent rawness of the underlying call, the\
validity of the flag is not checked, cf. C language.
hostAccessible boolean as to whether the region in which the\
allocation takes place should be host accessible
"""
hw = self._agent.device
all_reg = self._agent.regions
flag_ok_r = list() # regions which pass the memTypeFlags test
regions = list()
# don't support DSP
if hw == "GPU" or hw == "CPU":
# check user requested flags
if memTypeFlags is not None:
for r in all_reg:
count = 0
for flags in memTypeFlags:
if r.supports(flags):
count += 1
if count == len(memTypeFlags):
flag_ok_r.append(r)
else:
flag_ok_r = all_reg
# check system required flags for allocation
for r in flag_ok_r:
# check the mem region is coarse grained if dGPU present
# TODO: this probably ought to explicitly check for a dGPU.
if (hw == "GPU" and
not r.supports(enums.HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED)):
continue
# check accessibility criteria
if hostAccessible:
if r.host_accessible:
regions.append(r)
else:
if not r.host_accessible:
regions.append(r)
else:
raise RuntimeError("Unknown device type string \"%s\"" % hw)
assert len(regions) > 0, "No suitable memory regions found."
# walk though valid regions trying to malloc until there's none left
mem = None
for region_id in regions:
try:
mem = MemRegion.instance_for(self._agent, region_id)\
.allocate(nbytes)
except HsaApiError: # try next memory region if an allocation fails
pass
else: # allocation succeeded, stop looking for memory
break
if mem is None:
raise RuntimeError("Memory allocation failed. No agent/region \
combination could meet allocation restraints \
(hardware = %s, size = %s, flags = %s)." % \
( hw, nbytes, memTypeFlags))
fin = _make_mem_finalizer(hsa.hsa_memory_free)
ret = MemoryPointer(weakref.proxy(self), mem, nbytes,
finalizer=fin(self, mem))
if mem.value is None:
raise RuntimeError("MemoryPointer has no value")
self.allocations[mem.value] = ret
return ret.own()
def mempoolalloc(self, nbytes, allow_access_to=(), finegrain=False):
"""
Allocates memory in a memory pool.
Parameters:
*nbytes* the number of bytes to allocate.
*allow_acces_to*
*finegrain*
"""
mempool = (self.finegrain_mempool
if finegrain
else self.coarsegrain_mempool)
buff = mempool.allocate(nbytes)
fin = _make_mem_finalizer(hsa.hsa_amd_memory_pool_free)
mp = MemoryPointer(weakref.proxy(self), buff, nbytes,
finalizer=fin(self, buff))
mp.allow_access_to(*allow_access_to)
self.allocations[buff.value] = mp
return mp.own()
def memhostalloc(self, size, finegrain, allow_access_to):
mem = self.mempoolalloc(size, allow_access_to=allow_access_to,
finegrain=finegrain)
return HostMemory(weakref.proxy(self), owner=mem,
pointer=mem.device_pointer, size=mem.size)
class Stream(object):
"""
An asynchronous stream for async API
"""
def __init__(self):
self._signals = deque()
self._callbacks = defaultdict(list)
def _add_signal(self, signal):
"""
Add a signal that corresponds to an async task.
"""
# XXX: too many pending signals seem to cause async copy to hang
if len(self._signals) > 100:
self._sync(50)
self._signals.append(signal)
def _add_callback(self, callback):
assert callable(callback)
self._callbacks[self._get_last_signal()].append(callback)
def _get_last_signal(self):
"""
Get the last signal.
"""
return self._signals[-1] if self._signals else None
def synchronize(self):
"""
Synchronize the stream.
"""
self._sync(len(self._signals))
def _sync(self, limit):
ct = 0
while self._signals:
if ct >= limit:
break
sig = self._signals.popleft()
if sig.load_relaxed() == 1:
sig.wait_until_ne_one()
for cb in self._callbacks[sig]:
cb()
del self._callbacks[sig]
ct += 1
@contextmanager
def auto_synchronize(self):
'''
A context manager that waits for all commands in this stream to execute
and commits any pending memory transfers upon exiting the context.
'''
yield self
self.synchronize()
def _make_mem_finalizer(dtor):
"""
finalises memory
Parameters:
dtor a function that will delete/free held memory from a reference
Returns:
Finalising function
"""
def mem_finalize(context, handle):
allocations = context.allocations
sync = hsa.implicit_sync
def core():
_logger.info("Current allocations: %s", allocations)
if allocations:
_logger.info("Attempting delete on %s" % handle.value)
del allocations[handle.value]
sync() # implicit sync
dtor(handle)
return core
return mem_finalize
def device_pointer(obj):
"Get the device pointer as an integer"
return device_ctypes_pointer(obj).value
def device_ctypes_pointer(obj):
"Get the ctypes object for the device pointer"
if obj is None:
return c_void_p(0)
require_device_memory(obj)
return obj.device_ctypes_pointer
def is_device_memory(obj):
"""All HSA dGPU memory object is recognized as an instance with the
attribute "__hsa_memory__" defined and its value evaluated to True.
All HSA memory object should also define an attribute named
"device_pointer" which value is an int(or long) object carrying the pointer
value of the device memory address. This is not tested in this method.
"""
return getattr(obj, '__hsa_memory__', False)
def require_device_memory(obj):
"""A sentry for methods that accept HSA memory object.
"""
if not is_device_memory(obj):
raise Exception("Not a HSA memory object.")
def host_pointer(obj):
"""
NOTE: The underlying data pointer from the host data buffer is used and
it should not be changed until the operation which can be asynchronous
completes.
"""
if isinstance(obj, int):
return obj
forcewritable = isinstance(obj, np.void)
return mviewbuf.memoryview_get_buffer(obj, forcewritable)
def host_to_dGPU(context, dst, src, size):
"""
Copy data from a host memory region to a dGPU.
Parameters:
context the dGPU context
dst a pointer to the destination location in dGPU memory
src a pointer to the source location in host memory
size the size (in bytes) of data to transfer
"""
_logger.info("CPU->dGPU")
if size < 0:
raise ValueError("Invalid size given: %s" % size)
hsa.hsa_memory_copy(device_pointer(dst), host_pointer(src), size)
def dGPU_to_host(context, dst, src, size):
"""
Copy data from a host memory region to a dGPU.
Parameters:
context the dGPU context
dst a pointer to the destination location in dGPU memory
src a pointer to the source location in host memory
size the size (in bytes) of data to transfer
"""
_logger.info("dGPU->CPU")
if size < 0:
raise ValueError("Invalid size given: %s" % size)
hsa.hsa_memory_copy(host_pointer(dst), device_pointer(src), size)
def dGPU_to_dGPU(context, dst, src, size):
_logger.info("dGPU->dGPU")
if size < 0:
raise ValueError("Invalid size given: %s" % size)
hsa.hsa_memory_copy(device_pointer(dst), device_pointer(src), size)
def async_host_to_dGPU(dst_ctx, src_ctx, dst, src, size, stream):
_logger.info("Async CPU->dGPU")
async_copy_dgpu(dst_ctx=dst_ctx, src_ctx=src_ctx,
src=host_pointer(src), dst=device_pointer(dst),
size=size, stream=stream)
def async_dGPU_to_host(dst_ctx, src_ctx, dst, src, size, stream):
_logger.info("Async dGPU->CPU")
async_copy_dgpu(dst_ctx=dst_ctx, src_ctx=src_ctx,
dst=host_pointer(dst), src=device_pointer(src),
size=size, stream=stream)
def async_dGPU_to_dGPU(dst_ctx, src_ctx, dst, src, size, stream):
_logger.info("Async dGPU->dGPU")
async_copy_dgpu(dst_ctx=dst_ctx, src_ctx=src_ctx,
dst=device_pointer(dst), src=device_pointer(src),
size=size, stream=stream)
def async_copy_dgpu(dst_ctx, src_ctx, dst, src, size, stream):
if size < 0:
raise ValueError("Invalid size given: %s" % size)
completion_signal = hsa.create_signal(1)
dependent_signal = stream._get_last_signal()
if dependent_signal is not None:
dsignal = drvapi.hsa_signal_t(dependent_signal._id)
signals = (1, ctypes.byref(dsignal), completion_signal)
else:
signals = (0, None, completion_signal)
hsa.hsa_amd_memory_async_copy(dst, dst_ctx._agent._id,
src, src_ctx._agent._id,
size, *signals)
stream._add_signal(completion_signal)
def dgpu_count():
"""
Returns the number of discrete GPUs present on the current machine.
"""
ngpus = 0
try:
for a in hsa.agents:
if a.is_component and a.device == 'GPU':
ngpus += 1
except:
pass
return ngpus
"""
True if a dGPU is present in the current machine.
"""
dgpu_present = dgpu_count() > 0
import ctypes
import warnings
from numba.core import utils
from numba.roc.hsadrv import enums
from .error import HsaApiError, HsaWarning
_PTR = ctypes.POINTER
# This deals with types which are defined as
# typedef struct { uint64_t handle;};
handle_struct = ctypes.c_uint64
#------------------------------------------------------------------------------
# HSA types from hsa.h, ordered as per header file
hsa_status_t = ctypes.c_int # enum
class hsa_dim3_t(ctypes.Structure):
_fields_ = [
('x', ctypes.c_uint32),
('y', ctypes.c_uint32),
('z', ctypes.c_uint32)
]
hsa_access_permission_t = ctypes.c_int # enum
hsa_endianness_t = ctypes.c_int # enum
hsa_machine_model_t = ctypes.c_int # enum
hsa_profile_t = ctypes.c_int # enum
hsa_system_info_t = ctypes.c_int # enum
hsa_extension_t = ctypes.c_int # enum
hsa_agent_t = handle_struct
hsa_agent_feature_t = ctypes.c_int # enum
hsa_device_type_t = ctypes.c_int # enum
hsa_default_float_rounding_mode_t = ctypes.c_int # enum
hsa_agent_info_t = ctypes.c_int # enum
hsa_exception_policy_t = ctypes.c_int # enum
hsa_signal_t = handle_struct
hsa_signal_value_t = ctypes.c_uint64 if enums.HSA_LARGE_MODEL else ctypes.c_uint32
hsa_signal_condition_t = ctypes.c_int # enum
hsa_wait_state_t = ctypes.c_int # enum
hsa_region_t = handle_struct
hsa_queue_type_t = ctypes.c_int # enum
hsa_queue_feature_t = ctypes.c_int # enum
class hsa_queue_t(ctypes.Structure):
"""In theory, this should be aligned to 64 bytes. In any case, allocation
of this structure is done by the hsa library"""
_fields_ = [
('type', hsa_queue_type_t),
('features', ctypes.c_uint32),
('base_address', ctypes.c_void_p), # if LARGE MODEL
('doorbell_signal', hsa_signal_t),
('size', ctypes.c_uint32),
('reserved1', ctypes.c_uint32),
('id', ctypes.c_uint32),
]
hsa_packet_type_t = ctypes.c_int # enum
hsa_fence_scope_t = ctypes.c_int # enum
hsa_packet_header_t = ctypes.c_int # enum
hsa_packet_header_width_t = ctypes.c_int # enum
hsa_kernel_dispatch_packet_setup_t = ctypes.c_int # enum
hsa_kernel_dispatch_packet_setup_width_t = ctypes.c_int # enum
class hsa_kernel_dispatch_packet_t(ctypes.Structure):
_fields_ = [
('header', ctypes.c_uint16),
('setup', ctypes.c_uint16),
('workgroup_size_x', ctypes.c_uint16),
('workgroup_size_y', ctypes.c_uint16),
('workgroup_size_z', ctypes.c_uint16),
('reserved0', ctypes.c_uint16), # Must be zero
('grid_size_x', ctypes.c_uint32),
('grid_size_y', ctypes.c_uint32),
('grid_size_z', ctypes.c_uint32),
('private_segment_size', ctypes.c_uint32),
('group_segment_size', ctypes.c_uint32),
('kernel_object', ctypes.c_uint64),
# NOTE: Small model not dealt with properly...!
# ifdef HSA_LARGE_MODEL
('kernarg_address', ctypes.c_uint64),
# SMALL Machine has a reserved uint32
('reserved2', ctypes.c_uint64), # Must be zero
('completion_signal', hsa_signal_t),
]
class hsa_agent_dispatch_packet_t(ctypes.Structure):
"""This should be aligned to HSA_PACKET_ALIGN_BYTES (64)"""
_fields_ = [
('header', ctypes.c_uint16),
('type', ctypes.c_uint16),
('reserved0', ctypes.c_uint32),
# NOTE: Small model not dealt with properly...!
('return_address', ctypes.c_void_p),
('arg', ctypes.c_uint64 * 4),
('reserved2', ctypes.c_uint64),
('completion_signal', hsa_signal_t),
]
class hsa_barrier_and_packet_t(ctypes.Structure):
_fields_ = [
('header', ctypes.c_uint16),
('reserved0', ctypes.c_uint16),
('reserved1', ctypes.c_uint32),
('dep_signal0', hsa_signal_t),
('dep_signal1', hsa_signal_t),
('dep_signal2', hsa_signal_t),
('dep_signal3', hsa_signal_t),
('dep_signal4', hsa_signal_t),
('reserved2', ctypes.c_uint64),
('completion_signal', hsa_signal_t),
]
hsa_barrier_or_packet_t = hsa_barrier_and_packet_t
hsa_region_segment_t = ctypes.c_int # enum
hsa_region_global_flag_t = ctypes.c_int # enum
hsa_region_info_t = ctypes.c_int # enum
hsa_symbol_kind_t = ctypes.c_int # enum
hsa_variable_allocation_t = ctypes.c_int # enum
hsa_symbol_linkage_t = ctypes.c_int # enum
hsa_variable_segment_t = ctypes.c_int # enum
hsa_isa_t = handle_struct
hsa_isa_info_t = ctypes.c_int # enum
hsa_code_object_t = handle_struct
hsa_callback_data_t = handle_struct
hsa_code_object_type_t = ctypes.c_int # enum
hsa_code_object_info_t = ctypes.c_int # enum
hsa_code_symbol_t = handle_struct
hsa_code_symbol_info_t = ctypes.c_int # enum
hsa_executable_t = handle_struct
hsa_executable_state_t = ctypes.c_int # enum
hsa_executable_info_t = ctypes.c_int # enum
hsa_executable_symbol_t = handle_struct
hsa_executable_symbol_info_t = ctypes.c_int # enum
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
# HSA types from Brig.h, ordered as per header file
# NOTE: not all of the definitions are needed
BrigVersion32_t = ctypes.c_uint32
MODULE_IDENTIFICATION_LENGTH=8
class BrigModuleHeader(ctypes.Structure):
_fields_ = [
('identification', ctypes.c_char*MODULE_IDENTIFICATION_LENGTH),
('brigMajor', BrigVersion32_t),
('brigMinor', BrigVersion32_t),
('byteCount', ctypes.c_uint64),
('hash', ctypes.c_uint8*64),
('reserved', ctypes.c_uint32),
('sectionCount', ctypes.c_uint32),
('sectionIndex', ctypes.c_uint64),
]
BrigModule_t = _PTR(BrigModuleHeader)
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
# HSA types from hsa_ext_amd.h, ordered as per header file
hsa_amd_agent_info_t = ctypes.c_int # enum
hsa_amd_region_info_t = ctypes.c_int # enum
hsa_amd_coherency_type_t = ctypes.c_int # enum
class hsa_amd_profiling_dispatch_time_t(ctypes.Structure):
_fields_ = [
('start', ctypes.c_uint64),
('end', ctypes.c_uint64),
]
# typedef bool (*hsa_amd_signal_handler)(hsa_signal_value_t value, void* arg);
hsa_amd_signal_handler = _PTR(
ctypes.CFUNCTYPE(ctypes.c_bool,
hsa_signal_value_t,
ctypes.c_void_p)
)
hsa_amd_segment_t = ctypes.c_int # enum
hsa_amd_memory_pool_t = handle_struct
hsa_amd_memory_pool_global_flag_t = ctypes.c_int # enum
hsa_amd_memory_pool_info_t = ctypes.c_int # enum
hsa_amd_memory_pool_access_t = ctypes.c_int # enum
hsa_amd_link_info_type_t = ctypes.c_int # enum
hsa_amd_memory_pool_link_info_t = ctypes.c_int # enum
hsa_amd_agent_memory_pool_info_t = ctypes.c_int # enum
class hsa_amd_image_descriptor_t(ctypes.Structure):
_fields_ = [
('version', ctypes.c_uint32),
('deviceID', ctypes.c_uint32),
('data', ctypes.c_uint32*1),
]
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
# HSA types from hsa_ext_finalize.h, ordered as per header file
hsa_ext_module_t = BrigModule_t
hsa_ext_program_t = handle_struct
hsa_ext_program_info_t = ctypes.c_int # enum
hsa_ext_finalizer_call_convention_t = ctypes.c_int # enum
class hsa_ext_control_directives_t(ctypes.Structure):
_fields_ = [
('control_directives_mask', ctypes.c_uint64),
('break_exceptions_mask', ctypes.c_uint16),
('detect_exceptions_mask', ctypes.c_uint16),
('max_dynamic_group_size', ctypes.c_uint32),
('max_flat_grid_size', ctypes.c_uint64),
('max_flat_workgroup_size', ctypes.c_uint32),
('reserved1', ctypes.c_uint32),
('required_grid_size', ctypes.c_uint64*3),
('required_workgroup_size', hsa_dim3_t),
('required_dim', ctypes.c_uint8),
('reserved2', ctypes.c_uint8*75),
]
# function pointers, that are used in the
# "hsa_ext_finalizer_1_00_pfn_t" struct of pointers
HSA_EXT_PROGRAM_CREATE_FPTR = ctypes.CFUNCTYPE(
hsa_status_t, # return value
hsa_machine_model_t, # machine_model
hsa_profile_t, # profile
hsa_default_float_rounding_mode_t, # default_float_rounding_mode
ctypes.c_char_p, # options
_PTR(hsa_ext_program_t)) # program
HSA_EXT_PROGRAM_DESTROY_FPTR = ctypes.CFUNCTYPE(
hsa_status_t, # return value
hsa_ext_program_t) # program
HSA_EXT_PROGRAM_ADD_MODULE_FPTR = ctypes.CFUNCTYPE(
hsa_status_t, # return value
hsa_ext_program_t, # program
hsa_ext_module_t) # module
HSA_EXT_PROGRAM_ITERATE_MODULES_CALLBACK_FUNC = ctypes.CFUNCTYPE(
hsa_status_t, # return
hsa_ext_program_t, # program
hsa_ext_module_t, # module
ctypes.c_void_p) # data
HSA_EXT_PROGRAM_ITERATE_MODULES_FPTR = ctypes.CFUNCTYPE(
hsa_status_t, # return value
hsa_ext_program_t, # program
HSA_EXT_PROGRAM_ITERATE_MODULES_CALLBACK_FUNC, # callback
ctypes.c_void_p) # data
HSA_EXT_PROGRAM_GET_INFO_FPTR = ctypes.CFUNCTYPE(
hsa_status_t, # return value
hsa_ext_program_t, # program
hsa_ext_program_info_t, # attribute
ctypes.c_void_p) # value
HSA_EXT_PROGRAM_FINALIZE_FPTR = ctypes.CFUNCTYPE(
hsa_status_t, # return value
hsa_ext_program_t, # program
hsa_isa_t, # isa
ctypes.c_int32, # call_convention
hsa_ext_control_directives_t, # control_directives
ctypes.c_char_p, #options
hsa_code_object_type_t, #code_object_type
_PTR(hsa_code_object_t)) # code_object
# this struct holds function pointers
class hsa_ext_finalizer_1_00_pfn_t(ctypes.Structure):
_fields_ = [
('hsa_ext_program_create', HSA_EXT_PROGRAM_CREATE_FPTR),
('hsa_ext_program_destroy', HSA_EXT_PROGRAM_DESTROY_FPTR),
('hsa_ext_program_add_module', HSA_EXT_PROGRAM_ADD_MODULE_FPTR),
('hsa_ext_program_iterate_modules',
HSA_EXT_PROGRAM_ITERATE_MODULES_FPTR),
('hsa_ext_program_get_info', HSA_EXT_PROGRAM_GET_INFO_FPTR),
('hsa_ext_program_finalize', HSA_EXT_PROGRAM_FINALIZE_FPTR)
]
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
# HSA types from hsa_ext_image.h (NOTE: support incomplete)
hsa_ext_image_t = handle_struct
hsa_ext_image_geometry_t = ctypes.c_int # enum
hsa_ext_image_channel_type_t = ctypes.c_int # enum
hsa_ext_image_channel_order_t = ctypes.c_int # enum
class hsa_ext_image_format_t(ctypes.Structure):
_fields_ = [
("channel_type", hsa_ext_image_channel_type_t),
("channel_order", hsa_ext_image_channel_order_t)
]
class hsa_ext_image_descriptor_t(ctypes.Structure):
_fields_ = [
("geometry", hsa_ext_image_geometry_t),
("width", ctypes.c_size_t),
("height", ctypes.c_size_t),
("depth", ctypes.c_size_t),
("array_size", ctypes.c_size_t),
("format", hsa_ext_image_format_t)
]
hsa_ext_image_capability_t = ctypes.c_int # enum
class hsa_ext_image_data_info_t(ctypes.Structure):
_fields_ = [
("size", ctypes.c_size_t),
("alignment", ctypes.c_size_t),
]
class hsa_ext_image_region_t(ctypes.Structure):
_fields_ = [
("offset", hsa_dim3_t),
("offset", hsa_dim3_t),
]
hsa_ext_sampler_t = handle_struct
hsa_ext_sampler_addressing_mode_t = ctypes.c_int # enum
hsa_ext_sampler_coordinate_mode_t = ctypes.c_int # enum
hsa_ext_sampler_filter_mode_t = ctypes.c_int # enum
class hsa_ext_sampler_descriptor_t(ctypes.Structure):
_fields_ = [
("coordinate_mode", hsa_ext_sampler_coordinate_mode_t),
("filter_mode", hsa_ext_sampler_filter_mode_t),
("address_mode", hsa_ext_sampler_addressing_mode_t)
]
#NOTE: Not implemented yet: hsa_ext_images_1_00_pfn_t
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
# callbacks that have no related typedef in the hsa include files
HSA_ITER_AGENT_CALLBACK_FUNC = ctypes.CFUNCTYPE(
hsa_status_t, # return value
hsa_agent_t, # agent
ctypes.py_object) # this is a c_void_p used to wrap a python object
HSA_QUEUE_CALLBACK_FUNC = ctypes.CFUNCTYPE(
None, # return value
hsa_status_t,
_PTR(hsa_queue_t),
ctypes.py_object) # this is a c_void_p used to wrap a python object
HSA_AGENT_ITERATE_REGIONS_CALLBACK_FUNC = ctypes.CFUNCTYPE(
hsa_status_t, # return value
hsa_region_t, # region
ctypes.py_object) # this is a c_void_p used to wrap a python object
# hsa_status_t (*callback)(hsa_code_object_t code_object, hsa_code_symbol_t symbol, void* data),
HSA_CODE_OBJECT_ITERATE_SYMBOLS_CALLBACK = ctypes.CFUNCTYPE(
hsa_status_t, # return value
hsa_code_object_t,
hsa_code_symbol_t,
ctypes.py_object) # this is a c_void_p used to wrap a python object
# hsa_status_t (*alloc_callback)(size_t size, hsa_callback_data_t data, void **address),
HSA_ALLOC_CALLBACK_FUNCTION = ctypes.CFUNCTYPE(
hsa_status_t, # return value
ctypes.c_size_t,
hsa_callback_data_t,
_PTR(ctypes.c_void_p) # this might need to be a ptr to a py_object
)
void_fn_ptr = ctypes.CFUNCTYPE(
None,
ctypes.c_void_p) # this might need to be a ptr to a py_object
# hsa_status_t (*callback)(hsa_amd_memory_pool_t memory_pool, void* data)
HSA_AMD_AGENT_ITERATE_MEMORY_POOLS_CALLBACK = ctypes.CFUNCTYPE(
hsa_status_t,
hsa_amd_memory_pool_t,
ctypes.c_void_p) # this is a c_void_p used to wrap a python object
#------------------------------------------------------------------------------
# Functions used by API calls returning hsa_status_t to check for errors ######
def _build_reverse_error_warn_maps():
err_map = utils.UniqueDict()
warn_map = utils.UniqueDict()
for name in [name for name in dir(enums) if name.startswith('HSA_')]:
code = getattr(enums, name)
if 'STATUS_ERROR' in name:
err_map[code] = name
elif 'STATUS_INFO' in name:
warn_map[code] = name
else:
pass # should we warn here?
return err_map, warn_map
ERROR_MAP, WARN_MAP = _build_reverse_error_warn_maps()
def _check_error(result, func, arguments):
if result != enums.HSA_STATUS_SUCCESS:
if result >= enums.HSA_STATUS_ERROR:
errname = ERROR_MAP.get(result, "UNKNOWN_HSA_ERROR")
msg = "Call to {0} returned {1}".format(func.__name__, errname)
raise HsaApiError(result, msg)
else:
warnname = WARN_MAP.get(result, "UNKNOWN_HSA_INFO")
msg = "Call to {0} returned {1}".format(func.__name__, warnname)
warnings.warn(msg, HsaWarning)
# The API prototypes
# These are order based on header files.
API_PROTOTYPES = {
#------------------------------------------------------------------------------
# HSA functions from hsa.h, ordered as per header file.
# hsa_status_t hsa_status_string(
# hsa_status_t status,
# const char **status_string);
'hsa_status_string': {
'restype': hsa_status_t,
'argtypes': [hsa_status_t, _PTR(ctypes.c_char_p)],
'errcheck': _check_error
},
# hsa_status_t hsa_init(void)
'hsa_init': {
'restype': hsa_status_t,
'argtypes': [],
'errcheck': _check_error
},
# hsa_status_t hsa_shut_down(void)
'hsa_shut_down': {
'restype': hsa_status_t,
'argtypes': [],
'errcheck': _check_error
},
# hsa_status_t hsa_system_get_info(hsa_system_info_t, void*)
'hsa_system_get_info': {
'restype': hsa_status_t,
'argtypes': [hsa_system_info_t, ctypes.c_void_p],
'errcheck': _check_error
},
# hsa_status_t HSA_API hsa_system_extension_supported(uint16_t, uint16_t,
# uint16_t, bool *);
'hsa_system_extension_supported': {
'restype': hsa_status_t,
'argtypes': [ctypes.c_uint16, # extension
ctypes.c_uint16, # version_major
ctypes.c_uint16, # version_minor
_PTR(ctypes.c_bool)], # result
'errcheck': _check_error
},
# hsa_status_t hsa_system_get_extension_table(uint16_t, uint16_t,
# uint16_t, void *);
'hsa_system_get_extension_table': {
'restype': hsa_status_t,
'argtypes': [ctypes.c_uint16, # extension
ctypes.c_uint16, # version_major
ctypes.c_uint16, # version_minor
ctypes.c_void_p], # result
'errcheck': _check_error
},
# hsa_status_t hsa_agent_get_info(hsa_agent_t, hsa_agent_info_t, void*)
'hsa_agent_get_info': {
'restype': hsa_status_t,
'argtypes': [hsa_agent_t, hsa_agent_info_t, ctypes.c_void_p],
'errcheck': _check_error
},
# hsa_status_t hsa_iterate_agents(hsa_status_t(*)(hsa_agent_t, void*),
# void*)
'hsa_iterate_agents': {
'restype': hsa_status_t,
'argtypes': [HSA_ITER_AGENT_CALLBACK_FUNC, ctypes.py_object],
'errcheck': _check_error
},
# hsa_status_t hsa_agent_get_exception_policies(hsa_agent_t agent,
# hsa_profile_t profile,
# uint16_t *mask);
'hsa_agent_get_exception_policies': {
'restype': hsa_status_t,
'argtypes': [hsa_agent_t, hsa_profile_t, _PTR(ctypes.c_uint16)],
'errcheck': _check_error
},
# hsa_status_t hsa_agent_extension_supported(uint16_t extension, hsa_agent_t agent,
# uint16_t version_major,
# uint16_t version_minor, bool *result);
'hsa_agent_extension_supported': {
'restype': hsa_status_t,
'argtypes': [ctypes.c_uint16, hsa_agent_t, ctypes.c_uint16, ctypes.c_uint16,
_PTR(ctypes.c_bool)],
'errcheck': _check_error
},
#--------------------------------------------------------------------------
# Signals
#--------------------------------------------------------------------------
# hsa_status_t hsa_signal_create(
# hsa_signal_value_t initial_value,
# uint32_t agent_count,
# const hsa_agent_t *agents,
# hsa_signal_t *signal)
'hsa_signal_create': {
'restype': hsa_status_t,
'argtypes': [hsa_signal_value_t,
ctypes.c_uint32,
_PTR(hsa_agent_t),
_PTR(hsa_signal_t)],
'errcheck': _check_error
},
# hsa_status_t hsa_signal_destroy(
# hsa_signal_t signal)
'hsa_signal_destroy': {
'restype': hsa_status_t,
'argtypes': [hsa_signal_t],
'errcheck': _check_error
},
# hsa_signal_value_t hsa_signal_load_acquire(
# hsa_signal_t signal);
'hsa_signal_load_acquire': {
'restype': hsa_signal_value_t,
'argtypes': [hsa_signal_t],
},
# hsa_signal_value_t hsa_signal_load_relaxed(
# hsa_signal_t signal);
'hsa_signal_load_relaxed': {
'restype': hsa_signal_value_t,
'argtypes': [hsa_signal_t],
},
# void hsa_signal_store_relaxed(
# hsa_signal_t signal,
# hsa_signal_value_t value);
'hsa_signal_store_relaxed': {
'restype': None,
'argtypes': [hsa_signal_t, hsa_signal_value_t]
},
# void hsa_signal_store_release(
# hsa_signal_t signal,
# hsa_signal_value_t value);
'hsa_signal_store_release': {
'restype': None,
'argtypes': [hsa_signal_t, hsa_signal_value_t],
},
# hsa_signal_value_t hsa_signal_exchange_acq_rel(
# hsa_signal_t signal,
# hsa_signal_value_t value);
'hsa_signal_exchange_acq_rel': {
'restype': hsa_signal_value_t,
'argtypes': [hsa_signal_t, hsa_signal_value_t]
},
# hsa_signal_value_t hsa_signal_exchange_acquire(
# hsa_signal_t signal,
# hsa_signal_value_t value);
'hsa_signal_exchange_acquire': {
'restype': hsa_signal_value_t,
'argtypes': [hsa_signal_t, hsa_signal_value_t]
},
# hsa_signal_value_t hsa_signal_exchange_relaxed(
# hsa_signal_t signal,
# hsa_signal_value_t value);
'hsa_signal_exchange_relaxed': {
'restype': hsa_signal_value_t,
'argtypes': [hsa_signal_t, hsa_signal_value_t]
},
# hsa_signal_value_t hsa_signal_exchange_release(
# hsa_signal_t signal,
# hsa_signal_value_t value);
'hsa_signal_exchange_release': {
'restype': hsa_signal_value_t,
'argtypes': [hsa_signal_t, hsa_signal_value_t]
},
# hsa_signal_value_t hsa_signal_cas_acq_rel(
# hsa_signal_t signal,
# hsa_signal_value_t expected,
# hsa_signal_value_t value);
'hsa_signal_cas_acq_rel': {
'restype': hsa_signal_value_t,
'argtypes': [hsa_signal_t, hsa_signal_value_t, hsa_signal_value_t]
},
# hsa_signal_value_t hsa_signal_cas_acquire(
# hsa_signal_t signal,
# hsa_signal_value_t expected,
# hsa_signal_value_t value);
'hsa_signal_cas_acquire': {
'restype': hsa_signal_value_t,
'argtypes': [hsa_signal_t, hsa_signal_value_t, hsa_signal_value_t]
},
# hsa_signal_value_t hsa_signal_cas_relaxed(
# hsa_signal_t signal,
# hsa_signal_value_t expected,
# hsa_signal_value_t value);
'hsa_signal_cas_relaxed': {
'restype': hsa_signal_value_t,
'argtypes': [hsa_signal_t, hsa_signal_value_t, hsa_signal_value_t]
},
# hsa_signal_value_t hsa_signal_cas_release(
# hsa_signal_t signal,
# hsa_signal_value_t expected,
# hsa_signal_value_t value);
'hsa_signal_cas_release': {
'restype': hsa_signal_value_t,
'argtypes': [hsa_signal_t, hsa_signal_value_t, hsa_signal_value_t]
},
# void hsa_signal_add_acq_rel(
# hsa_signal_t signal,
# hsa_signal_value_t value);
'hsa_signal_add_acq_rel': {
'restype': None,
'argtypes': [hsa_signal_t, hsa_signal_value_t]
},
# void hsa_signal_add_acquire(
# hsa_signal_t signal,
# hsa_signal_value_t value);
'hsa_signal_add_acquire': {
'restype': None,
'argtypes': [hsa_signal_t, hsa_signal_value_t]
},
# void hsa_signal_add_relaxed(
# hsa_signal_t signal,
# hsa_signal_value_t value);
'hsa_signal_add_relaxed': {
'restype': None,
'argtypes': [hsa_signal_t, hsa_signal_value_t]
},
# void hsa_signal_add_release(
# hsa_signal_t signal,
# hsa_signal_value_t value);
'hsa_signal_add_release': {
'restype': None,
'argtypes': [hsa_signal_t, hsa_signal_value_t]
},
# void hsa_signal_subtract_acq_rel(
# hsa_signal_t signal,
# hsa_signal_value_t value);
'hsa_signal_subtract_acq_rel': {
'restype': None,
'argtypes': [hsa_signal_t, hsa_signal_value_t]
},
# void hsa_signal_subtract_acquire(
# hsa_signal_t signal,
# hsa_signal_value_t value);
'hsa_signal_subtract_acquire': {
'restype': None,
'argtypes': [hsa_signal_t, hsa_signal_value_t]
},
# void hsa_signal_subtract_relaxed(
# hsa_signal_t signal,
# hsa_signal_value_t value);
'hsa_signal_subtract_relaxed': {
'restype': None,
'argtypes': [hsa_signal_t, hsa_signal_value_t]
},
# void hsa_signal_subtract_release(
# hsa_signal_t signal,
# hsa_signal_value_t value);
'hsa_signal_subtract_release': {
'restype': None,
'argtypes': [hsa_signal_t, hsa_signal_value_t]
},
# void hsa_signal_and_acq_rel(
# hsa_signal_t signal,
# hsa_signal_value_t value);
'hsa_signal_and_acq_rel': {
'restype': None,
'argtypes': [hsa_signal_t, hsa_signal_value_t]
},
# void hsa_signal_and_acquire(
# hsa_signal_t signal,
# hsa_signal_value_t value);
'hsa_signal_and_acquire': {
'restype': None,
'argtypes': [hsa_signal_t, hsa_signal_value_t]
},
# void hsa_signal_and_relaxed(
# hsa_signal_t signal,
# hsa_signal_value_t value);
'hsa_signal_and_relaxed': {
'restype': None,
'argtypes': [hsa_signal_t, hsa_signal_value_t]
},
# void hsa_signal_and_release(
# hsa_signal_t signal,
# hsa_signal_value_t value);
'hsa_signal_and_release': {
'restype': None,
'argtypes': [hsa_signal_t, hsa_signal_value_t]
},
# void hsa_signal_or_acq_rel(
# hsa_signal_t signal,
# hsa_signal_value_t value);
'hsa_signal_or_acq_rel': {
'restype': None,
'argtypes': [hsa_signal_t,
hsa_signal_value_t]
},
# void hsa_signal_or_acquire(
# hsa_signal_t signal,
# hsa_signal_value_t value);
'hsa_signal_or_acquire': {
'restype': None,
'argtypes': [hsa_signal_t,
hsa_signal_value_t]
},
# void hsa_signal_or_relaxed(
# hsa_signal_t signal,
# hsa_signal_value_t value);
'hsa_signal_or_relaxed': {
'restype': None,
'argtypes': [hsa_signal_t,
hsa_signal_value_t]
},
# void hsa_signal_or_release(
# hsa_signal_t signal,
# hsa_signal_value_t value);
'hsa_signal_or_release': {
'restype': None,
'argtypes': [hsa_signal_t,
hsa_signal_value_t]
},
# void hsa_signal_xor_acq_rel(
# hsa_signal_t signal,
# hsa_signal_value_t value);
'hsa_signal_xor_acq_rel': {
'restype': None,
'argtypes': [hsa_signal_t, hsa_signal_value_t]
},
# void hsa_signal_xor_acquire(
# hsa_signal_t signal,
# hsa_signal_value_t value);
'hsa_signal_xor_acquire': {
'restype': None,
'argtypes': [hsa_signal_t, hsa_signal_value_t]
},
# void hsa_signal_xor_relaxed(
# hsa_signal_t signal,
# hsa_signal_value_t value);
'hsa_signal_xor_relaxed': {
'restype': None,
'argtypes': [hsa_signal_t, hsa_signal_value_t]
},
# void hsa_signal_xor_release(
# hsa_signal_t signal,
# hsa_signal_value_t value);
'hsa_signal_xor_release': {
'restype': None,
'argtypes': [hsa_signal_t, hsa_signal_value_t]
},
# hsa_signal_value_t HSA_API
# hsa_signal_wait_acquire(hsa_signal_t signal,
# hsa_signal_condition_t condition,
# hsa_signal_value_t compare_value,
# uint64_t timeout_hint,
# hsa_wait_state_t wait_state_hint);
'hsa_signal_wait_acquire': {
'restype': hsa_signal_value_t,
'argtypes': [hsa_signal_t,
hsa_signal_condition_t,
hsa_signal_value_t,
ctypes.c_uint64,
hsa_wait_state_t]
},
# hsa_signal_value_t hsa_signal_wait_relaxed(
# hsa_signal_t signal,
# hsa_signal_condition_t condition,
# hsa_signal_value_t compare_value,
# uint64_t timeout_hint,
# hsa_wait_state_t wait_state_hint);
'hsa_signal_wait_relaxed': {
'restype': hsa_signal_value_t,
'argtypes': [hsa_signal_t,
hsa_signal_condition_t,
hsa_signal_value_t,
ctypes.c_uint64,
hsa_wait_state_t],
},
#--------------------------------------------------------------------------
# Queues
#--------------------------------------------------------------------------
# hsa_status_t HSA_API
# hsa_queue_create(hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
# void (*callback)(hsa_status_t status, hsa_queue_t *source,
# void *data),
# void *data, uint32_t private_segment_size,
# uint32_t group_segment_size, hsa_queue_t **queue);
'hsa_queue_create': {
'restype': hsa_status_t,
'argtypes': [hsa_agent_t,
ctypes.c_uint32,
hsa_queue_type_t,
HSA_QUEUE_CALLBACK_FUNC,
ctypes.c_void_p, # data
ctypes.c_uint32, # private segment size
ctypes.c_uint32, # group segment size
_PTR(_PTR(hsa_queue_t))],
'errcheck': _check_error
},
# hsa_status_t
# hsa_soft_queue_create(hsa_region_t region, uint32_t size,
# hsa_queue_type_t type, uint32_t features,
# hsa_signal_t doorbell_signal, hsa_queue_t **queue);
'hsa_soft_queue_create': {
'restype': hsa_status_t,
'argtypes': [hsa_region_t,
ctypes.c_uint32,
hsa_queue_type_t,
ctypes.c_uint32,
hsa_signal_t,
_PTR(_PTR(hsa_queue_t))],
'errcheck': _check_error
},
# hsa_status_t hsa_queue_destroy(
# hsa_queue_t *queue)
'hsa_queue_destroy': {
'restype': hsa_status_t,
'argtypes': [_PTR(hsa_queue_t)],
'errcheck': _check_error
},
# hsa_status_t hsa_queue_inactivate(hsa_queue_t *queue);
'hsa_queue_inactivate': {
'restype': hsa_status_t,
'argtypes': [_PTR(hsa_queue_t)],
'errcheck': _check_error
},
# uint64_t hsa_queue_load_read_index_acquire(hsa_queue_t *queue);
'hsa_queue_load_read_index_acquire': {
'restype': ctypes.c_uint64,
'argtypes': [_PTR(hsa_queue_t)]
},
# uint64_t hsa_queue_load_read_index_relaxed(hsa_queue_t *queue);
'hsa_queue_load_read_index_relaxed': {
'restype': ctypes.c_uint64,
'argtypes': [_PTR(hsa_queue_t)]
},
# uint64_t hsa_queue_load_write_index_acquire(hsa_queue_t *queue);
'hsa_queue_load_write_index_acquire': {
'restype': ctypes.c_uint64,
'argtypes': [_PTR(hsa_queue_t)]
},
# uint64_t hsa_queue_load_write_index_relaxed(hsa_queue_t *queue);
'hsa_queue_load_write_index_relaxed': {
'restype': ctypes.c_uint64,
'argtypes': [_PTR(hsa_queue_t)]
},
# void hsa_queue_store_write_index_relaxed(hsa_queue_t *queue, uint64_t value);
'hsa_queue_store_write_index_relaxed': {
'restype': None,
'argtypes': [_PTR(hsa_queue_t), ctypes.c_uint64]
},
# void hsa_queue_store_write_index_release(hsa_queue_t *queue, uint64_t value);
'hsa_queue_store_write_index_release': {
'restype': None,
'argtypes': [_PTR(hsa_queue_t), ctypes.c_uint64]
},
# uint64_t hsa_queue_cas_write_index_acq_rel(
# hsa_queue_t *queue,
# uint64_t expected,
# uint64_t value);
'hsa_queue_cas_write_index_acq_rel': {
'restype': ctypes.c_uint64,
'argtypes': [_PTR(hsa_queue_t), ctypes.c_uint64, ctypes.c_uint64]
},
# uint64_t hsa_queue_cas_write_index_acquire(
# hsa_queue_t *queue,
# uint64_t expected,
# uint64_t value);
'hsa_queue_cas_write_index_acquire': {
'restype': ctypes.c_uint64,
'argtypes': [_PTR(hsa_queue_t), ctypes.c_uint64, ctypes.c_uint64]
},
# uint64_t hsa_queue_cas_write_index_relaxed(
# hsa_queue_t *queue,
# uint64_t expected,
# uint64_t value);
'hsa_queue_cas_write_index_relaxed': {
'restype': ctypes.c_uint64,
'argtypes': [_PTR(hsa_queue_t), ctypes.c_uint64, ctypes.c_uint64]
},
# uint64_t hsa_queue_cas_write_index_release(
# hsa_queue_t *queue,
# uint64_t expected,
# uint64_t value);
'hsa_queue_cas_write_index_release': {
'restype': ctypes.c_uint64,
'argtypes': [_PTR(hsa_queue_t), ctypes.c_uint64, ctypes.c_uint64]
},
# uint64_t hsa_queue_add_write_index_acq_rel(
# hsa_queue_t *queue,
# uint64_t value);
'hsa_queue_add_write_index_acq_rel': {
'restype': ctypes.c_uint64,
'argtypes': [_PTR(hsa_queue_t), ctypes.c_uint64]
},
# uint64_t hsa_queue_add_write_index_acquire(
# hsa_queue_t *queue,
# uint64_t value);
'hsa_queue_add_write_index_acquire': {
'restype': ctypes.c_uint64,
'argtypes': [_PTR(hsa_queue_t), ctypes.c_uint64]
},
# uint64_t hsa_queue_add_write_index_relaxed(
# hsa_queue_t *queue,
# uint64_t value);
'hsa_queue_add_write_index_relaxed': {
'restype': ctypes.c_uint64,
'argtypes': [_PTR(hsa_queue_t), ctypes.c_uint64]
},
# uint64_t hsa_queue_add_write_index_release(
# hsa_queue_t *queue,
# uint64_t value);
'hsa_queue_add_write_index_release': {
'restype': ctypes.c_uint64,
'argtypes': [_PTR(hsa_queue_t), ctypes.c_uint64]
},
# void hsa_queue_store_read_index_relaxed(
# hsa_queue_t *queue,
# uint64_t value);
'hsa_queue_store_read_index_relaxed': {
'restype': None,
'argtypes': [_PTR(hsa_queue_t), ctypes.c_uint64]
},
# void hsa_queue_store_read_index_release(
# hsa_queue_t *queue,
# uint64_t value);
'hsa_queue_store_read_index_release': {
'restype': None,
'argtypes': [_PTR(hsa_queue_t), ctypes.c_uint64]
},
#--------------------------------------------------------------------------
# Memory
#--------------------------------------------------------------------------
# hsa_status_t hsa_region_get_info(
# hsa_region_t region,
# hsa_region_info_t attribute,
# void *value);
'hsa_region_get_info': {
'restype': hsa_status_t,
'argtypes': [hsa_region_t, hsa_region_info_t, ctypes.c_void_p],
'errcheck': _check_error,
},
# hsa_status_t hsa_agent_iterate_regions(
# hsa_agent_t agent,
# hsa_status_t (*callback)(hsa_region_t region, void *data),
# void *data);
'hsa_agent_iterate_regions': {
'restype': hsa_status_t,
'argtypes': [hsa_agent_t,
HSA_AGENT_ITERATE_REGIONS_CALLBACK_FUNC,
ctypes.py_object],
'errcheck': _check_error
},
# hsa_status_t hsa_memory_allocate(
# hsa_region_t region,
# size_t size,
# void **ptr);
'hsa_memory_allocate': {
'restype': hsa_status_t,
'argtypes': [hsa_region_t, ctypes.c_size_t, _PTR(ctypes.c_void_p)],
'errcheck': _check_error
},
# hsa_status_t hsa_memory_free(
# void *ptr);
'hsa_memory_free': {
'restype': hsa_status_t,
'argtypes': [ctypes.c_void_p],
'errcheck': _check_error
},
# hsa_status_t HSA_API hsa_memory_copy(
# void * dst,
# const void * src,
# size_t size);
'hsa_memory_copy': {
'restype': hsa_status_t,
'argtypes': [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t],
'errcheck': _check_error
},
# hsa_status_t HSA_API hsa_memory_assign_agent(void *ptr,
# hsa_agent_t agent,
# hsa_access_permission_t access);
'hsa_memory_assign_agent': {
'restype': hsa_status_t,
'argtypes': [ctypes.c_void_p, hsa_agent_t, hsa_access_permission_t],
'errcheck': _check_error
},
# hsa_status_t hsa_memory_register(
# void *address,
# size_t size);
'hsa_memory_register': {
'restype': hsa_status_t,
'argtypes': [ctypes.c_void_p, ctypes.c_size_t],
'errcheck': _check_error
},
# hsa_status_t hsa_memory_deregister(
# void *address,
# size_t size);
'hsa_memory_deregister': {
'restype': hsa_status_t,
'argtypes': [ctypes.c_void_p, ctypes.c_size_t],
'errcheck': _check_error
},
#--------------------------------------------------------------------------
# Code Object functions
#--------------------------------------------------------------------------
# hsa_status_t HSA_API hsa_isa_from_name(const char* name,
# hsa_isa_t* isa);
'hsa_isa_from_name': {
'restype': hsa_status_t,
'argtypes': [ctypes.c_char_p, _PTR(hsa_isa_t)],
'errcheck': _check_error
},
# hsa_status_t HSA_API hsa_isa_get_info(hsa_isa_t isa,
# hsa_isa_info_t attribute,
# uint32_t index,
# void* value);
'hsa_isa_get_info': {
'restype': hsa_status_t,
'argtypes': [hsa_isa_t, hsa_isa_info_t, ctypes.c_void_p],
'errcheck': _check_error
},
# hsa_status_t HSA_API hsa_isa_compatible(hsa_isa_t code_object_isa,
# hsa_isa_t agent_isa,
# bool* result);
'hsa_isa_compatible': {
'restype': hsa_status_t,
'argtypes': [hsa_isa_t, hsa_isa_t, _PTR(ctypes.c_bool)],
'errcheck': _check_error
},
# hsa_status_t HSA_API hsa_code_object_serialize(
# hsa_code_object_t code_object,
# hsa_status_t (*alloc_callback)(size_t size,
# hsa_callback_data_t data, void **address),
# hsa_callback_data_t callback_data,
# const char *options,
# void **serialized_code_object,
# size_t *serialized_code_object_size);
'hsa_code_object_serialize': {
'restype': hsa_status_t,
'argtypes': [HSA_ALLOC_CALLBACK_FUNCTION,
hsa_callback_data_t,
_PTR(ctypes.c_void_p),
hsa_callback_data_t,
ctypes.c_char_p,
_PTR(ctypes.c_void_p),
_PTR(ctypes.c_size_t)],
'errcheck': _check_error
},
# hsa_status_t HSA_API hsa_code_object_deserialize(
# void *serialized_code_object,
# size_t serialized_code_object_size,
# const char *options,
# hsa_code_object_t *code_object);
'hsa_code_object_deserialize': {
'restype': hsa_status_t,
'argtypes': [ctypes.c_void_p,
ctypes.c_size_t,
ctypes.c_char_p,
_PTR(hsa_code_object_t)],
'errcheck': _check_error
},
# hsa_status_t HSA_API hsa_code_object_destroy(
# hsa_code_object_t code_object);
'hsa_code_object_destroy': {
'restype': hsa_status_t,
'argtypes': [hsa_code_object_t],
'errcheck': _check_error
},
# hsa_status_t HSA_API hsa_code_object_get_info(
# hsa_code_object_t code_object,
# hsa_code_object_info_t attribute,
# void *value);
'hsa_code_object_get_info': {
'restype': hsa_status_t,
'argtypes': [hsa_code_object_t,
hsa_code_object_info_t,
ctypes.c_void_p
],
'errcheck': _check_error
},
# hsa_status_t HSA_API hsa_code_object_get_symbol(
# hsa_code_object_t code_object,
# const char *symbol_name,
# hsa_code_symbol_t *symbol);
'hsa_code_object_get_symbol': {
'restype': hsa_status_t,
'argtypes': [hsa_code_object_t,
ctypes.c_char_p,
_PTR(hsa_code_symbol_t)
],
'errcheck': _check_error
},
# hsa_status_t HSA_API hsa_code_symbol_get_info(
# hsa_code_symbol_t code_symbol,
# hsa_code_symbol_info_t attribute,
# void *value);
'hsa_code_symbol_get_info': {
'restype': hsa_status_t,
'argtypes': [hsa_code_symbol_t,
hsa_code_symbol_info_t,
ctypes.c_void_p
],
'errcheck': _check_error
},
# hsa_status_t HSA_API hsa_code_object_iterate_symbols(
# hsa_code_object_t code_object,
# hsa_status_t (*callback)(hsa_code_object_t code_object, hsa_code_symbol_t symbol, void* data),
# void* data);
'hsa_code_object_iterate_symbols': {
'restype': hsa_status_t,
'argtypes': [hsa_code_object_t,
HSA_CODE_OBJECT_ITERATE_SYMBOLS_CALLBACK,
ctypes.c_void_p
],
'errcheck': _check_error
},
#--------------------------------------------------------------------------
# Executable functions
#--------------------------------------------------------------------------
# hsa_status_t HSA_API hsa_executable_create(
# hsa_profile_t profile,
# hsa_executable_state_t executable_state,
# const char *options,
# hsa_executable_t *executable);
"hsa_executable_create": {
'restype': hsa_status_t,
'argtypes': [hsa_profile_t,
hsa_executable_state_t,
ctypes.c_char_p,
ctypes.POINTER(hsa_executable_t)],
'errcheck': _check_error,
},
# hsa_status_t HSA_API hsa_executable_destroy(
# hsa_executable_t executable);
"hsa_executable_destroy": {
'errcheck': _check_error,
'restype': hsa_status_t,
'argtypes': [
hsa_executable_t,
],
},
# hsa_status_t HSA_API hsa_executable_load_code_object(
# hsa_executable_t executable,
# hsa_agent_t agent,
# hsa_code_object_t code_object,
# const char *options);
"hsa_executable_load_code_object": {
'errcheck': _check_error,
'restype': hsa_status_t,
'argtypes': [
hsa_executable_t,
hsa_agent_t,
hsa_code_object_t,
ctypes.c_char_p,
],
},
# hsa_status_t HSA_API hsa_executable_freeze(
# hsa_executable_t executable,
# const char *options);
"hsa_executable_freeze": {
'errcheck': _check_error,
'restype': hsa_status_t,
'argtypes': [
hsa_executable_t,
ctypes.c_char_p,
],
},
# hsa_status_t HSA_API hsa_executable_get_info(
# hsa_executable_t executable,
# hsa_executable_info_t attribute,
# void *value);
"hsa_executable_get_info": {
'errcheck': _check_error,
'restype': hsa_status_t,
'argtypes': [
hsa_executable_t,
hsa_executable_info_t,
ctypes.c_void_p
],
},
# hsa_status_t HSA_API hsa_executable_global_variable_define(
# hsa_executable_t executable,
# const char *variable_name,
# void *address);
"hsa_executable_global_variable_define": {
'restype': hsa_status_t,
'argtypes': [hsa_executable_t,
ctypes.c_char_p,
ctypes.c_void_p],
'errcheck': _check_error,
},
# hsa_status_t HSA_API hsa_executable_agent_global_variable_define(
# hsa_executable_t executable,
# hsa_agent_t agent,
# const char *variable_name,
# void *address);
"hsa_executable_agent_global_variable_define": {
'restype': hsa_status_t,
'argtypes': [hsa_executable_t,
hsa_agent_t,
ctypes.c_char_p,
ctypes.c_void_p],
'errcheck': _check_error,
},
# hsa_status_t HSA_API hsa_executable_readonly_variable_define(
# hsa_executable_t executable,
# hsa_agent_t agent,
# const char *variable_name,
# void *address);
"hsa_executable_readonly_variable_define": {
'restype': hsa_status_t,
'argtypes': [hsa_executable_t,
hsa_agent_t,
ctypes.c_char_p,
ctypes.c_void_p],
'errcheck': _check_error,
},
# hsa_status_t HSA_API hsa_executable_validate(
# hsa_executable_t executable,
# uint32_t* result);
"hsa_executable_validate": {
'restype': hsa_status_t,
'argtypes': [hsa_executable_t,
_PTR(ctypes.c_uint32)],
'errcheck': _check_error,
},
# hsa_status_t HSA_API hsa_executable_get_symbol(
# hsa_executable_t executable,
# const char *module_name,
# const char *symbol_name,
# hsa_agent_t agent,
# int32_t call_convention,
# hsa_executable_symbol_t *symbol);
"hsa_executable_get_symbol": {
'errcheck': _check_error,
'restype': hsa_status_t,
'argtypes': [
hsa_executable_t,
ctypes.c_char_p, # module_name (must be NULL for program linkage)
ctypes.c_char_p, # symbol_name
hsa_agent_t,
ctypes.c_int32,
ctypes.POINTER(hsa_executable_symbol_t),
],
},
# hsa_status_t HSA_API hsa_executable_symbol_get_info(
# hsa_executable_symbol_t executable_symbol,
# hsa_executable_symbol_info_t attribute,
# void *value);
"hsa_executable_symbol_get_info": {
'errcheck': _check_error,
'restype': hsa_status_t,
'argtypes': [
hsa_executable_symbol_t,
hsa_executable_symbol_info_t,
ctypes.c_void_p,
],
},
#hsa_status_t HSA_API hsa_executable_iterate_symbols(
# hsa_executable_t executable,
# hsa_status_t (*callback)(hsa_executable_t executable, hsa_executable_symbol_t symbol, void* data),
# void* data);
"hsa_executable_iterate_symbols": {
'errcheck': _check_error,
'restype': hsa_status_t,
'argtypes': [
hsa_executable_symbol_t,
hsa_executable_symbol_info_t,
ctypes.c_void_p,
],
},
#--------------------------------------------------------------------------
# AMD extensions from hsa_ext_amd.h
#--------------------------------------------------------------------------
# hsa_status_t HSA_API hsa_amd_coherency_get_type(hsa_agent_t agent,
# hsa_amd_coherency_type_t* type);
"hsa_amd_coherency_get_type": {
'errcheck': _check_error,
'restype': hsa_status_t,
'argtypes': [
hsa_agent_t,
_PTR(hsa_amd_coherency_type_t),
],
},
# hsa_status_t HSA_API hsa_amd_coherency_set_type(hsa_agent_t agent,
# hsa_amd_coherency_type_t type);
"hsa_amd_coherency_get_type": {
'errcheck': _check_error,
'restype': hsa_status_t,
'argtypes': [
hsa_agent_t,
hsa_amd_coherency_type_t,
],
},
# hsa_status_t HSA_API
# hsa_amd_profiling_set_profiler_enabled(hsa_queue_t* queue, int enable);
"hsa_amd_profiling_set_profiler_enabled": {
'errcheck': _check_error,
'restype': hsa_status_t,
'argtypes': [
_PTR(hsa_queue_t),
ctypes.c_int,
],
},
# hsa_status_t HSA_API hsa_amd_profiling_get_dispatch_time(
# hsa_agent_t agent, hsa_signal_t signal,
# hsa_amd_profiling_dispatch_time_t* time);
"hsa_amd_profiling_get_dispatch_time": {
'errcheck': _check_error,
'restype': hsa_status_t,
'argtypes': [
hsa_agent_t,
hsa_signal_t,
_PTR(hsa_amd_profiling_dispatch_time_t)
],
},
# hsa_status_t HSA_API
# hsa_amd_profiling_convert_tick_to_system_domain(hsa_agent_t agent,
# uint64_t agent_tick,
# uint64_t* system_tick);
"hsa_amd_profiling_convert_tick_to_system_domain": {
'errcheck': _check_error,
'restype': hsa_status_t,
'argtypes': [
ctypes.c_uint64,
_PTR(ctypes.c_uint64)
],
},
# hsa_status_t HSA_API
# hsa_amd_signal_async_handler(hsa_signal_t signal,
# hsa_signal_condition_t cond,
# hsa_signal_value_t value,
# hsa_amd_signal_handler handler, void* arg);
"hsa_amd_signal_async_handler": {
'errcheck': _check_error,
'restype': hsa_status_t,
'argtypes': [
hsa_signal_t,
hsa_signal_condition_t,
hsa_signal_value_t,
hsa_amd_signal_handler,
ctypes.c_void_p,
],
},
#hsa_amd_async_function(void (*callback)(void* arg), void* arg);
"hsa_amd_async_function": {
'errcheck': _check_error,
'restype': hsa_status_t,
'argtypes': [
ctypes.POINTER(void_fn_ptr),
ctypes.c_void_p,
],
},
#uint32_t HSA_API
#hsa_amd_signal_wait_any(uint32_t signal_count, hsa_signal_t* signals,
# hsa_signal_condition_t* conds,
# hsa_signal_value_t* values, uint64_t timeout_hint,
# hsa_wait_state_t wait_hint,
# hsa_signal_value_t* satisfying_value);
"hsa_amd_signal_wait_any": {
'errcheck': _check_error,
'restype': ctypes.c_uint32,
'argtypes': [
ctypes.c_uint32,
_PTR(hsa_signal_t),
_PTR(hsa_signal_condition_t),
_PTR(hsa_signal_value_t),
ctypes.c_uint64,
hsa_wait_state_t,
_PTR(hsa_signal_value_t),
],
},
# hsa_status_t HSA_API hsa_amd_image_get_info_max_dim(hsa_agent_t agent,
# hsa_agent_info_t attribute,
# void* value);
"hsa_amd_image_get_info_max_dim": {
'errcheck': _check_error,
'restype': hsa_status_t,
'argtypes': [
hsa_agent_t,
hsa_agent_info_t,
ctypes.c_void_p,
],
},
# hsa_status_t HSA_API hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue,
# uint32_t num_cu_mask_count,
# const uint32_t* cu_mask);
"hsa_amd_queue_cu_set_mask": {
'errcheck': _check_error,
'restype': hsa_status_t,
'argtypes': [
_PTR(hsa_queue_t),
ctypes.c_uint32,
_PTR(ctypes.c_uint32)
],
},
# hsa_status_t HSA_API
# hsa_amd_memory_pool_get_info(hsa_amd_memory_pool_t memory_pool,
# hsa_amd_memory_pool_info_t attribute,
# void* value);
"hsa_amd_memory_pool_get_info": {
'errcheck': _check_error,
'restype': hsa_status_t,
'argtypes': [
hsa_amd_memory_pool_t,
hsa_amd_memory_pool_info_t,
ctypes.c_void_p
],
},
# hsa_status_t HSA_API hsa_amd_agent_iterate_memory_pools(
# hsa_agent_t agent,
# hsa_status_t (*callback)(hsa_amd_memory_pool_t memory_pool, void* data),
# void* data);
"hsa_amd_agent_iterate_memory_pools": {
'errcheck': _check_error,
'restype': hsa_status_t,
'argtypes': [
hsa_agent_t,
HSA_AMD_AGENT_ITERATE_MEMORY_POOLS_CALLBACK,
ctypes.c_void_p
],
},
# hsa_status_t HSA_API hsa_amd_memory_pool_allocate
# (hsa_amd_memory_pool_t memory_pool, size_t size,
# uint32_t flags, void** ptr);
"hsa_amd_memory_pool_allocate": {
'errcheck': _check_error,
'restype': hsa_status_t,
'argtypes': [
hsa_amd_memory_pool_t,
ctypes.c_size_t,
ctypes.c_uint32,
_PTR(ctypes.c_void_p)
],
},
# hsa_status_t HSA_API hsa_amd_memory_pool_free(void* ptr);
"hsa_amd_memory_pool_free": {
'errcheck': _check_error,
'restype': hsa_status_t,
'argtypes': [
ctypes.c_void_p
],
},
# hsa_status_t HSA_API hsa_amd_memory_async_copy(void* dst,
# hsa_agent_t dst_agent, const void* src,
# hsa_agent_t src_agent, size_t size,
# uint32_t num_dep_signals,
# const hsa_signal_t* dep_signals,
# hsa_signal_t completion_signal);
"hsa_amd_memory_async_copy": {
'errcheck': _check_error,
'restype': hsa_status_t,
'argtypes': [
ctypes.c_void_p,
hsa_agent_t,
ctypes.c_void_p,
hsa_agent_t,
ctypes.c_size_t,
ctypes.c_uint32,
_PTR(hsa_signal_t),
hsa_signal_t
],
},
# hsa_status_t HSA_API hsa_amd_agent_memory_pool_get_info(
# hsa_agent_t agent, hsa_amd_memory_pool_t memory_pool,
# hsa_amd_agent_memory_pool_info_t attribute, void* value);
"hsa_amd_agent_memory_pool_get_info": {
'errcheck': _check_error,
'restype': hsa_status_t,
'argtypes': [
hsa_agent_t,
hsa_amd_memory_pool_t,
hsa_amd_agent_memory_pool_info_t,
ctypes.c_void_p
],
},
# hsa_status_t HSA_API
# hsa_amd_agents_allow_access(uint32_t num_agents, const hsa_agent_t* agents,
# const uint32_t* flags, const void* ptr);
"hsa_amd_agents_allow_access": {
'errcheck': _check_error,
'restype': hsa_status_t,
'argtypes': [
ctypes.c_uint32,
_PTR(hsa_agent_t),
_PTR(ctypes.c_uint32),
ctypes.c_void_p
],
},
# hsa_status_t HSA_API
# hsa_amd_memory_pool_can_migrate(hsa_amd_memory_pool_t src_memory_pool,
# hsa_amd_memory_pool_t dst_memory_pool,
# bool* result);
"hsa_amd_memory_pool_can_migrate": {
'errcheck': _check_error,
'restype': hsa_status_t,
'argtypes': [
hsa_amd_memory_pool_t,
hsa_amd_memory_pool_t,
_PTR(ctypes.c_bool)
],
},
# hsa_status_t HSA_API hsa_amd_memory_migrate(const void* ptr,
# hsa_amd_memory_pool_t memory_pool,
# uint32_t flags);
"hsa_amd_memory_migrate": {
'errcheck': _check_error,
'restype': hsa_status_t,
'argtypes': [
ctypes.c_void_p,
hsa_amd_memory_pool_t,
ctypes.c_uint32
],
},
# hsa_status_t HSA_API hsa_amd_memory_lock(void* host_ptr, size_t size,
# hsa_agent_t* agents, int num_agent,
# void** agent_ptr);
"hsa_amd_memory_lock": {
'errcheck': _check_error,
'restype': hsa_status_t,
'argtypes': [
ctypes.c_void_p,
ctypes.c_size_t,
_PTR(hsa_agent_t),
ctypes.c_int,
_PTR(ctypes.c_void_p)
],
},
# hsa_status_t HSA_API hsa_amd_memory_unlock(void* host_ptr);
"hsa_amd_memory_unlock": {
'errcheck': _check_error,
'restype': hsa_status_t,
'argtypes': [
ctypes.c_void_p
],
},
# hsa_status_t HSA_API
# hsa_amd_memory_fill(void* ptr, uint32_t value, size_t count);
"hsa_amd_memory_unlock": {
'errcheck': _check_error,
'restype': hsa_status_t,
'argtypes': [
ctypes.c_void_p
],
},
# hsa_status_t HSA_API hsa_amd_interop_map_buffer(uint32_t num_agents,
# hsa_agent_t* agents,
# int interop_handle,
# uint32_t flags,
# size_t* size,
# void** ptr,
# size_t* metadata_size,
# const void** metadata);
"hsa_amd_interop_map_buffer": {
'errcheck': _check_error,
'restype': hsa_status_t,
'argtypes': [
ctypes.c_uint32,
_PTR(hsa_agent_t),
ctypes.c_int,
ctypes.c_uint32,
_PTR(ctypes.c_size_t),
_PTR(ctypes.c_void_p),
_PTR(ctypes.c_size_t),
_PTR(ctypes.c_void_p),
],
},
# hsa_status_t HSA_API hsa_amd_interop_unmap_buffer(void* ptr);
"hsa_amd_interop_map_buffer": {
'errcheck': _check_error,
'restype': hsa_status_t,
'argtypes': [
_PTR(ctypes.c_void_p),
],
},
# hsa_status_t HSA_API hsa_amd_image_create(
# hsa_agent_t agent,
# const hsa_ext_image_descriptor_t *image_descriptor,
# const hsa_amd_image_descriptor_t *image_layout,
# const void *image_data,
# hsa_access_permission_t access_permission,
# hsa_ext_image_t *image
# );
"hsa_amd_image_create": {
'errcheck': _check_error,
'restype': hsa_status_t,
'argtypes': [
hsa_agent_t,
_PTR(hsa_ext_image_descriptor_t),
_PTR(hsa_amd_image_descriptor_t),
ctypes.c_void_p,
hsa_access_permission_t,
hsa_ext_image_t
],
},
#--------------------------------------------------------------------------
# Functions from hsa_ext_finalize.h
# NOTE: To access these functions use the hsa_ext_finalizer_1_00_pfn_t
# struct.
#--------------------------------------------------------------------------
}
"""Enum values for HSA
Note that Python namespacing could be used to avoid the C-like
prefixing, but we choose to keep the same names as found in the C
enums, in order to match the documentation.
"""
import ctypes
HSA_LARGE_MODEL = ctypes.sizeof(ctypes.c_void_p) == 8
# hsa_status_t
# The function has been executed successfully.
HSA_STATUS_SUCCESS = 0x0
# A traversal over a list of elements has been interrupted by the
# application before completing.
HSA_STATUS_INFO_BREAK = 0x1
# A generic error has occurred.
HSA_STATUS_ERROR = 0x1000
# One of the actual arguments does not meet a precondition stated in the
# documentation of the corresponding formal argument.
HSA_STATUS_ERROR_INVALID_ARGUMENT = 0x1001
# The requested queue creation is not valid.
HSA_STATUS_ERROR_INVALID_QUEUE_CREATION = 0x1002
# The requested allocation is not valid.
HSA_STATUS_ERROR_INVALID_ALLOCATION = 0x1003
# The agent is invalid.
HSA_STATUS_ERROR_INVALID_AGENT = 0x1004
# The memory region is invalid.
HSA_STATUS_ERROR_INVALID_REGION = 0x1005
# The signal is invalid.
HSA_STATUS_ERROR_INVALID_SIGNAL = 0x1006
# The queue is invalid.
HSA_STATUS_ERROR_INVALID_QUEUE = 0x1007
# The HSA runtime failed to allocate the necessary resources. This error
# may also occur when the HSA runtime needs to spawn threads or create
# internal OS-specific events.
HSA_STATUS_ERROR_OUT_OF_RESOURCES = 0x1008
# The AQL packet is malformed.
HSA_STATUS_ERROR_INVALID_PACKET_FORMAT = 0x1009
# An error has been detected while releasing a resource.
HSA_STATUS_ERROR_RESOURCE_FREE = 0x100A
# An API other than ::hsa_init has been invoked while the reference count
# of the HSA runtime is 0.
HSA_STATUS_ERROR_NOT_INITIALIZED = 0x100B
# The maximum reference count for the object has been reached.
HSA_STATUS_ERROR_REFCOUNT_OVERFLOW = 0x100C
# The arguments passed to a functions are not compatible.
HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS = 0x100D
# The index is invalid.\
HSA_STATUS_ERROR_INVALID_INDEX = 0x100E
# The instruction set architecture is invalid.
HSA_STATUS_ERROR_INVALID_ISA = 0x100F,
# The instruction set architecture name is invalid.
HSA_STATUS_ERROR_INVALID_ISA_NAME = 0x1017
# The code object is invalid.
HSA_STATUS_ERROR_INVALID_CODE_OBJECT = 0x1010
# The executable is invalid.
HSA_STATUS_ERROR_INVALID_EXECUTABLE = 0x1011
# The executable is frozen.
HSA_STATUS_ERROR_FROZEN_EXECUTABLE = 0x1012
# There is no symbol with the given name.
HSA_STATUS_ERROR_INVALID_SYMBOL_NAME = 0x1013
# The variable is already defined.
HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED = 0x1014
# The variable is undefined.
HSA_STATUS_ERROR_VARIABLE_UNDEFINED = 0x1015
# An HSAIL operation resulted on a hardware exception.
HSA_STATUS_ERROR_EXCEPTION = 0x1016
# hsa_packet_type_t
HSA_PACKET_TYPE_VENDOR_SPECIFIC = 0
# The packet has been processed in the past, but has not been reassigned to
# the packet processor. A packet processor must not process a packet of this
# type. All queues support this packet type.
HSA_PACKET_TYPE_INVALID = 1
# Packet used by agents for dispatching jobs to kernel agents. Not all
# queues support packets of this type (see ::hsa_queue_feature_t).
HSA_PACKET_TYPE_KERNEL_DISPATCH = 2
# Packet used by agents to delay processing of subsequent packets, and to
# express complex dependencies between multiple packets. All queues support
# this packet type.
HSA_PACKET_TYPE_BARRIER_AND = 3
# Packet used by agents for dispatching jobs to agents. Not all
# queues support packets of this type (see ::hsa_queue_feature_t).
HSA_PACKET_TYPE_AGENT_DISPATCH = 4
# Packet used by agents to delay processing of subsequent packets, and to
# express complex dependencies between multiple packets. All queues support
# this packet type.
HSA_PACKET_TYPE_BARRIER_OR = 5
# hsa_queue_type_t
HSA_QUEUE_TYPE_MULTI = 0
HSA_QUEUE_TYPE_SINGLE = 1
# hsa_queue_feature_t
HSA_QUEUE_FEATURE_KERNEL_DISPATCH = 1
HSA_QUEUE_FEATURE_AGENT_DISPATCH = 2
# hsa_fence_scope_t
HSA_FENCE_SCOPE_NONE = 0
HSA_FENCE_SCOPE_AGENT = 1
HSA_FENCE_SCOPE_SYSTEM = 2
# hsa_wait_state_t
# The application thread may be rescheduled while waiting on the signal.
HSA_WAIT_STATE_BLOCKED = 0
# The application thread stays active while waiting on a signal.
HSA_WAIT_STATE_ACTIVE = 1
# hsa_signal_condition_t
HSA_SIGNAL_CONDITION_EQ = 0
HSA_SIGNAL_CONDITION_NE = 1
HSA_SIGNAL_CONDITION_LT = 2
HSA_SIGNAL_CONDITION_GTE = 3
# # hsa_dim_t
# HSA_DIM_X = 0
# HSA_DIM_Y = 1
# HSA_DIM_Z = 2
# hsa_extension_t
HSA_EXTENSION_FINALIZER = 0
HSA_EXTENSION_IMAGES = 1
HSA_EXTENSION_AMD_PROFILER = 2
# hsa_agent_feature_t
HSA_AGENT_FEATURE_KERNEL_DISPATCH = 1
HSA_AGENT_FEATURE_AGENT_DISPATCH = 2
# hsa_device_type_t
HSA_DEVICE_TYPE_CPU = 0
HSA_DEVICE_TYPE_GPU = 1
HSA_DEVICE_TYPE_DSP = 2
# hsa_system_info_t
HSA_SYSTEM_INFO_VERSION_MAJOR = 0
HSA_SYSTEM_INFO_VERSION_MINOR = 1
HSA_SYSTEM_INFO_TIMESTAMP = 2
HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY = 3
HSA_SYSTEM_INFO_SIGNAL_MAX_WAIT = 4
HSA_SYSTEM_INFO_ENDIANNESS = 5
HSA_SYSTEM_INFO_MACHINE_MODEL = 6
HSA_SYSTEM_INFO_EXTENSIONS = 7
# hsa_agent_info_t
# Agent name. The type of this attribute is a NUL-terminated char[64]. If
# the name of the agent uses less than 63 characters, the rest of the
# array must be filled with NULs.
HSA_AGENT_INFO_NAME = 0
# Name of vendor. The type of this attribute is a NUL-terminated char[64]. If
# the name of the vendor uses less than 63 characters, the rest of the array
# must be filled with NULs.
HSA_AGENT_INFO_VENDOR_NAME = 1
# Agent capability. The type of this attribute is ::hsa_agent_feature_t.
HSA_AGENT_INFO_FEATURE = 2
# Machine model supported by the agent. The type of this attribute is
# ::hsa_machine_model_t.
HSA_AGENT_INFO_MACHINE_MODEL = 3
# Profile supported by the agent. The type of this attribute is
# ::hsa_profile_t.
HSA_AGENT_INFO_PROFILE = 4
# Default floating-point rounding mode. The type of this attribute is
# ::hsa_default_float_rounding_mode_t, but the value
# ::HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT is not allowed.
HSA_AGENT_INFO_DEFAULT_FLOAT_ROUNDING_MODE = 5
# Default floating-point rounding modes supported by the agent in the Base
# profile. The type of this attribute is a mask of
# ::hsa_default_float_rounding_mode_t. The default floating-point rounding
# mode (::HSA_AGENT_INFO_DEFAULT_FLOAT_ROUNDING_MODE) bit must not be set.
HSA_AGENT_INFO_BASE_PROFILE_DEFAULT_FLOAT_ROUNDING_MODES = 23
# Flag indicating that the f16 HSAIL operation is at least as fast as the
# f32 operation in the current agent. The value of this attribute is
# undefined if the agent is not a kernel agent. The type of this
# attribute is bool.
HSA_AGENT_INFO_FAST_F16_OPERATION = 24
# Number of work-items in a wavefront. Must be a power of 2 in the range
# [1,256]. The value of this attribute is undefined if the agent is not
# a kernel agent. The type of this attribute is uint32_t.
HSA_AGENT_INFO_WAVEFRONT_SIZE = 6
# Maximum number of work-items of each dimension of a work-group. Each
# maximum must be greater than 0. No maximum can exceed the value of
# ::HSA_AGENT_INFO_WORKGROUP_MAX_SIZE. The value of this attribute is
# undefined if the agent is not a kernel agent. The type of this
# attribute is uint16_t[3].
HSA_AGENT_INFO_WORKGROUP_MAX_DIM = 7
# Maximum total number of work-items in a work-group. The value of this
# attribute is undefined if the agent is not a kernel agent. The type
# of this attribute is uint32_t.
HSA_AGENT_INFO_WORKGROUP_MAX_SIZE = 8
# Maximum number of work-items of each dimension of a grid. Each maximum must
# be greater than 0, and must not be smaller than the corresponding value in
# ::HSA_AGENT_INFO_WORKGROUP_MAX_DIM. No maximum can exceed the value of
# ::HSA_AGENT_INFO_GRID_MAX_SIZE. The value of this attribute is undefined if
# the agent is not a kernel agent. The type of this attribute is
# ::hsa_dim3_t.
HSA_AGENT_INFO_GRID_MAX_DIM = 9
# Maximum total number of work-items in a grid. The value of this attribute
# is undefined if the agent is not a kernel agent. The type of this
# attribute is uint32_t.
HSA_AGENT_INFO_GRID_MAX_SIZE = 10
# Maximum number of fbarriers per work-group. Must be at least 32. The value
# of this attribute is undefined if the agent is not a kernel agent. The
# type of this attribute is uint32_t.
HSA_AGENT_INFO_FBARRIER_MAX_SIZE = 11
# Maximum number of queues that can be active (created but not destroyed) at
# one time in the agent. The type of this attribute is uint32_t.
HSA_AGENT_INFO_QUEUES_MAX = 12
# Minimum number of packets that a queue created in the agent
# can hold. Must be a power of 2 greater than 0. Must not exceed
# the value of ::HSA_AGENT_INFO_QUEUE_MAX_SIZE. The type of this
# attribute is uint32_t.
HSA_AGENT_INFO_QUEUE_MIN_SIZE = 13
# Maximum number of packets that a queue created in the agent can
# hold. Must be a power of 2 greater than 0. The type of this attribute
# is uint32_t.
HSA_AGENT_INFO_QUEUE_MAX_SIZE = 14
# Type of a queue created in the agent. The type of this attribute is
# ::hsa_queue_type_t.
HSA_AGENT_INFO_QUEUE_TYPE = 15
# Identifier of the NUMA node associated with the agent. The type of this
# attribute is uint32_t.
HSA_AGENT_INFO_NODE = 16
# Type of hardware device associated with the agent. The type of this
# attribute is ::hsa_device_type_t.
HSA_AGENT_INFO_DEVICE = 17
# Array of data cache sizes (L1..L4). Each size is expressed in bytes. A size
# of 0 for a particular level indicates that there is no cache information
# for that level. The type of this attribute is uint32_t[4].
HSA_AGENT_INFO_CACHE_SIZE = 18
# Instruction set architecture of the agent. The type of this attribute
# is ::hsa_isa_t.
HSA_AGENT_INFO_ISA = 19
# Bit-mask indicating which extensions are supported by the agent. An
# extension with an ID of @p i is supported if the bit at position @p i is
# set. The type of this attribute is uint8_t[128].
HSA_AGENT_INFO_EXTENSIONS = 20
# Major version of the HSA runtime specification supported by the
# agent. The type of this attribute is uint16_t.
HSA_AGENT_INFO_VERSION_MAJOR = 21
# Minor version of the HSA runtime specification supported by the
# agent. The type of this attribute is uint16_t.
HSA_AGENT_INFO_VERSION_MINOR = 22
# hsa_region_segment_t
# Global segment. Used to hold data that is shared by all agents.
HSA_REGION_SEGMENT_GLOBAL = 0
# Read-only segment. Used to hold data that remains constant during the
# execution of a kernel.
HSA_REGION_SEGMENT_READONLY = 1
# Private segment. Used to hold data that is local to a single work-item.
HSA_REGION_SEGMENT_PRIVATE = 2
# Group segment. Used to hold data that is shared by the work-items of a
# work-group.
HSA_REGION_SEGMENT_GROUP = 3
# hsa_region_global_flag_t
# The application can use memory in the region to store kernel arguments, and
# provide the values for the kernarg segment of a kernel dispatch. If this
# flag is set, then ::HSA_REGION_GLOBAL_FLAG_FINE_GRAINED must be set.
HSA_REGION_GLOBAL_FLAG_KERNARG = 1
# Updates to memory in this region are immediately visible to all the
# agents under the terms of the HSA memory model. If this
# flag is set, then ::HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED must not be set.
HSA_REGION_GLOBAL_FLAG_FINE_GRAINED = 2
# Updates to memory in this region can be performed by a single agent at
# a time. If a different agent in the system is allowed to access the
# region, the application must explicitely invoke ::hsa_memory_assign_agent
# in order to transfer ownership to that agent for a particular buffer.
HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED = 4
# hsa_region_info_t
# Segment where memory in the region can be used. The type of this
# attribute is ::hsa_region_segment_t.
HSA_REGION_INFO_SEGMENT = 0
# Flag mask. The value of this attribute is undefined if the value of
# ::HSA_REGION_INFO_SEGMENT is not ::HSA_REGION_SEGMENT_GLOBAL. The type of
# this attribute is uint32_t, a bit-field of ::hsa_region_global_flag_t
# values.
HSA_REGION_INFO_GLOBAL_FLAGS = 1
# Size of this region, in bytes. The type of this attribute is size_t.
HSA_REGION_INFO_SIZE = 2
# Maximum allocation size in this region, in bytes. Must not exceed the value
# of ::HSA_REGION_INFO_SIZE. The type of this attribute is size_t.
#
# If the region is in the global or readonly segments, this is the maximum
# size that the application can pass to ::hsa_memory_allocate. If the region
# is in the group segment, this is the maximum size (per work-group) that can
# be requested for a given kernel dispatch. If the region is in the private
# segment, this is the maximum size (per work-item) that can be request for a
# specific kernel dispatch.
HSA_REGION_INFO_ALLOC_MAX_SIZE = 4
# Indicates whether memory in this region can be allocated using
# ::hsa_memory_allocate. The type of this attribute is bool.
#
# The value of this flag is always false for regions in the group and private
# segments.
HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED = 5
# Allocation granularity of buffers allocated by ::hsa_memory_allocate in
# this region. The size of a buffer allocated in this region is a multiple of
# the value of this attribute. The value of this attribute is only defined if
# ::HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED is true for this region. The type
# of this attribute is size_t.
HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE = 6
# Alignment of buffers allocated by ::hsa_memory_allocate in this region. The
# value of this attribute is only defined if
# ::HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED is true for this region, and must
# be a power of 2. The type of this attribute is size_t.
HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT = 7
# hsa_profile_t
HSA_PROFILE_BASE = 0
HSA_PROFILE_FULL = 1
# hsa_machine_model_t
HSA_MACHINE_MODEL_SMALL = 0
HSA_MACHINE_MODEL_LARGE = 1
# hsa_executable_symbol_info_t
# The kind of the symbol. The type of this attribute is ::hsa_symbol_kind_t.
HSA_EXECUTABLE_SYMBOL_INFO_TYPE = 0
# The length of the symbol name. The type of this attribute is uint32_t.
HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH = 1
# The name of the symbol. The type of this attribute is character array with
# the length equal to the value of ::HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH
# attribute
HSA_EXECUTABLE_SYMBOL_INFO_NAME = 2
# The length of the module name to which this symbol belongs if this symbol
# has module linkage, otherwise 0 is returned. The type of this attribute is
# uint32_t.
HSA_EXECUTABLE_SYMBOL_INFO_MODULE_NAME_LENGTH = 3
# The module name to which this symbol belongs if this symbol has module
# linkage, otherwise empty string is returned. The type of this attribute is
# character array with the length equal to the value of
# ::HSA_EXECUTABLE_SYMBOL_INFO_MODULE_NAME_LENGTH attribute.
HSA_EXECUTABLE_SYMBOL_INFO_MODULE_NAME = 4
# Agent associated with this symbol. If the symbol is a variable, the
# value of this attribute is only defined if
# ::HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ALLOCATION is
# ::HSA_VARIABLE_ALLOCATION_AGENT. The type of this attribute is hsa_agent_t.
HSA_EXECUTABLE_SYMBOL_INFO_AGENT = 20
# The address of the variable. The value of this attribute is undefined if
# the symbol is not a variable. The type of this attribute is uint64_t.
# If executable's state is ::HSA_EXECUTABLE_STATE_UNFROZEN, then 0 is
# returned.
HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS = 21
# The linkage kind of the symbol. The type of this attribute is
# ::hsa_symbol_linkage_t.
HSA_EXECUTABLE_SYMBOL_INFO_LINKAGE = 5
# Indicates whether the symbol corresponds to a definition. The type of this
# attribute is bool.
HSA_EXECUTABLE_SYMBOL_INFO_IS_DEFINITION = 17
# The allocation kind of the variable. The value of this attribute is
# undefined if the symbol is not a variable. The type of this attribute is
# ::hsa_variable_allocation_t.
HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ALLOCATION = 6
# The segment kind of the variable. The value of this attribute is undefined
# if the symbol is not a variable. The type of this attribute is
# ::hsa_variable_segment_t.
HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SEGMENT = 7
# Alignment of the variable. The value of this attribute is undefined if
# the symbol is not a variable. The type of this attribute is uint32_t.
HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ALIGNMENT = 8
# Size of the variable. The value of this attribute is undefined if
# the symbol is not a variable. The type of this attribute is uint32_t.
#
# A value of 0 is returned if the variable is an external variable and has an
# unknown dimension.
HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE = 9
# Indicates whether the variable is constant. The value of this attribute is
# undefined if the symbol is not a variable. The type of this attribute is
# bool.
HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_IS_CONST = 10
# Kernel object handle, used in the kernel dispatch packet. The value of this
# attribute is undefined if the symbol is not a kernel. The type of this
# attribute is uint64_t.
#
# If the state of the executable is ::HSA_EXECUTABLE_STATE_UNFROZEN, then 0
# is returned.
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT = 22
# Size of kernarg segment memory that is required to hold the values of the
# kernel arguments, in bytes. The value of this attribute is undefined if the
# symbol is not a kernel. The type of this attribute is uint32_t.
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE = 11
# Alignment (in bytes) of the buffer used to pass arguments to the kernel,
# which is the maximum of 16 and the maximum alignment of any of the kernel
# arguments. The value of this attribute is undefined if the symbol is not a
# kernel. The type of this attribute is uint32_t.
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT = 12
# Size of static group segment memory required by the kernel (per
# work-group), in bytes. The value of this attribute is undefined
# if the symbol is not a kernel. The type of this attribute is uint32_t.
#
# The reported amount does not include any dynamically allocated group
# segment memory that may be requested by the application when a kernel is
# dispatched.
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE = 13
# Size of static private, spill, and arg segment memory required by
# this kernel (per work-item), in bytes. The value of this attribute is
# undefined if the symbol is not a kernel. The type of this attribute is
# uint32_t.
#
# If the value of ::HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK is
# true, the kernel may use more private memory than the reported value, and
# the application must add the dynamic call stack usage to @a
# private_segment_size when populating a kernel dispatch packet.
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE = 14
# Dynamic callstack flag. The value of this attribute is undefined if the
# symbol is not a kernel. The type of this attribute is bool.
#
# If this flag is set (the value is true), the kernel uses a dynamically
# sized call stack. This can happen if recursive calls, calls to indirect
# functions, or the HSAIL alloca instruction are present in the kernel.
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK = 15
# Indirect function object handle. The value of this attribute is undefined
# if the symbol is not an indirect function, or the associated agent does
# not support the Full Profile. The type of this attribute depends on the
# machine model: if machine model is small, then the type is uint32_t, if
# machine model is large, then the type is uint64_t.
#
# If the state of the executable is ::HSA_EXECUTABLE_STATE_UNFROZEN, then 0
# is returned.
HSA_EXECUTABLE_SYMBOL_INFO_INDIRECT_FUNCTION_OBJECT = 23
# Call convention of the indirect function. The value of this attribute is
# undefined if the symbol is not an indirect function, or the associated
# agent does not support the Full Profile. The type of this attribute is
# uint32_t.
HSA_EXECUTABLE_SYMBOL_INFO_INDIRECT_FUNCTION_CALL_CONVENTION = 16
# hsa_default_float_rounding_mode_t
# Use a default floating-point rounding mode specified elsewhere.
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT = 0
# Operations that specify the default floating-point mode are rounded to zero
# by default.
HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO = 1
# Operations that specify the default floating-point mode are rounded to the
# nearest representable number and that ties should be broken by selecting
# the value with an even least significant bit.
HSA_DEFAULT_FLOAT_ROUNDING_MODE_NEAR = 2
# hsa_code_object_type_t
HSA_CODE_OBJECT_TYPE_PROGRAM = 0
# hsa_executable_state_t
# Executable state, which allows the user to load code objects and define
# external variables. Variable addresses, kernel code handles, and
# indirect function code handles are not available in query operations until
# the executable is frozen (zero always returned).
HSA_EXECUTABLE_STATE_UNFROZEN = 0
# Executable state, which allows the user to query variable addresses,
# kernel code handles, and indirect function code handles using query
# operation. Loading new code objects, as well as defining external variables
# is not allowed in this state.
HSA_EXECUTABLE_STATE_FROZEN = 1
# hsa_kernel_dispatch_packet_setup_t
HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS = 0
# hsa_packet_header_t
HSA_PACKET_HEADER_TYPE = 0
HSA_PACKET_HEADER_BARRIER = 8
HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE = 9
HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE = 11
"""Enum values for HSA from the HSA extension header
Note that Python namespacing could be used to avoid the C-like
prefixing, but we choose to keep the same names as found in the C
enums, in order to match the documentation.
"""
# These enums are a direct translation of those found in:
# hsa_ext_amd.h from the ROCR-Runtime. For example:
# https://github.com/RadeonOpenCompute/ROCR-Runtime/blob/master/src/inc/hsa_ext_amd.h
# Comments relating to the values are largely wholesale copied.
import ctypes
#------------------------------------------------------------------------------
#
# Anonymous enum expressing that a memory pool is invalid
#
HSA_STATUS_ERROR_INVALID_MEMORY_POOL = 40
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
#
# Agent attributes
#
# Enums of the type hsa_amd_agent_info_t
# Chip identifier. The type of this attribute is uint32_t.
HSA_AMD_AGENT_INFO_CHIP_ID = 0xA000
# Size of a cacheline in bytes. The type of this attribute is uint32_t.
HSA_AMD_AGENT_INFO_CACHELINE_SIZE = 0xA001
# The number of compute unit available in the agent. The type of this
# attribute is uint32_t.
HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT = 0xA002
# The maximum clock frequency of the agent in MHz. The type of this
# attribute is uint32_t.
HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY = 0xA003
# Internay driver node identifier. The type of this attribute is uint32_t.
HSA_AMD_AGENT_INFO_DRIVER_NODE_ID = 0xA004
# Max number of watch points on memory address ranges to generate exception
# events when the watched addresses are accessed.
HSA_AMD_AGENT_INFO_MAX_ADDRESS_WATCH_POINTS = 0xA005
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
#
# Region attributes
#
# Enums of the type hsa_amd_region_info_t
# Determine if host can access the region. The type of this attribute is bool.
HSA_AMD_REGION_INFO_HOST_ACCESSIBLE = 0xA000
# Base address of the region in flat address space.
HSA_AMD_REGION_INFO_BASE = 0xA001
# Memory Interface width, the return value type is uint32_t.
# This attribute is deprecated. Use HSA_AMD_AGENT_INFO_MEMORY_WIDTH.
HSA_AMD_REGION_INFO_BUS_WIDTH = 0xA002
# Max Memory Clock, the return value type is uint32_t.
# This attribute is deprecated. Use HSA_AMD_AGENT_INFO_MEMORY_MAX_FREQUENCY.
HSA_AMD_REGION_INFO_MAX_CLOCK_FREQUENCY = 0xA003
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
#
# Coherency attributes of a fine grained region
#
# Enums of the type hsa_amd_coherency_type_t
# Coherent region.
HSA_AMD_COHERENCY_TYPE_COHERENT = 0
# Non coherent region.
HSA_AMD_COHERENCY_TYPE_NONCOHERENT = 1
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
#
# Memory segments associated with a memory pool.
#
# Enums of the type hsa_amd_segment_t
# Global segment. Used to hold data that is shared by all agents.
HSA_AMD_SEGMENT_GLOBAL = 0
# Read-only segment. Used to hold data that remains constant during the
# execution of a kernel.
HSA_AMD_SEGMENT_READONLY = 1
# Private segment. Used to hold data that is local to a single work-item.
HSA_AMD_SEGMENT_PRIVATE = 2
# Group segment. Used to hold data that is shared by the work-items of a
# work-group.
HSA_AMD_SEGMENT_GROUP = 3
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
#
# Memory pool global flags.
#
# Enums of the type hsa_amd_memory_pool_global_flag_t.
# The application can use allocations in the memory pool to store kernel
# arguments, and provide the values for the kernarg segment of
# a kernel dispatch.
HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT = 1
# Updates to memory in this pool conform to HSA memory consistency model.
# If this flag is set, then HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED
# must not be set.
HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED = 2
# Writes to memory in this pool can be performed by a single agent at a time.
HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED = 4
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
#
# Memory pool features flags.
#
# Enums of the type hsa_amd_memory_pool_info_t.
# Segment where the memory pool resides. The type of this attribute is
# hsa_amd_segment_t.
HSA_AMD_MEMORY_POOL_INFO_SEGMENT = 0
# Flag mask. The value of this attribute is undefined if the value of
# HSA_AMD_MEMORY_POOL_INFO_SEGMENT is not HSA_AMD_SEGMENT_GLOBAL. The type
# of this attribute is uint32_t, a bit-field of
# hsa_amd_memory_pool_global_flag_t values.
HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS = 1
# Size of this pool, in bytes. The type of this attribute is size_t.
HSA_AMD_MEMORY_POOL_INFO_SIZE = 2
# Indicates whether memory in this pool can be allocated using
# hsa_amd_memory_pool_allocate. The type of this attribute is bool.
# The value of this flag is always false for memory pools in the group and
# private segments.
HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED = 5
# Allocation granularity of buffers allocated by hsa_amd_memory_pool_allocate
# in this memory pool. The size of a buffer allocated in this pool is a
# multiple of the value of this attribute. The value of this attribute is
# only defined if HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED is true for
# this pool. The type of this attribute is size_t.
HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE = 6
# Alignment of buffers allocated by hsa_amd_memory_pool_allocate in this
# pool. The value of this attribute is only defined if
# HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED is true for this pool, and
# must be a power of 2. The type of this attribute is size_t.
HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT = 7
# This memory_pool can be made directly accessible by all the agents in the
# system (hsa_amd_agent_memory_pool_get_info returns
# HSA_AMD_MEMORY_POOL_ACCESS_ALLOWED_BY_DEFAULT for all agents). The type of
# this attribute is bool.
HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL = 15
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
#
# Type of accesses to a memory pool from a given agent.
#
# Enums of the type hsa_amd_memory_pool_access_t
# The agent cannot directly access any buffer in the memory pool.
HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED = 0
# The agent can directly access a buffer located in the pool; the application
# does not need to invoke hsa_amd_agents_allow_access.
HSA_AMD_MEMORY_POOL_ACCESS_ALLOWED_BY_DEFAULT = 1
# The agent can directly access a buffer located in the pool, but only if the
# application has previously requested access to that buffer using
# hsa_amd_agents_allow_access.
HSA_AMD_MEMORY_POOL_ACCESS_DISALLOWED_BY_DEFAULT = 2
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
#
# Properties of the relationship between an agent a memory pool.
#
# Enums of the type hsa_amd_link_info_type_t
# Hyper-transport bus type.
HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT = 0
# QPI bus type.
HSA_AMD_LINK_INFO_TYPE_QPI = 1
# PCIe bus type.
HSA_AMD_LINK_INFO_TYPE_PCIE = 2
# Infiniband bus type.
HSA_AMD_LINK_INFO_TYPE_INFINBAND = 3
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
#
# Access to buffers located in the memory pool. The type of this attribute
# is hsa_amd_memory_pool_access_t.
#
# Enums of type hsa_amd_agent_memory_pool_info_t.
# An agent can always directly access buffers currently located in a memory
# pool that is associated (the memory_pool is one of the values returned by
# hsa_amd_agent_iterate_memory_pools on the agent) with that agent. If the
# buffer is currently located in a memory pool that is not associated with
# the agent, and the value returned by this function for the given
# combination of agent and memory pool is not
# HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED, the application still needs to
# invoke hsa_amd_agents_allow_access in order to gain direct access to the
# buffer.
# If the given agent can directly access buffers the pool, the result is not
# HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED. If the memory pool is associated
# with the agent, or it is of fined-grained type, the result must not be
# HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED. If the memory pool is not
# associated with the agent, and does not reside in the global segment, the
# result must be HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED.
HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS = 0
# Number of links to hop when accessing the memory pool from the specified
# agent. The type of this attribute is uint32_t.
HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS = 1
# Details of each link hop when accessing the memory pool starting from the
# specified agent. The type of this attribute is an array size of
# HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS with each element containing
# hsa_amd_memory_pool_link_info_t.
HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO = 2
#------------------------------------------------------------------------------
class HsaDriverError(Exception):
pass
class HsaSupportError(ImportError):
pass
class HsaApiError(HsaDriverError):
def __init__(self, code, msg):
self.code = code
super(HsaApiError, self).__init__(msg)
class HsaWarning(UserWarning):
pass
class HsaKernelLaunchError(HsaDriverError):
pass
class HsaContextMismatchError(HsaDriverError):
def __init__(self, expect, got):
fmt = ("device array is associated with a different "
"context: expect {0} but got {1}")
msg = fmt.format(expect, got)
super(HsaContextMismatchError, self).__init__(msg)
import operator
from functools import reduce
from llvmlite.llvmpy.core import Type
import llvmlite.llvmpy.core as lc
import llvmlite.binding as ll
from llvmlite import ir
from numba import roc
from numba.core.imputils import Registry
from numba.core import types, cgutils
from numba.core.itanium_mangler import mangle_c, mangle, mangle_type
from numba.core.typing.npydecl import parse_dtype
from numba.roc import target
from numba.roc import stubs
from numba.roc import hlc
from numba.roc import enums
registry = Registry()
lower = registry.lower
_void_value = lc.Constant.null(lc.Type.pointer(lc.Type.int(8)))
# -----------------------------------------------------------------------------
def _declare_function(context, builder, name, sig, cargs,
mangler=mangle_c):
"""Insert declaration for a opencl builtin function.
Uses the Itanium mangler.
Args
----
context: target context
builder: llvm builder
name: str
symbol name
sig: signature
function signature of the symbol being declared
cargs: sequence of str
C type names for the arguments
mangler: a mangler function
function to use to mangle the symbol
"""
mod = builder.module
if sig.return_type == types.void:
llretty = lc.Type.void()
else:
llretty = context.get_value_type(sig.return_type)
llargs = [context.get_value_type(t) for t in sig.args]
fnty = Type.function(llretty, llargs)
mangled = mangler(name, cargs)
fn = mod.get_or_insert_function(fnty, mangled)
fn.calling_convention = target.CC_SPIR_FUNC
return fn
@lower(stubs.get_global_id, types.uint32)
def get_global_id_impl(context, builder, sig, args):
[dim] = args
get_global_id = _declare_function(context, builder, 'get_global_id', sig,
['unsigned int'])
res = builder.call(get_global_id, [dim])
return context.cast(builder, res, types.uintp, types.intp)
@lower(stubs.get_local_id, types.uint32)
def get_local_id_impl(context, builder, sig, args):
[dim] = args
get_local_id = _declare_function(context, builder, 'get_local_id', sig,
['unsigned int'])
res = builder.call(get_local_id, [dim])
return context.cast(builder, res, types.uintp, types.intp)
@lower(stubs.get_group_id, types.uint32)
def get_group_id_impl(context, builder, sig, args):
[dim] = args
get_group_id = _declare_function(context, builder, 'get_group_id', sig,
['unsigned int'])
res = builder.call(get_group_id, [dim])
return context.cast(builder, res, types.uintp, types.intp)
@lower(stubs.get_num_groups, types.uint32)
def get_num_groups_impl(context, builder, sig, args):
[dim] = args
get_num_groups = _declare_function(context, builder, 'get_num_groups', sig,
['unsigned int'])
res = builder.call(get_num_groups, [dim])
return context.cast(builder, res, types.uintp, types.intp)
@lower(stubs.get_work_dim)
def get_work_dim_impl(context, builder, sig, args):
get_work_dim = _declare_function(context, builder, 'get_work_dim', sig,
["void"])
res = builder.call(get_work_dim, [])
return res
@lower(stubs.get_global_size, types.uint32)
def get_global_size_impl(context, builder, sig, args):
[dim] = args
get_global_size = _declare_function(context, builder, 'get_global_size',
sig, ['unsigned int'])
res = builder.call(get_global_size, [dim])
return context.cast(builder, res, types.uintp, types.intp)
@lower(stubs.get_local_size, types.uint32)
def get_local_size_impl(context, builder, sig, args):
[dim] = args
get_local_size = _declare_function(context, builder, 'get_local_size',
sig, ['unsigned int'])
res = builder.call(get_local_size, [dim])
return context.cast(builder, res, types.uintp, types.intp)
@lower(stubs.barrier, types.uint32)
def barrier_one_arg_impl(context, builder, sig, args):
[flags] = args
barrier = _declare_function(context, builder, 'barrier', sig,
['unsigned int'])
builder.call(barrier, [flags])
return _void_value
@lower(stubs.barrier)
def barrier_no_arg_impl(context, builder, sig, args):
assert not args
sig = types.void(types.uint32)
barrier = _declare_function(context, builder, 'barrier', sig,
['unsigned int'])
flags = context.get_constant(types.uint32, enums.CLK_GLOBAL_MEM_FENCE)
builder.call(barrier, [flags])
return _void_value
@lower(stubs.mem_fence, types.uint32)
def mem_fence_impl(context, builder, sig, args):
[flags] = args
mem_fence = _declare_function(context, builder, 'mem_fence', sig,
['unsigned int'])
builder.call(mem_fence, [flags])
return _void_value
@lower(stubs.wavebarrier)
def wavebarrier_impl(context, builder, sig, args):
assert not args
fnty = Type.function(Type.void(), [])
fn = builder.module.declare_intrinsic('llvm.amdgcn.wave.barrier', fnty=fnty)
builder.call(fn, [])
return _void_value
@lower(stubs.activelanepermute_wavewidth,
types.Any, types.uint32, types.Any, types.bool_)
def activelanepermute_wavewidth_impl(context, builder, sig, args):
[src, laneid, identity, use_ident] = args
assert sig.args[0] == sig.args[2]
elem_type = sig.args[0]
bitwidth = elem_type.bitwidth
intbitwidth = Type.int(bitwidth)
i32 = Type.int(32)
i1 = Type.int(1)
name = "__hsail_activelanepermute_wavewidth_b{0}".format(bitwidth)
fnty = Type.function(intbitwidth, [intbitwidth, i32, intbitwidth, i1])
fn = builder.module.get_or_insert_function(fnty, name=name)
fn.calling_convention = target.CC_SPIR_FUNC
def cast(val):
return builder.bitcast(val, intbitwidth)
result = builder.call(fn, [cast(src), laneid, cast(identity), use_ident])
return builder.bitcast(result, context.get_value_type(elem_type))
def _gen_ds_permute(intrinsic_name):
def _impl(context, builder, sig, args):
"""
args are (index, src)
"""
assert sig.return_type == sig.args[1]
idx, src = args
i32 = Type.int(32)
fnty = Type.function(i32, [i32, i32])
fn = builder.module.declare_intrinsic(intrinsic_name, fnty=fnty)
# the args are byte addressable, VGPRs are 4 wide so mul idx by 4
# the idx might be an int64, this is ok to trunc to int32 as
# wavefront_size is never likely overflow an int32
idx = builder.trunc(idx, i32)
four = lc.Constant.int(i32, 4)
idx = builder.mul(idx, four)
# bit cast is so float32 works as packed i32, the return casts back
result = builder.call(fn, (idx, builder.bitcast(src, i32)))
return builder.bitcast(result, context.get_value_type(sig.return_type))
return _impl
lower(stubs.ds_permute, types.Any, types.Any)(_gen_ds_permute('llvm.amdgcn.ds.permute'))
lower(stubs.ds_bpermute, types.Any, types.Any)(_gen_ds_permute('llvm.amdgcn.ds.bpermute'))
@lower(stubs.atomic.add, types.Array, types.intp, types.Any)
@lower(stubs.atomic.add, types.Array,
types.UniTuple, types.Any)
@lower(stubs.atomic.add, types.Array, types.Tuple,
types.Any)
def hsail_atomic_add_tuple(context, builder, sig, args):
aryty, indty, valty = sig.args
ary, inds, val = args
dtype = aryty.dtype
if indty == types.intp:
indices = [inds] # just a single integer
indty = [indty]
else:
indices = cgutils.unpack_tuple(builder, inds, count=len(indty))
indices = [context.cast(builder, i, t, types.intp)
for t, i in zip(indty, indices)]
if dtype != valty:
raise TypeError("expecting %s but got %s" % (dtype, valty))
if aryty.ndim != len(indty):
raise TypeError("indexing %d-D array with %d-D index" %
(aryty.ndim, len(indty)))
lary = context.make_array(aryty)(context, builder, ary)
ptr = cgutils.get_item_pointer(context, builder, aryty, lary, indices)
return builder.atomic_rmw("add", ptr, val, ordering='monotonic')
@lower(roc.shared.array, types.IntegerLiteral, types.Any)
def hsail_smem_alloc_array_integer(context, builder, sig, args):
length = sig.args[0].literal_value
dtype = parse_dtype(sig.args[1])
return _generic_array(context, builder, shape=(length,), dtype=dtype,
symbol_name='_hsapy_smem',
addrspace=target.SPIR_LOCAL_ADDRSPACE)
@lower(roc.shared.array, types.Tuple, types.Any)
@lower(roc.shared.array, types.UniTuple, types.Any)
def hsail_smem_alloc_array_tuple(context, builder, sig, args):
shape = [ s.literal_value for s in sig.args[0] ]
dtype = parse_dtype(sig.args[1])
return _generic_array(context, builder, shape=shape, dtype=dtype,
symbol_name='_hsapy_smem',
addrspace=target.SPIR_LOCAL_ADDRSPACE)
def _generic_array(context, builder, shape, dtype, symbol_name, addrspace):
elemcount = reduce(operator.mul, shape, 1)
lldtype = context.get_data_type(dtype)
laryty = Type.array(lldtype, elemcount)
if addrspace == target.SPIR_LOCAL_ADDRSPACE:
lmod = builder.module
# Create global variable in the requested address-space
gvmem = lmod.add_global_variable(laryty, symbol_name, addrspace)
if elemcount <= 0:
raise ValueError("array length <= 0")
else:
gvmem.linkage = lc.LINKAGE_INTERNAL
if dtype not in types.number_domain:
raise TypeError("unsupported type: %s" % dtype)
# Convert to generic address-space
dataptr = context.addrspacecast(builder, gvmem,
target.SPIR_GENERIC_ADDRSPACE)
else:
raise NotImplementedError("addrspace {addrspace}".format(**locals()))
return _make_array(context, builder, dataptr, dtype, shape)
def _make_array(context, builder, dataptr, dtype, shape, layout='C'):
ndim = len(shape)
# Create array object
aryty = types.Array(dtype=dtype, ndim=ndim, layout='C')
ary = context.make_array(aryty)(context, builder)
targetdata = _get_target_data(context)
lldtype = context.get_data_type(dtype)
itemsize = lldtype.get_abi_size(targetdata)
# Compute strides
rstrides = [itemsize]
for i, lastsize in enumerate(reversed(shape[1:])):
rstrides.append(lastsize * rstrides[-1])
strides = [s for s in reversed(rstrides)]
kshape = [context.get_constant(types.intp, s) for s in shape]
kstrides = [context.get_constant(types.intp, s) for s in strides]
context.populate_array(ary,
data=builder.bitcast(dataptr, ary.data.type),
shape=kshape,
strides=kstrides,
itemsize=context.get_constant(types.intp, itemsize),
meminfo=None)
return ary._getvalue()
def _get_target_data(context):
return ll.create_target_data(hlc.DATALAYOUT[context.address_size])
#### Additional initialization code ######
def _initialize_ufunc():
from numba.np.ufunc import Vectorize
def init_vectorize():
from numba.roc.vectorizers import HsaVectorize
return HsaVectorize
Vectorize.target_registry.ondemand['roc'] = init_vectorize
def _initialize_gufunc():
from numba.np.ufunc import GUVectorize
def init_guvectorize():
from numba.roc.vectorizers import HsaGUFuncVectorize
return HsaGUFuncVectorize
GUVectorize.target_registry.ondemand['roc'] = init_guvectorize
_initialize_ufunc()
_initialize_gufunc()
import math
from numba.core import types, utils
from numba.core.typing.templates import (AttributeTemplate, ConcreteTemplate,
signature, Registry)
registry = Registry()
builtin_attr = registry.register_attr
infer_global = registry.register_global
@builtin_attr
class MathModuleAttribute(AttributeTemplate):
key = types.Module(math)
def resolve_fabs(self, mod):
return types.Function(Math_fabs)
def resolve_exp(self, mod):
return types.Function(Math_exp)
def resolve_expm1(self, mod):
return types.Function(Math_expm1)
def resolve_sqrt(self, mod):
return types.Function(Math_sqrt)
def resolve_log(self, mod):
return types.Function(Math_log)
def resolve_log1p(self, mod):
return types.Function(Math_log1p)
def resolve_log10(self, mod):
return types.Function(Math_log10)
def resolve_sin(self, mod):
return types.Function(Math_sin)
def resolve_cos(self, mod):
return types.Function(Math_cos)
def resolve_tan(self, mod):
return types.Function(Math_tan)
def resolve_sinh(self, mod):
return types.Function(Math_sinh)
def resolve_cosh(self, mod):
return types.Function(Math_cosh)
def resolve_tanh(self, mod):
return types.Function(Math_tanh)
def resolve_asin(self, mod):
return types.Function(Math_asin)
def resolve_acos(self, mod):
return types.Function(Math_acos)
def resolve_atan(self, mod):
return types.Function(Math_atan)
def resolve_atan2(self, mod):
return types.Function(Math_atan2)
def resolve_asinh(self, mod):
return types.Function(Math_asinh)
def resolve_acosh(self, mod):
return types.Function(Math_acosh)
def resolve_atanh(self, mod):
return types.Function(Math_atanh)
def resolve_pi(self, mod):
return types.float64
def resolve_e(self, mod):
return types.float64
def resolve_floor(self, mod):
return types.Function(Math_floor)
def resolve_ceil(self, mod):
return types.Function(Math_ceil)
def resolve_trunc(self, mod):
return types.Function(Math_trunc)
def resolve_isnan(self, mod):
return types.Function(Math_isnan)
def resolve_isinf(self, mod):
return types.Function(Math_isinf)
def resolve_degrees(self, mod):
return types.Function(Math_degrees)
def resolve_radians(self, mod):
return types.Function(Math_radians)
# def resolve_hypot(self, mod):
# return types.Function(Math_hypot)
def resolve_copysign(self, mod):
return types.Function(Math_copysign)
def resolve_fmod(self, mod):
return types.Function(Math_fmod)
def resolve_pow(self, mod):
return types.Function(Math_pow)
def resolve_erf(self, mod):
return types.Function(Math_erf)
def resolve_erfc(self, mod):
return types.Function(Math_erfc)
def resolve_gamma(self, mod):
return types.Function(Math_gamma)
def resolve_lgamma(self, mod):
return types.Function(Math_lgamma)
class Math_unary(ConcreteTemplate):
cases = [
signature(types.float64, types.int64),
signature(types.float64, types.uint64),
signature(types.float32, types.float32),
signature(types.float64, types.float64),
]
class Math_fabs(Math_unary):
key = math.fabs
class Math_exp(Math_unary):
key = math.exp
class Math_expm1(Math_unary):
key = math.expm1
class Math_sqrt(Math_unary):
key = math.sqrt
class Math_log(Math_unary):
key = math.log
class Math_log1p(Math_unary):
key = math.log1p
class Math_log10(Math_unary):
key = math.log10
class Math_sin(Math_unary):
key = math.sin
class Math_cos(Math_unary):
key = math.cos
class Math_tan(Math_unary):
key = math.tan
class Math_sinh(Math_unary):
key = math.sinh
class Math_cosh(Math_unary):
key = math.cosh
class Math_tanh(Math_unary):
key = math.tanh
class Math_asin(Math_unary):
key = math.asin
class Math_acos(Math_unary):
key = math.acos
class Math_atan(Math_unary):
key = math.atan
class Math_atan2(ConcreteTemplate):
key = math.atan2
cases = [
signature(types.float64, types.int64, types.int64),
signature(types.float64, types.uint64, types.uint64),
signature(types.float32, types.float32, types.float32),
signature(types.float64, types.float64, types.float64),
]
class Math_asinh(Math_unary):
key = math.asinh
class Math_acosh(Math_unary):
key = math.acosh
class Math_atanh(Math_unary):
key = math.atanh
class Math_floor(Math_unary):
key = math.floor
class Math_ceil(Math_unary):
key = math.ceil
class Math_trunc(Math_unary):
key = math.trunc
class Math_radians(Math_unary):
key = math.radians
class Math_degrees(Math_unary):
key = math.degrees
# class Math_hypot(ConcreteTemplate):
# key = math.hypot
# cases = [
# signature(types.float64, types.int64, types.int64),
# signature(types.float64, types.uint64, types.uint64),
# signature(types.float32, types.float32, types.float32),
# signature(types.float64, types.float64, types.float64),
# ]
class Math_erf(Math_unary):
key = math.erf
class Math_erfc(Math_unary):
key = math.erfc
class Math_gamma(Math_unary):
key = math.gamma
class Math_lgamma(Math_unary):
key = math.lgamma
class Math_binary(ConcreteTemplate):
cases = [
signature(types.float32, types.float32, types.float32),
signature(types.float64, types.float64, types.float64),
]
class Math_copysign(Math_binary):
key = math.copysign
class Math_fmod(Math_binary):
key = math.fmod
class Math_pow(ConcreteTemplate):
key = math.pow
cases = [
signature(types.float32, types.float32, types.float32),
signature(types.float64, types.float64, types.float64),
signature(types.float32, types.float32, types.int32),
signature(types.float64, types.float64, types.int32),
]
class Math_isnan(ConcreteTemplate):
key = math.isnan
cases = [
signature(types.boolean, types.int64),
signature(types.boolean, types.uint64),
signature(types.boolean, types.float32),
signature(types.boolean, types.float64),
]
class Math_isinf(ConcreteTemplate):
key = math.isinf
cases = [
signature(types.boolean, types.int64),
signature(types.boolean, types.uint64),
signature(types.boolean, types.float32),
signature(types.boolean, types.float64),
]
infer_global(math, types.Module(math))
infer_global(math.fabs, types.Function(Math_fabs))
infer_global(math.exp, types.Function(Math_exp))
infer_global(math.expm1, types.Function(Math_expm1))
infer_global(math.sqrt, types.Function(Math_sqrt))
infer_global(math.log, types.Function(Math_log))
infer_global(math.log1p, types.Function(Math_log1p))
infer_global(math.log10, types.Function(Math_log10))
infer_global(math.sin, types.Function(Math_sin))
infer_global(math.cos, types.Function(Math_cos))
infer_global(math.tan, types.Function(Math_tan))
infer_global(math.sinh, types.Function(Math_sinh))
infer_global(math.cosh, types.Function(Math_cosh))
infer_global(math.tanh, types.Function(Math_tanh))
infer_global(math.asin, types.Function(Math_asin))
infer_global(math.acos, types.Function(Math_acos))
infer_global(math.atan, types.Function(Math_atan))
infer_global(math.atan2, types.Function(Math_atan2))
infer_global(math.asinh, types.Function(Math_asinh))
infer_global(math.acosh, types.Function(Math_acosh))
infer_global(math.atanh, types.Function(Math_atanh))
# infer_global(math.hypot, types.Function(Math_hypot))
infer_global(math.floor, types.Function(Math_floor))
infer_global(math.ceil, types.Function(Math_ceil))
infer_global(math.trunc, types.Function(Math_trunc))
infer_global(math.isnan, types.Function(Math_isnan))
infer_global(math.isinf, types.Function(Math_isinf))
infer_global(math.degrees, types.Function(Math_degrees))
infer_global(math.radians, types.Function(Math_radians))
infer_global(math.copysign, types.Function(Math_copysign))
infer_global(math.fmod, types.Function(Math_fmod))
infer_global(math.pow, types.Function(Math_pow))
infer_global(math.erf, types.Function(Math_erf))
infer_global(math.erfc, types.Function(Math_erfc))
infer_global(math.gamma, types.Function(Math_gamma))
infer_global(math.lgamma, types.Function(Math_lgamma))
import math
import warnings
from numba.core.imputils import Registry
from numba.core import types
from numba.core.itanium_mangler import mangle
from .hsaimpl import _declare_function
registry = Registry()
lower = registry.lower
# -----------------------------------------------------------------------------
_unary_b_f = types.int32(types.float32)
_unary_b_d = types.int32(types.float64)
_unary_f_f = types.float32(types.float32)
_unary_d_d = types.float64(types.float64)
_binary_f_ff = types.float32(types.float32, types.float32)
_binary_d_dd = types.float64(types.float64, types.float64)
function_descriptors = {
'isnan': (_unary_b_f, _unary_b_d),
'isinf': (_unary_b_f, _unary_b_d),
'ceil': (_unary_f_f, _unary_d_d),
'floor': (_unary_f_f, _unary_d_d),
'fabs': (_unary_f_f, _unary_d_d),
'sqrt': (_unary_f_f, _unary_d_d),
'exp': (_unary_f_f, _unary_d_d),
'expm1': (_unary_f_f, _unary_d_d),
'log': (_unary_f_f, _unary_d_d),
'log10': (_unary_f_f, _unary_d_d),
'log1p': (_unary_f_f, _unary_d_d),
'sin': (_unary_f_f, _unary_d_d),
'cos': (_unary_f_f, _unary_d_d),
'tan': (_unary_f_f, _unary_d_d),
'asin': (_unary_f_f, _unary_d_d),
'acos': (_unary_f_f, _unary_d_d),
'atan': (_unary_f_f, _unary_d_d),
'sinh': (_unary_f_f, _unary_d_d),
'cosh': (_unary_f_f, _unary_d_d),
'tanh': (_unary_f_f, _unary_d_d),
'asinh': (_unary_f_f, _unary_d_d),
'acosh': (_unary_f_f, _unary_d_d),
'atanh': (_unary_f_f, _unary_d_d),
'copysign': (_binary_f_ff, _binary_d_dd),
'atan2': (_binary_f_ff, _binary_d_dd),
'pow': (_binary_f_ff, _binary_d_dd),
'fmod': (_binary_f_ff, _binary_d_dd),
'erf': (_unary_f_f, _unary_d_d),
'erfc': (_unary_f_f, _unary_d_d),
'gamma': (_unary_f_f, _unary_d_d),
'lgamma': (_unary_f_f, _unary_d_d),
# unsupported functions listed in the math module documentation:
# frexp, ldexp, trunc, modf, factorial, fsum
}
# some functions may be named differently by the underlying math
# library as opposed to the Python name.
_lib_counterpart = {
'gamma': 'tgamma'
}
def _mk_fn_decl(name, decl_sig):
sym = _lib_counterpart.get(name, name)
def core(context, builder, sig, args):
fn = _declare_function(context, builder, sym, decl_sig, decl_sig.args,
mangler=mangle)
res = builder.call(fn, args)
return context.cast(builder, res, decl_sig.return_type, sig.return_type)
core.__name__ = name
return core
_supported = ['sin', 'cos', 'tan', 'asin', 'acos', 'atan', 'atan2', 'sinh',
'cosh', 'tanh', 'asinh', 'acosh', 'atanh', 'isnan', 'isinf',
'ceil', 'floor', 'fabs', 'sqrt', 'exp', 'expm1', 'log',
'log10', 'log1p', 'copysign', 'pow', 'fmod', 'erf', 'erfc',
'gamma', 'lgamma',
]
for name in _supported:
sigs = function_descriptors.get(name)
if sigs is None:
warnings.warn("HSA - failed to register '{0}'".format(name))
continue
try:
# only symbols present in the math module
key = getattr(math, name)
except AttributeError:
continue
for sig in sigs:
fn = _mk_fn_decl(name, sig)
lower(key, *sig.args)(fn)
from .service import Service
from .threadlocal import TLStack
"""
Implement background services for the application.
This is implemented as a cooperative concurrent task.
"""
import functools
class Service(object):
def __init__(self, name="unnamed", arg=None):
self.name = name
self.enabled = True
self.arg = arg
self._task = self.process(self.arg)
next(self._task)
def service(self):
"""
Request for the service task.
Servicing is disabled if it is disabled thourght the "enabled"
attribute. When the task is executing, the service is disabled to
avoid recursion.
"""
if self.enabled:
enable = self.enabled
try:
# Prevent recursion
self.enabled = False
next(self._task)
finally:
self.enabled = enable
def process(self, arg):
"""
Overrided to implement the service task.
This must be a generator.
Use `yield` to return control.
"""
raise NotImplementedError
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.service()
def after(self, fn):
"""
A decorator for a function. Service is triggered on return.
"""
@functools.wraps(fn)
def wrap(*args, **kws):
with self:
return fn(*args, **kws)
return wrap
# -----------------------------------------------------------------------------
# The rest are for testing
class HelloService(Service):
def process(self, arg):
count = 0
yield
while True:
print("Hello", count)
count += 1
yield
def test():
serv = HelloService("my.hello")
print("1")
serv.service()
print("2")
serv.service()
with serv:
print("3")
@serv.after
def nested():
print("4")
nested()
if __name__ == '__main__':
test()
"""
Implements:
- Threadlocal stack
"""
import threading
class TLStack(object):
def __init__(self):
self.local = threading.local()
@property
def stack(self):
try:
# Retrieve thread local stack
return self.local.stack
except AttributeError:
# Initialize stack for the thread
self.local.stack = []
return self.local.stack
def push(self, item):
self.stack.append(item)
def pop(self):
return self.stack.pop()
@property
def top(self):
return self.stack[-1]
@property
def is_empty(self):
return not self.stack
def __bool__(self):
return not self.is_empty
def __nonzero__(self):
return self.__bool__()
def __len__(self):
return len(self.stack)
def clear(self):
self.__init__()
from numba.core import types, typing, ir
_stub_error = NotImplementedError("This is a stub.")
def get_global_id(*args, **kargs):
"""
OpenCL get_global_id()
"""
raise _stub_error
def get_local_id(*args, **kargs):
"""
OpenCL get_local_id()
"""
raise _stub_error
def get_global_size(*args, **kargs):
"""
OpenCL get_global_size()
"""
raise _stub_error
def get_local_size(*args, **kargs):
"""
OpenCL get_local_size()
"""
raise _stub_error
def get_group_id(*args, **kargs):
"""
OpenCL get_group_id()
"""
raise _stub_error
def get_num_groups(*args, **kargs):
"""
OpenCL get_num_groups()
"""
raise _stub_error
def get_work_dim(*args, **kargs):
"""
OpenCL get_work_dim()
"""
raise _stub_error
def barrier(*args, **kargs):
"""
OpenCL barrier()
Example:
# workgroup barrier + local memory fence
hsa.barrier(hsa.CLK_LOCAL_MEM_FENCE)
# workgroup barrier + global memory fence
hsa.barrier(hsa.CLK_GLOBAL_MEM_FENCE)
# workgroup barrier + global memory fence
hsa.barrier()
"""
raise _stub_error
def mem_fence(*args, **kargs):
"""
OpenCL mem_fence()
Example:
# local memory fence
hsa.mem_fence(hsa.CLK_LOCAL_MEM_FENCE)
# global memory fence
hsa.mem_fence(hsa.CLK_GLOBAL_MEM_FENCE)
"""
raise _stub_error
def wavebarrier():
"""
HSAIL wavebarrier
"""
raise _stub_error
def activelanepermute_wavewidth(src, laneid, identity, useidentity):
"""
HSAIL activelanepermute_wavewidth_*
"""
raise _stub_error
def ds_permute(src_lane, dest_lane):
"""
AMDGCN Data Share intrinsic forwards permute (push semantics)
"""
raise _stub_error
def ds_bpermute(src_lane, dest_lane):
"""
AMDGCN Data Share intrinsic backwards permute (pull semantics)
"""
raise _stub_error
class Stub(object):
"""A stub object to represent special objects which is meaningless
outside the context of HSA-python.
"""
_description_ = '<ptx special value>'
__slots__ = () # don't allocate __dict__
def __new__(cls):
raise NotImplementedError("%s is not instantiable" % cls)
def __repr__(self):
return self._description_
class shared(Stub):
"""shared namespace
"""
_description_ = '<shared>'
def array(shape, dtype):
"""shared.array(shape, dtype)
Allocate a shared memory array.
"""
#-------------------------------------------------------------------------------
# atomic
class atomic(Stub):
"""atomic namespace
"""
_description_ = '<atomic>'
class add(Stub):
"""add(ary, idx, val)
Perform atomic ary[idx] += val
"""
import re
from llvmlite.llvmpy import core as lc
from llvmlite import ir as llvmir
from llvmlite import binding as ll
from numba.core import typing, types, utils, datamodel, cgutils
from numba.core.utils import cached_property
from numba.core.base import BaseContext
from numba.core.callconv import MinimalCallConv
from numba.roc import codegen
from .hlc import DATALAYOUT
CC_SPIR_KERNEL = "spir_kernel"
CC_SPIR_FUNC = ""
# -----------------------------------------------------------------------------
# Typing
class HSATypingContext(typing.BaseContext):
def load_additional_registries(self):
from . import hsadecl, mathdecl
self.install_registry(hsadecl.registry)
self.install_registry(mathdecl.registry)
# -----------------------------------------------------------------------------
# Implementation
VALID_CHARS = re.compile(r'[^a-z0-9]', re.I)
# Address spaces
SPIR_GENERIC_ADDRSPACE = 0
SPIR_GLOBAL_ADDRSPACE = 1
SPIR_REGION_ADDRSPACE = 2
SPIR_CONSTANT_ADDRSPACE = 4
SPIR_LOCAL_ADDRSPACE = 3
SPIR_PRIVATE_ADDRSPACE = 5
SPIR_CONSTANT_32BIT_ADDRSPACE = 6
SPIR_VERSION = (2, 0)
class GenericPointerModel(datamodel.PrimitiveModel):
def __init__(self, dmm, fe_type):
adrsp = SPIR_GENERIC_ADDRSPACE
be_type = dmm.lookup(fe_type.dtype).get_data_type().as_pointer(adrsp)
super(GenericPointerModel, self).__init__(dmm, fe_type, be_type)
def _init_data_model_manager():
dmm = datamodel.default_manager.copy()
dmm.register(types.CPointer, GenericPointerModel)
return dmm
hsa_data_model_manager = _init_data_model_manager()
class HSATargetContext(BaseContext):
implement_powi_as_math_call = True
generic_addrspace = SPIR_GENERIC_ADDRSPACE
def init(self):
self._internal_codegen = codegen.JITHSACodegen("numba.hsa.jit")
self._target_data = \
ll.create_target_data(DATALAYOUT[utils.MACHINE_BITS])
# Override data model manager
self.data_model_manager = hsa_data_model_manager
def load_additional_registries(self):
from . import hsaimpl, mathimpl
self.insert_func_defn(hsaimpl.registry.functions)
self.insert_func_defn(mathimpl.registry.functions)
@cached_property
def call_conv(self):
return HSACallConv(self)
def codegen(self):
return self._internal_codegen
@property
def target_data(self):
return self._target_data
def mangler(self, name, argtypes):
def repl(m):
ch = m.group(0)
return "_%X_" % ord(ch)
qualified = name + '.' + '.'.join(str(a) for a in argtypes)
mangled = VALID_CHARS.sub(repl, qualified)
return 'hsapy_devfn_' + mangled
def prepare_hsa_kernel(self, func, argtypes):
module = func.module
func.linkage = 'linkonce_odr'
module.data_layout = DATALAYOUT[self.address_size]
wrapper = self.generate_kernel_wrapper(func, argtypes)
return wrapper
def mark_hsa_device(self, func):
# Adapt to SPIR
# module = func.module
func.calling_convention = CC_SPIR_FUNC
func.linkage = 'linkonce_odr'
return func
def generate_kernel_wrapper(self, func, argtypes):
module = func.module
arginfo = self.get_arg_packer(argtypes)
def sub_gen_with_global(lty):
if isinstance(lty, llvmir.PointerType):
return (lty.pointee.as_pointer(SPIR_GLOBAL_ADDRSPACE),
lty.addrspace)
return lty, None
if len(arginfo.argument_types) > 0:
llargtys, changed = zip(*map(sub_gen_with_global,
arginfo.argument_types))
else:
llargtys = changed = ()
wrapperfnty = lc.Type.function(lc.Type.void(), llargtys)
wrapper_module = self.create_module("hsa.kernel.wrapper")
wrappername = 'hsaPy_{name}'.format(name=func.name)
argtys = list(arginfo.argument_types)
fnty = lc.Type.function(lc.Type.int(),
[self.call_conv.get_return_type(
types.pyobject)] + argtys)
func = wrapper_module.add_function(fnty, name=func.name)
func.calling_convention = CC_SPIR_FUNC
wrapper = wrapper_module.add_function(wrapperfnty, name=wrappername)
builder = lc.Builder(wrapper.append_basic_block(''))
# Adjust address space of each kernel argument
fixed_args = []
for av, adrsp in zip(wrapper.args, changed):
if adrsp is not None:
casted = self.addrspacecast(builder, av, adrsp)
fixed_args.append(casted)
else:
fixed_args.append(av)
callargs = arginfo.from_arguments(builder, fixed_args)
# XXX handle error status
status, _ = self.call_conv.call_function(builder, func, types.void,
argtypes, callargs)
builder.ret_void()
set_hsa_kernel(wrapper)
# Link
module.link_in(ll.parse_assembly(str(wrapper_module)))
# To enable inlining which is essential because addrspacecast 1->0 is
# illegal. Inlining will optimize the addrspacecast out.
func.linkage = 'internal'
wrapper = module.get_function(wrapper.name)
module.get_function(func.name).linkage = 'internal'
return wrapper
def declare_function(self, module, fndesc):
ret = super(HSATargetContext, self).declare_function(module, fndesc)
# XXX: Refactor fndesc instead of this special case
if fndesc.llvm_func_name.startswith('hsapy_devfn'):
ret.calling_convention = CC_SPIR_FUNC
return ret
def make_constant_array(self, builder, typ, ary):
"""
Return dummy value.
"""
#
# a = self.make_array(typ)(self, builder)
# return a._getvalue()
raise NotImplementedError
def addrspacecast(self, builder, src, addrspace):
"""
Handle addrspacecast
"""
ptras = llvmir.PointerType(src.type.pointee, addrspace=addrspace)
return builder.addrspacecast(src, ptras)
def set_hsa_kernel(fn):
"""
Ensure `fn` is usable as a SPIR kernel.
- Fix calling convention
- Add metadata
"""
mod = fn.module
# Set nounwind
# fn.add_attribute(lc.ATTR_NO_UNWIND)
# Set SPIR kernel calling convention
fn.calling_convention = CC_SPIR_KERNEL
# Mark kernels
ocl_kernels = mod.get_or_insert_named_metadata("opencl.kernels")
ocl_kernels.add(lc.MetaData.get(mod, [fn,
gen_arg_addrspace_md(fn),
gen_arg_access_qual_md(fn),
gen_arg_type(fn),
gen_arg_type_qual(fn),
gen_arg_base_type(fn)]))
# SPIR version 2.0
make_constant = lambda x: lc.Constant.int(lc.Type.int(), x)
spir_version_constant = [make_constant(x) for x in SPIR_VERSION]
spir_version = mod.get_or_insert_named_metadata("opencl.spir.version")
if not spir_version.operands:
spir_version.add(lc.MetaData.get(mod, spir_version_constant))
ocl_version = mod.get_or_insert_named_metadata("opencl.ocl.version")
if not ocl_version.operands:
ocl_version.add(lc.MetaData.get(mod, spir_version_constant))
## The following metadata does not seem to be necessary
# Other metadata
# empty_md = lc.MetaData.get(mod, ())
# others = ["opencl.used.extensions",
# "opencl.used.optional.core.features",
# "opencl.compiler.options"]cat
#
# for name in others:
# nmd = mod.get_or_insert_named_metadata(name)
# if not nmd.operands:
# nmd.add(empty_md)
def gen_arg_addrspace_md(fn):
"""
Generate kernel_arg_addr_space metadata
"""
mod = fn.module
fnty = fn.type.pointee
codes = []
for a in fnty.args:
if cgutils.is_pointer(a):
codes.append(SPIR_GLOBAL_ADDRSPACE)
else:
codes.append(SPIR_PRIVATE_ADDRSPACE)
consts = [lc.Constant.int(lc.Type.int(), x) for x in codes]
name = lc.MetaDataString.get(mod, "kernel_arg_addr_space")
return lc.MetaData.get(mod, [name] + consts)
def gen_arg_access_qual_md(fn):
"""
Generate kernel_arg_access_qual metadata
"""
mod = fn.module
consts = [lc.MetaDataString.get(mod, "none")] * len(fn.args)
name = lc.MetaDataString.get(mod, "kernel_arg_access_qual")
return lc.MetaData.get(mod, [name] + consts)
def gen_arg_type(fn):
"""
Generate kernel_arg_type metadata
"""
mod = fn.module
fnty = fn.type.pointee
consts = [lc.MetaDataString.get(mod, str(a)) for a in fnty.args]
name = lc.MetaDataString.get(mod, "kernel_arg_type")
return lc.MetaData.get(mod, [name] + consts)
def gen_arg_type_qual(fn):
"""
Generate kernel_arg_type_qual metadata
"""
mod = fn.module
fnty = fn.type.pointee
consts = [lc.MetaDataString.get(mod, "") for _ in fnty.args]
name = lc.MetaDataString.get(mod, "kernel_arg_type_qual")
return lc.MetaData.get(mod, [name] + consts)
def gen_arg_base_type(fn):
"""
Generate kernel_arg_base_type metadata
"""
mod = fn.module
fnty = fn.type.pointee
consts = [lc.MetaDataString.get(mod, str(a)) for a in fnty.args]
name = lc.MetaDataString.get(mod, "kernel_arg_base_type")
return lc.MetaData.get(mod, [name] + consts)
class HSACallConv(MinimalCallConv):
def call_function(self, builder, callee, resty, argtys, args, env=None):
"""
Call the Numba-compiled *callee*.
"""
assert env is None
retty = callee.args[0].type.pointee
retvaltmp = cgutils.alloca_once(builder, retty)
# initialize return value
builder.store(cgutils.get_null_value(retty), retvaltmp)
arginfo = self.context.get_arg_packer(argtys)
args = arginfo.as_arguments(builder, args)
realargs = [retvaltmp] + list(args)
code = builder.call(callee, realargs)
status = self._get_return_status(builder, code)
retval = builder.load(retvaltmp)
out = self.context.get_returned_value(builder, resty, retval)
return status, out
from numba.testing import SerialSuite
from numba.testing import load_testsuite
from numba import roc
from os.path import dirname, join
def load_tests(loader, tests, pattern):
suite = SerialSuite()
this_dir = dirname(__file__)
if roc.is_available():
suite.addTests(load_testsuite(loader, join(this_dir, 'hsadrv')))
suite.addTests(load_testsuite(loader, join(this_dir, 'hsapy')))
else:
print("skipped HSA tests")
return suite
from numba.testing import SerialSuite
from numba.testing import load_testsuite
import os
def load_tests(loader, tests, pattern):
return SerialSuite(load_testsuite(loader, os.path.dirname(__file__)))
import numpy as np
from numba import roc
import unittest
from numba.roc.hsadrv.driver import dgpu_present
@unittest.skipUnless(dgpu_present, 'test only on dGPU system')
class TestAsync(unittest.TestCase):
def test_coarsegrain_array(self):
arr = roc.coarsegrain_array(shape=1024, dtype=np.float32)
self.assertEqual(arr.size, 1024)
arr[:] = expect = np.arange(arr.size)
np.testing.assert_allclose(arr, expect)
def test_async_copy_to_device(self):
arr = np.arange(1024)
devarr = roc.to_device(arr)
# allocate pinned array equivalent
hostarr = roc.coarsegrain_array(shape=arr.shape, dtype=arr.dtype)
hostarr[:] = arr + 100
stream = roc.stream()
ct = len(stream._signals)
devarr.copy_to_device(hostarr, stream=stream)
self.assertEqual(ct + 1, len(stream._signals),
"no new async signal")
# implicit synchronization
got = devarr.copy_to_host()
self.assertEqual(0, len(stream._signals),
"missing implicit synchronization")
np.testing.assert_equal(hostarr, got)
def test_async_copy_to_device_and_back(self):
arr = np.arange(1024)
hostarr = roc.coarsegrain_array(shape=arr.shape, dtype=arr.dtype)
gotarr = roc.coarsegrain_array(shape=arr.shape, dtype=arr.dtype)
stream = roc.stream()
ct = len(stream._signals)
devarr = roc.to_device(hostarr, stream=stream)
self.assertEqual(ct + 1, len(stream._signals))
devarr.copy_to_host(gotarr, stream=stream)
self.assertEqual(ct + 2, len(stream._signals))
stream.synchronize()
self.assertEqual(0, len(stream._signals))
np.testing.assert_equal(hostarr, gotarr)
if __name__ == '__main__':
unittest.main()
import ctypes
import os
import threading
import numpy as np
import unittest
from numba.roc.hsadrv.driver import hsa, Queue, Program, Executable,\
BrigModule, Context, dgpu_present
from numba.roc.hsadrv.driver import hsa as roc
import numba.roc.api as hsaapi
from numba import float32, float64, vectorize
from numba.roc.hsadrv import drvapi
from numba.roc.hsadrv import enums
from numba.roc.hsadrv import enums_ext
from numba.core import config
try:
import queue
except ImportError:
import Queue as queue
class TestLowLevelApi(unittest.TestCase):
"""This test checks that all the functions defined in drvapi
bind properly using ctypes."""
def test_functions_available(self):
missing_functions = []
for fname in drvapi.API_PROTOTYPES.keys():
try:
getattr(hsa, fname)
except Exception as e:
missing_functions.append("'{0}': {1}".format(fname, str(e)))
self.assertEqual(len(missing_functions), 0,
msg='\n'.join(missing_functions))
class TestAgents(unittest.TestCase):
def test_agents_init(self):
self.assertGreater(len(roc.agents), 0)
def test_agents_create_queue_single(self):
for agent in roc.agents:
if agent.is_component:
queue = agent.create_queue_single(2 ** 5)
self.assertIsInstance(queue, Queue)
def test_agents_create_queue_multi(self):
for agent in roc.agents:
if agent.is_component:
queue = agent.create_queue_multi(2 ** 5)
self.assertIsInstance(queue, Queue)
def test_agent_wavebits(self):
for agent in roc.agents:
if agent.is_component:
if agent.name.decode() in ['gfx803', 'gfx900']:
self.assertEqual(agent.wavebits, 6)
class _TestBase(unittest.TestCase):
def setUp(self):
self.gpu = [a for a in roc.agents if a.is_component][0]
self.cpu = [a for a in roc.agents if not a.is_component][0]
self.queue = self.gpu.create_queue_multi(self.gpu.queue_max_size)
def tearDown(self):
del self.queue
del self.gpu
del self.cpu
def get_brig_file():
path = os.path.join('/opt/rocm/hsa/sample/vector_copy_full.brig')
assert os.path.isfile(path)
return path
def _check_example_file():
try:
get_brig_file()
except Exception:
return False
return True
has_brig_example = _check_example_file()
@unittest.skipUnless(has_brig_example, "Brig example not found")
class TestBrigModule(unittest.TestCase):
def test_from_file(self):
brig_file = get_brig_file()
brig_module = BrigModule.from_file(brig_file)
self.assertGreater(len(brig_module), 0)
@unittest.skipUnless(has_brig_example, "Brig example not found")
class TestProgram(_TestBase):
def test_create_program(self):
brig_file = get_brig_file()
symbol = '&__vector_copy_kernel'
brig_module = BrigModule.from_file(brig_file)
program = Program()
program.add_module(brig_module)
code = program.finalize(self.gpu.isa)
ex = Executable()
ex.load(self.gpu, code)
ex.freeze()
sym = ex.get_symbol(self.gpu, symbol)
self.assertGreater(sym.kernarg_segment_size, 0)
class TestMemory(_TestBase):
def test_region_list(self):
self.assertGreater(len(self.gpu.regions.globals), 0)
self.assertGreater(len(self.gpu.regions.groups), 0)
# The following maybe empty
# print(self.gpu.regions.privates)
# print(self.gpu.regions.readonlys)
def test_register(self):
src = np.random.random(1024).astype(np.float32)
roc.hsa_memory_register(src.ctypes.data, src.nbytes)
roc.hsa_memory_deregister(src.ctypes.data, src.nbytes)
def test_allocate(self):
regions = self.gpu.regions
# More than one region
self.assertGreater(len(regions), 0)
# Find kernel argument regions
kernarg_regions = list()
for r in regions:
if r.supports(enums.HSA_REGION_GLOBAL_FLAG_KERNARG):
kernarg_regions.append(r)
self.assertGreater(len(kernarg_regions), 0)
# Test allocating at the kernel argument region
kernarg_region = kernarg_regions[0]
nelem = 10
ptr = kernarg_region.allocate(ctypes.sizeof(ctypes.c_float) * nelem)
self.assertNotEqual(ctypes.addressof(ptr), 0,
"pointer must not be NULL")
# Test writing to it
src = np.random.random(nelem).astype(np.float32)
ctypes.memmove(ptr, src.ctypes.data, src.nbytes)
ref = (ctypes.c_float * nelem).from_address(ptr.value)
for i in range(src.size):
self.assertEqual(ref[i], src[i])
roc.hsa_memory_free(ptr)
@unittest.skipUnless(dgpu_present, "dGPU only")
def test_coarse_grained_allocate(self):
"""
Tests the coarse grained allocation works on a dGPU.
It performs a data copying round trip via:
memory
|
HSA cpu memory
|
HSA dGPU host accessible memory <---|
| |
HSA dGPU memory --------------------|
"""
gpu_regions = self.gpu.regions
gpu_only_coarse_regions = list()
gpu_host_accessible_coarse_regions = list()
for r in gpu_regions:
if r.supports(enums.HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED):
if r.host_accessible:
gpu_host_accessible_coarse_regions.append(r)
else:
gpu_only_coarse_regions.append(r)
# check we have 1+ coarse gpu region(s) of each type
self.assertGreater(len(gpu_only_coarse_regions), 0)
self.assertGreater(len(gpu_host_accessible_coarse_regions), 0)
cpu_regions = self.cpu.regions
cpu_coarse_regions = list()
for r in cpu_regions:
if r.supports(enums.HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED):
cpu_coarse_regions.append(r)
# check we have 1+ coarse cpu region(s)
self.assertGreater(len(cpu_coarse_regions), 0)
# ten elements of data used
nelem = 10
# allocation
cpu_region = cpu_coarse_regions[0]
cpu_ptr = cpu_region.allocate(ctypes.sizeof(ctypes.c_float) * nelem)
self.assertNotEqual(ctypes.addressof(cpu_ptr), 0,
"pointer must not be NULL")
gpu_only_region = gpu_only_coarse_regions[0]
gpu_only_ptr = gpu_only_region.allocate(ctypes.sizeof(ctypes.c_float) *
nelem)
self.assertNotEqual(ctypes.addressof(gpu_only_ptr), 0,
"pointer must not be NULL")
gpu_host_accessible_region = gpu_host_accessible_coarse_regions[0]
gpu_host_accessible_ptr = gpu_host_accessible_region.allocate(
ctypes.sizeof(ctypes.c_float) * nelem)
self.assertNotEqual(ctypes.addressof(gpu_host_accessible_ptr), 0,
"pointer must not be NULL")
# Test writing to allocated area
src = np.random.random(nelem).astype(np.float32)
roc.hsa_memory_copy(cpu_ptr, src.ctypes.data, src.nbytes)
roc.hsa_memory_copy(gpu_host_accessible_ptr, cpu_ptr, src.nbytes)
roc.hsa_memory_copy(gpu_only_ptr, gpu_host_accessible_ptr, src.nbytes)
# check write is correct
cpu_ref = (ctypes.c_float * nelem).from_address(cpu_ptr.value)
for i in range(src.size):
self.assertEqual(cpu_ref[i], src[i])
gpu_ha_ref = (ctypes.c_float * nelem).\
from_address(gpu_host_accessible_ptr.value)
for i in range(src.size):
self.assertEqual(gpu_ha_ref[i], src[i])
# zero out host accessible GPU memory and CPU memory
z0 = np.zeros(nelem).astype(np.float32)
roc.hsa_memory_copy(cpu_ptr, z0.ctypes.data, z0.nbytes)
roc.hsa_memory_copy(gpu_host_accessible_ptr, cpu_ptr, z0.nbytes)
# check zeroing is correct
for i in range(z0.size):
self.assertEqual(cpu_ref[i], z0[i])
for i in range(z0.size):
self.assertEqual(gpu_ha_ref[i], z0[i])
# copy back the data from the GPU
roc.hsa_memory_copy(gpu_host_accessible_ptr, gpu_only_ptr, src.nbytes)
# check the copy back is ok
for i in range(src.size):
self.assertEqual(gpu_ha_ref[i], src[i])
# free
roc.hsa_memory_free(cpu_ptr)
roc.hsa_memory_free(gpu_only_ptr)
roc.hsa_memory_free(gpu_host_accessible_ptr)
@unittest.skipUnless(has_brig_example, "Brig example not found")
@unittest.skipUnless(dgpu_present, "dGPU only")
@unittest.skip("Permanently skip? HSA spec violation causes corruption")
def test_coarse_grained_kernel_execution(self):
"""
This tests the execution of a kernel on a dGPU using coarse memory
regions for the buffers.
NOTE: the code violates the HSA spec in that it uses a coarse region
for kernargs, this is a performance hack.
"""
from numba.roc.hsadrv.driver import BrigModule, Program, hsa,\
Executable
# get a brig file
brig_file = get_brig_file()
brig_module = BrigModule.from_file(brig_file)
self.assertGreater(len(brig_module), 0)
# use existing GPU regions for computation space
gpu_regions = self.gpu.regions
gpu_only_coarse_regions = list()
gpu_host_accessible_coarse_regions = list()
for r in gpu_regions:
if r.supports(enums.HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED):
if r.host_accessible:
gpu_host_accessible_coarse_regions.append(r)
else:
gpu_only_coarse_regions.append(r)
# check we have 1+ coarse gpu region(s) of each type
self.assertGreater(len(gpu_only_coarse_regions), 0)
self.assertGreater(len(gpu_host_accessible_coarse_regions), 0)
# Compilation phase:
# FIXME: this is dubious, assume launching agent is indexed first
agent = roc.components[0]
prog = Program()
prog.add_module(brig_module)
# get kernel and load
code = prog.finalize(agent.isa)
ex = Executable()
ex.load(agent, code)
ex.freeze()
# extract symbols
sym = ex.get_symbol(agent, "&__vector_copy_kernel")
self.assertNotEqual(sym.kernel_object, 0)
self.assertGreater(sym.kernarg_segment_size, 0)
# attempt kernel execution
import ctypes
import numpy as np
# Do memory allocations
# allocate and initialise memory
nelem = 1024 * 1024
src = np.random.random(nelem).astype(np.float32)
z0 = np.zeros_like(src)
# alloc host accessible memory
nbytes = ctypes.sizeof(ctypes.c_float) * nelem
gpu_host_accessible_region = gpu_host_accessible_coarse_regions[0]
host_in_ptr = gpu_host_accessible_region.allocate(nbytes)
self.assertNotEqual(host_in_ptr.value, None,
"pointer must not be NULL")
host_out_ptr = gpu_host_accessible_region.allocate(nbytes)
self.assertNotEqual(host_out_ptr.value, None,
"pointer must not be NULL")
# init mem with data
roc.hsa_memory_copy(host_in_ptr, src.ctypes.data, src.nbytes)
roc.hsa_memory_copy(host_out_ptr, z0.ctypes.data, z0.nbytes)
# alloc gpu only memory
gpu_only_region = gpu_only_coarse_regions[0]
gpu_in_ptr = gpu_only_region.allocate(nbytes)
self.assertNotEqual(gpu_in_ptr.value, None, "pointer must not be NULL")
gpu_out_ptr = gpu_only_region.allocate(nbytes)
self.assertNotEqual(gpu_out_ptr.value, None,
"pointer must not be NULL")
# copy memory from host accessible location to gpu only
roc.hsa_memory_copy(gpu_in_ptr, host_in_ptr, src.nbytes)
# Do kernargs
# Find a coarse region (for better performance on dGPU) in which
# to place kernargs. NOTE: This violates the HSA spec
kernarg_regions = list()
for r in gpu_host_accessible_coarse_regions:
# NOTE: VIOLATION
if r.supports(enums.HSA_REGION_GLOBAL_FLAG_KERNARG):
kernarg_regions.append(r)
self.assertGreater(len(kernarg_regions), 0)
# use first region for args
kernarg_region = kernarg_regions[0]
kernarg_ptr = kernarg_region.allocate(
2 * ctypes.sizeof(ctypes.c_void_p))
self.assertNotEqual(kernarg_ptr, None, "pointer must not be NULL")
# wire in gpu memory
argref = (2 * ctypes.c_size_t).from_address(kernarg_ptr.value)
argref[0] = gpu_in_ptr.value
argref[1] = gpu_out_ptr.value
# signal
sig = roc.create_signal(1)
# create queue and dispatch job
queue = agent.create_queue_single(32)
queue.dispatch(sym, kernarg_ptr, workgroup_size=(256, 1, 1),
grid_size=(nelem, 1, 1),signal=None)
# copy result back to host accessible memory to check
roc.hsa_memory_copy(host_out_ptr, gpu_out_ptr, src.nbytes)
# check the data is recovered
ref = (nelem * ctypes.c_float).from_address(host_out_ptr.value)
np.testing.assert_equal(ref, src)
# free
roc.hsa_memory_free(host_in_ptr)
roc.hsa_memory_free(host_out_ptr)
roc.hsa_memory_free(gpu_in_ptr)
roc.hsa_memory_free(gpu_out_ptr)
class TestContext(_TestBase):
"""Tests the Context class behaviour is correct."""
def test_memalloc(self):
"""
Tests Context.memalloc() for a given, in the parlance of HSA,\
`component`. Testing includes specialisations for the supported
components of dGPUs and APUs.
"""
n = 10 # things to alloc
nbytes = ctypes.sizeof(ctypes.c_double) * n
# run if a dGPU is present
if dgpu_present:
# find a host accessible region
dGPU_agent = self.gpu
CPU_agent = self.cpu
gpu_ctx = Context(dGPU_agent)
gpu_only_mem = gpu_ctx.memalloc(nbytes, hostAccessible=False)
ha_mem = gpu_ctx.memalloc(nbytes, hostAccessible=True)
# on dGPU systems, all host mem is host accessible
cpu_ctx = Context(CPU_agent)
cpu_mem = cpu_ctx.memalloc(nbytes, hostAccessible=True)
# Test writing to allocated area
src = np.random.random(n).astype(np.float64)
roc.hsa_memory_copy(cpu_mem.device_pointer, src.ctypes.data, src.nbytes)
roc.hsa_memory_copy(ha_mem.device_pointer, cpu_mem.device_pointer, src.nbytes)
roc.hsa_memory_copy(gpu_only_mem.device_pointer, ha_mem.device_pointer, src.nbytes)
# clear
z0 = np.zeros_like(src)
roc.hsa_memory_copy(ha_mem.device_pointer, z0.ctypes.data, z0.nbytes)
ref = (n * ctypes.c_double).from_address(ha_mem.device_pointer.value)
for k in range(n):
self.assertEqual(ref[k], 0)
# copy back from dGPU
roc.hsa_memory_copy(ha_mem.device_pointer, gpu_only_mem.device_pointer, src.nbytes)
for k in range(n):
self.assertEqual(ref[k], src[k])
else: #TODO: write APU variant
pass
def check_mempools(self, agent, has_fine_grain=True):
# get allocation-allowed pools
mp_alloc_list = [mp for mp in agent.mempools if mp.alloc_allowed]
mpdct = {'global': [], 'readonly': [], 'private': [], 'group': []}
for mp in mp_alloc_list:
mpdct[mp.kind].append(mp)
# only globals are allocation-allowed
if has_fine_grain:
self.assertEqual(len(mpdct['global']), 2)
else:
self.assertEqual(len(mpdct['global']), 1)
self.assertEqual(len(mpdct['readonly']), 0)
self.assertEqual(len(mpdct['private']), 0)
self.assertEqual(len(mpdct['group']), 0)
self.assertEqual(len(agent.mempools.globals), len(mpdct['global']))
# the global-pools are coarse-grain and fine-grain pools
glbs = mpdct['global']
coarsegrain = None
finegrain = None
for gmp in glbs:
if gmp.supports(enums_ext.HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED):
coarsegrain = gmp
if gmp.supports(enums_ext.HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED):
finegrain = gmp
self.assertIsNotNone(coarsegrain)
if has_fine_grain:
self.assertIsNotNone(finegrain)
else:
self.assertIsNone(finegrain)
self.assertIsNot(coarsegrain, finegrain)
def test_cpu_mempool_property(self):
self.check_mempools(self.cpu)
@unittest.skipUnless(dgpu_present, "dGPU only")
def test_gpu_mempool_property(self):
self.check_mempools(self.gpu, has_fine_grain=False)
@unittest.skipUnless(dgpu_present, "dGPU only")
def test_mempool(self):
n = 10 # things to alloc
nbytes = ctypes.sizeof(ctypes.c_double) * n
dGPU_agent = self.gpu
CPU_agent = self.cpu
# allocate a GPU memory pool
gpu_ctx = Context(dGPU_agent)
gpu_only_mem = gpu_ctx.mempoolalloc(nbytes)
# allocate a CPU memory pool, allow the GPU access to it
cpu_ctx = Context(CPU_agent)
cpu_mem = cpu_ctx.mempoolalloc(nbytes, allow_access_to=[gpu_ctx.agent])
## Test writing to allocated area
src = np.random.random(n).astype(np.float64)
roc.hsa_memory_copy(cpu_mem.device_pointer, src.ctypes.data, src.nbytes)
roc.hsa_memory_copy(gpu_only_mem.device_pointer, cpu_mem.device_pointer, src.nbytes)
# clear
z0 = np.zeros_like(src)
roc.hsa_memory_copy(cpu_mem.device_pointer, z0.ctypes.data, z0.nbytes)
ref = (n * ctypes.c_double).from_address(cpu_mem.device_pointer.value)
for k in range(n):
self.assertEqual(ref[k], 0)
# copy back from dGPU
roc.hsa_memory_copy(cpu_mem.device_pointer, gpu_only_mem.device_pointer, src.nbytes)
for k in range(n):
self.assertEqual(ref[k], src[k])
def check_mempool_with_flags(self, finegrain):
dGPU_agent = self.gpu
gpu_ctx = Context(dGPU_agent)
CPU_agent = self.cpu
cpu_ctx = Context(CPU_agent)
# get mempool with specific flags
cpu_ctx.mempoolalloc(1024, allow_access_to=[gpu_ctx._agent])
@unittest.skipUnless(dgpu_present, 'dGPU only')
def test_mempool_finegrained(self):
self.check_mempool_with_flags(finegrain=True)
@unittest.skipUnless(dgpu_present, 'dGPU only')
def test_mempool_coarsegrained(self):
self.check_mempool_with_flags(finegrain=False)
@unittest.skipUnless(dgpu_present, 'dGPU only')
def test_mempool_amd_example(self):
dGPU_agent = self.gpu
gpu_ctx = Context(dGPU_agent)
CPU_agent = self.cpu
cpu_ctx = Context(CPU_agent)
kNumInt = 1024
kSize = kNumInt * ctypes.sizeof(ctypes.c_int)
dependent_signal = roc.create_signal(0)
completion_signal = roc.create_signal(0)
## allocate host src and dst, allow gpu access
flags = dict(allow_access_to=[gpu_ctx.agent], finegrain=False)
host_src = cpu_ctx.mempoolalloc(kSize, **flags)
host_dst = cpu_ctx.mempoolalloc(kSize, **flags)
# there's a loop in `i` here over GPU hardware
i = 0
# get gpu local pool
local_memory = gpu_ctx.mempoolalloc(kSize)
host_src_view = (kNumInt * ctypes.c_int).from_address(host_src.device_pointer.value)
host_dst_view = (kNumInt * ctypes.c_int).from_address(host_dst.device_pointer.value)
host_src_view[:] = i + 2016 + np.arange(0, kNumInt, dtype=np.int32)
host_dst_view[:] = np.zeros(kNumInt, dtype=np.int32)
# print("GPU: %s"%gpu_ctx._agent.name)
# print("CPU: %s"%cpu_ctx._agent.name)
roc.hsa_signal_store_relaxed(completion_signal, 1);
q = queue.Queue()
class validatorThread(threading.Thread):
def run(self):
val = roc.hsa_signal_wait_acquire(
completion_signal,
enums.HSA_SIGNAL_CONDITION_EQ,
0,
ctypes.c_uint64(-1),
enums.HSA_WAIT_STATE_ACTIVE)
q.put(val) # wait_res
# this could be a call on the signal itself dependent_signal.store_relaxed(1)
roc.hsa_signal_store_relaxed(dependent_signal, 1);
h2l_start = threading.Semaphore(value=0)
class l2hThread(threading.Thread):
def run(self):
dep_signal = drvapi.hsa_signal_t(dependent_signal._id)
roc.hsa_amd_memory_async_copy(host_dst.device_pointer.value,
cpu_ctx._agent._id,
local_memory.device_pointer.value,
gpu_ctx._agent._id, kSize, 1,
ctypes.byref(dep_signal),
completion_signal)
h2l_start.release() # signal h2l to start
class h2lThread(threading.Thread):
def run(self):
h2l_start.acquire() # to wait until l2h thread has started
roc.hsa_amd_memory_async_copy(local_memory.device_pointer.value,
gpu_ctx._agent._id,
host_src.device_pointer.value,
cpu_ctx._agent._id, kSize, 0,
None,
dependent_signal)
timeout = 10 # 10 seconds timeout
# # init thread instances
validator = validatorThread()
l2h = l2hThread()
h2l = h2lThread()
# run them
validator.start()
l2h.start()
h2l.start()
# join
l2h.join(timeout)
h2l.join(timeout)
validator.join(timeout)
# verify
wait_res = q.get()
self.assertEqual(wait_res, 0)
np.testing.assert_allclose(host_dst_view, host_src_view)
@unittest.skipUnless(dgpu_present, "dGPU only")
def test_to_device_to_host(self):
"""
Tests .to_device() and .copy_to_host()
"""
n = 10
data = np.zeros(n)
output = np.zeros(n)
@vectorize("float64(float64)", target='roc')
def func(x):
return x + 1
hsaapi.to_device(data)
out_device = hsaapi.to_device(output)
func(data, out=out_device)
host_output = out_device.copy_to_host()
np.testing.assert_equal(np.ones(n), host_output)
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment