Commit 5c70ef66 authored by dugupeiwen's avatar dugupeiwen
Browse files

update roc

parent 1fb0017a
"""
A HSA dGPU backed ND Array is recognized by checking the __hsa_memory__
attribute on the object. If it exists and evaluate to True, it must define
shape, strides, dtype and size attributes similar to a NumPy ndarray.
"""
import warnings
import math
import copy
import weakref
from ctypes import c_void_p
import numpy as np
from numba.roc.hsadrv import driver as _driver
from numba.roc.hsadrv import devices
from numba.core import types
from .error import HsaContextMismatchError
from numba.misc import dummyarray
from numba.np import numpy_support
def is_hsa_ndarray(obj):
"Check if an object is a HSA ndarray"
return getattr(obj, '__hsa_ndarray__', False)
def verify_hsa_ndarray_interface(obj):
"Verify the HSA ndarray interface for an obj"
require_hsa_ndarray(obj)
def requires_attr(attr, typ):
if not hasattr(obj, attr):
raise AttributeError(attr)
if not isinstance(getattr(obj, attr), typ):
raise AttributeError('%s must be of type %s' % (attr, typ))
requires_attr('shape', tuple)
requires_attr('strides', tuple)
requires_attr('dtype', np.dtype)
requires_attr('size', int)
def require_hsa_ndarray(obj):
"Raises ValueError if is_hsa_ndarray(obj) evaluates False"
if not is_hsa_ndarray(obj):
raise ValueError('require an hsa ndarray object')
class DeviceNDArrayBase(object):
"""Base class for an on dGPU NDArray representation cf. numpy.ndarray
"""
__hsa_memory__ = True
__hsa_ndarray__ = True # There must be dgpu_data attribute as a result
def __init__(self, shape, strides, dtype, dgpu_data=None):
"""
Args
----
shape
array shape.
strides
array strides.
dtype
data type as numpy.dtype.
dgpu_data
user provided device memory for the ndarray data buffer
"""
if isinstance(shape, int):
shape = (shape,)
if isinstance(strides, int):
strides = (strides,)
self.ndim = len(shape)
if len(strides) != self.ndim:
raise ValueError('strides not match ndim')
self._dummy = dummyarray.Array.from_desc(0, shape, strides,
dtype.itemsize)
self.shape = tuple(shape)
self.strides = tuple(strides)
self.dtype = np.dtype(dtype)
self.size = int(np.prod(self.shape))
# prepare dgpu memory
if self.size > 0:
if dgpu_data is None:
from numba.roc.api import _memory_size_from_info
self.alloc_size = _memory_size_from_info(self.shape,
self.strides, self.dtype.itemsize)
# find a coarse region on the dGPU
dgpu_data = devices.get_context().mempoolalloc(self.alloc_size)
else: # we have some preallocated dgpu_memory
sz = getattr(dgpu_data, '_hsa_memsize_', None)
if sz is None:
raise ValueError('dgpu_data as no _hsa_memsize_ attribute')
assert sz >= 0
self.alloc_size = sz
else:
dgpu_data = None
self.alloc_size = 0
self.dgpu_data = dgpu_data
@property
def _context(self):
return self.dgpu_data.context
@property
def _numba_type_(self):
"""
Magic attribute expected by Numba to get the numba type that
represents this object.
"""
dtype = numpy_support.from_dtype(self.dtype)
return types.Array(dtype, self.ndim, 'A')
@property
def device_ctypes_pointer(self):
"""Returns the ctypes pointer to the GPU data buffer
"""
if self.dgpu_data is None:
return c_void_p(0)
else:
return self.dgpu_data.device_ctypes_pointer
def copy_to_device(self, ary, stream=None, context=None):
"""Copy `ary` to `self`.
If `ary` is a HSA memory, perform a device-to-device transfer.
Otherwise, perform a a host-to-device transfer.
If `stream` is a stream object, an async copy to used.
"""
if ary.size == 0:
# Nothing to do
return
if context is not None:
if self.dgpu_data is not None:
expect, got = self._context, context
if expect.unproxy != got.unproxy:
raise HsaContextMismatchError(expect=expect, got=got)
else:
context = self._context
# TODO: Worry about multiple dGPUs
#if _driver.is_device_memory(ary):
# sz = min(self.alloc_size, ary.alloc_size)
# _driver.device_to_device(self, ary, sz)
#else:
# sz = min(_driver.host_memory_size(ary), self.alloc_size)
sz = self.alloc_size
# host_to_dGPU(context, dst, src, size):
if stream is None:
_driver.hsa.implicit_sync()
if isinstance(ary, DeviceNDArray):
_driver.dGPU_to_dGPU(self._context, self, ary, sz)
else:
_driver.host_to_dGPU(self._context, self, ary, sz)
else:
if isinstance(ary, DeviceNDArray):
_driver.async_dGPU_to_dGPU(dst_ctx=self._context,
src_ctx=ary._context,
dst=self, src=ary, size=sz,
stream=stream)
else:
_driver.async_host_to_dGPU(dst_ctx=self._context,
src_ctx=devices.get_cpu_context(),
dst=self, src=ary, size=sz,
stream=stream)
def copy_to_host(self, ary=None, stream=None):
"""Copy ``self`` to ``ary`` or create a new Numpy ndarray
if ``ary`` is ``None``.
The transfer is synchronous: the function returns after the copy
is finished.
Always returns the host array.
Example::
import numpy as np
from numba import hsa
arr = np.arange(1000)
d_arr = hsa.to_device(arr)
my_kernel[100, 100](d_arr)
result_array = d_arr.copy_to_host()
"""
if ary is None: # destination does not exist
hostary = np.empty(shape=self.alloc_size, dtype=np.byte)
else: # destination does exist, it's `ary`, check it
if ary.dtype != self.dtype:
raise TypeError('incompatible dtype')
if ary.shape != self.shape:
scalshapes = (), (1,)
if not (ary.shape in scalshapes and self.shape in scalshapes):
raise TypeError('incompatible shape; device %s; host %s' %
(self.shape, ary.shape))
if ary.strides != self.strides:
scalstrides = (), (self.dtype.itemsize,)
if not (ary.strides in scalstrides and
self.strides in scalstrides):
raise TypeError('incompatible strides; device %s; host %s' %
(self.strides, ary.strides))
hostary = ary # this is supposed to be a ptr for writing
# a location for the data exists as `hostary`
assert self.alloc_size >= 0, "Negative memory size"
context = self._context
# copy the data from the device to the hostary
if self.alloc_size != 0:
sz = self.alloc_size
if stream is None:
_driver.hsa.implicit_sync()
_driver.dGPU_to_host(context, hostary, self, sz)
else:
_driver.async_dGPU_to_host(dst_ctx=devices.get_cpu_context(),
src_ctx=self._context,
dst=hostary, src=self,
size=sz, stream=stream)
# if the location for the data was originally None
# then create a new ndarray and plumb in the new memory
if ary is None:
if self.size == 0:
hostary = np.ndarray(shape=self.shape, dtype=self.dtype,
buffer=hostary)
else:
hostary = np.ndarray(shape=self.shape, dtype=self.dtype,
strides=self.strides, buffer=hostary)
else: # else hostary points to ary and how has the right memory
hostary = ary
return hostary
def as_hsa_arg(self):
"""Returns a device memory object that is used as the argument.
"""
return self.dgpu_data
class DeviceNDArray(DeviceNDArrayBase):
'''
An on-dGPU array type
'''
def is_f_contiguous(self):
'''
Return true if the array is Fortran-contiguous.
'''
return self._dummy.is_f_contig
def is_c_contiguous(self):
'''
Return true if the array is C-contiguous.
'''
return self._dummy.is_c_contig
def reshape(self, *newshape, **kws):
"""
Reshape the array without changing its contents, similarly to
:meth:`numpy.ndarray.reshape`. Example::
d_arr = d_arr.reshape(20, 50, order='F')
"""
if len(newshape) == 1 and isinstance(newshape[0], (tuple, list)):
newshape = newshape[0]
cls = type(self)
if newshape == self.shape:
# nothing to do
return cls(shape=self.shape, strides=self.strides,
dtype=self.dtype, dgpu_data=self.dgpu_data)
newarr, extents = self._dummy.reshape(*newshape, **kws)
if extents == [self._dummy.extent]:
return cls(shape=newarr.shape, strides=newarr.strides,
dtype=self.dtype, dgpu_data=self.dgpu_data)
else:
raise NotImplementedError("operation requires copying")
def ravel(self, order='C'):
'''
Flatten the array without changing its contents, similar to
:meth:`numpy.ndarray.ravel`.
'''
cls = type(self)
newarr, extents = self._dummy.ravel(order=order)
if extents == [self._dummy.extent]:
return cls(shape=newarr.shape, strides=newarr.strides,
dtype=self.dtype, dgpu_data=self.dgpu_data)
else:
raise NotImplementedError("operation requires copying")
class HostArray(np.ndarray):
__hsa_memory__ = True
@property
def device_ctypes_pointer(self):
return self.ctypes.data_as(c_void_p)
def from_array_like(ary, dgpu_data=None):
"Create a DeviceNDArray object that is like ary."
if ary.ndim == 0:
ary = ary.reshape(1)
return DeviceNDArray(ary.shape, ary.strides, ary.dtype,
dgpu_data=dgpu_data)
errmsg_contiguous_buffer = ("Array contains non-contiguous buffer and cannot "
"be transferred as a single memory region. Please "
"ensure contiguous buffer with numpy "
".ascontiguousarray()")
def _single_buffer(ary):
i = np.argmax(ary.strides)
size = ary.strides[i] * ary.shape[i]
return size == ary.nbytes
def sentry_contiguous(ary):
if not ary.flags['C_CONTIGUOUS'] and not ary.flags['F_CONTIGUOUS']:
if ary.strides[0] == 0:
# Broadcasted, ensure inner contiguous
return sentry_contiguous(ary[0])
elif _single_buffer(ary):
return True
else:
raise ValueError(errmsg_contiguous_buffer)
def auto_device(obj, context, stream=None, copy=True):
"""
Create a DeviceArray like obj and optionally copy data from
host to device. If obj already represents device memory, it is returned and
no copy is made.
"""
if _driver.is_device_memory(obj): # it's already on the dGPU
return obj, False
else: # needs to be copied to the dGPU
sentry_contiguous(obj)
devobj = from_array_like(obj)
if copy:
devobj.copy_to_device(obj, stream=stream, context=context)
return devobj, True
"""
Expose each GPU device directly
"""
import functools
from .driver import hsa as driver, Context as _Context
from numba.roc import servicelib
class _culist(object):
"""A thread local list of GPU instances
"""
def __init__(self):
self._lst = None
@property
def _gpus(self):
if not self._lst:
self._lst = self._init_gpus()
return self._lst
def _init_gpus(self):
gpus = []
for com in driver.components:
gpus.append(CU(com))
return gpus
def __getitem__(self, item):
return self._gpus[item]
def append(self, item):
return self._gpus.append(item)
def __len__(self):
return len(self._gpus)
def __nonzero__(self):
return bool(self._gpus)
def __iter__(self):
return iter(self._gpus)
__bool__ = __nonzero__
def reset(self):
for gpu in self:
gpu.reset()
@property
def current(self):
"""Get the current GPU object associated with the thread
"""
return _custack.top
cus = _culist()
del _culist
class CU(object):
def __init__(self, cu):
self._cu = cu
self._context = None
def __getattr__(self, key):
"""Redirect to self._gpu
"""
if key.startswith('_'):
raise AttributeError(key)
return getattr(self._cu, key)
def __repr__(self):
return repr(self._cu)
def associate_context(self):
"""Associate the context of this GPU to the running thread
"""
# No context was created for this GPU
if self._context is None:
self._context = self._cu.create_context()
return self._context
def __enter__(self):
self.associate_context()
_custack.push(self)
def __exit__(self, exc_type, exc_val, exc_tb):
assert _get_device() is self
self._context.pop()
_custack.pop()
def reset(self):
if self._context:
self._context.reset()
self._context = None
_cpu_context = None
def get_cpu_context():
global _cpu_context
if _cpu_context is None:
cpu_agent = [a for a in driver.agents if not a.is_component][0]
_cpu_context = _Context(cpu_agent)
return _cpu_context
def get_gpu(i):
return cus[i]
def get_num_gpus():
return len(cus)
_custack = servicelib.TLStack()
def _get_device(devnum=0):
"""Get the current device or use a device by device number.
"""
if not _custack:
_custack.push(get_gpu(devnum))
return _custack.top
def get_context(devnum=0):
"""Get the current device or use a device by device number, and
return the HSA context.
"""
return _get_device(devnum=devnum).associate_context()
def get_all_contexts():
return [get_context(i) for i in range(get_num_gpus())]
def require_context(fn):
"""
A decorator to ensure a context for the HSA subsystem
"""
@functools.wraps(fn)
def _require_cu_context(*args, **kws):
get_context()
return fn(*args, **kws)
return _require_cu_context
def reset():
cus.reset()
_custack.clear()
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
"""Enum values for HSA from the HSA extension header
Note that Python namespacing could be used to avoid the C-like
prefixing, but we choose to keep the same names as found in the C
enums, in order to match the documentation.
"""
# These enums are a direct translation of those found in:
# hsa_ext_amd.h from the ROCR-Runtime. For example:
# https://github.com/RadeonOpenCompute/ROCR-Runtime/blob/master/src/inc/hsa_ext_amd.h
# Comments relating to the values are largely wholesale copied.
import ctypes
#------------------------------------------------------------------------------
#
# Anonymous enum expressing that a memory pool is invalid
#
HSA_STATUS_ERROR_INVALID_MEMORY_POOL = 40
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
#
# Agent attributes
#
# Enums of the type hsa_amd_agent_info_t
# Chip identifier. The type of this attribute is uint32_t.
HSA_AMD_AGENT_INFO_CHIP_ID = 0xA000
# Size of a cacheline in bytes. The type of this attribute is uint32_t.
HSA_AMD_AGENT_INFO_CACHELINE_SIZE = 0xA001
# The number of compute unit available in the agent. The type of this
# attribute is uint32_t.
HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT = 0xA002
# The maximum clock frequency of the agent in MHz. The type of this
# attribute is uint32_t.
HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY = 0xA003
# Internay driver node identifier. The type of this attribute is uint32_t.
HSA_AMD_AGENT_INFO_DRIVER_NODE_ID = 0xA004
# Max number of watch points on memory address ranges to generate exception
# events when the watched addresses are accessed.
HSA_AMD_AGENT_INFO_MAX_ADDRESS_WATCH_POINTS = 0xA005
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
#
# Region attributes
#
# Enums of the type hsa_amd_region_info_t
# Determine if host can access the region. The type of this attribute is bool.
HSA_AMD_REGION_INFO_HOST_ACCESSIBLE = 0xA000
# Base address of the region in flat address space.
HSA_AMD_REGION_INFO_BASE = 0xA001
# Memory Interface width, the return value type is uint32_t.
# This attribute is deprecated. Use HSA_AMD_AGENT_INFO_MEMORY_WIDTH.
HSA_AMD_REGION_INFO_BUS_WIDTH = 0xA002
# Max Memory Clock, the return value type is uint32_t.
# This attribute is deprecated. Use HSA_AMD_AGENT_INFO_MEMORY_MAX_FREQUENCY.
HSA_AMD_REGION_INFO_MAX_CLOCK_FREQUENCY = 0xA003
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
#
# Coherency attributes of a fine grained region
#
# Enums of the type hsa_amd_coherency_type_t
# Coherent region.
HSA_AMD_COHERENCY_TYPE_COHERENT = 0
# Non coherent region.
HSA_AMD_COHERENCY_TYPE_NONCOHERENT = 1
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
#
# Memory segments associated with a memory pool.
#
# Enums of the type hsa_amd_segment_t
# Global segment. Used to hold data that is shared by all agents.
HSA_AMD_SEGMENT_GLOBAL = 0
# Read-only segment. Used to hold data that remains constant during the
# execution of a kernel.
HSA_AMD_SEGMENT_READONLY = 1
# Private segment. Used to hold data that is local to a single work-item.
HSA_AMD_SEGMENT_PRIVATE = 2
# Group segment. Used to hold data that is shared by the work-items of a
# work-group.
HSA_AMD_SEGMENT_GROUP = 3
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
#
# Memory pool global flags.
#
# Enums of the type hsa_amd_memory_pool_global_flag_t.
# The application can use allocations in the memory pool to store kernel
# arguments, and provide the values for the kernarg segment of
# a kernel dispatch.
HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT = 1
# Updates to memory in this pool conform to HSA memory consistency model.
# If this flag is set, then HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED
# must not be set.
HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED = 2
# Writes to memory in this pool can be performed by a single agent at a time.
HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED = 4
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
#
# Memory pool features flags.
#
# Enums of the type hsa_amd_memory_pool_info_t.
# Segment where the memory pool resides. The type of this attribute is
# hsa_amd_segment_t.
HSA_AMD_MEMORY_POOL_INFO_SEGMENT = 0
# Flag mask. The value of this attribute is undefined if the value of
# HSA_AMD_MEMORY_POOL_INFO_SEGMENT is not HSA_AMD_SEGMENT_GLOBAL. The type
# of this attribute is uint32_t, a bit-field of
# hsa_amd_memory_pool_global_flag_t values.
HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS = 1
# Size of this pool, in bytes. The type of this attribute is size_t.
HSA_AMD_MEMORY_POOL_INFO_SIZE = 2
# Indicates whether memory in this pool can be allocated using
# hsa_amd_memory_pool_allocate. The type of this attribute is bool.
# The value of this flag is always false for memory pools in the group and
# private segments.
HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED = 5
# Allocation granularity of buffers allocated by hsa_amd_memory_pool_allocate
# in this memory pool. The size of a buffer allocated in this pool is a
# multiple of the value of this attribute. The value of this attribute is
# only defined if HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED is true for
# this pool. The type of this attribute is size_t.
HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE = 6
# Alignment of buffers allocated by hsa_amd_memory_pool_allocate in this
# pool. The value of this attribute is only defined if
# HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED is true for this pool, and
# must be a power of 2. The type of this attribute is size_t.
HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT = 7
# This memory_pool can be made directly accessible by all the agents in the
# system (hsa_amd_agent_memory_pool_get_info returns
# HSA_AMD_MEMORY_POOL_ACCESS_ALLOWED_BY_DEFAULT for all agents). The type of
# this attribute is bool.
HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL = 15
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
#
# Type of accesses to a memory pool from a given agent.
#
# Enums of the type hsa_amd_memory_pool_access_t
# The agent cannot directly access any buffer in the memory pool.
HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED = 0
# The agent can directly access a buffer located in the pool; the application
# does not need to invoke hsa_amd_agents_allow_access.
HSA_AMD_MEMORY_POOL_ACCESS_ALLOWED_BY_DEFAULT = 1
# The agent can directly access a buffer located in the pool, but only if the
# application has previously requested access to that buffer using
# hsa_amd_agents_allow_access.
HSA_AMD_MEMORY_POOL_ACCESS_DISALLOWED_BY_DEFAULT = 2
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
#
# Properties of the relationship between an agent a memory pool.
#
# Enums of the type hsa_amd_link_info_type_t
# Hyper-transport bus type.
HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT = 0
# QPI bus type.
HSA_AMD_LINK_INFO_TYPE_QPI = 1
# PCIe bus type.
HSA_AMD_LINK_INFO_TYPE_PCIE = 2
# Infiniband bus type.
HSA_AMD_LINK_INFO_TYPE_INFINBAND = 3
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
#
# Access to buffers located in the memory pool. The type of this attribute
# is hsa_amd_memory_pool_access_t.
#
# Enums of type hsa_amd_agent_memory_pool_info_t.
# An agent can always directly access buffers currently located in a memory
# pool that is associated (the memory_pool is one of the values returned by
# hsa_amd_agent_iterate_memory_pools on the agent) with that agent. If the
# buffer is currently located in a memory pool that is not associated with
# the agent, and the value returned by this function for the given
# combination of agent and memory pool is not
# HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED, the application still needs to
# invoke hsa_amd_agents_allow_access in order to gain direct access to the
# buffer.
# If the given agent can directly access buffers the pool, the result is not
# HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED. If the memory pool is associated
# with the agent, or it is of fined-grained type, the result must not be
# HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED. If the memory pool is not
# associated with the agent, and does not reside in the global segment, the
# result must be HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED.
HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS = 0
# Number of links to hop when accessing the memory pool from the specified
# agent. The type of this attribute is uint32_t.
HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS = 1
# Details of each link hop when accessing the memory pool starting from the
# specified agent. The type of this attribute is an array size of
# HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS with each element containing
# hsa_amd_memory_pool_link_info_t.
HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO = 2
#------------------------------------------------------------------------------
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
from .service import Service
from .threadlocal import TLStack
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment