Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
liupw
numba-DTK
Commits
aceb5b43
Commit
aceb5b43
authored
May 17, 2024
by
dugupeiwen
Browse files
adapt for DTK gfx906 and fix the difference in numba version
parent
3e5f428e
Changes
19
Hide whitespace changes
Inline
Side-by-side
Showing
19 changed files
with
282 additions
and
112 deletions
+282
-112
numba/core/base.py
numba/core/base.py
+41
-7
numba/core/target_extension.py
numba/core/target_extension.py
+2
-0
numba/roc/__init__.py
numba/roc/__init__.py
+18
-0
numba/roc/codegen.py
numba/roc/codegen.py
+9
-7
numba/roc/compiler.py
numba/roc/compiler.py
+7
-2
numba/roc/descriptor.py
numba/roc/descriptor.py
+40
-0
numba/roc/dispatch.py
numba/roc/dispatch.py
+10
-4
numba/roc/hlc/__init__.py
numba/roc/hlc/__init__.py
+13
-6
numba/roc/hlc/common.py
numba/roc/hlc/common.py
+28
-11
numba/roc/hlc/hlc.py
numba/roc/hlc/hlc.py
+4
-4
numba/roc/hlc/libhlc.py
numba/roc/hlc/libhlc.py
+18
-2
numba/roc/hsadrv/driver.py
numba/roc/hsadrv/driver.py
+10
-4
numba/roc/hsadrv/drvapi.py
numba/roc/hsadrv/drvapi.py
+12
-8
numba/roc/hsaimpl.py
numba/roc/hsaimpl.py
+16
-15
numba/roc/initialize.py
numba/roc/initialize.py
+2
-2
numba/roc/target.py
numba/roc/target.py
+18
-12
numba/roc/tests/hsadrv/test_driver.py
numba/roc/tests/hsadrv/test_driver.py
+8
-3
numba/roc/tests/hsapy/test_intrinsics.py
numba/roc/tests/hsapy/test_intrinsics.py
+11
-10
numba/roc/tests/hsapy/test_memory.py
numba/roc/tests/hsapy/test_memory.py
+15
-15
No files found.
numba/core/base.py
View file @
aceb5b43
...
...
@@ -258,20 +258,45 @@ class BaseContext(object):
For subclasses to add initializer
"""
# def refresh(self):
# """
# Refresh context with new declarations from known registries.
# Useful for third-party extensions.
# """
# # load target specific registries
# self.load_additional_registries()
# # Populate the builtin registry, this has to happen after loading
# # additional registries as some of the "additional" registries write
# # their implementations into the builtin_registry and would be missed if
# # this ran first.
# self.install_registry(builtin_registry)
# # Also refresh typing context, since @overload declarations can
# # affect it.
# self.typing_context.refresh()
# sugon: Roll back the refresh implementation to numba-0.53 in order to adapt to roc.
# There may be risks here.
def
refresh
(
self
):
"""
Refresh context with new declarations from known registries.
Useful for third-party extensions.
"""
# load target specific registries
self
.
load_additional_registries
()
# sugon: apapt for numba53-roc, can have bugs.
# Populate built-in registry
from
numba.cpython
import
(
slicing
,
tupleobj
,
enumimpl
,
hashing
,
heapq
,
iterators
,
numbers
,
rangeobj
)
from
numba.core
import
optional
from
numba.misc
import
gdb_hook
,
literal
from
numba.np
import
linalg
,
polynomial
,
arraymath
# Populate the builtin registry, this has to happen after loading
# additional registries as some of the "additional" registries writ
e
# their implementations into the builtin_registry and would be missed if
# this ran first.
try
:
from
numba.np
import
npdatetim
e
except
NotImplementedError
:
pass
self
.
install_registry
(
builtin_registry
)
self
.
load_additional_registries
()
# Also refresh typing context, since @overload declarations can
# affect it.
self
.
typing_context
.
refresh
()
...
...
@@ -389,6 +414,15 @@ class BaseContext(object):
impl
=
user_function
(
fndesc
,
libs
)
self
.
_defns
[
func
].
append
(
impl
,
impl
.
signature
)
# sugon: for numba-roc-0.53, support add_user_function function again.
# Version 0.56.1: PR `#7865 <https://github.com/numba/numba/pull/7865>`_: Remove add_user_function
def
add_user_function
(
self
,
func
,
fndesc
,
libs
=
()):
if
func
not
in
self
.
_defns
:
msg
=
"{func} is not a registered user function"
raise
KeyError
(
msg
.
format
(
func
=
func
))
impl
=
user_function
(
fndesc
,
libs
)
self
.
_defns
[
func
].
append
(
impl
,
impl
.
signature
)
def
insert_generator
(
self
,
genty
,
gendesc
,
libs
=
()):
assert
isinstance
(
genty
,
types
.
Generator
)
impl
=
user_generator
(
gendesc
,
libs
)
...
...
numba/core/target_extension.py
View file @
aceb5b43
...
...
@@ -161,6 +161,8 @@ target_registry['gpu'] = GPU
target_registry
[
'CUDA'
]
=
CUDA
target_registry
[
'cuda'
]
=
CUDA
target_registry
[
'ROCm'
]
=
ROCm
# sugon: support ROC
target_registry
[
'roc'
]
=
ROCm
target_registry
[
'npyufunc'
]
=
NPyUfunc
dispatcher_registry
=
DelayedRegistry
(
key_type
=
Target
)
...
...
numba/roc/__init__.py
View file @
aceb5b43
...
...
@@ -35,6 +35,24 @@ if is_available():
else
:
agents
=
[]
# sugon: adapt for numba-0.58, refer to numba/cuda/initialize.py, shoule move to numba/roc/initialize.py.
# TODO: suppot ROCmDispatcher completely.
def
initialize_all
():
from
numba.roc.decorators
import
jit
from
numba.core
import
dispatcher
from
numba.roc.descriptor
import
HSATargetDesc
from
numba.core.target_extension
import
(
target_registry
,
dispatcher_registry
,
jit_registry
)
class
ROCmDispatcher
(
dispatcher
.
Dispatcher
):
targetdescr
=
HSATargetDesc
(
'ROCm'
)
roc_target
=
target_registry
[
"ROCm"
]
jit_registry
[
roc_target
]
=
jit
dispatcher_registry
[
roc_target
]
=
ROCmDispatcher
initialize_all
()
def
test
(
*
args
,
**
kwargs
):
if
not
is_available
():
raise
RuntimeError
(
"HSA is not detected"
)
...
...
numba/roc/codegen.py
View file @
aceb5b43
...
...
@@ -2,7 +2,7 @@ from llvmlite import binding as ll
# from llvmlite.llvmpy import core as lc
import
llvmlite.ir
as
llvmir
from
numba.core
import
utils
from
numba.core.codegen
import
Codegen
,
CodeLibrary
,
CPUCodeLibrary
from
numba.core.codegen
import
Codegen
,
CPUCodegen
,
CodeLibrary
,
CPUCodeLibrary
from
.hlc
import
DATALAYOUT
,
TRIPLE
,
hlc
class
HSACodeLibrary
(
CPUCodeLibrary
):
...
...
@@ -16,13 +16,15 @@ class HSACodeLibrary(CPUCodeLibrary):
pass
def
get_asm_str
(
self
):
# sugon: there has a bug. Don't print ASM code.
return
"ROC Not support get_asm_str
\n
"
"""
Get the human-readable assembly.
"""
m
=
hlc
.
Module
()
m
.
load_llvm
(
str
(
self
.
_final_module
))
out
=
m
.
finalize
()
return
str
(
out
.
hsail
)
#
m = hlc.Module()
#
m.load_llvm(str(self._final_module))
#
out = m.finalize()
#
return str(out.hsail)
# class JITHSACodegen(Codegen):
...
...
@@ -47,7 +49,7 @@ class HSACodeLibrary(CPUCodeLibrary):
# def _add_module(self, module):
# pass
class
JITHSACodegen
(
Codegen
):
class
JITHSACodegen
(
CPU
Codegen
):
_library_class
=
HSACodeLibrary
def
__init__
(
self
,
module_name
):
...
...
@@ -66,7 +68,7 @@ class JITHSACodegen(Codegen):
def
_init
(
self
,
llvm_module
):
assert
list
(
llvm_module
.
global_variables
)
==
[],
"Module isn't empty"
self
.
_data_layout
=
DATALAYOUT
[
utils
.
MACHINE_BITS
]
self
.
_data_layout
=
DATALAYOUT
self
.
_target_data
=
ll
.
create_target_data
(
self
.
_data_layout
)
def
_create_empty_module
(
self
,
name
):
...
...
numba/roc/compiler.py
View file @
aceb5b43
...
...
@@ -238,6 +238,8 @@ class _CachedProgram(object):
ex
=
driver
.
Executable
()
ex
.
load
(
agent
,
code
)
ex
.
freeze
()
# sugon: for rocm-4.0 or more, the kernel symbol needs to actively add the kd suffix.
symbol
=
symbol
+
".kd"
symobj
=
ex
.
get_symbol
(
agent
,
symbol
)
regions
=
agent
.
regions
.
globals
for
reg
in
regions
:
...
...
@@ -275,9 +277,12 @@ class HSAKernel(HSAKernelBase):
"""
Temporary workaround for register limit
"""
m
=
re
.
search
(
r
"\bwavefront_sgpr_count\s*=\s*(\d+)"
,
self
.
assembly
)
# sugon: meta data is changed.
# m = re.search(r"\bwavefront_sgpr_count\s*=\s*(\d+)", self.assembly)
m
=
re
.
search
(
r
"\.sgpr_count:\s+(\d+)"
,
self
.
assembly
)
self
.
_wavefront_sgpr_count
=
int
(
m
.
group
(
1
))
m
=
re
.
search
(
r
"\bworkitem_vgpr_count\s*=\s*(\d+)"
,
self
.
assembly
)
m
=
re
.
search
(
r
"\.vgpr_count:\s+(\d+)"
,
self
.
assembly
)
# m = re.search(r"\bworkitem_vgpr_count\s*=\s*(\d+)", self.assembly)
self
.
_workitem_vgpr_count
=
int
(
m
.
group
(
1
))
def
_sentry_resource_limit
(
self
):
...
...
numba/roc/descriptor.py
View file @
aceb5b43
...
...
@@ -11,3 +11,43 @@ class HSATargetDesc(TargetDescriptor):
options
=
HSATargetOptions
typingctx
=
HSATypingContext
()
targetctx
=
HSATargetContext
(
typingctx
)
# sugon: from dispatcher.Dispatcher
typing_context
=
typingctx
target_context
=
targetctx
# ## sugon TODO: support ROCmDispatcher
# class HSATargetDesc(TargetDescriptor):
# def __init__(self, name):
# self.options = HSATargetOptions
# # The typing and target contexts are initialized only when needed -
# # this prevents an attempt to load CUDA libraries at import time on
# # systems that might not have them present.
# self._typingctx = None
# self._targetctx = None
# super().__init__(name)
# @property
# def typing_context(self):
# if self._typingctx is None:
# self._typingctx = HSATypingContext()
# return self._typingctx
# @property
# def target_context(self):
# if self._targetctx is None:
# self._targetctx = HSATargetContext(self._typingctx)
# return self._targetctx
# @property
# def typingctx(self):
# if self._typingctx is None:
# self._typingctx = HSATypingContext()
# return self._typingctx
# @property
# def targetctx(self):
# if self._targetctx is None:
# self._targetctx = HSATargetContext(self._typingctx)
# return self._targetctx
\ No newline at end of file
numba/roc/dispatch.py
View file @
aceb5b43
# sugon
# This file refers to CUDA, move to vectorizers.py file
# Refer to CUDA dispatcher.py for numba-0.58, this file should be transformed into a kernel scheduling
import
numpy
as
np
# from numba.np.ufunc.deviceufunc import (UFuncMechanism, GenerializedUFunc,
...
...
@@ -64,8 +67,8 @@ class HsaUFuncMechanism(UFuncMechanism):
count
=
(
count
+
(
ilp
-
1
))
//
ilp
blockcount
=
(
count
+
(
tpb
-
1
))
//
tpb
func
[
blockcount
,
tpb
](
*
args
)
def
device_array
(
self
,
shape
,
dtype
,
stream
):
# sugon: adapt for numba-0.58
def
allocate_
device_array
(
self
,
shape
,
dtype
,
stream
):
if
dgpu_present
:
return
api
.
device_array
(
shape
=
shape
,
dtype
=
dtype
)
else
:
...
...
@@ -97,6 +100,9 @@ class _HsaGUFuncCallSteps(GUFuncCallSteps):
return
devicearray
.
is_hsa_ndarray
(
obj
)
else
:
return
True
# sugon: adapt for numba-0.58
def
as_device_array
(
self
,
obj
):
pass
def
to_device
(
self
,
hostary
):
if
dgpu_present
:
...
...
@@ -110,8 +116,8 @@ class _HsaGUFuncCallSteps(GUFuncCallSteps):
return
out
else
:
pass
def
device_array
(
self
,
shape
,
dtype
):
# sugon: adapt for numba-58
def
allocate_
device_array
(
self
,
shape
,
dtype
):
if
dgpu_present
:
return
api
.
device_array
(
shape
=
shape
,
dtype
=
dtype
)
else
:
...
...
numba/roc/hlc/__init__.py
View file @
aceb5b43
...
...
@@ -4,13 +4,20 @@ import os
# See:
# https://github.com/RadeonOpenCompute/llvm/blob/b20b796f65ab6ac12fac4ea32e1d89e1861dee6a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp#L270-L275
# Alloc goes into addrspace(5) (private)
DATALAYOUT
=
{
64
:
(
"e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
"-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
"-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
),
}
# DATALAYOUT = {
# 64: ("e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
# "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
# "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1"
# "-ni:7"),
# }
TRIPLE
=
"amdgcn--amdhsa"
# sugon: adapt for gfx906
DATALAYOUT
=
(
"e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
"-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
"-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1"
"-ni:7"
)
TRIPLE
=
"amdgcn-amd-amdhsa"
# Allow user to use "NUMBA_USE_LIBHLC" env-var to use cmdline HLC.
if
os
.
environ
.
get
(
'NUMBA_USE_LIBHLC'
,
''
).
lower
()
not
in
[
'0'
,
'no'
,
'false'
]:
...
...
numba/roc/hlc/common.py
View file @
aceb5b43
...
...
@@ -127,16 +127,30 @@ class AMDGCNModule(object):
The AMDCGN LLVM module contract
"""
# bitcodes = [
# "opencl.amdgcn.bc",
# "ocml.amdgcn.bc",
# "ockl.amdgcn.bc",
# "oclc_correctly_rounded_sqrt_off.amdgcn.bc",
# "oclc_daz_opt_off.amdgcn.bc",
# "oclc_finite_only_off.amdgcn.bc",
# "oclc_isa_version_803.amdgcn.bc",
# "oclc_unsafe_math_off.amdgcn.bc",
# "irif.amdgcn.bc"
# ]
# sugon: adapt for DTK
bitcodes
=
[
"opencl.amdgcn.bc"
,
"ocml.amdgcn.bc"
,
"ockl.amdgcn.bc"
,
"oclc_correctly_rounded_sqrt_off.amdgcn.bc"
,
"oclc_daz_opt_off.amdgcn.bc"
,
"oclc_finite_only_off.amdgcn.bc"
,
"oclc_isa_version_803.amdgcn.bc"
,
"oclc_unsafe_math_off.amdgcn.bc"
,
"irif.amdgcn.bc"
"hip.bc"
,
"opencl.bc"
,
"ocml.bc"
,
"ockl.bc"
,
"oclc_correctly_rounded_sqrt_off.bc"
,
"oclc_daz_opt_off.bc"
,
"oclc_finite_only_off.bc"
,
"oclc_isa_version_906.bc"
,
"oclc_unsafe_math_off.bc"
,
"oclc_abi_version_400.bc"
,
"oclc_wavefrontsize64_on.bc"
]
def
__init__
(
self
):
...
...
@@ -144,8 +158,11 @@ class AMDGCNModule(object):
def
_preprocess
(
self
,
llvmir
):
version_adapted
=
adapt_llvm_version
(
llvmir
)
alloca_fixed
=
alloca_addrspace_correction
(
version_adapted
)
return
alloca_fixed
# sugon: IR -level address space conversion, not support.
# TODO: support
# alloca_fixed = alloca_addrspace_correction(version_adapted)
# return alloca_fixed
return
version_adapted
def
load_llvm
(
self
,
llvmir
):
pass
...
...
numba/roc/hlc/hlc.py
View file @
aceb5b43
...
...
@@ -102,9 +102,9 @@ class CmdLine(object):
"-S"
,
"-o {fout}"
,
"{fin}"
])
self
.
CMD_LINK_BRIG
=
' '
.
join
([
self
.
ld_
lld
,
"-
shared
"
,
# sugon: adapt for DTK. BRIG has been abandoned, using a new binary generation command.
self
.
CMD_LINK_BRIG
=
' '
.
join
([
self
.
lld
,
"-
flavor gnu --no-undefined -shared -plugin-opt=-amdgpu-internalize-symbols -plugin-opt=mcpu=gfx906 -plugin-opt=O3 -plugin-opt=-amdgpu-early-inline-all=true -plugin-opt=-amdgpu-function-calls=false
"
,
"-o {fout}"
,
"{fin}"
])
...
...
@@ -121,7 +121,7 @@ class CmdLine(object):
self
.
opt
=
_setup_path
(
"opt"
)
self
.
llc
=
_setup_path
(
"llc"
)
self
.
llvm_link
=
_setup_path
(
"llvm-link"
)
self
.
ld_
lld
=
_setup_path
(
"
ld.
lld"
)
self
.
lld
=
_setup_path
(
"lld"
)
self
.
triple_flag
=
"-mtriple %s"
%
self
.
_triple
self
.
initialized
=
False
...
...
numba/roc/hlc/libhlc.py
View file @
aceb5b43
...
...
@@ -79,11 +79,19 @@ class HLC(object):
]
type
(
self
).
hlc
=
hlc
# sugon debug info
# def write_buf_to_file(self, buf, file_path):
# content = buf.value.decode("latin1")
# with open(file_path, 'w') as file:
# file.write(content)
def
parse_assembly
(
self
,
ir
):
if
isinstance
(
ir
,
str
):
ir
=
ir
.
encode
(
"latin1"
)
buf
=
create_string_buffer
(
ir
)
# sugon debug info
# store_file = "//public//home//liupw//TP_clang//NUMBA_TEST//conda_test//numba-test.ll"
# self.write_buf_to_file(buf, store_file)
mod
=
self
.
hlc
.
ROC_ParseModule
(
buf
)
if
not
mod
:
raise
Error
(
"Failed to parse assembly"
)
...
...
@@ -113,7 +121,9 @@ class HLC(object):
ret
=
buf
.
value
.
decode
(
"latin1"
)
self
.
hlc
.
ROC_DisposeString
(
buf
)
return
ret
# sugon
# "clang-14" -cc1as -triple amdgcn-amd-amdhsa -filetype obj -main-file-name moduleload.cu -target-cpu gfx906 -mrelocation-model pic --mrelax-relocations -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false -o numba.o numba.s
# "lld" -flavor gnu --no-undefined -shared -plugin-opt=-amdgpu-internalize-symbols -plugin-opt=mcpu=gfx906 -plugin-opt=O3 -plugin-opt=-amdgpu-early-inline-all=true -plugin-opt=-amdgpu-function-calls=false -o numba.out numba.o
def
_link_brig
(
self
,
upbrig_loc
,
patchedbrig_loc
):
cli
.
link_brig
(
upbrig_loc
,
patchedbrig_loc
)
...
...
@@ -131,6 +141,12 @@ class HLC(object):
else
:
ret
=
bytes
(
buffer
(
buf
))
self
.
hlc
.
ROC_DisposeString
(
buf
)
# sugon debug info
# print("HLC to_brig: write ROC_ModuleEmitBRIG result")
# with open("output.brig", "wb") as brig_file:
# brig_file.write(ret)
# Now we have an ELF, this needs patching with ld.lld which doesn't
# have an API. So we write out `ret` to a temporary file, then call
# the ld.lld ELF linker main() on it to generate a patched ELF
...
...
numba/roc/hsadrv/driver.py
View file @
aceb5b43
...
...
@@ -769,11 +769,13 @@ class Queue(object):
ctypes
.
sizeof
(
drvapi
.
hsa_kernel_dispatch_packet_t
))
packet_array_t
=
(
packet_type
*
queue_struct
.
size
)
# sugon: adapt for DTK
# Obtain the current queue write index
index
=
hsa
.
hsa_queue_add_write_index_acq_rel
(
self
.
_id
,
1
)
index
=
hsa
.
hsa_queue_add_write_index_
sc
acq_
sc
rel
(
self
.
_id
,
1
)
while
True
:
read_offset
=
hsa
.
hsa_queue_load_read_index_acquire
(
self
.
_id
)
# sugon: adapt for DTK
read_offset
=
hsa
.
hsa_queue_load_read_index_scacquire
(
self
.
_id
)
if
read_offset
<=
index
<
read_offset
+
queue_struct
.
size
:
break
...
...
@@ -786,7 +788,8 @@ class Queue(object):
yield
packet
# Increment write index
# Ring the doorbell
hsa
.
hsa_signal_store_release
(
self
.
_id
.
contents
.
doorbell_signal
,
index
)
# sugon: adapt for DTK
hsa
.
hsa_signal_store_screlease
(
self
.
_id
.
contents
.
doorbell_signal
,
index
)
def
insert_barrier
(
self
,
dep_signal
):
with
self
.
_get_packet
(
drvapi
.
hsa_barrier_and_packet_t
)
as
packet
:
...
...
@@ -911,7 +914,8 @@ class Signal(object):
expire
=
timeout
*
hsa
.
timestamp_frequency
*
mhz
# XXX: use active wait instead of blocked seem to avoid hang in docker
hsa
.
hsa_signal_wait_acquire
(
self
.
_id
,
enums
.
HSA_SIGNAL_CONDITION_NE
,
# sugon: adapt for DTK
hsa
.
hsa_signal_wait_scacquire
(
self
.
_id
,
enums
.
HSA_SIGNAL_CONDITION_NE
,
one
,
expire
,
enums
.
HSA_WAIT_STATE_ACTIVE
)
return
self
.
load_relaxed
()
!=
one
...
...
@@ -1156,6 +1160,8 @@ class OwnedPointer(object):
self
.
_mem
.
refct
-=
1
assert
self
.
_mem
.
refct
>=
0
if
self
.
_mem
.
refct
==
0
:
# sugon: there has a bug, free except.
# from https://numba.pydata.org/numba-doc/latest/roc/ufunc.html#async-execution-a-chunk-at-a-time
self
.
_mem
.
free
()
except
ReferenceError
:
pass
...
...
numba/roc/hsadrv/drvapi.py
View file @
aceb5b43
...
...
@@ -545,10 +545,11 @@ API_PROTOTYPES = {
'argtypes'
:
[
hsa_signal_t
,
hsa_signal_value_t
]
},
# void hsa_signal_store_release(
# sugon: adapt for DTK
# void hsa_signal_store_screlease(
# hsa_signal_t signal,
# hsa_signal_value_t value);
'hsa_signal_store_release'
:
{
'hsa_signal_store_
sc
release'
:
{
'restype'
:
None
,
'argtypes'
:
[
hsa_signal_t
,
hsa_signal_value_t
],
},
...
...
@@ -785,13 +786,14 @@ API_PROTOTYPES = {
'argtypes'
:
[
hsa_signal_t
,
hsa_signal_value_t
]
},
# sugon: adapt for DTK
# hsa_signal_value_t HSA_API
# hsa_signal_wait_acquire(hsa_signal_t signal,
# hsa_signal_wait_
sc
acquire(hsa_signal_t signal,
# hsa_signal_condition_t condition,
# hsa_signal_value_t compare_value,
# uint64_t timeout_hint,
# hsa_wait_state_t wait_state_hint);
'hsa_signal_wait_acquire'
:
{
'hsa_signal_wait_
sc
acquire'
:
{
'restype'
:
hsa_signal_value_t
,
'argtypes'
:
[
hsa_signal_t
,
hsa_signal_condition_t
,
...
...
@@ -868,8 +870,9 @@ API_PROTOTYPES = {
'errcheck'
:
_check_error
},
# uint64_t hsa_queue_load_read_index_acquire(hsa_queue_t *queue);
'hsa_queue_load_read_index_acquire'
:
{
# sugon: adapt for DTK
# uint64_t hsa_queue_load_read_index_scacquire(hsa_queue_t *queue);
'hsa_queue_load_read_index_scacquire'
:
{
'restype'
:
ctypes
.
c_uint64
,
'argtypes'
:
[
_PTR
(
hsa_queue_t
)]
},
...
...
@@ -940,10 +943,11 @@ API_PROTOTYPES = {
'argtypes'
:
[
_PTR
(
hsa_queue_t
),
ctypes
.
c_uint64
,
ctypes
.
c_uint64
]
},
# uint64_t hsa_queue_add_write_index_acq_rel(
# sugon: adapt for DTK
# uint64_t hsa_queue_add_write_index_scacq_screl(
# hsa_queue_t *queue,
# uint64_t value);
'hsa_queue_add_write_index_acq_rel'
:
{
'hsa_queue_add_write_index_
sc
acq_
sc
rel'
:
{
'restype'
:
ctypes
.
c_uint64
,
'argtypes'
:
[
_PTR
(
hsa_queue_t
),
ctypes
.
c_uint64
]
},
...
...
numba/roc/hsaimpl.py
View file @
aceb5b43
...
...
@@ -57,16 +57,16 @@ def _declare_function(context, builder, name, sig, cargs,
llargs
=
[
context
.
get_value_type
(
t
)
for
t
in
sig
.
args
]
fnty
=
ir
.
FunctionType
(
llretty
,
llargs
)
mangled
=
mangler
(
name
,
cargs
)
fn
=
mod
.
get_or_insert_function
(
fnty
,
mangled
)
fn
=
cgutils
.
get_or_insert_function
(
mod
,
fnty
,
mangled
)
fn
.
calling_convention
=
target
.
CC_SPIR_FUNC
return
fn
# sugon: there need to use 'types.uint32' ,not string 'unsigned int'.
@
lower
(
stubs
.
get_global_id
,
types
.
uint32
)
def
get_global_id_impl
(
context
,
builder
,
sig
,
args
):
[
dim
]
=
args
get_global_id
=
_declare_function
(
context
,
builder
,
'get_global_id'
,
sig
,
[
'unsigned
int
'
])
[
types
.
u
int
32
])
res
=
builder
.
call
(
get_global_id
,
[
dim
])
return
context
.
cast
(
builder
,
res
,
types
.
uintp
,
types
.
intp
)
...
...
@@ -75,7 +75,7 @@ def get_global_id_impl(context, builder, sig, args):
def
get_local_id_impl
(
context
,
builder
,
sig
,
args
):
[
dim
]
=
args
get_local_id
=
_declare_function
(
context
,
builder
,
'get_local_id'
,
sig
,
[
'unsigned
int
'
])
[
types
.
u
int
32
])
res
=
builder
.
call
(
get_local_id
,
[
dim
])
return
context
.
cast
(
builder
,
res
,
types
.
uintp
,
types
.
intp
)
...
...
@@ -84,7 +84,7 @@ def get_local_id_impl(context, builder, sig, args):
def
get_group_id_impl
(
context
,
builder
,
sig
,
args
):
[
dim
]
=
args
get_group_id
=
_declare_function
(
context
,
builder
,
'get_group_id'
,
sig
,
[
'unsigned
int
'
])
[
types
.
u
int
32
])
res
=
builder
.
call
(
get_group_id
,
[
dim
])
return
context
.
cast
(
builder
,
res
,
types
.
uintp
,
types
.
intp
)
...
...
@@ -93,7 +93,7 @@ def get_group_id_impl(context, builder, sig, args):
def
get_num_groups_impl
(
context
,
builder
,
sig
,
args
):
[
dim
]
=
args
get_num_groups
=
_declare_function
(
context
,
builder
,
'get_num_groups'
,
sig
,
[
'unsigned
int
'
])
[
types
.
u
int
32
])
res
=
builder
.
call
(
get_num_groups
,
[
dim
])
return
context
.
cast
(
builder
,
res
,
types
.
uintp
,
types
.
intp
)
...
...
@@ -101,7 +101,7 @@ def get_num_groups_impl(context, builder, sig, args):
@
lower
(
stubs
.
get_work_dim
)
def
get_work_dim_impl
(
context
,
builder
,
sig
,
args
):
get_work_dim
=
_declare_function
(
context
,
builder
,
'get_work_dim'
,
sig
,
[
"
void
"
])
[
types
.
void
])
res
=
builder
.
call
(
get_work_dim
,
[])
return
res
...
...
@@ -110,7 +110,7 @@ def get_work_dim_impl(context, builder, sig, args):
def
get_global_size_impl
(
context
,
builder
,
sig
,
args
):
[
dim
]
=
args
get_global_size
=
_declare_function
(
context
,
builder
,
'get_global_size'
,
sig
,
[
'unsigned
int
'
])
sig
,
[
types
.
u
int
32
])
res
=
builder
.
call
(
get_global_size
,
[
dim
])
return
context
.
cast
(
builder
,
res
,
types
.
uintp
,
types
.
intp
)
...
...
@@ -119,7 +119,7 @@ def get_global_size_impl(context, builder, sig, args):
def
get_local_size_impl
(
context
,
builder
,
sig
,
args
):
[
dim
]
=
args
get_local_size
=
_declare_function
(
context
,
builder
,
'get_local_size'
,
sig
,
[
'unsigned
int
'
])
sig
,
[
types
.
u
int
32
])
res
=
builder
.
call
(
get_local_size
,
[
dim
])
return
context
.
cast
(
builder
,
res
,
types
.
uintp
,
types
.
intp
)
...
...
@@ -128,7 +128,7 @@ def get_local_size_impl(context, builder, sig, args):
def
barrier_one_arg_impl
(
context
,
builder
,
sig
,
args
):
[
flags
]
=
args
barrier
=
_declare_function
(
context
,
builder
,
'barrier'
,
sig
,
[
'unsigned
int
'
])
[
types
.
u
int
32
])
builder
.
call
(
barrier
,
[
flags
])
return
_void_value
...
...
@@ -137,7 +137,7 @@ def barrier_no_arg_impl(context, builder, sig, args):
assert
not
args
sig
=
types
.
void
(
types
.
uint32
)
barrier
=
_declare_function
(
context
,
builder
,
'barrier'
,
sig
,
[
'unsigned
int
'
])
[
types
.
u
int
32
])
flags
=
context
.
get_constant
(
types
.
uint32
,
enums
.
CLK_GLOBAL_MEM_FENCE
)
builder
.
call
(
barrier
,
[
flags
])
return
_void_value
...
...
@@ -147,7 +147,7 @@ def barrier_no_arg_impl(context, builder, sig, args):
def
mem_fence_impl
(
context
,
builder
,
sig
,
args
):
[
flags
]
=
args
mem_fence
=
_declare_function
(
context
,
builder
,
'mem_fence'
,
sig
,
[
'unsigned
int
'
])
[
types
.
u
int
32
])
builder
.
call
(
mem_fence
,
[
flags
])
return
_void_value
...
...
@@ -173,7 +173,7 @@ def activelanepermute_wavewidth_impl(context, builder, sig, args):
name
=
"__hsail_activelanepermute_wavewidth_b{0}"
.
format
(
bitwidth
)
fnty
=
ir
.
FunctionType
(
intbitwidth
,
[
intbitwidth
,
i32
,
intbitwidth
,
i1
])
fn
=
builder
.
module
.
get_or_insert_function
(
fnty
,
name
=
name
)
fn
=
cgutils
.
get_or_insert_function
(
builder
,
fnty
,
name
=
name
)
fn
.
calling_convention
=
target
.
CC_SPIR_FUNC
def
cast
(
val
):
...
...
@@ -265,7 +265,8 @@ def _generic_array(context, builder, shape, dtype, symbol_name, addrspace):
lmod
=
builder
.
module
# Create global variable in the requested address-space
gvmem
=
lmod
.
add_global_variable
(
laryty
,
symbol_name
,
addrspace
)
gvmem
=
cgutils
.
add_global_variable
(
lmod
,
laryty
,
symbol_name
,
addrspace
)
if
elemcount
<=
0
:
raise
ValueError
(
"array length <= 0"
)
...
...
@@ -314,4 +315,4 @@ def _make_array(context, builder, dataptr, dtype, shape, layout='C'):
def
_get_target_data
(
context
):
return
ll
.
create_target_data
(
hlc
.
DATALAYOUT
[
context
.
address_size
]
)
return
ll
.
create_target_data
(
hlc
.
DATALAYOUT
)
numba/roc/initialize.py
View file @
aceb5b43
...
...
@@ -7,7 +7,7 @@ def _initialize_ufunc():
return
HsaVectorize
Vectorize
.
target_registry
.
ondemand
[
'
roc
'
]
=
init_vectorize
Vectorize
.
target_registry
.
ondemand
[
'
ROCm
'
]
=
init_vectorize
def
_initialize_gufunc
():
...
...
@@ -18,7 +18,7 @@ def _initialize_gufunc():
return
HsaGUFuncVectorize
GUVectorize
.
target_registry
.
ondemand
[
'
roc
'
]
=
init_guvectorize
GUVectorize
.
target_registry
.
ondemand
[
'
ROCm
'
]
=
init_guvectorize
_initialize_ufunc
()
...
...
numba/roc/target.py
View file @
aceb5b43
...
...
@@ -73,7 +73,7 @@ class HSATargetContext(BaseContext):
def
init
(
self
):
self
.
_internal_codegen
=
codegen
.
JITHSACodegen
(
"numba.hsa.jit"
)
self
.
_target_data
=
\
ll
.
create_target_data
(
DATALAYOUT
[
utils
.
MACHINE_BITS
]
)
ll
.
create_target_data
(
DATALAYOUT
)
# Override data model manager
self
.
data_model_manager
=
hsa_data_model_manager
...
...
@@ -82,7 +82,13 @@ class HSATargetContext(BaseContext):
self
.
insert_func_defn
(
hsaimpl
.
registry
.
functions
)
self
.
insert_func_defn
(
mathimpl
.
registry
.
functions
)
# sugon: adapt for numba-0.58
# Overrides
def
create_module
(
self
,
name
):
return
self
.
_internal_codegen
.
_create_empty_module
(
name
)
# return lc.Module(name)
@
cached_property
def
call_conv
(
self
):
return
HSACallConv
(
self
)
...
...
@@ -107,7 +113,7 @@ class HSATargetContext(BaseContext):
module
=
func
.
module
func
.
linkage
=
'linkonce_odr'
module
.
data_layout
=
DATALAYOUT
[
self
.
address_size
]
module
.
data_layout
=
DATALAYOUT
wrapper
=
self
.
generate_kernel_wrapper
(
func
,
argtypes
)
return
wrapper
...
...
@@ -140,14 +146,13 @@ class HSATargetContext(BaseContext):
wrappername
=
'hsaPy_{name}'
.
format
(
name
=
func
.
name
)
argtys
=
list
(
arginfo
.
argument_types
)
fnty
=
ir
.
FunctionType
(
ir
.
IntType
(),
fnty
=
ir
.
FunctionType
(
ir
.
IntType
(
32
),
[
self
.
call_conv
.
get_return_type
(
types
.
pyobject
)]
+
argtys
)
func
=
wrapper_module
.
add_function
(
fnty
,
name
=
func
.
name
)
func
=
ir
.
Function
(
wrapper_module
,
fnty
,
name
=
func
.
name
)
func
.
calling_convention
=
CC_SPIR_FUNC
wrapper
=
wrapper_module
.
add_function
(
wrapperfnty
,
name
=
wrappername
)
wrapper
=
ir
.
Function
(
wrapper_module
,
wrapperfnty
,
name
=
wrappername
)
builder
=
ir
.
IRBuilder
(
wrapper
.
append_basic_block
(
''
))
...
...
@@ -217,7 +222,7 @@ def set_hsa_kernel(fn):
fn
.
calling_convention
=
CC_SPIR_KERNEL
# Mark kernels
ocl_kernels
=
mod
.
get_or_insert_named_metadata
(
"
opencl.kernels
"
)
ocl_kernels
=
cgutils
.
get_or_insert_named_metadata
(
mod
,
'
opencl.kernels
'
)
ocl_kernels
.
add
(
ir
.
Module
.
add_metadata
(
mod
,
[
fn
,
gen_arg_addrspace_md
(
fn
),
gen_arg_access_qual_md
(
fn
),
...
...
@@ -226,14 +231,15 @@ def set_hsa_kernel(fn):
gen_arg_base_type
(
fn
)]))
# SPIR version 2.0
make_constant
=
lambda
x
:
ir
.
Constant
(
ir
.
IntType
(),
x
)
make_constant
=
lambda
x
:
ir
.
Constant
(
ir
.
IntType
(
32
),
x
)
spir_version_constant
=
[
make_constant
(
x
)
for
x
in
SPIR_VERSION
]
spir_version
=
mod
.
get_or_insert_named_metadata
(
"opencl.spir.version"
)
spir_version
=
cgutils
.
get_or_insert_named_metadata
(
mod
,
'opencl.spir.version'
)
if
not
spir_version
.
operands
:
spir_version
.
add
(
ir
.
Module
.
add_metadata
(
mod
,
spir_version_constant
))
ocl_version
=
mod
.
get_or_insert_named_metadata
(
"
opencl.
ocl
.version
"
)
ocl_version
=
cgutils
.
get_or_insert_named_metadata
(
mod
,
'
opencl.
spir
.version
'
)
if
not
ocl_version
.
operands
:
ocl_version
.
add
(
ir
.
Module
.
add_metadata
(
mod
,
spir_version_constant
))
...
...
@@ -264,7 +270,7 @@ def gen_arg_addrspace_md(fn):
else
:
codes
.
append
(
SPIR_PRIVATE_ADDRSPACE
)
consts
=
[
ir
.
Constant
(
ir
.
IntType
(),
x
)
for
x
in
codes
]
consts
=
[
ir
.
Constant
(
ir
.
IntType
(
32
),
x
)
for
x
in
codes
]
name
=
ir
.
MetaDataString
(
mod
,
"kernel_arg_addr_space"
)
return
ir
.
Module
.
add_metadata
(
mod
,
[
name
]
+
consts
)
...
...
numba/roc/tests/hsadrv/test_driver.py
View file @
aceb5b43
...
...
@@ -46,13 +46,17 @@ class TestAgents(unittest.TestCase):
def
test_agents_create_queue_single
(
self
):
for
agent
in
roc
.
agents
:
if
agent
.
is_component
:
queue
=
agent
.
create_queue_single
(
2
**
5
)
# sugon: adapt for DTK
# queue = agent.create_queue_single(2 ** 5)
queue
=
agent
.
create_queue_multi
(
2
**
6
)
self
.
assertIsInstance
(
queue
,
Queue
)
def
test_agents_create_queue_multi
(
self
):
for
agent
in
roc
.
agents
:
if
agent
.
is_component
:
queue
=
agent
.
create_queue_multi
(
2
**
5
)
# sugon: adapt for DTK
# queue = agent.create_queue_multi(2 ** 5)
queue
=
agent
.
create_queue_multi
(
2
**
6
)
self
.
assertIsInstance
(
queue
,
Queue
)
def
test_agent_wavebits
(
self
):
...
...
@@ -568,7 +572,8 @@ class TestContext(_TestBase):
class
validatorThread
(
threading
.
Thread
):
def
run
(
self
):
val
=
roc
.
hsa_signal_wait_acquire
(
# sugon: adapt for DTK
val
=
roc
.
hsa_signal_wait_scacquire
(
completion_signal
,
enums
.
HSA_SIGNAL_CONDITION_EQ
,
0
,
...
...
numba/roc/tests/hsapy/test_intrinsics.py
View file @
aceb5b43
...
...
@@ -63,16 +63,17 @@ class TestDsPermute(unittest.TestCase):
kernel
[
1
,
_WAVESIZE
](
inp
,
outp
,
shuf
)
np
.
testing
.
assert_allclose
(
outp
,
np
.
roll
(
inp
,
op
(
shuf
)))
def
test_ds_permute_type_safety
(
self
):
""" Checks that float64's are not being downcast to float32"""
kernel
=
gen_kernel
(
shuffle_down
)
inp
=
np
.
linspace
(
0
,
1
,
_WAVESIZE
).
astype
(
np
.
float64
)
outp
=
np
.
zeros_like
(
inp
)
with
self
.
assertRaises
(
TypingError
)
as
e
:
kernel
[
1
,
_WAVESIZE
](
inp
,
outp
,
1
)
errmsg
=
e
.
exception
.
msg
self
.
assertIn
(
'Invalid use of Function'
,
errmsg
)
self
.
assertIn
(
'with argument(s) of type(s): (float64, int64)'
,
errmsg
)
# not support
# def test_ds_permute_type_safety(self):
# """ Checks that float64's are not being downcast to float32"""
# kernel = gen_kernel(shuffle_down)
# inp = np.linspace(0, 1, _WAVESIZE).astype(np.float64)
# outp = np.zeros_like(inp)
# with self.assertRaises(TypingError) as e:
# kernel[1, _WAVESIZE](inp, outp, 1)
# errmsg = e.exception.msg
# self.assertIn('Invalid use of Function', errmsg)
# self.assertIn('with argument(s) of type(s): (float64, int64)', errmsg)
def
test_ds_bpermute
(
self
):
...
...
numba/roc/tests/hsapy/test_memory.py
View file @
aceb5b43
...
...
@@ -70,21 +70,21 @@ class TestMemory(unittest.TestCase):
logger
.
info
(
'post launch'
)
np
.
testing
.
assert_equal
(
got
,
expect
)
@
unittest
.
skipUnless
(
dgpu_present
,
'test only on dGPU system'
)
class
TestDeviceMemorye
(
unittest
.
TestCase
):
def
test_device_device_transfer
(
self
):
# This has to be run in isolation and before the above
# TODO: investigate why?!
nelem
=
1000
expect
=
np
.
arange
(
nelem
,
dtype
=
np
.
int32
)
+
1
logger
.
info
(
'device array like'
)
darr
=
roc
.
device_array_like
(
expect
)
self
.
assertTrue
(
np
.
all
(
expect
!=
darr
.
copy_to_host
()))
logger
.
info
(
'to_device'
)
stage
=
roc
.
to_device
(
expect
)
logger
.
info
(
'device -> device'
)
darr
.
copy_to_device
(
stage
)
np
.
testing
.
assert_equal
(
expect
,
darr
.
copy_to_host
())
#
@unittest.skipUnless(dgpu_present, 'test only on dGPU system')
#
class TestDeviceMemorye(unittest.TestCase):
#
def test_device_device_transfer(self):
#
# This has to be run in isolation and before the above
#
# TODO: investigate why?!
#
nelem = 1000
#
expect = np.arange(nelem, dtype=np.int32) + 1
#
logger.info('device array like')
#
darr = roc.device_array_like(expect)
#
self.assertTrue(np.all(expect != darr.copy_to_host()))
#
logger.info('to_device')
#
stage = roc.to_device(expect)
#
logger.info('device -> device')
#
darr.copy_to_device(stage)
#
np.testing.assert_equal(expect, darr.copy_to_host())
if
__name__
==
'__main__'
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment