Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
liupw
numba-DTK
Commits
aceb5b43
Commit
aceb5b43
authored
May 17, 2024
by
dugupeiwen
Browse files
adapt for DTK gfx906 and fix the difference in numba version
parent
3e5f428e
Changes
19
Hide whitespace changes
Inline
Side-by-side
Showing
19 changed files
with
282 additions
and
112 deletions
+282
-112
numba/core/base.py
numba/core/base.py
+41
-7
numba/core/target_extension.py
numba/core/target_extension.py
+2
-0
numba/roc/__init__.py
numba/roc/__init__.py
+18
-0
numba/roc/codegen.py
numba/roc/codegen.py
+9
-7
numba/roc/compiler.py
numba/roc/compiler.py
+7
-2
numba/roc/descriptor.py
numba/roc/descriptor.py
+40
-0
numba/roc/dispatch.py
numba/roc/dispatch.py
+10
-4
numba/roc/hlc/__init__.py
numba/roc/hlc/__init__.py
+13
-6
numba/roc/hlc/common.py
numba/roc/hlc/common.py
+28
-11
numba/roc/hlc/hlc.py
numba/roc/hlc/hlc.py
+4
-4
numba/roc/hlc/libhlc.py
numba/roc/hlc/libhlc.py
+18
-2
numba/roc/hsadrv/driver.py
numba/roc/hsadrv/driver.py
+10
-4
numba/roc/hsadrv/drvapi.py
numba/roc/hsadrv/drvapi.py
+12
-8
numba/roc/hsaimpl.py
numba/roc/hsaimpl.py
+16
-15
numba/roc/initialize.py
numba/roc/initialize.py
+2
-2
numba/roc/target.py
numba/roc/target.py
+18
-12
numba/roc/tests/hsadrv/test_driver.py
numba/roc/tests/hsadrv/test_driver.py
+8
-3
numba/roc/tests/hsapy/test_intrinsics.py
numba/roc/tests/hsapy/test_intrinsics.py
+11
-10
numba/roc/tests/hsapy/test_memory.py
numba/roc/tests/hsapy/test_memory.py
+15
-15
No files found.
numba/core/base.py
View file @
aceb5b43
...
@@ -258,20 +258,45 @@ class BaseContext(object):
...
@@ -258,20 +258,45 @@ class BaseContext(object):
For subclasses to add initializer
For subclasses to add initializer
"""
"""
# def refresh(self):
# """
# Refresh context with new declarations from known registries.
# Useful for third-party extensions.
# """
# # load target specific registries
# self.load_additional_registries()
# # Populate the builtin registry, this has to happen after loading
# # additional registries as some of the "additional" registries write
# # their implementations into the builtin_registry and would be missed if
# # this ran first.
# self.install_registry(builtin_registry)
# # Also refresh typing context, since @overload declarations can
# # affect it.
# self.typing_context.refresh()
# sugon: Roll back the refresh implementation to numba-0.53 in order to adapt to roc.
# There may be risks here.
def
refresh
(
self
):
def
refresh
(
self
):
"""
"""
Refresh context with new declarations from known registries.
Refresh context with new declarations from known registries.
Useful for third-party extensions.
Useful for third-party extensions.
"""
"""
# load target specific registries
# sugon: apapt for numba53-roc, can have bugs.
self
.
load_additional_registries
()
# Populate built-in registry
from
numba.cpython
import
(
slicing
,
tupleobj
,
enumimpl
,
hashing
,
heapq
,
iterators
,
numbers
,
rangeobj
)
from
numba.core
import
optional
from
numba.misc
import
gdb_hook
,
literal
from
numba.np
import
linalg
,
polynomial
,
arraymath
# Populate the builtin registry, this has to happen after loading
try
:
# additional registries as some of the "additional" registries writ
e
from
numba.np
import
npdatetim
e
# their implementations into the builtin_registry and would be missed if
except
NotImplementedError
:
# this ran first.
pass
self
.
install_registry
(
builtin_registry
)
self
.
install_registry
(
builtin_registry
)
self
.
load_additional_registries
()
# Also refresh typing context, since @overload declarations can
# Also refresh typing context, since @overload declarations can
# affect it.
# affect it.
self
.
typing_context
.
refresh
()
self
.
typing_context
.
refresh
()
...
@@ -389,6 +414,15 @@ class BaseContext(object):
...
@@ -389,6 +414,15 @@ class BaseContext(object):
impl
=
user_function
(
fndesc
,
libs
)
impl
=
user_function
(
fndesc
,
libs
)
self
.
_defns
[
func
].
append
(
impl
,
impl
.
signature
)
self
.
_defns
[
func
].
append
(
impl
,
impl
.
signature
)
# sugon: for numba-roc-0.53, support add_user_function function again.
# Version 0.56.1: PR `#7865 <https://github.com/numba/numba/pull/7865>`_: Remove add_user_function
def
add_user_function
(
self
,
func
,
fndesc
,
libs
=
()):
if
func
not
in
self
.
_defns
:
msg
=
"{func} is not a registered user function"
raise
KeyError
(
msg
.
format
(
func
=
func
))
impl
=
user_function
(
fndesc
,
libs
)
self
.
_defns
[
func
].
append
(
impl
,
impl
.
signature
)
def
insert_generator
(
self
,
genty
,
gendesc
,
libs
=
()):
def
insert_generator
(
self
,
genty
,
gendesc
,
libs
=
()):
assert
isinstance
(
genty
,
types
.
Generator
)
assert
isinstance
(
genty
,
types
.
Generator
)
impl
=
user_generator
(
gendesc
,
libs
)
impl
=
user_generator
(
gendesc
,
libs
)
...
...
numba/core/target_extension.py
View file @
aceb5b43
...
@@ -161,6 +161,8 @@ target_registry['gpu'] = GPU
...
@@ -161,6 +161,8 @@ target_registry['gpu'] = GPU
target_registry
[
'CUDA'
]
=
CUDA
target_registry
[
'CUDA'
]
=
CUDA
target_registry
[
'cuda'
]
=
CUDA
target_registry
[
'cuda'
]
=
CUDA
target_registry
[
'ROCm'
]
=
ROCm
target_registry
[
'ROCm'
]
=
ROCm
# sugon: support ROC
target_registry
[
'roc'
]
=
ROCm
target_registry
[
'npyufunc'
]
=
NPyUfunc
target_registry
[
'npyufunc'
]
=
NPyUfunc
dispatcher_registry
=
DelayedRegistry
(
key_type
=
Target
)
dispatcher_registry
=
DelayedRegistry
(
key_type
=
Target
)
...
...
numba/roc/__init__.py
View file @
aceb5b43
...
@@ -35,6 +35,24 @@ if is_available():
...
@@ -35,6 +35,24 @@ if is_available():
else
:
else
:
agents
=
[]
agents
=
[]
# sugon: adapt for numba-0.58, refer to numba/cuda/initialize.py, shoule move to numba/roc/initialize.py.
# TODO: suppot ROCmDispatcher completely.
def
initialize_all
():
from
numba.roc.decorators
import
jit
from
numba.core
import
dispatcher
from
numba.roc.descriptor
import
HSATargetDesc
from
numba.core.target_extension
import
(
target_registry
,
dispatcher_registry
,
jit_registry
)
class
ROCmDispatcher
(
dispatcher
.
Dispatcher
):
targetdescr
=
HSATargetDesc
(
'ROCm'
)
roc_target
=
target_registry
[
"ROCm"
]
jit_registry
[
roc_target
]
=
jit
dispatcher_registry
[
roc_target
]
=
ROCmDispatcher
initialize_all
()
def
test
(
*
args
,
**
kwargs
):
def
test
(
*
args
,
**
kwargs
):
if
not
is_available
():
if
not
is_available
():
raise
RuntimeError
(
"HSA is not detected"
)
raise
RuntimeError
(
"HSA is not detected"
)
...
...
numba/roc/codegen.py
View file @
aceb5b43
...
@@ -2,7 +2,7 @@ from llvmlite import binding as ll
...
@@ -2,7 +2,7 @@ from llvmlite import binding as ll
# from llvmlite.llvmpy import core as lc
# from llvmlite.llvmpy import core as lc
import
llvmlite.ir
as
llvmir
import
llvmlite.ir
as
llvmir
from
numba.core
import
utils
from
numba.core
import
utils
from
numba.core.codegen
import
Codegen
,
CodeLibrary
,
CPUCodeLibrary
from
numba.core.codegen
import
Codegen
,
CPUCodegen
,
CodeLibrary
,
CPUCodeLibrary
from
.hlc
import
DATALAYOUT
,
TRIPLE
,
hlc
from
.hlc
import
DATALAYOUT
,
TRIPLE
,
hlc
class
HSACodeLibrary
(
CPUCodeLibrary
):
class
HSACodeLibrary
(
CPUCodeLibrary
):
...
@@ -16,13 +16,15 @@ class HSACodeLibrary(CPUCodeLibrary):
...
@@ -16,13 +16,15 @@ class HSACodeLibrary(CPUCodeLibrary):
pass
pass
def
get_asm_str
(
self
):
def
get_asm_str
(
self
):
# sugon: there has a bug. Don't print ASM code.
return
"ROC Not support get_asm_str
\n
"
"""
"""
Get the human-readable assembly.
Get the human-readable assembly.
"""
"""
m
=
hlc
.
Module
()
#
m = hlc.Module()
m
.
load_llvm
(
str
(
self
.
_final_module
))
#
m.load_llvm(str(self._final_module))
out
=
m
.
finalize
()
#
out = m.finalize()
return
str
(
out
.
hsail
)
#
return str(out.hsail)
# class JITHSACodegen(Codegen):
# class JITHSACodegen(Codegen):
...
@@ -47,7 +49,7 @@ class HSACodeLibrary(CPUCodeLibrary):
...
@@ -47,7 +49,7 @@ class HSACodeLibrary(CPUCodeLibrary):
# def _add_module(self, module):
# def _add_module(self, module):
# pass
# pass
class
JITHSACodegen
(
Codegen
):
class
JITHSACodegen
(
CPU
Codegen
):
_library_class
=
HSACodeLibrary
_library_class
=
HSACodeLibrary
def
__init__
(
self
,
module_name
):
def
__init__
(
self
,
module_name
):
...
@@ -66,7 +68,7 @@ class JITHSACodegen(Codegen):
...
@@ -66,7 +68,7 @@ class JITHSACodegen(Codegen):
def
_init
(
self
,
llvm_module
):
def
_init
(
self
,
llvm_module
):
assert
list
(
llvm_module
.
global_variables
)
==
[],
"Module isn't empty"
assert
list
(
llvm_module
.
global_variables
)
==
[],
"Module isn't empty"
self
.
_data_layout
=
DATALAYOUT
[
utils
.
MACHINE_BITS
]
self
.
_data_layout
=
DATALAYOUT
self
.
_target_data
=
ll
.
create_target_data
(
self
.
_data_layout
)
self
.
_target_data
=
ll
.
create_target_data
(
self
.
_data_layout
)
def
_create_empty_module
(
self
,
name
):
def
_create_empty_module
(
self
,
name
):
...
...
numba/roc/compiler.py
View file @
aceb5b43
...
@@ -238,6 +238,8 @@ class _CachedProgram(object):
...
@@ -238,6 +238,8 @@ class _CachedProgram(object):
ex
=
driver
.
Executable
()
ex
=
driver
.
Executable
()
ex
.
load
(
agent
,
code
)
ex
.
load
(
agent
,
code
)
ex
.
freeze
()
ex
.
freeze
()
# sugon: for rocm-4.0 or more, the kernel symbol needs to actively add the kd suffix.
symbol
=
symbol
+
".kd"
symobj
=
ex
.
get_symbol
(
agent
,
symbol
)
symobj
=
ex
.
get_symbol
(
agent
,
symbol
)
regions
=
agent
.
regions
.
globals
regions
=
agent
.
regions
.
globals
for
reg
in
regions
:
for
reg
in
regions
:
...
@@ -275,9 +277,12 @@ class HSAKernel(HSAKernelBase):
...
@@ -275,9 +277,12 @@ class HSAKernel(HSAKernelBase):
"""
"""
Temporary workaround for register limit
Temporary workaround for register limit
"""
"""
m
=
re
.
search
(
r
"\bwavefront_sgpr_count\s*=\s*(\d+)"
,
self
.
assembly
)
# sugon: meta data is changed.
# m = re.search(r"\bwavefront_sgpr_count\s*=\s*(\d+)", self.assembly)
m
=
re
.
search
(
r
"\.sgpr_count:\s+(\d+)"
,
self
.
assembly
)
self
.
_wavefront_sgpr_count
=
int
(
m
.
group
(
1
))
self
.
_wavefront_sgpr_count
=
int
(
m
.
group
(
1
))
m
=
re
.
search
(
r
"\bworkitem_vgpr_count\s*=\s*(\d+)"
,
self
.
assembly
)
m
=
re
.
search
(
r
"\.vgpr_count:\s+(\d+)"
,
self
.
assembly
)
# m = re.search(r"\bworkitem_vgpr_count\s*=\s*(\d+)", self.assembly)
self
.
_workitem_vgpr_count
=
int
(
m
.
group
(
1
))
self
.
_workitem_vgpr_count
=
int
(
m
.
group
(
1
))
def
_sentry_resource_limit
(
self
):
def
_sentry_resource_limit
(
self
):
...
...
numba/roc/descriptor.py
View file @
aceb5b43
...
@@ -11,3 +11,43 @@ class HSATargetDesc(TargetDescriptor):
...
@@ -11,3 +11,43 @@ class HSATargetDesc(TargetDescriptor):
options
=
HSATargetOptions
options
=
HSATargetOptions
typingctx
=
HSATypingContext
()
typingctx
=
HSATypingContext
()
targetctx
=
HSATargetContext
(
typingctx
)
targetctx
=
HSATargetContext
(
typingctx
)
# sugon: from dispatcher.Dispatcher
typing_context
=
typingctx
target_context
=
targetctx
# ## sugon TODO: support ROCmDispatcher
# class HSATargetDesc(TargetDescriptor):
# def __init__(self, name):
# self.options = HSATargetOptions
# # The typing and target contexts are initialized only when needed -
# # this prevents an attempt to load CUDA libraries at import time on
# # systems that might not have them present.
# self._typingctx = None
# self._targetctx = None
# super().__init__(name)
# @property
# def typing_context(self):
# if self._typingctx is None:
# self._typingctx = HSATypingContext()
# return self._typingctx
# @property
# def target_context(self):
# if self._targetctx is None:
# self._targetctx = HSATargetContext(self._typingctx)
# return self._targetctx
# @property
# def typingctx(self):
# if self._typingctx is None:
# self._typingctx = HSATypingContext()
# return self._typingctx
# @property
# def targetctx(self):
# if self._targetctx is None:
# self._targetctx = HSATargetContext(self._typingctx)
# return self._targetctx
\ No newline at end of file
numba/roc/dispatch.py
View file @
aceb5b43
# sugon
# This file refers to CUDA, move to vectorizers.py file
# Refer to CUDA dispatcher.py for numba-0.58, this file should be transformed into a kernel scheduling
import
numpy
as
np
import
numpy
as
np
# from numba.np.ufunc.deviceufunc import (UFuncMechanism, GenerializedUFunc,
# from numba.np.ufunc.deviceufunc import (UFuncMechanism, GenerializedUFunc,
...
@@ -64,8 +67,8 @@ class HsaUFuncMechanism(UFuncMechanism):
...
@@ -64,8 +67,8 @@ class HsaUFuncMechanism(UFuncMechanism):
count
=
(
count
+
(
ilp
-
1
))
//
ilp
count
=
(
count
+
(
ilp
-
1
))
//
ilp
blockcount
=
(
count
+
(
tpb
-
1
))
//
tpb
blockcount
=
(
count
+
(
tpb
-
1
))
//
tpb
func
[
blockcount
,
tpb
](
*
args
)
func
[
blockcount
,
tpb
](
*
args
)
# sugon: adapt for numba-0.58
def
device_array
(
self
,
shape
,
dtype
,
stream
):
def
allocate_
device_array
(
self
,
shape
,
dtype
,
stream
):
if
dgpu_present
:
if
dgpu_present
:
return
api
.
device_array
(
shape
=
shape
,
dtype
=
dtype
)
return
api
.
device_array
(
shape
=
shape
,
dtype
=
dtype
)
else
:
else
:
...
@@ -97,6 +100,9 @@ class _HsaGUFuncCallSteps(GUFuncCallSteps):
...
@@ -97,6 +100,9 @@ class _HsaGUFuncCallSteps(GUFuncCallSteps):
return
devicearray
.
is_hsa_ndarray
(
obj
)
return
devicearray
.
is_hsa_ndarray
(
obj
)
else
:
else
:
return
True
return
True
# sugon: adapt for numba-0.58
def
as_device_array
(
self
,
obj
):
pass
def
to_device
(
self
,
hostary
):
def
to_device
(
self
,
hostary
):
if
dgpu_present
:
if
dgpu_present
:
...
@@ -110,8 +116,8 @@ class _HsaGUFuncCallSteps(GUFuncCallSteps):
...
@@ -110,8 +116,8 @@ class _HsaGUFuncCallSteps(GUFuncCallSteps):
return
out
return
out
else
:
else
:
pass
pass
# sugon: adapt for numba-58
def
device_array
(
self
,
shape
,
dtype
):
def
allocate_
device_array
(
self
,
shape
,
dtype
):
if
dgpu_present
:
if
dgpu_present
:
return
api
.
device_array
(
shape
=
shape
,
dtype
=
dtype
)
return
api
.
device_array
(
shape
=
shape
,
dtype
=
dtype
)
else
:
else
:
...
...
numba/roc/hlc/__init__.py
View file @
aceb5b43
...
@@ -4,13 +4,20 @@ import os
...
@@ -4,13 +4,20 @@ import os
# See:
# See:
# https://github.com/RadeonOpenCompute/llvm/blob/b20b796f65ab6ac12fac4ea32e1d89e1861dee6a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp#L270-L275
# https://github.com/RadeonOpenCompute/llvm/blob/b20b796f65ab6ac12fac4ea32e1d89e1861dee6a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp#L270-L275
# Alloc goes into addrspace(5) (private)
# Alloc goes into addrspace(5) (private)
DATALAYOUT
=
{
# DATALAYOUT = {
64
:
(
"e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
# 64: ("e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
"-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
# "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
"-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
),
# "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1"
}
# "-ni:7"),
# }
TRIPLE
=
"amdgcn--amdhsa"
# sugon: adapt for gfx906
DATALAYOUT
=
(
"e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
"-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
"-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1"
"-ni:7"
)
TRIPLE
=
"amdgcn-amd-amdhsa"
# Allow user to use "NUMBA_USE_LIBHLC" env-var to use cmdline HLC.
# Allow user to use "NUMBA_USE_LIBHLC" env-var to use cmdline HLC.
if
os
.
environ
.
get
(
'NUMBA_USE_LIBHLC'
,
''
).
lower
()
not
in
[
'0'
,
'no'
,
'false'
]:
if
os
.
environ
.
get
(
'NUMBA_USE_LIBHLC'
,
''
).
lower
()
not
in
[
'0'
,
'no'
,
'false'
]:
...
...
numba/roc/hlc/common.py
View file @
aceb5b43
...
@@ -127,16 +127,30 @@ class AMDGCNModule(object):
...
@@ -127,16 +127,30 @@ class AMDGCNModule(object):
The AMDCGN LLVM module contract
The AMDCGN LLVM module contract
"""
"""
# bitcodes = [
# "opencl.amdgcn.bc",
# "ocml.amdgcn.bc",
# "ockl.amdgcn.bc",
# "oclc_correctly_rounded_sqrt_off.amdgcn.bc",
# "oclc_daz_opt_off.amdgcn.bc",
# "oclc_finite_only_off.amdgcn.bc",
# "oclc_isa_version_803.amdgcn.bc",
# "oclc_unsafe_math_off.amdgcn.bc",
# "irif.amdgcn.bc"
# ]
# sugon: adapt for DTK
bitcodes
=
[
bitcodes
=
[
"opencl.amdgcn.bc"
,
"hip.bc"
,
"ocml.amdgcn.bc"
,
"opencl.bc"
,
"ockl.amdgcn.bc"
,
"ocml.bc"
,
"oclc_correctly_rounded_sqrt_off.amdgcn.bc"
,
"ockl.bc"
,
"oclc_daz_opt_off.amdgcn.bc"
,
"oclc_correctly_rounded_sqrt_off.bc"
,
"oclc_finite_only_off.amdgcn.bc"
,
"oclc_daz_opt_off.bc"
,
"oclc_isa_version_803.amdgcn.bc"
,
"oclc_finite_only_off.bc"
,
"oclc_unsafe_math_off.amdgcn.bc"
,
"oclc_isa_version_906.bc"
,
"irif.amdgcn.bc"
"oclc_unsafe_math_off.bc"
,
"oclc_abi_version_400.bc"
,
"oclc_wavefrontsize64_on.bc"
]
]
def
__init__
(
self
):
def
__init__
(
self
):
...
@@ -144,8 +158,11 @@ class AMDGCNModule(object):
...
@@ -144,8 +158,11 @@ class AMDGCNModule(object):
def
_preprocess
(
self
,
llvmir
):
def
_preprocess
(
self
,
llvmir
):
version_adapted
=
adapt_llvm_version
(
llvmir
)
version_adapted
=
adapt_llvm_version
(
llvmir
)
alloca_fixed
=
alloca_addrspace_correction
(
version_adapted
)
# sugon: IR -level address space conversion, not support.
return
alloca_fixed
# TODO: support
# alloca_fixed = alloca_addrspace_correction(version_adapted)
# return alloca_fixed
return
version_adapted
def
load_llvm
(
self
,
llvmir
):
def
load_llvm
(
self
,
llvmir
):
pass
pass
...
...
numba/roc/hlc/hlc.py
View file @
aceb5b43
...
@@ -102,9 +102,9 @@ class CmdLine(object):
...
@@ -102,9 +102,9 @@ class CmdLine(object):
"-S"
,
"-S"
,
"-o {fout}"
,
"-o {fout}"
,
"{fin}"
])
"{fin}"
])
# sugon: adapt for DTK. BRIG has been abandoned, using a new binary generation command.
self
.
CMD_LINK_BRIG
=
' '
.
join
([
self
.
ld_
lld
,
self
.
CMD_LINK_BRIG
=
' '
.
join
([
self
.
lld
,
"-
shared
"
,
"-
flavor gnu --no-undefined -shared -plugin-opt=-amdgpu-internalize-symbols -plugin-opt=mcpu=gfx906 -plugin-opt=O3 -plugin-opt=-amdgpu-early-inline-all=true -plugin-opt=-amdgpu-function-calls=false
"
,
"-o {fout}"
,
"-o {fout}"
,
"{fin}"
])
"{fin}"
])
...
@@ -121,7 +121,7 @@ class CmdLine(object):
...
@@ -121,7 +121,7 @@ class CmdLine(object):
self
.
opt
=
_setup_path
(
"opt"
)
self
.
opt
=
_setup_path
(
"opt"
)
self
.
llc
=
_setup_path
(
"llc"
)
self
.
llc
=
_setup_path
(
"llc"
)
self
.
llvm_link
=
_setup_path
(
"llvm-link"
)
self
.
llvm_link
=
_setup_path
(
"llvm-link"
)
self
.
ld_
lld
=
_setup_path
(
"
ld.
lld"
)
self
.
lld
=
_setup_path
(
"lld"
)
self
.
triple_flag
=
"-mtriple %s"
%
self
.
_triple
self
.
triple_flag
=
"-mtriple %s"
%
self
.
_triple
self
.
initialized
=
False
self
.
initialized
=
False
...
...
numba/roc/hlc/libhlc.py
View file @
aceb5b43
...
@@ -79,11 +79,19 @@ class HLC(object):
...
@@ -79,11 +79,19 @@ class HLC(object):
]
]
type
(
self
).
hlc
=
hlc
type
(
self
).
hlc
=
hlc
# sugon debug info
# def write_buf_to_file(self, buf, file_path):
# content = buf.value.decode("latin1")
# with open(file_path, 'w') as file:
# file.write(content)
def
parse_assembly
(
self
,
ir
):
def
parse_assembly
(
self
,
ir
):
if
isinstance
(
ir
,
str
):
if
isinstance
(
ir
,
str
):
ir
=
ir
.
encode
(
"latin1"
)
ir
=
ir
.
encode
(
"latin1"
)
buf
=
create_string_buffer
(
ir
)
buf
=
create_string_buffer
(
ir
)
# sugon debug info
# store_file = "//public//home//liupw//TP_clang//NUMBA_TEST//conda_test//numba-test.ll"
# self.write_buf_to_file(buf, store_file)
mod
=
self
.
hlc
.
ROC_ParseModule
(
buf
)
mod
=
self
.
hlc
.
ROC_ParseModule
(
buf
)
if
not
mod
:
if
not
mod
:
raise
Error
(
"Failed to parse assembly"
)
raise
Error
(
"Failed to parse assembly"
)
...
@@ -113,7 +121,9 @@ class HLC(object):
...
@@ -113,7 +121,9 @@ class HLC(object):
ret
=
buf
.
value
.
decode
(
"latin1"
)
ret
=
buf
.
value
.
decode
(
"latin1"
)
self
.
hlc
.
ROC_DisposeString
(
buf
)
self
.
hlc
.
ROC_DisposeString
(
buf
)
return
ret
return
ret
# sugon
# "clang-14" -cc1as -triple amdgcn-amd-amdhsa -filetype obj -main-file-name moduleload.cu -target-cpu gfx906 -mrelocation-model pic --mrelax-relocations -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false -o numba.o numba.s
# "lld" -flavor gnu --no-undefined -shared -plugin-opt=-amdgpu-internalize-symbols -plugin-opt=mcpu=gfx906 -plugin-opt=O3 -plugin-opt=-amdgpu-early-inline-all=true -plugin-opt=-amdgpu-function-calls=false -o numba.out numba.o
def
_link_brig
(
self
,
upbrig_loc
,
patchedbrig_loc
):
def
_link_brig
(
self
,
upbrig_loc
,
patchedbrig_loc
):
cli
.
link_brig
(
upbrig_loc
,
patchedbrig_loc
)
cli
.
link_brig
(
upbrig_loc
,
patchedbrig_loc
)
...
@@ -131,6 +141,12 @@ class HLC(object):
...
@@ -131,6 +141,12 @@ class HLC(object):
else
:
else
:
ret
=
bytes
(
buffer
(
buf
))
ret
=
bytes
(
buffer
(
buf
))
self
.
hlc
.
ROC_DisposeString
(
buf
)
self
.
hlc
.
ROC_DisposeString
(
buf
)
# sugon debug info
# print("HLC to_brig: write ROC_ModuleEmitBRIG result")
# with open("output.brig", "wb") as brig_file:
# brig_file.write(ret)
# Now we have an ELF, this needs patching with ld.lld which doesn't
# Now we have an ELF, this needs patching with ld.lld which doesn't
# have an API. So we write out `ret` to a temporary file, then call
# have an API. So we write out `ret` to a temporary file, then call
# the ld.lld ELF linker main() on it to generate a patched ELF
# the ld.lld ELF linker main() on it to generate a patched ELF
...
...
numba/roc/hsadrv/driver.py
View file @
aceb5b43
...
@@ -769,11 +769,13 @@ class Queue(object):
...
@@ -769,11 +769,13 @@ class Queue(object):
ctypes
.
sizeof
(
drvapi
.
hsa_kernel_dispatch_packet_t
))
ctypes
.
sizeof
(
drvapi
.
hsa_kernel_dispatch_packet_t
))
packet_array_t
=
(
packet_type
*
queue_struct
.
size
)
packet_array_t
=
(
packet_type
*
queue_struct
.
size
)
# sugon: adapt for DTK
# Obtain the current queue write index
# Obtain the current queue write index
index
=
hsa
.
hsa_queue_add_write_index_acq_rel
(
self
.
_id
,
1
)
index
=
hsa
.
hsa_queue_add_write_index_
sc
acq_
sc
rel
(
self
.
_id
,
1
)
while
True
:
while
True
:
read_offset
=
hsa
.
hsa_queue_load_read_index_acquire
(
self
.
_id
)
# sugon: adapt for DTK
read_offset
=
hsa
.
hsa_queue_load_read_index_scacquire
(
self
.
_id
)
if
read_offset
<=
index
<
read_offset
+
queue_struct
.
size
:
if
read_offset
<=
index
<
read_offset
+
queue_struct
.
size
:
break
break
...
@@ -786,7 +788,8 @@ class Queue(object):
...
@@ -786,7 +788,8 @@ class Queue(object):
yield
packet
yield
packet
# Increment write index
# Increment write index
# Ring the doorbell
# Ring the doorbell
hsa
.
hsa_signal_store_release
(
self
.
_id
.
contents
.
doorbell_signal
,
index
)
# sugon: adapt for DTK
hsa
.
hsa_signal_store_screlease
(
self
.
_id
.
contents
.
doorbell_signal
,
index
)
def
insert_barrier
(
self
,
dep_signal
):
def
insert_barrier
(
self
,
dep_signal
):
with
self
.
_get_packet
(
drvapi
.
hsa_barrier_and_packet_t
)
as
packet
:
with
self
.
_get_packet
(
drvapi
.
hsa_barrier_and_packet_t
)
as
packet
:
...
@@ -911,7 +914,8 @@ class Signal(object):
...
@@ -911,7 +914,8 @@ class Signal(object):
expire
=
timeout
*
hsa
.
timestamp_frequency
*
mhz
expire
=
timeout
*
hsa
.
timestamp_frequency
*
mhz
# XXX: use active wait instead of blocked seem to avoid hang in docker
# XXX: use active wait instead of blocked seem to avoid hang in docker
hsa
.
hsa_signal_wait_acquire
(
self
.
_id
,
enums
.
HSA_SIGNAL_CONDITION_NE
,
# sugon: adapt for DTK
hsa
.
hsa_signal_wait_scacquire
(
self
.
_id
,
enums
.
HSA_SIGNAL_CONDITION_NE
,
one
,
expire
,
one
,
expire
,
enums
.
HSA_WAIT_STATE_ACTIVE
)
enums
.
HSA_WAIT_STATE_ACTIVE
)
return
self
.
load_relaxed
()
!=
one
return
self
.
load_relaxed
()
!=
one
...
@@ -1156,6 +1160,8 @@ class OwnedPointer(object):
...
@@ -1156,6 +1160,8 @@ class OwnedPointer(object):
self
.
_mem
.
refct
-=
1
self
.
_mem
.
refct
-=
1
assert
self
.
_mem
.
refct
>=
0
assert
self
.
_mem
.
refct
>=
0
if
self
.
_mem
.
refct
==
0
:
if
self
.
_mem
.
refct
==
0
:
# sugon: there has a bug, free except.
# from https://numba.pydata.org/numba-doc/latest/roc/ufunc.html#async-execution-a-chunk-at-a-time
self
.
_mem
.
free
()
self
.
_mem
.
free
()
except
ReferenceError
:
except
ReferenceError
:
pass
pass
...
...
numba/roc/hsadrv/drvapi.py
View file @
aceb5b43
...
@@ -545,10 +545,11 @@ API_PROTOTYPES = {
...
@@ -545,10 +545,11 @@ API_PROTOTYPES = {
'argtypes'
:
[
hsa_signal_t
,
hsa_signal_value_t
]
'argtypes'
:
[
hsa_signal_t
,
hsa_signal_value_t
]
},
},
# void hsa_signal_store_release(
# sugon: adapt for DTK
# void hsa_signal_store_screlease(
# hsa_signal_t signal,
# hsa_signal_t signal,
# hsa_signal_value_t value);
# hsa_signal_value_t value);
'hsa_signal_store_release'
:
{
'hsa_signal_store_
sc
release'
:
{
'restype'
:
None
,
'restype'
:
None
,
'argtypes'
:
[
hsa_signal_t
,
hsa_signal_value_t
],
'argtypes'
:
[
hsa_signal_t
,
hsa_signal_value_t
],
},
},
...
@@ -785,13 +786,14 @@ API_PROTOTYPES = {
...
@@ -785,13 +786,14 @@ API_PROTOTYPES = {
'argtypes'
:
[
hsa_signal_t
,
hsa_signal_value_t
]
'argtypes'
:
[
hsa_signal_t
,
hsa_signal_value_t
]
},
},
# sugon: adapt for DTK
# hsa_signal_value_t HSA_API
# hsa_signal_value_t HSA_API
# hsa_signal_wait_acquire(hsa_signal_t signal,
# hsa_signal_wait_
sc
acquire(hsa_signal_t signal,
# hsa_signal_condition_t condition,
# hsa_signal_condition_t condition,
# hsa_signal_value_t compare_value,
# hsa_signal_value_t compare_value,
# uint64_t timeout_hint,
# uint64_t timeout_hint,
# hsa_wait_state_t wait_state_hint);
# hsa_wait_state_t wait_state_hint);
'hsa_signal_wait_acquire'
:
{
'hsa_signal_wait_
sc
acquire'
:
{
'restype'
:
hsa_signal_value_t
,
'restype'
:
hsa_signal_value_t
,
'argtypes'
:
[
hsa_signal_t
,
'argtypes'
:
[
hsa_signal_t
,
hsa_signal_condition_t
,
hsa_signal_condition_t
,
...
@@ -868,8 +870,9 @@ API_PROTOTYPES = {
...
@@ -868,8 +870,9 @@ API_PROTOTYPES = {
'errcheck'
:
_check_error
'errcheck'
:
_check_error
},
},
# uint64_t hsa_queue_load_read_index_acquire(hsa_queue_t *queue);
# sugon: adapt for DTK
'hsa_queue_load_read_index_acquire'
:
{
# uint64_t hsa_queue_load_read_index_scacquire(hsa_queue_t *queue);
'hsa_queue_load_read_index_scacquire'
:
{
'restype'
:
ctypes
.
c_uint64
,
'restype'
:
ctypes
.
c_uint64
,
'argtypes'
:
[
_PTR
(
hsa_queue_t
)]
'argtypes'
:
[
_PTR
(
hsa_queue_t
)]
},
},
...
@@ -940,10 +943,11 @@ API_PROTOTYPES = {
...
@@ -940,10 +943,11 @@ API_PROTOTYPES = {
'argtypes'
:
[
_PTR
(
hsa_queue_t
),
ctypes
.
c_uint64
,
ctypes
.
c_uint64
]
'argtypes'
:
[
_PTR
(
hsa_queue_t
),
ctypes
.
c_uint64
,
ctypes
.
c_uint64
]
},
},
# uint64_t hsa_queue_add_write_index_acq_rel(
# sugon: adapt for DTK
# uint64_t hsa_queue_add_write_index_scacq_screl(
# hsa_queue_t *queue,
# hsa_queue_t *queue,
# uint64_t value);
# uint64_t value);
'hsa_queue_add_write_index_acq_rel'
:
{
'hsa_queue_add_write_index_
sc
acq_
sc
rel'
:
{
'restype'
:
ctypes
.
c_uint64
,
'restype'
:
ctypes
.
c_uint64
,
'argtypes'
:
[
_PTR
(
hsa_queue_t
),
ctypes
.
c_uint64
]
'argtypes'
:
[
_PTR
(
hsa_queue_t
),
ctypes
.
c_uint64
]
},
},
...
...
numba/roc/hsaimpl.py
View file @
aceb5b43
...
@@ -57,16 +57,16 @@ def _declare_function(context, builder, name, sig, cargs,
...
@@ -57,16 +57,16 @@ def _declare_function(context, builder, name, sig, cargs,
llargs
=
[
context
.
get_value_type
(
t
)
for
t
in
sig
.
args
]
llargs
=
[
context
.
get_value_type
(
t
)
for
t
in
sig
.
args
]
fnty
=
ir
.
FunctionType
(
llretty
,
llargs
)
fnty
=
ir
.
FunctionType
(
llretty
,
llargs
)
mangled
=
mangler
(
name
,
cargs
)
mangled
=
mangler
(
name
,
cargs
)
fn
=
mod
.
get_or_insert_function
(
fnty
,
mangled
)
fn
=
cgutils
.
get_or_insert_function
(
mod
,
fnty
,
mangled
)
fn
.
calling_convention
=
target
.
CC_SPIR_FUNC
fn
.
calling_convention
=
target
.
CC_SPIR_FUNC
return
fn
return
fn
# sugon: there need to use 'types.uint32' ,not string 'unsigned int'.
@
lower
(
stubs
.
get_global_id
,
types
.
uint32
)
@
lower
(
stubs
.
get_global_id
,
types
.
uint32
)
def
get_global_id_impl
(
context
,
builder
,
sig
,
args
):
def
get_global_id_impl
(
context
,
builder
,
sig
,
args
):
[
dim
]
=
args
[
dim
]
=
args
get_global_id
=
_declare_function
(
context
,
builder
,
'get_global_id'
,
sig
,
get_global_id
=
_declare_function
(
context
,
builder
,
'get_global_id'
,
sig
,
[
'unsigned
int
'
])
[
types
.
u
int
32
])
res
=
builder
.
call
(
get_global_id
,
[
dim
])
res
=
builder
.
call
(
get_global_id
,
[
dim
])
return
context
.
cast
(
builder
,
res
,
types
.
uintp
,
types
.
intp
)
return
context
.
cast
(
builder
,
res
,
types
.
uintp
,
types
.
intp
)
...
@@ -75,7 +75,7 @@ def get_global_id_impl(context, builder, sig, args):
...
@@ -75,7 +75,7 @@ def get_global_id_impl(context, builder, sig, args):
def
get_local_id_impl
(
context
,
builder
,
sig
,
args
):
def
get_local_id_impl
(
context
,
builder
,
sig
,
args
):
[
dim
]
=
args
[
dim
]
=
args
get_local_id
=
_declare_function
(
context
,
builder
,
'get_local_id'
,
sig
,
get_local_id
=
_declare_function
(
context
,
builder
,
'get_local_id'
,
sig
,
[
'unsigned
int
'
])
[
types
.
u
int
32
])
res
=
builder
.
call
(
get_local_id
,
[
dim
])
res
=
builder
.
call
(
get_local_id
,
[
dim
])
return
context
.
cast
(
builder
,
res
,
types
.
uintp
,
types
.
intp
)
return
context
.
cast
(
builder
,
res
,
types
.
uintp
,
types
.
intp
)
...
@@ -84,7 +84,7 @@ def get_local_id_impl(context, builder, sig, args):
...
@@ -84,7 +84,7 @@ def get_local_id_impl(context, builder, sig, args):
def
get_group_id_impl
(
context
,
builder
,
sig
,
args
):
def
get_group_id_impl
(
context
,
builder
,
sig
,
args
):
[
dim
]
=
args
[
dim
]
=
args
get_group_id
=
_declare_function
(
context
,
builder
,
'get_group_id'
,
sig
,
get_group_id
=
_declare_function
(
context
,
builder
,
'get_group_id'
,
sig
,
[
'unsigned
int
'
])
[
types
.
u
int
32
])
res
=
builder
.
call
(
get_group_id
,
[
dim
])
res
=
builder
.
call
(
get_group_id
,
[
dim
])
return
context
.
cast
(
builder
,
res
,
types
.
uintp
,
types
.
intp
)
return
context
.
cast
(
builder
,
res
,
types
.
uintp
,
types
.
intp
)
...
@@ -93,7 +93,7 @@ def get_group_id_impl(context, builder, sig, args):
...
@@ -93,7 +93,7 @@ def get_group_id_impl(context, builder, sig, args):
def
get_num_groups_impl
(
context
,
builder
,
sig
,
args
):
def
get_num_groups_impl
(
context
,
builder
,
sig
,
args
):
[
dim
]
=
args
[
dim
]
=
args
get_num_groups
=
_declare_function
(
context
,
builder
,
'get_num_groups'
,
sig
,
get_num_groups
=
_declare_function
(
context
,
builder
,
'get_num_groups'
,
sig
,
[
'unsigned
int
'
])
[
types
.
u
int
32
])
res
=
builder
.
call
(
get_num_groups
,
[
dim
])
res
=
builder
.
call
(
get_num_groups
,
[
dim
])
return
context
.
cast
(
builder
,
res
,
types
.
uintp
,
types
.
intp
)
return
context
.
cast
(
builder
,
res
,
types
.
uintp
,
types
.
intp
)
...
@@ -101,7 +101,7 @@ def get_num_groups_impl(context, builder, sig, args):
...
@@ -101,7 +101,7 @@ def get_num_groups_impl(context, builder, sig, args):
@
lower
(
stubs
.
get_work_dim
)
@
lower
(
stubs
.
get_work_dim
)
def
get_work_dim_impl
(
context
,
builder
,
sig
,
args
):
def
get_work_dim_impl
(
context
,
builder
,
sig
,
args
):
get_work_dim
=
_declare_function
(
context
,
builder
,
'get_work_dim'
,
sig
,
get_work_dim
=
_declare_function
(
context
,
builder
,
'get_work_dim'
,
sig
,
[
"
void
"
])
[
types
.
void
])
res
=
builder
.
call
(
get_work_dim
,
[])
res
=
builder
.
call
(
get_work_dim
,
[])
return
res
return
res
...
@@ -110,7 +110,7 @@ def get_work_dim_impl(context, builder, sig, args):
...
@@ -110,7 +110,7 @@ def get_work_dim_impl(context, builder, sig, args):
def
get_global_size_impl
(
context
,
builder
,
sig
,
args
):
def
get_global_size_impl
(
context
,
builder
,
sig
,
args
):
[
dim
]
=
args
[
dim
]
=
args
get_global_size
=
_declare_function
(
context
,
builder
,
'get_global_size'
,
get_global_size
=
_declare_function
(
context
,
builder
,
'get_global_size'
,
sig
,
[
'unsigned
int
'
])
sig
,
[
types
.
u
int
32
])
res
=
builder
.
call
(
get_global_size
,
[
dim
])
res
=
builder
.
call
(
get_global_size
,
[
dim
])
return
context
.
cast
(
builder
,
res
,
types
.
uintp
,
types
.
intp
)
return
context
.
cast
(
builder
,
res
,
types
.
uintp
,
types
.
intp
)
...
@@ -119,7 +119,7 @@ def get_global_size_impl(context, builder, sig, args):
...
@@ -119,7 +119,7 @@ def get_global_size_impl(context, builder, sig, args):
def
get_local_size_impl
(
context
,
builder
,
sig
,
args
):
def
get_local_size_impl
(
context
,
builder
,
sig
,
args
):
[
dim
]
=
args
[
dim
]
=
args
get_local_size
=
_declare_function
(
context
,
builder
,
'get_local_size'
,
get_local_size
=
_declare_function
(
context
,
builder
,
'get_local_size'
,
sig
,
[
'unsigned
int
'
])
sig
,
[
types
.
u
int
32
])
res
=
builder
.
call
(
get_local_size
,
[
dim
])
res
=
builder
.
call
(
get_local_size
,
[
dim
])
return
context
.
cast
(
builder
,
res
,
types
.
uintp
,
types
.
intp
)
return
context
.
cast
(
builder
,
res
,
types
.
uintp
,
types
.
intp
)
...
@@ -128,7 +128,7 @@ def get_local_size_impl(context, builder, sig, args):
...
@@ -128,7 +128,7 @@ def get_local_size_impl(context, builder, sig, args):
def
barrier_one_arg_impl
(
context
,
builder
,
sig
,
args
):
def
barrier_one_arg_impl
(
context
,
builder
,
sig
,
args
):
[
flags
]
=
args
[
flags
]
=
args
barrier
=
_declare_function
(
context
,
builder
,
'barrier'
,
sig
,
barrier
=
_declare_function
(
context
,
builder
,
'barrier'
,
sig
,
[
'unsigned
int
'
])
[
types
.
u
int
32
])
builder
.
call
(
barrier
,
[
flags
])
builder
.
call
(
barrier
,
[
flags
])
return
_void_value
return
_void_value
...
@@ -137,7 +137,7 @@ def barrier_no_arg_impl(context, builder, sig, args):
...
@@ -137,7 +137,7 @@ def barrier_no_arg_impl(context, builder, sig, args):
assert
not
args
assert
not
args
sig
=
types
.
void
(
types
.
uint32
)
sig
=
types
.
void
(
types
.
uint32
)
barrier
=
_declare_function
(
context
,
builder
,
'barrier'
,
sig
,
barrier
=
_declare_function
(
context
,
builder
,
'barrier'
,
sig
,
[
'unsigned
int
'
])
[
types
.
u
int
32
])
flags
=
context
.
get_constant
(
types
.
uint32
,
enums
.
CLK_GLOBAL_MEM_FENCE
)
flags
=
context
.
get_constant
(
types
.
uint32
,
enums
.
CLK_GLOBAL_MEM_FENCE
)
builder
.
call
(
barrier
,
[
flags
])
builder
.
call
(
barrier
,
[
flags
])
return
_void_value
return
_void_value
...
@@ -147,7 +147,7 @@ def barrier_no_arg_impl(context, builder, sig, args):
...
@@ -147,7 +147,7 @@ def barrier_no_arg_impl(context, builder, sig, args):
def
mem_fence_impl
(
context
,
builder
,
sig
,
args
):
def
mem_fence_impl
(
context
,
builder
,
sig
,
args
):
[
flags
]
=
args
[
flags
]
=
args
mem_fence
=
_declare_function
(
context
,
builder
,
'mem_fence'
,
sig
,
mem_fence
=
_declare_function
(
context
,
builder
,
'mem_fence'
,
sig
,
[
'unsigned
int
'
])
[
types
.
u
int
32
])
builder
.
call
(
mem_fence
,
[
flags
])
builder
.
call
(
mem_fence
,
[
flags
])
return
_void_value
return
_void_value
...
@@ -173,7 +173,7 @@ def activelanepermute_wavewidth_impl(context, builder, sig, args):
...
@@ -173,7 +173,7 @@ def activelanepermute_wavewidth_impl(context, builder, sig, args):
name
=
"__hsail_activelanepermute_wavewidth_b{0}"
.
format
(
bitwidth
)
name
=
"__hsail_activelanepermute_wavewidth_b{0}"
.
format
(
bitwidth
)
fnty
=
ir
.
FunctionType
(
intbitwidth
,
[
intbitwidth
,
i32
,
intbitwidth
,
i1
])
fnty
=
ir
.
FunctionType
(
intbitwidth
,
[
intbitwidth
,
i32
,
intbitwidth
,
i1
])
fn
=
builder
.
module
.
get_or_insert_function
(
fnty
,
name
=
name
)
fn
=
cgutils
.
get_or_insert_function
(
builder
,
fnty
,
name
=
name
)
fn
.
calling_convention
=
target
.
CC_SPIR_FUNC
fn
.
calling_convention
=
target
.
CC_SPIR_FUNC
def
cast
(
val
):
def
cast
(
val
):
...
@@ -265,7 +265,8 @@ def _generic_array(context, builder, shape, dtype, symbol_name, addrspace):
...
@@ -265,7 +265,8 @@ def _generic_array(context, builder, shape, dtype, symbol_name, addrspace):
lmod
=
builder
.
module
lmod
=
builder
.
module
# Create global variable in the requested address-space
# Create global variable in the requested address-space
gvmem
=
lmod
.
add_global_variable
(
laryty
,
symbol_name
,
addrspace
)
gvmem
=
cgutils
.
add_global_variable
(
lmod
,
laryty
,
symbol_name
,
addrspace
)
if
elemcount
<=
0
:
if
elemcount
<=
0
:
raise
ValueError
(
"array length <= 0"
)
raise
ValueError
(
"array length <= 0"
)
...
@@ -314,4 +315,4 @@ def _make_array(context, builder, dataptr, dtype, shape, layout='C'):
...
@@ -314,4 +315,4 @@ def _make_array(context, builder, dataptr, dtype, shape, layout='C'):
def
_get_target_data
(
context
):
def
_get_target_data
(
context
):
return
ll
.
create_target_data
(
hlc
.
DATALAYOUT
[
context
.
address_size
]
)
return
ll
.
create_target_data
(
hlc
.
DATALAYOUT
)
numba/roc/initialize.py
View file @
aceb5b43
...
@@ -7,7 +7,7 @@ def _initialize_ufunc():
...
@@ -7,7 +7,7 @@ def _initialize_ufunc():
return
HsaVectorize
return
HsaVectorize
Vectorize
.
target_registry
.
ondemand
[
'
roc
'
]
=
init_vectorize
Vectorize
.
target_registry
.
ondemand
[
'
ROCm
'
]
=
init_vectorize
def
_initialize_gufunc
():
def
_initialize_gufunc
():
...
@@ -18,7 +18,7 @@ def _initialize_gufunc():
...
@@ -18,7 +18,7 @@ def _initialize_gufunc():
return
HsaGUFuncVectorize
return
HsaGUFuncVectorize
GUVectorize
.
target_registry
.
ondemand
[
'
roc
'
]
=
init_guvectorize
GUVectorize
.
target_registry
.
ondemand
[
'
ROCm
'
]
=
init_guvectorize
_initialize_ufunc
()
_initialize_ufunc
()
...
...
numba/roc/target.py
View file @
aceb5b43
...
@@ -73,7 +73,7 @@ class HSATargetContext(BaseContext):
...
@@ -73,7 +73,7 @@ class HSATargetContext(BaseContext):
def
init
(
self
):
def
init
(
self
):
self
.
_internal_codegen
=
codegen
.
JITHSACodegen
(
"numba.hsa.jit"
)
self
.
_internal_codegen
=
codegen
.
JITHSACodegen
(
"numba.hsa.jit"
)
self
.
_target_data
=
\
self
.
_target_data
=
\
ll
.
create_target_data
(
DATALAYOUT
[
utils
.
MACHINE_BITS
]
)
ll
.
create_target_data
(
DATALAYOUT
)
# Override data model manager
# Override data model manager
self
.
data_model_manager
=
hsa_data_model_manager
self
.
data_model_manager
=
hsa_data_model_manager
...
@@ -82,7 +82,13 @@ class HSATargetContext(BaseContext):
...
@@ -82,7 +82,13 @@ class HSATargetContext(BaseContext):
self
.
insert_func_defn
(
hsaimpl
.
registry
.
functions
)
self
.
insert_func_defn
(
hsaimpl
.
registry
.
functions
)
self
.
insert_func_defn
(
mathimpl
.
registry
.
functions
)
self
.
insert_func_defn
(
mathimpl
.
registry
.
functions
)
# sugon: adapt for numba-0.58
# Overrides
def
create_module
(
self
,
name
):
return
self
.
_internal_codegen
.
_create_empty_module
(
name
)
# return lc.Module(name)
@
cached_property
@
cached_property
def
call_conv
(
self
):
def
call_conv
(
self
):
return
HSACallConv
(
self
)
return
HSACallConv
(
self
)
...
@@ -107,7 +113,7 @@ class HSATargetContext(BaseContext):
...
@@ -107,7 +113,7 @@ class HSATargetContext(BaseContext):
module
=
func
.
module
module
=
func
.
module
func
.
linkage
=
'linkonce_odr'
func
.
linkage
=
'linkonce_odr'
module
.
data_layout
=
DATALAYOUT
[
self
.
address_size
]
module
.
data_layout
=
DATALAYOUT
wrapper
=
self
.
generate_kernel_wrapper
(
func
,
argtypes
)
wrapper
=
self
.
generate_kernel_wrapper
(
func
,
argtypes
)
return
wrapper
return
wrapper
...
@@ -140,14 +146,13 @@ class HSATargetContext(BaseContext):
...
@@ -140,14 +146,13 @@ class HSATargetContext(BaseContext):
wrappername
=
'hsaPy_{name}'
.
format
(
name
=
func
.
name
)
wrappername
=
'hsaPy_{name}'
.
format
(
name
=
func
.
name
)
argtys
=
list
(
arginfo
.
argument_types
)
argtys
=
list
(
arginfo
.
argument_types
)
fnty
=
ir
.
FunctionType
(
ir
.
IntType
(),
fnty
=
ir
.
FunctionType
(
ir
.
IntType
(
32
),
[
self
.
call_conv
.
get_return_type
(
[
self
.
call_conv
.
get_return_type
(
types
.
pyobject
)]
+
argtys
)
types
.
pyobject
)]
+
argtys
)
func
=
ir
.
Function
(
wrapper_module
,
fnty
,
name
=
func
.
name
)
func
=
wrapper_module
.
add_function
(
fnty
,
name
=
func
.
name
)
func
.
calling_convention
=
CC_SPIR_FUNC
func
.
calling_convention
=
CC_SPIR_FUNC
wrapper
=
wrapper_module
.
add_function
(
wrapperfnty
,
name
=
wrappername
)
wrapper
=
ir
.
Function
(
wrapper_module
,
wrapperfnty
,
name
=
wrappername
)
builder
=
ir
.
IRBuilder
(
wrapper
.
append_basic_block
(
''
))
builder
=
ir
.
IRBuilder
(
wrapper
.
append_basic_block
(
''
))
...
@@ -217,7 +222,7 @@ def set_hsa_kernel(fn):
...
@@ -217,7 +222,7 @@ def set_hsa_kernel(fn):
fn
.
calling_convention
=
CC_SPIR_KERNEL
fn
.
calling_convention
=
CC_SPIR_KERNEL
# Mark kernels
# Mark kernels
ocl_kernels
=
mod
.
get_or_insert_named_metadata
(
"
opencl.kernels
"
)
ocl_kernels
=
cgutils
.
get_or_insert_named_metadata
(
mod
,
'
opencl.kernels
'
)
ocl_kernels
.
add
(
ir
.
Module
.
add_metadata
(
mod
,
[
fn
,
ocl_kernels
.
add
(
ir
.
Module
.
add_metadata
(
mod
,
[
fn
,
gen_arg_addrspace_md
(
fn
),
gen_arg_addrspace_md
(
fn
),
gen_arg_access_qual_md
(
fn
),
gen_arg_access_qual_md
(
fn
),
...
@@ -226,14 +231,15 @@ def set_hsa_kernel(fn):
...
@@ -226,14 +231,15 @@ def set_hsa_kernel(fn):
gen_arg_base_type
(
fn
)]))
gen_arg_base_type
(
fn
)]))
# SPIR version 2.0
# SPIR version 2.0
make_constant
=
lambda
x
:
ir
.
Constant
(
ir
.
IntType
(),
x
)
make_constant
=
lambda
x
:
ir
.
Constant
(
ir
.
IntType
(
32
),
x
)
spir_version_constant
=
[
make_constant
(
x
)
for
x
in
SPIR_VERSION
]
spir_version_constant
=
[
make_constant
(
x
)
for
x
in
SPIR_VERSION
]
spir_version
=
mod
.
get_or_insert_named_metadata
(
"opencl.spir.version"
)
spir_version
=
cgutils
.
get_or_insert_named_metadata
(
mod
,
'opencl.spir.version'
)
if
not
spir_version
.
operands
:
if
not
spir_version
.
operands
:
spir_version
.
add
(
ir
.
Module
.
add_metadata
(
mod
,
spir_version_constant
))
spir_version
.
add
(
ir
.
Module
.
add_metadata
(
mod
,
spir_version_constant
))
ocl_version
=
mod
.
get_or_insert_named_metadata
(
"
opencl.
ocl
.version
"
)
ocl_version
=
cgutils
.
get_or_insert_named_metadata
(
mod
,
'
opencl.
spir
.version
'
)
if
not
ocl_version
.
operands
:
if
not
ocl_version
.
operands
:
ocl_version
.
add
(
ir
.
Module
.
add_metadata
(
mod
,
spir_version_constant
))
ocl_version
.
add
(
ir
.
Module
.
add_metadata
(
mod
,
spir_version_constant
))
...
@@ -264,7 +270,7 @@ def gen_arg_addrspace_md(fn):
...
@@ -264,7 +270,7 @@ def gen_arg_addrspace_md(fn):
else
:
else
:
codes
.
append
(
SPIR_PRIVATE_ADDRSPACE
)
codes
.
append
(
SPIR_PRIVATE_ADDRSPACE
)
consts
=
[
ir
.
Constant
(
ir
.
IntType
(),
x
)
for
x
in
codes
]
consts
=
[
ir
.
Constant
(
ir
.
IntType
(
32
),
x
)
for
x
in
codes
]
name
=
ir
.
MetaDataString
(
mod
,
"kernel_arg_addr_space"
)
name
=
ir
.
MetaDataString
(
mod
,
"kernel_arg_addr_space"
)
return
ir
.
Module
.
add_metadata
(
mod
,
[
name
]
+
consts
)
return
ir
.
Module
.
add_metadata
(
mod
,
[
name
]
+
consts
)
...
...
numba/roc/tests/hsadrv/test_driver.py
View file @
aceb5b43
...
@@ -46,13 +46,17 @@ class TestAgents(unittest.TestCase):
...
@@ -46,13 +46,17 @@ class TestAgents(unittest.TestCase):
def
test_agents_create_queue_single
(
self
):
def
test_agents_create_queue_single
(
self
):
for
agent
in
roc
.
agents
:
for
agent
in
roc
.
agents
:
if
agent
.
is_component
:
if
agent
.
is_component
:
queue
=
agent
.
create_queue_single
(
2
**
5
)
# sugon: adapt for DTK
# queue = agent.create_queue_single(2 ** 5)
queue
=
agent
.
create_queue_multi
(
2
**
6
)
self
.
assertIsInstance
(
queue
,
Queue
)
self
.
assertIsInstance
(
queue
,
Queue
)
def
test_agents_create_queue_multi
(
self
):
def
test_agents_create_queue_multi
(
self
):
for
agent
in
roc
.
agents
:
for
agent
in
roc
.
agents
:
if
agent
.
is_component
:
if
agent
.
is_component
:
queue
=
agent
.
create_queue_multi
(
2
**
5
)
# sugon: adapt for DTK
# queue = agent.create_queue_multi(2 ** 5)
queue
=
agent
.
create_queue_multi
(
2
**
6
)
self
.
assertIsInstance
(
queue
,
Queue
)
self
.
assertIsInstance
(
queue
,
Queue
)
def
test_agent_wavebits
(
self
):
def
test_agent_wavebits
(
self
):
...
@@ -568,7 +572,8 @@ class TestContext(_TestBase):
...
@@ -568,7 +572,8 @@ class TestContext(_TestBase):
class
validatorThread
(
threading
.
Thread
):
class
validatorThread
(
threading
.
Thread
):
def
run
(
self
):
def
run
(
self
):
val
=
roc
.
hsa_signal_wait_acquire
(
# sugon: adapt for DTK
val
=
roc
.
hsa_signal_wait_scacquire
(
completion_signal
,
completion_signal
,
enums
.
HSA_SIGNAL_CONDITION_EQ
,
enums
.
HSA_SIGNAL_CONDITION_EQ
,
0
,
0
,
...
...
numba/roc/tests/hsapy/test_intrinsics.py
View file @
aceb5b43
...
@@ -63,16 +63,17 @@ class TestDsPermute(unittest.TestCase):
...
@@ -63,16 +63,17 @@ class TestDsPermute(unittest.TestCase):
kernel
[
1
,
_WAVESIZE
](
inp
,
outp
,
shuf
)
kernel
[
1
,
_WAVESIZE
](
inp
,
outp
,
shuf
)
np
.
testing
.
assert_allclose
(
outp
,
np
.
roll
(
inp
,
op
(
shuf
)))
np
.
testing
.
assert_allclose
(
outp
,
np
.
roll
(
inp
,
op
(
shuf
)))
def
test_ds_permute_type_safety
(
self
):
# not support
""" Checks that float64's are not being downcast to float32"""
# def test_ds_permute_type_safety(self):
kernel
=
gen_kernel
(
shuffle_down
)
# """ Checks that float64's are not being downcast to float32"""
inp
=
np
.
linspace
(
0
,
1
,
_WAVESIZE
).
astype
(
np
.
float64
)
# kernel = gen_kernel(shuffle_down)
outp
=
np
.
zeros_like
(
inp
)
# inp = np.linspace(0, 1, _WAVESIZE).astype(np.float64)
with
self
.
assertRaises
(
TypingError
)
as
e
:
# outp = np.zeros_like(inp)
kernel
[
1
,
_WAVESIZE
](
inp
,
outp
,
1
)
# with self.assertRaises(TypingError) as e:
errmsg
=
e
.
exception
.
msg
# kernel[1, _WAVESIZE](inp, outp, 1)
self
.
assertIn
(
'Invalid use of Function'
,
errmsg
)
# errmsg = e.exception.msg
self
.
assertIn
(
'with argument(s) of type(s): (float64, int64)'
,
errmsg
)
# self.assertIn('Invalid use of Function', errmsg)
# self.assertIn('with argument(s) of type(s): (float64, int64)', errmsg)
def
test_ds_bpermute
(
self
):
def
test_ds_bpermute
(
self
):
...
...
numba/roc/tests/hsapy/test_memory.py
View file @
aceb5b43
...
@@ -70,21 +70,21 @@ class TestMemory(unittest.TestCase):
...
@@ -70,21 +70,21 @@ class TestMemory(unittest.TestCase):
logger
.
info
(
'post launch'
)
logger
.
info
(
'post launch'
)
np
.
testing
.
assert_equal
(
got
,
expect
)
np
.
testing
.
assert_equal
(
got
,
expect
)
@
unittest
.
skipUnless
(
dgpu_present
,
'test only on dGPU system'
)
#
@unittest.skipUnless(dgpu_present, 'test only on dGPU system')
class
TestDeviceMemorye
(
unittest
.
TestCase
):
#
class TestDeviceMemorye(unittest.TestCase):
def
test_device_device_transfer
(
self
):
#
def test_device_device_transfer(self):
# This has to be run in isolation and before the above
#
# This has to be run in isolation and before the above
# TODO: investigate why?!
#
# TODO: investigate why?!
nelem
=
1000
#
nelem = 1000
expect
=
np
.
arange
(
nelem
,
dtype
=
np
.
int32
)
+
1
#
expect = np.arange(nelem, dtype=np.int32) + 1
logger
.
info
(
'device array like'
)
#
logger.info('device array like')
darr
=
roc
.
device_array_like
(
expect
)
#
darr = roc.device_array_like(expect)
self
.
assertTrue
(
np
.
all
(
expect
!=
darr
.
copy_to_host
()))
#
self.assertTrue(np.all(expect != darr.copy_to_host()))
logger
.
info
(
'to_device'
)
#
logger.info('to_device')
stage
=
roc
.
to_device
(
expect
)
#
stage = roc.to_device(expect)
logger
.
info
(
'device -> device'
)
#
logger.info('device -> device')
darr
.
copy_to_device
(
stage
)
#
darr.copy_to_device(stage)
np
.
testing
.
assert_equal
(
expect
,
darr
.
copy_to_host
())
#
np.testing.assert_equal(expect, darr.copy_to_host())
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment