Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
MIGraphX
Commits
1692db3c
Commit
1692db3c
authored
Aug 27, 2018
by
Paul
Browse files
Vectorized loads
parent
33fe5e12
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
23 additions
and
8 deletions
+23
-8
src/targets/gpu/device/include/migraph/gpu/device/launch.hpp
src/targets/gpu/device/include/migraph/gpu/device/launch.hpp
+1
-1
src/targets/gpu/device/include/migraph/gpu/device/nary.hpp
src/targets/gpu/device/include/migraph/gpu/device/nary.hpp
+22
-7
No files found.
src/targets/gpu/device/include/migraph/gpu/device/launch.hpp
View file @
1692db3c
...
...
@@ -33,7 +33,7 @@ inline auto launch(std::size_t global, std::size_t local)
};
}
inline
auto
gs_launch
(
std
::
size_t
n
,
std
::
size_t
local
=
256
)
inline
auto
gs_launch
(
std
::
size_t
n
,
std
::
size_t
local
=
1024
)
{
std
::
size_t
groups
=
1
+
n
/
local
;
std
::
size_t
nglobal
=
std
::
min
<
std
::
size_t
>
(
256
,
groups
)
*
local
;
...
...
src/targets/gpu/device/include/migraph/gpu/device/nary.hpp
View file @
1692db3c
...
...
@@ -10,6 +10,15 @@ namespace migraph {
namespace
gpu
{
namespace
device
{
template
<
class
T
>
using
vec4
=
T
__attribute__
((
ext_vector_type
(
4
)));
template
<
class
T
>
vec4
<
T
>*
as_vec4
(
T
*
x
)
{
return
reinterpret_cast
<
vec4
<
T
>*>
(
x
);
}
template
<
class
F
,
class
...
Arguments
>
auto
nary_nonstandard_impl
(
F
f
,
argument
result
,
Arguments
...
args
)
{
...
...
@@ -49,13 +58,13 @@ inline auto binary_broadcast(argument result, argument arg1, argument arg2)
visit_all
(
result
,
arg1
,
arg2
)([
&
](
auto
output
,
auto
input1
,
auto
input2
)
{
using
type
=
std
::
remove_cv_t
<
typename
decltype
(
output
)
::
value_type
>
;
auto
*
xp
=
input1
.
data
();
auto
*
xp
=
as_vec4
(
input1
.
data
()
)
;
auto
*
yp
=
input2
.
data
();
auto
*
outp
=
output
.
data
();
auto
*
outp
=
as_vec4
(
output
.
data
()
)
;
const
std
::
size_t
nlocal
=
256
;
const
std
::
size_t
nlocal
=
1024
;
const
std
::
size_t
nglobal
=
256
*
nlocal
;
const
std
::
size_t
n
=
output
.
size
();
const
std
::
size_t
n
=
output
.
size
()
/
4
;
launch
(
nglobal
,
nlocal
)([
=
](
auto
idx
)
__device__
{
__shared__
type
buffer
[
2048
];
...
...
@@ -66,9 +75,15 @@ inline auto binary_broadcast(argument result, argument arg1, argument arg2)
__syncthreads
();
for
(
size_t
i
=
idx
.
global
;
i
<
n
;
i
+=
nglobal
)
{
auto
bidx
=
i
%
bdim_len
;
auto
b
=
buffer
[
bidx
];
outp
[
i
]
=
f
(
xp
[
i
],
b
);
vec4
<
type
>
x
=
xp
[
i
];
vec4
<
type
>
out
=
outp
[
i
];
for
(
std
::
size_t
j
=
0
;
j
<
4
;
j
++
)
{
auto
gidx
=
i
*
4
+
j
;
auto
bidx
=
gidx
%
bdim_len
;
auto
b
=
buffer
[
bidx
];
out
[
j
]
=
f
(
x
[
j
],
b
);
}
}
});
});
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment