Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
MIGraphX
Commits
5f4e8561
Commit
5f4e8561
authored
Feb 26, 2022
by
Shucai Xiao
Browse files
change mul_add gpu implementation to use half2 for fp16 data type
parent
9e610129
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
21 additions
and
7 deletions
+21
-7
src/targets/gpu/device/mul_add.cpp
src/targets/gpu/device/mul_add.cpp
+21
-7
No files found.
src/targets/gpu/device/mul_add.cpp
View file @
5f4e8561
...
...
@@ -8,30 +8,44 @@ inline namespace MIGRAPHX_INLINE_NS {
namespace
gpu
{
namespace
device
{
//__global__ void mul_add_kernel(void* a, void* x, void* b, void* r, int n)
//{
// int id = blockDim.x * blockIdx.x + threadIdx.x;
// __half* ha = reinterpret_cast<__half*>(a);
// __half* hb = reinterpret_cast<__half*>(b);
// __half* hx = reinterpret_cast<__half*>(x);
// __half* hr = reinterpret_cast<__half*>(r);
// if (id < n)
// {
// hr[id] = __float2half(__half2float(ha[id]) * __half2float(hx[id]) + __half2float(hb[id]));
// }
//}
__global__
void
mul_add_kernel
(
void
*
a
,
void
*
x
,
void
*
b
,
void
*
r
,
int
n
)
{
int
id
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
__half
*
ha
=
reinterpret_cast
<
__half
*>
(
a
);
__half
*
hb
=
reinterpret_cast
<
__half
*>
(
b
);
__half
*
hx
=
reinterpret_cast
<
__half
*>
(
x
);
__half
*
hr
=
reinterpret_cast
<
__half
*>
(
r
);
__half
2
*
ha
=
reinterpret_cast
<
__half
2
*>
(
a
);
__half
2
*
hb
=
reinterpret_cast
<
__half
2
*>
(
b
);
__half
2
*
hx
=
reinterpret_cast
<
__half
2
*>
(
x
);
__half
2
*
hr
=
reinterpret_cast
<
__half
2
*>
(
r
);
if
(
id
<
n
)
{
hr
[
id
]
=
__
float2half
(
__half2float
(
ha
[
id
])
*
__half2float
(
hx
[
id
])
+
__half2float
(
hb
[
id
])
)
;
hr
[
id
]
=
__
hadd2
(
__hmul2
(
ha
[
id
],
hx
[
id
]),
hb
[
id
]);
}
}
void
mul_add
(
hipStream_t
stream
,
const
argument
&
result
,
const
argument
&
arg1
,
const
argument
&
arg2
,
const
argument
&
arg3
)
{
auto
elem_num
=
result
.
get_shape
().
elements
();
auto
type
=
result
.
get_shape
().
type
();
if
(
type
==
shape
::
half_type
)
{
int
block_size
=
256
;
auto
elem_num
=
result
.
get_shape
().
elements
()
/
2
;
int
block_size
=
1024
;
int
block_num
=
(
elem_num
+
block_size
-
1
)
/
block_size
;
mul_add_kernel
<<<
block_num
,
block_size
>>>
(
arg1
.
data
(),
arg2
.
data
(),
arg3
.
data
(),
result
.
data
(),
elem_num
);
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment