Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
e72eece8
"...composable_kernel-1.git" did not exist on "6dfb92bbef33b4caea55f6b4ed7c449927ae771c"
Commit
e72eece8
authored
Mar 21, 2019
by
Chao Liu
Browse files
added int8x4
parent
02d72160
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
2 additions
and
8 deletions
+2
-8
driver/driver.hip.cpp
driver/driver.hip.cpp
+1
-1
src/include/data_type.hip.hpp
src/include/data_type.hip.hpp
+1
-7
No files found.
driver/driver.hip.cpp
View file @
e72eece8
...
@@ -617,7 +617,7 @@ int main(int argc, char* argv[])
...
@@ -617,7 +617,7 @@ int main(int argc, char* argv[])
#if 0
#if 0
in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
#elif
1
#elif
0
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei_kcyx
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei_kcyx
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
#elif 1
#elif 1
...
...
src/include/data_type.hip.hpp
View file @
e72eece8
...
@@ -231,17 +231,11 @@ __device__ void fused_multiply_accumulate(float& d, const half2& s0, const half2
...
@@ -231,17 +231,11 @@ __device__ void fused_multiply_accumulate(float& d, const half2& s0, const half2
__device__
void
fused_multiply_accumulate
(
char
&
d
,
const
char
&
s0
,
const
char
&
s1
)
{
d
+=
s0
*
s1
;
}
__device__
void
fused_multiply_accumulate
(
char
&
d
,
const
char
&
s0
,
const
char
&
s1
)
{
d
+=
s0
*
s1
;
}
// TODO:: this interface is misleading,
int32 is
actually int8x4
// TODO:: this interface is misleading,
s0, s1 are
actually int8x4
// need to make a better interface
// need to make a better interface
__device__
void
fused_multiply_accumulate
(
int32_t
&
d
,
const
int32_t
&
s0
,
const
int32_t
&
s1
)
__device__
void
fused_multiply_accumulate
(
int32_t
&
d
,
const
int32_t
&
s0
,
const
int32_t
&
s1
)
{
{
#if DEVICE_BACKEND_CUDA
#if DEVICE_BACKEND_CUDA
#if 1 // debug
d
=
__dp4a
(
s0
,
s1
,
d
);
d
=
__dp4a
(
s0
,
s1
,
d
);
#elif 1
asm
volatile
(
"dp4a.s32.s32 %0, %1, %2, %3;"
:
"=r"
(
d
)
:
"r"
(
s0
),
"r"
(
s1
),
"r"
(
d
));
#elif 0 // this is wrong! just for debugging
d
+=
(
*
reinterpret_cast
<
const
int32_t
*>
(
&
s0
))
*
(
*
reinterpret_cast
<
const
int32_t
*>
(
&
s1
));
#endif
#endif
#endif
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment