Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
a7361926
Commit
a7361926
authored
Nov 23, 2021
by
Chao Liu
Browse files
clean up
parent
81b26528
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
20 additions
and
9 deletions
+20
-9
example/2_gemm_xdl_bias_add/gemm_xdl_bias_add.cpp
example/2_gemm_xdl_bias_add/gemm_xdl_bias_add.cpp
+20
-9
No files found.
example/2_gemm_xdl_bias_add/gemm_xdl_bias_add.cpp
View file @
a7361926
...
@@ -36,28 +36,37 @@ struct PassThrough
...
@@ -36,28 +36,37 @@ struct PassThrough
// v2 is from bias vector
// v2 is from bias vector
struct
BiasAdd
struct
BiasAdd
{
{
#if
1
#if
0
// correct result
// correct result
// no scratch memory, good VGPR allocation (59)
// no scratch memory, good VGPR allocation (59)
// good perf (101Tflops)
// good perf (101Tflops)
template <typename T1, typename T2>
template <typename T1, typename T2>
__host__ __device__ constexpr float operator()(float v0, T1 v1, T2 v2) const
__host__ __device__ constexpr float operator()(float v0, T1 v1, T2 v2) const
{
{
constexpr float alpha = 0.1;
constexpr float beta = 0.2;
constexpr float gamma = 0.3;
// compiler seems very volatile to the order of these calculation:
// compiler seems very volatile to the order of these calculation:
// compiler is very eager to read AccVgpr (v0) out prematurely, resulting in register
// compiler is very eager to read AccVgpr (v0) out prematurely, resulting in register
// over-allocation. Therefore, move v0 calculation to the very end
// over-allocation. Therefore, move v0 calculation to the very end
float
a
=
T1
(
0.2
)
*
v1
+
T2
(
0.3
)
*
v2
;
float a = T1(
beta
) * v1 + T2(
gamma
) * v2;
float
b
=
a
+
float
(
0.1
)
*
v0
;
float b = a + float(
alpha
) * v0;
return b;
return b;
}
}
#elif 0
#elif
1
// correct result
float
alpha
=
0.1
;
// some scratch memory (68), large VGPR usage (126)
float
beta
=
0.2
;
// very little perf drop (101Tflops)
float
gamma
=
0.3
;
__host__
__device__
constexpr
auto
operator
()(
float
v0
,
ck
::
half_t
v1
,
ck
::
half_t
v2
)
const
// wrong result
// lots of scratch memory
// huge perf drop
template
<
typename
T1
,
typename
T2
>
__host__
__device__
constexpr
float
operator
()(
float
v0
,
T1
v1
,
T2
v2
)
const
{
{
return
float
(
0.1
)
*
v0
+
ck
::
half_t
(
0.2
)
*
v1
+
ck
::
half_t
(
0.3
)
*
v2
;
return
alpha
*
v0
+
beta
*
v1
+
gamma
*
v2
;
}
}
#elif 0
#elif 0
// correct result
// correct result
...
@@ -361,5 +370,7 @@ int main(int argc, char* argv[])
...
@@ -361,5 +370,7 @@ int main(int argc, char* argv[])
PassThrough
{},
PassThrough
{},
PassThrough
{},
PassThrough
{},
c_element_op
);
c_element_op
);
check_error
(
c_m_n_host_result
,
c_m_n_device_result
);
}
}
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment