Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
29d881df
Commit
29d881df
authored
Apr 22, 2022
by
Anthony Chang
Browse files
format
parent
e8c7de8d
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
10 additions
and
11 deletions
+10
-11
include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
...e/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+10
-11
No files found.
include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
View file @
29d881df
...
...
@@ -250,9 +250,8 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
#if CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING
static
constexpr
index_t
KPerInnerLoop
=
math
::
max
(
KPerThread
/
CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING_MAC_CLUSTERS
,
KPack
);
static
constexpr
index_t
KPerInnerLoop
=
math
::
max
(
KPerThread
/
CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING_MAC_CLUSTERS
,
KPack
);
// 2-wave optimized blockwise gemm
template
<
typename
ABlockBuffer
,
typename
BBlockBuffer
,
typename
CThreadBuffer
>
...
...
@@ -319,8 +318,9 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
// moved here B) reduce VMEM FIFO congestion by applying small delays to
// different wavefronts It is performed near the end of MAC cluster to
// minimize lgkmcnt penalty
if
constexpr
(
int
(
k
)
==
KPerThread
-
KPerInnerLoop
&&
int
(
k_
)
==
KPerInnerLoop
-
KPack
&&
int
(
m0
)
==
MRepeat
-
1
&&
int
(
n0
)
==
NRepeat
-
1
)
if
constexpr
(
int
(
k
)
==
KPerThread
-
KPerInnerLoop
&&
int
(
k_
)
==
KPerInnerLoop
-
KPack
&&
int
(
m0
)
==
MRepeat
-
1
&&
int
(
n0
)
==
NRepeat
-
1
)
{
__builtin_amdgcn_sched_barrier
();
block_sync_lds
();
...
...
@@ -350,12 +350,12 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
private:
// A[M0, M1, M2, KPerInnerLoop]
static
constexpr
auto
a_thread_desc_
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MRepeat
>
{},
I1
,
I1
,
Number
<
KPerInnerLoop
>
{}));
static
constexpr
auto
a_thread_desc_
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MRepeat
>
{},
I1
,
I1
,
Number
<
KPerInnerLoop
>
{}));
// B[N0, N1, N2, KPerInnerLoop]
static
constexpr
auto
b_thread_desc_
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
NRepeat
>
{},
I1
,
I1
,
Number
<
KPerInnerLoop
>
{}));
static
constexpr
auto
b_thread_desc_
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
NRepeat
>
{},
I1
,
I1
,
Number
<
KPerInnerLoop
>
{}));
using
AThreadCopy
=
ThreadwiseTensorSliceTransfer_v4
<
FloatAB
,
FloatAB
,
...
...
@@ -377,7 +377,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
B_K1
,
B_K1
>
;
#else
// #if CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING
#else // #if CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING
template
<
typename
ABlockBuffer
,
typename
BBlockBuffer
,
typename
CThreadBuffer
>
__device__
void
Run
(
const
ABlockBuffer
&
a_block_buf
,
...
...
@@ -468,7 +468,6 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
static
constexpr
auto
c_thread_desc_
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MRepeat
>
{},
Number
<
NRepeat
>
{},
xdlops_gemm
.
GetRegSizePerXdlops
()));
AThreadCopy
a_thread_copy_
{
CalculateAThreadOriginDataIndex
()};
BThreadCopy
b_thread_copy_
{
CalculateBThreadOriginDataIndex
()};
};
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment