Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
b3e80872
Commit
b3e80872
authored
May 24, 2021
by
Chao Liu
Browse files
refactor
parent
6c37035f
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
166 additions
and
150 deletions
+166
-150
composable_kernel/include/driver/driver_dynamic_gemm_v1r1.hpp
...osable_kernel/include/driver/driver_dynamic_gemm_v1r1.hpp
+14
-14
composable_kernel/include/tensor_operation/blockwise_gemm_v2.hpp
...ble_kernel/include/tensor_operation/blockwise_gemm_v2.hpp
+32
-24
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_v1r1.hpp
...l/include/tensor_operation/gridwise_dynamic_gemm_v1r1.hpp
+45
-39
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
...convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
+75
-73
No files found.
composable_kernel/include/driver/driver_dynamic_gemm_v1r1.hpp
View file @
b3e80872
...
...
@@ -20,13 +20,13 @@ template <index_t BlockSize,
index_t
MPerBlock
,
index_t
NPerBlock
,
index_t
KPerBlock
,
index_t
MPerThread
,
index_t
NPerThread
,
index_t
M
1
PerThread
,
index_t
N
1
PerThread
,
index_t
KPerThread
,
index_t
M
Level0
Cluster
,
index_t
NLevel0
Cluster
,
index_t
M
Level1
Cluster
,
index_t
NLevel1
Cluster
,
index_t
M
1N1Thread
Cluster
M10
,
index_t
M1N1Thread
Cluster
N10
,
index_t
M
1N1Thread
Cluster
M11
,
index_t
M1N1Thread
Cluster
N11
,
typename
ABlockTransferThreadSliceLengths_K_M
,
typename
ABlockTransferThreadClusterLengths_K_M
,
typename
ABlockTransferThreadClusterArrangeOrder
,
...
...
@@ -80,8 +80,8 @@ __host__ float launch_kernel_dynamic_gemm_v1r1(const FloatAB* p_a_global,
throw
std
::
runtime_error
(
"wrong! GEMM size no divisible"
);
}
constexpr
auto
M1
=
Number
<
MPerThread
*
M
Level0
Cluster
*
M
Level1
Cluster
>
{};
constexpr
auto
N1
=
Number
<
NPerThread
*
NLevel0Cluster
*
NLevel1
Cluster
>
{};
constexpr
auto
M1
=
Number
<
M
1
PerThread
*
M
1N1Thread
Cluster
M11
*
M
1N1Thread
Cluster
M10
>
{};
constexpr
auto
N1
=
Number
<
N
1
PerThread
*
M1N1ThreadClusterN11
*
M1N1Thread
Cluster
N10
>
{};
if
(
!
(
MPerBlock
%
M1
==
0
&&
NPerBlock
%
N1
==
0
))
{
...
...
@@ -102,13 +102,13 @@ __host__ float launch_kernel_dynamic_gemm_v1r1(const FloatAB* p_a_global,
MPerBlock
,
NPerBlock
,
KPerBlock
,
MPerThread
,
NPerThread
,
M
1
PerThread
,
N
1
PerThread
,
KPerThread
,
M
Level0
Cluster
,
NLevel0
Cluster
,
M
Level1
Cluster
,
NLevel1
Cluster
,
M
1N1Thread
Cluster
M10
,
M1N1Thread
Cluster
N10
,
M
1N1Thread
Cluster
M11
,
M1N1Thread
Cluster
N11
,
ABlockTransferThreadSliceLengths_K_M
,
ABlockTransferThreadClusterLengths_K_M
,
ABlockTransferThreadClusterArrangeOrder
,
...
...
composable_kernel/include/tensor_operation/blockwise_gemm_v2.hpp
View file @
b3e80872
...
...
@@ -29,10 +29,10 @@ template <index_t BlockSize,
index_t
M1PerThread
,
index_t
N1PerThread
,
index_t
KPerThread
,
index_t
M
Level0
ThreadCluster
,
index_t
NLevel0
ThreadCluster
,
index_t
M
Level
1ThreadCluster
,
index_t
NLevel
1ThreadCluster
,
index_t
M
1N1
ThreadCluster
M10
,
index_t
M1N1
ThreadCluster
N10
,
index_t
M
1N
1ThreadCluster
M11
,
index_t
M1N
1ThreadCluster
N11
,
index_t
AThreadCopyScalarPerVector_M1
,
index_t
BThreadCopyScalarPerVector_N1
,
typename
std
::
enable_if
<
ABlockDesc
::
IsKnownAtCompileTime
()
&&
...
...
@@ -62,8 +62,8 @@ struct BlockwiseGemm_km0m1_kn0n1_m0m1n0n1_v1
CThreadDesc
::
IsKnownAtCompileTime
(),
"wrong! Desc should be known at compile-time"
);
static_assert
(
BlockSize
==
M
Level0
ThreadCluster
*
M
Level
1ThreadCluster
*
NLevel0
ThreadCluster
*
NLevel
1ThreadCluster
,
static_assert
(
BlockSize
==
M
1N1
ThreadCluster
M11
*
M
1N
1ThreadCluster
M10
*
M1N1
ThreadCluster
N11
*
M1N
1ThreadCluster
N10
,
"wrong! blocksize and cluster size not consistent"
);
static_assert
(
ABlockDesc
{}.
GetLength
(
I0
)
==
BBlockDesc
{}.
GetLength
(
I0
),
...
...
@@ -78,6 +78,8 @@ struct BlockwiseGemm_km0m1_kn0n1_m0m1n0n1_v1
constexpr
index_t
N1
=
BBlockDesc
{}.
GetLength
(
I2
);
// 4-d data space into 4-d thread space
// upper: {1, M1N1ThreadClusterM10 * M1N1ThreadClusterM11, 1, M1N1ThreadClusterN10 *
// M1N1ThreadClusterN11} lower: {M0, M1, N0, N1}
constexpr
auto
adaptor0
=
make_single_stage_tensor_adaptor
(
make_tuple
(
make_vectorize_transform
(
M0
,
1
),
make_vectorize_transform
(
M1PerThread
,
M1
/
M1PerThread
),
...
...
@@ -87,21 +89,27 @@ struct BlockwiseGemm_km0m1_kn0n1_m0m1n0n1_v1
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
// thread position 4-d thread space
// upper: {M1N1ThreadClusterM10, M1N1ThreadClusterM11, M1N1ThreadClusterN10,
// M1N1ThreadClusterN11} lower: {1, M1N1ThreadClusterM10 * M1N1ThreadClusterM11, 1,
// M1N1ThreadClusterN10 * M1N1ThreadClusterN11}
constexpr
auto
adaptor1
=
make_single_stage_tensor_adaptor
(
make_tuple
(
make_freeze_transform
(
make_multi_index
(
0
)),
make_unmerge_transform
(
make_tuple
(
M
Level
1ThreadCluster
,
MLevel0
ThreadCluster
)),
make_unmerge_transform
(
make_tuple
(
M
1N
1ThreadCluster
M10
,
M1N1
ThreadCluster
M11
)),
make_freeze_transform
(
make_multi_index
(
0
)),
make_unmerge_transform
(
make_tuple
(
NLevel
1ThreadCluster
,
NLevel0
ThreadCluster
))),
make_unmerge_transform
(
make_tuple
(
M1N
1ThreadCluster
N10
,
M1N1
ThreadCluster
N11
))),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<>
{},
Sequence
<
0
,
1
>
{},
Sequence
<>
{},
Sequence
<
2
,
3
>
{}));
// 4-d thread space to 1-d thread space
// upper: {BlockSize}
// lower: {M1N1ThreadClusterM10, M1N1ThreadClusterM11, M1N1ThreadClusterN10,
// M1N1ThreadClusterN11}
constexpr
auto
adaptor2
=
make_single_stage_tensor_adaptor
(
make_tuple
(
make_merge_transform
(
make_tuple
(
M
Level
1ThreadCluster
,
NLevel
1ThreadCluster
,
M
Level0
ThreadCluster
,
NLevel0
ThreadCluster
))),
make_tuple
(
make_merge_transform
(
make_tuple
(
M
1N
1ThreadCluster
M10
,
M1N
1ThreadCluster
N10
,
M
1N1
ThreadCluster
M11
,
M1N1
ThreadCluster
N11
))),
make_tuple
(
Sequence
<
0
,
2
,
1
,
3
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
...
...
@@ -221,10 +229,10 @@ template <index_t BlockSize,
index_t
M1PerThread
,
index_t
N1PerThread
,
index_t
KPerThread
,
index_t
M
Level0
ThreadCluster
,
index_t
NLevel0
ThreadCluster
,
index_t
M
Level
1ThreadCluster
,
index_t
NLevel
1ThreadCluster
,
index_t
M
1N1
ThreadCluster
M11
,
index_t
M1N1
ThreadCluster
N11
,
index_t
M
1N
1ThreadCluster
M10
,
index_t
M1N
1ThreadCluster
N10
,
index_t
AThreadCopyScalarPerVector_M1
,
index_t
BThreadCopyScalarPerVector_N1
,
typename
std
::
enable_if
<
ABlockDesc
::
IsKnownAtCompileTime
()
&&
...
...
@@ -254,8 +262,8 @@ struct BlockwiseGemm_km0m1_kn0n1_m0m1n0n1_v2_pipeline_2x2
CThreadDesc
::
IsKnownAtCompileTime
(),
"wrong! Desc should be known at compile-time"
);
static_assert
(
BlockSize
==
M
Level0
ThreadCluster
*
M
Level
1ThreadCluster
*
NLevel0
ThreadCluster
*
NLevel
1ThreadCluster
,
static_assert
(
BlockSize
==
M
1N1
ThreadCluster
M11
*
M
1N
1ThreadCluster
M10
*
M1N1
ThreadCluster
N11
*
M1N
1ThreadCluster
N10
,
"wrong! blocksize and cluster size not consistent"
);
static_assert
(
ABlockDesc
{}.
GetLength
(
I0
)
==
BBlockDesc
{}.
GetLength
(
I0
),
...
...
@@ -287,18 +295,18 @@ struct BlockwiseGemm_km0m1_kn0n1_m0m1n0n1_v2_pipeline_2x2
constexpr
auto
adaptor1
=
make_single_stage_tensor_adaptor
(
make_tuple
(
make_freeze_transform
(
make_multi_index
(
0
)),
make_unmerge_transform
(
make_tuple
(
M
Level
1ThreadCluster
,
MLevel0
ThreadCluster
)),
make_unmerge_transform
(
make_tuple
(
M
1N
1ThreadCluster
M10
,
M1N1
ThreadCluster
M11
)),
make_freeze_transform
(
make_multi_index
(
0
)),
make_unmerge_transform
(
make_tuple
(
NLevel
1ThreadCluster
,
NLevel0
ThreadCluster
))),
make_unmerge_transform
(
make_tuple
(
M1N
1ThreadCluster
N10
,
M1N1
ThreadCluster
N11
))),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<>
{},
Sequence
<
0
,
1
>
{},
Sequence
<>
{},
Sequence
<
2
,
3
>
{}));
// 4-d thread space to 1-d thread space
constexpr
auto
adaptor2
=
make_single_stage_tensor_adaptor
(
make_tuple
(
make_merge_transform
(
make_tuple
(
M
Level
1ThreadCluster
,
NLevel
1ThreadCluster
,
M
Level0
ThreadCluster
,
NLevel0
ThreadCluster
))),
make_tuple
(
make_merge_transform
(
make_tuple
(
M
1N
1ThreadCluster
M10
,
M1N
1ThreadCluster
N10
,
M
1N1
ThreadCluster
M11
,
M1N1
ThreadCluster
N11
))),
make_tuple
(
Sequence
<
0
,
2
,
1
,
3
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
...
...
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_v1r1.hpp
View file @
b3e80872
...
...
@@ -108,13 +108,13 @@ template <index_t BlockSize,
index_t
MPerBlock
,
index_t
NPerBlock
,
index_t
KPerBlock
,
index_t
MPerThread
,
index_t
NPerThread
,
index_t
M
1
PerThread
,
index_t
N
1
PerThread
,
index_t
KPerThread
,
index_t
M
Level0
Cluster
,
index_t
NLevel0
Cluster
,
index_t
M
Level1
Cluster
,
index_t
NLevel1
Cluster
,
index_t
M
1N1Thread
Cluster
M10
,
index_t
M1N1Thread
Cluster
N10
,
index_t
M
1N1Thread
Cluster
M11
,
index_t
M1N1Thread
Cluster
N11
,
typename
ABlockTransferThreadSliceLengths_K_M
,
typename
ABlockTransferThreadClusterLengths_K_M
,
typename
ABlockTransferThreadClusterArrangeOrder
,
...
...
@@ -145,8 +145,8 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_v1r1
{
constexpr
auto
max_lds_align
=
math
::
lcm
(
Number
<
ABlockTransferDstScalarPerVector_M
>
{},
Number
<
BBlockTransferDstScalarPerVector_N
>
{},
Number
<
MPerThread
>
{},
Number
<
NPerThread
>
{});
Number
<
M
1
PerThread
>
{},
Number
<
N
1
PerThread
>
{});
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
...
...
@@ -210,8 +210,8 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_v1r1
// lds max alignment
constexpr
auto
max_lds_align
=
math
::
lcm
(
Number
<
ABlockTransferDstScalarPerVector_M
>
{},
Number
<
BBlockTransferDstScalarPerVector_N
>
{},
Number
<
MPerThread
>
{},
Number
<
NPerThread
>
{});
Number
<
M
1
PerThread
>
{},
Number
<
N
1
PerThread
>
{});
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
...
...
@@ -284,34 +284,39 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_v1r1
// c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
// register
// sanity check
static_assert
(
MPerBlock
%
(
MPerThread
*
MLevel0Cluster
*
MLevel1Cluster
)
==
0
&&
NPerBlock
%
(
NPerThread
*
NLevel0Cluster
*
NLevel1Cluster
)
==
0
,
static_assert
(
MPerBlock
%
(
M1PerThread
*
M1N1ThreadClusterM11
*
M1N1ThreadClusterM10
)
==
0
&&
NPerBlock
%
(
N1PerThread
*
M1N1ThreadClusterN11
*
M1N1ThreadClusterN10
)
==
0
,
"wrong!"
);
constexpr
index_t
MRepeat
=
MPerBlock
/
(
MPerThread
*
MLevel0Cluster
*
MLevel1Cluster
);
constexpr
index_t
NRepeat
=
NPerBlock
/
(
NPerThread
*
NLevel0Cluster
*
NLevel1Cluster
);
constexpr
index_t
M0PerThread
=
MPerBlock
/
(
M1PerThread
*
M1N1ThreadClusterM11
*
M1N1ThreadClusterM10
);
constexpr
index_t
N0PerThread
=
NPerBlock
/
(
N1PerThread
*
M1N1ThreadClusterN11
*
M1N1ThreadClusterN10
);
constexpr
auto
a_k_m0_m1_block_desc
=
transform_dynamic_tensor_descriptor
(
a_k_m_block_desc
,
make_tuple
(
make_pass_through_transform
(
Number
<
KPerBlock
>
{}),
make_tuple
(
make_pass_through_transform
(
Number
<
KPerBlock
>
{}),
make_unmerge_transform
(
make_tuple
(
Number
<
MRepeat
>
{},
Number
<
MPerThread
*
MLevel0Cluster
*
MLevel1Cluster
>
{}))),
Number
<
M0PerThread
>
{},
Number
<
M1PerThread
*
M1N1ThreadClusterM11
*
M1N1ThreadClusterM10
>
{}))),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{}));
constexpr
auto
b_k_n0_n1_block_desc
=
transform_dynamic_tensor_descriptor
(
b_k_n_block_desc
,
make_tuple
(
make_pass_through_transform
(
Number
<
KPerBlock
>
{}),
make_tuple
(
make_pass_through_transform
(
Number
<
KPerBlock
>
{}),
make_unmerge_transform
(
make_tuple
(
Number
<
NRepeat
>
{},
Number
<
NPerThread
*
NLevel0Cluster
*
NLevel1Cluster
>
{}))),
Number
<
N0PerThread
>
{},
Number
<
N1PerThread
*
M1N1ThreadClusterN11
*
M1N1ThreadClusterN10
>
{}))),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{}));
constexpr
auto
c_m0_m1_n0_n1_thread_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
Number
<
MRepeat
>
{},
Number
<
MPerThread
>
{},
Number
<
NRepeat
>
{},
Number
<
NPerThread
>
{}));
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
Number
<
M0PerThread
>
{},
Number
<
M1PerThread
>
{},
Number
<
N0PerThread
>
{},
Number
<
N1PerThread
>
{}));
const
auto
blockwise_gemm
=
BlockwiseGemm_km0m1_kn0n1_m0m1n0n1_v2_pipeline_2x2
<
BlockSize
,
...
...
@@ -321,15 +326,15 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_v1r1
decltype
(
a_k_m0_m1_block_desc
),
decltype
(
b_k_n0_n1_block_desc
),
decltype
(
c_m0_m1_n0_n1_thread_desc
),
MPerThread
,
NPerThread
,
M
1
PerThread
,
N
1
PerThread
,
KPerThread
,
M
Level0
Cluster
,
NLevel0
Cluster
,
M
Level1
Cluster
,
NLevel1
Cluster
,
MPerThread
,
NPerThread
>
{};
M
1N1Thread
Cluster
M10
,
M1N1Thread
Cluster
N10
,
M
1N1Thread
Cluster
M11
,
M1N1Thread
Cluster
N11
,
M
1
PerThread
,
N
1
PerThread
>
{};
// LDS allocation for A and B: be careful of alignment
constexpr
auto
a_block_space_size
=
...
...
@@ -345,9 +350,10 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_v1r1
auto
c_thread_buf
=
make_static_buffer
<
AddressSpace
::
Vgpr
,
FloatAcc
>
(
c_m0_m1_n0_n1_thread_desc
.
GetElementSpaceSize
());
ThreadwiseDynamicTensorSliceSet_v1
<
FloatAcc
,
ThreadwiseDynamicTensorSliceSet_v1
<
FloatAcc
,
decltype
(
c_m0_m1_n0_n1_thread_desc
),
Sequence
<
M
Rep
ea
t
,
MPerThread
,
N
Rep
ea
t
,
NPerThread
>>
{}
Sequence
<
M
0PerThr
ea
d
,
M
1
PerThread
,
N
0PerThr
ea
d
,
N
1
PerThread
>>
{}
.
Run
(
c_m0_m1_n0_n1_thread_desc
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
c_thread_buf
,
FloatAcc
{
0
});
constexpr
auto
a_block_slice_copy_step
=
make_multi_index
(
KPerBlock
,
0
);
...
...
@@ -479,8 +485,8 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_v1r1
// output: register to global memory
{
constexpr
auto
M1
=
Number
<
MPerThread
*
M
Level0
Cluster
*
M
Level1
Cluster
>
{};
constexpr
auto
N1
=
Number
<
NPerThread
*
NLevel0Cluster
*
NLevel1
Cluster
>
{};
constexpr
auto
M1
=
Number
<
M
1
PerThread
*
M
1N1Thread
Cluster
M10
*
M
1N1Thread
Cluster
M11
>
{};
constexpr
auto
N1
=
Number
<
N
1
PerThread
*
M1N1ThreadClusterN10
*
M1N1Thread
Cluster
N11
>
{};
// hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
constexpr
auto
c_m0_m1_n0_n1_global_tensor_iterator_hacks
=
CGlobalIteratorHacks
{};
...
...
@@ -493,7 +499,7 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_v1r1
FloatC
,
decltype
(
c_m0_m1_n0_n1_thread_desc
),
decltype
(
c_m0_m1_n0_n1_global_desc
),
Sequence
<
M
Rep
ea
t
,
MPerThread
,
N
Rep
ea
t
,
NPerThread
>
,
Sequence
<
M
0PerThr
ea
d
,
M
1
PerThread
,
N
0PerThr
ea
d
,
N
1
PerThread
>
,
CThreadTransferSrcDstAccessOrder
,
CThreadTransferSrcDstVectorDim
,
CThreadTransferDstScalarPerVector
,
...
...
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
View file @
b3e80872
...
...
@@ -125,14 +125,14 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(
constexpr
index_t
GemmNPerBlock
=
64
;
constexpr
index_t
GemmKPerBlock
=
4
;
constexpr
index_t
GemmMPerThread
=
2
;
constexpr
index_t
GemmNPerThread
=
2
;
constexpr
index_t
GemmM
1
PerThread
=
2
;
constexpr
index_t
GemmN
1
PerThread
=
2
;
constexpr
index_t
GemmKPerThread
=
1
;
constexpr
index_t
GemmM
Level0
Cluster
=
2
;
constexpr
index_t
Gemm
NLevel0
Cluster
=
2
;
constexpr
index_t
GemmM
Level1
Cluster
=
2
;
constexpr
index_t
Gemm
NLevel1
Cluster
=
8
;
constexpr
index_t
GemmM
1N1Thread
Cluster
M11
=
2
;
constexpr
index_t
Gemm
M1N1Thread
Cluster
N11
=
2
;
constexpr
index_t
GemmM
1N1Thread
Cluster
M10
=
2
;
constexpr
index_t
Gemm
M1N1Thread
Cluster
N10
=
8
;
constexpr
index_t
ThreadGemmDataPerReadM
=
2
;
constexpr
index_t
ThreadGemmDataPerReadN
=
2
;
...
...
@@ -149,7 +149,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(
constexpr
index_t
GemmBBlockTransferSrcScalarPerVector_GemmK
=
4
;
constexpr
index_t
GemmBBlockTransferDstScalarPerVector_GemmN
=
1
;
constexpr
index_t
GemmCThreadTransferDstScalarPerVector_GemmM1
=
2
;
constexpr
index_t
GemmCThreadTransferDstScalarPerVector_GemmM1
1
=
2
;
#elif 0
// cdata = 32, BlockSize = 64, 16x128x4
constexpr
index_t
BlockSize
=
64
;
...
...
@@ -158,14 +158,14 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(
constexpr
index_t
GemmNPerBlock
=
128
;
constexpr
index_t
GemmKPerBlock
=
4
;
constexpr
index_t
GemmMPerThread
=
2
;
constexpr
index_t
GemmNPerThread
=
4
;
constexpr
index_t
GemmM
1
PerThread
=
2
;
constexpr
index_t
GemmN
1
PerThread
=
4
;
constexpr
index_t
GemmKPerThread
=
1
;
constexpr
index_t
GemmM
Level0
Cluster
=
2
;
constexpr
index_t
Gemm
NLevel0
Cluster
=
2
;
constexpr
index_t
GemmM
Level1
Cluster
=
2
;
constexpr
index_t
Gemm
NLevel1
Cluster
=
8
;
constexpr
index_t
GemmM
1N1Thread
Cluster
M11
=
2
;
constexpr
index_t
Gemm
M1N1Thread
Cluster
N11
=
2
;
constexpr
index_t
GemmM
1N1Thread
Cluster
M10
=
2
;
constexpr
index_t
Gemm
M1N1Thread
Cluster
N10
=
8
;
constexpr
index_t
ThreadGemmDataPerReadM
=
2
;
constexpr
index_t
ThreadGemmDataPerReadN
=
4
;
...
...
@@ -182,7 +182,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(
constexpr
index_t
GemmBBlockTransferSrcScalarPerVector_GemmK
=
4
;
constexpr
index_t
GemmBBlockTransferDstScalarPerVector_GemmN
=
1
;
constexpr
index_t
GemmCThreadTransferDstScalarPerVector_GemmM1
=
2
;
constexpr
index_t
GemmCThreadTransferDstScalarPerVector_GemmM1
1
=
2
;
#elif 0
// cdata = 64, BlockSize = 64, 16x256x2
constexpr
index_t
BlockSize
=
64
;
...
...
@@ -191,14 +191,14 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(
constexpr
index_t
GemmNPerBlock
=
256
;
constexpr
index_t
GemmKPerBlock
=
2
;
constexpr
index_t
GemmMPerThread
=
4
;
constexpr
index_t
GemmNPerThread
=
4
;
constexpr
index_t
GemmM
1
PerThread
=
4
;
constexpr
index_t
GemmN
1
PerThread
=
4
;
constexpr
index_t
GemmKPerThread
=
1
;
constexpr
index_t
GemmM
Level0
Cluster
=
1
;
constexpr
index_t
Gemm
NLevel0
Cluster
=
2
;
constexpr
index_t
GemmM
Level1
Cluster
=
2
;
constexpr
index_t
Gemm
NLevel1
Cluster
=
16
;
constexpr
index_t
GemmM
1N1Thread
Cluster
M11
=
1
;
constexpr
index_t
Gemm
M1N1Thread
Cluster
N11
=
2
;
constexpr
index_t
GemmM
1N1Thread
Cluster
M10
=
2
;
constexpr
index_t
Gemm
M1N1Thread
Cluster
N10
=
16
;
constexpr
index_t
ThreadGemmDataPerReadM
=
4
;
constexpr
index_t
ThreadGemmDataPerReadN
=
4
;
...
...
@@ -215,7 +215,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(
constexpr
index_t
GemmBBlockTransferSrcScalarPerVector_GemmK
=
2
;
constexpr
index_t
GemmBBlockTransferDstScalarPerVector_GemmN
=
1
;
constexpr
index_t
GemmCThreadTransferDstScalarPerVector_GemmM1
=
4
;
constexpr
index_t
GemmCThreadTransferDstScalarPerVector_GemmM1
1
=
4
;
#elif 0
// cdata = 64, BlockSize = 64, 16x256x4
constexpr
index_t
BlockSize
=
64
;
...
...
@@ -224,14 +224,14 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(
constexpr
index_t
GemmNPerBlock
=
256
;
constexpr
index_t
GemmKPerBlock
=
4
;
constexpr
index_t
GemmMPerThread
=
4
;
constexpr
index_t
GemmNPerThread
=
4
;
constexpr
index_t
GemmM
1
PerThread
=
4
;
constexpr
index_t
GemmN
1
PerThread
=
4
;
constexpr
index_t
GemmKPerThread
=
1
;
constexpr
index_t
GemmM
Level0
Cluster
=
2
;
constexpr
index_t
Gemm
NLevel0
Cluster
=
2
;
constexpr
index_t
GemmM
Level1
Cluster
=
1
;
constexpr
index_t
Gemm
NLevel1
Cluster
=
16
;
constexpr
index_t
GemmM
1N1Thread
Cluster
M11
=
2
;
constexpr
index_t
Gemm
M1N1Thread
Cluster
N11
=
2
;
constexpr
index_t
GemmM
1N1Thread
Cluster
M10
=
1
;
constexpr
index_t
Gemm
M1N1Thread
Cluster
N10
=
16
;
constexpr
index_t
ThreadGemmDataPerReadM
=
4
;
constexpr
index_t
ThreadGemmDataPerReadN
=
4
;
...
...
@@ -248,7 +248,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(
constexpr
index_t
GemmBBlockTransferSrcScalarPerVector_GemmK
=
4
;
constexpr
index_t
GemmBBlockTransferDstScalarPerVector_GemmN
=
1
;
constexpr
index_t
GemmCThreadTransferDstScalarPerVector_GemmM1
=
4
;
constexpr
index_t
GemmCThreadTransferDstScalarPerVector_GemmM1
1
=
4
;
#elif 0
// cdata = 64, BlockSize = 128, 32x256x4
constexpr
index_t
BlockSize
=
128
;
...
...
@@ -257,14 +257,14 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(
constexpr
index_t
GemmNPerBlock
=
256
;
constexpr
index_t
GemmKPerBlock
=
4
;
constexpr
index_t
GemmMPerThread
=
4
;
constexpr
index_t
GemmNPerThread
=
4
;
constexpr
index_t
GemmM
1
PerThread
=
4
;
constexpr
index_t
GemmN
1
PerThread
=
4
;
constexpr
index_t
GemmKPerThread
=
1
;
constexpr
index_t
GemmM
Level0
Cluster
=
2
;
constexpr
index_t
Gemm
NLevel0
Cluster
=
2
;
constexpr
index_t
GemmM
Level1
Cluster
=
2
;
constexpr
index_t
Gemm
NLevel1
Cluster
=
16
;
constexpr
index_t
GemmM
1N1Thread
Cluster
M11
=
2
;
constexpr
index_t
Gemm
M1N1Thread
Cluster
N11
=
2
;
constexpr
index_t
GemmM
1N1Thread
Cluster
M10
=
2
;
constexpr
index_t
Gemm
M1N1Thread
Cluster
N10
=
16
;
constexpr
index_t
ThreadGemmDataPerReadM
=
4
;
constexpr
index_t
ThreadGemmDataPerReadN
=
4
;
...
...
@@ -281,7 +281,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(
constexpr
index_t
GemmBBlockTransferSrcScalarPerVector_GemmK
=
4
;
constexpr
index_t
GemmBBlockTransferDstScalarPerVector_GemmN
=
1
;
constexpr
index_t
GemmCThreadTransferDstScalarPerVector_GemmM1
=
4
;
constexpr
index_t
GemmCThreadTransferDstScalarPerVector_GemmM1
1
=
4
;
#elif 0
// cdata = 64, BlockSize = 128, 32x256x8
constexpr
index_t
BlockSize
=
128
;
...
...
@@ -290,14 +290,14 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(
constexpr
index_t
GemmNPerBlock
=
256
;
constexpr
index_t
GemmKPerBlock
=
8
;
constexpr
index_t
GemmMPerThread
=
4
;
constexpr
index_t
GemmNPerThread
=
4
;
constexpr
index_t
GemmM
1
PerThread
=
4
;
constexpr
index_t
GemmN
1
PerThread
=
4
;
constexpr
index_t
GemmKPerThread
=
1
;
constexpr
index_t
GemmM
Level0
Cluster
=
2
;
constexpr
index_t
Gemm
NLevel0
Cluster
=
2
;
constexpr
index_t
GemmM
Level1
Cluster
=
2
;
constexpr
index_t
Gemm
NLevel1
Cluster
=
16
;
constexpr
index_t
GemmM
1N1Thread
Cluster
M11
=
2
;
constexpr
index_t
Gemm
M1N1Thread
Cluster
N11
=
2
;
constexpr
index_t
GemmM
1N1Thread
Cluster
M10
=
2
;
constexpr
index_t
Gemm
M1N1Thread
Cluster
N10
=
16
;
constexpr
index_t
ThreadGemmDataPerReadM
=
4
;
constexpr
index_t
ThreadGemmDataPerReadN
=
4
;
...
...
@@ -314,7 +314,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(
constexpr
index_t
GemmBBlockTransferSrcScalarPerVector_GemmK
=
8
;
constexpr
index_t
GemmBBlockTransferDstScalarPerVector_GemmN
=
1
;
constexpr
index_t
GemmCThreadTransferDstScalarPerVector_GemmM1
=
4
;
constexpr
index_t
GemmCThreadTransferDstScalarPerVector_GemmM1
1
=
4
;
#elif 1
// cdata = 64, BlockSize = 256, 128x128x8
constexpr
index_t
BlockSize
=
256
;
...
...
@@ -323,14 +323,14 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(
constexpr
index_t
GemmNPerBlock
=
128
;
constexpr
index_t
GemmKPerBlock
=
8
;
constexpr
index_t
GemmMPerThread
=
4
;
constexpr
index_t
GemmNPerThread
=
4
;
constexpr
index_t
GemmM
1
PerThread
=
4
;
constexpr
index_t
GemmN
1
PerThread
=
4
;
constexpr
index_t
GemmKPerThread
=
1
;
constexpr
index_t
GemmM
Level0
Cluster
=
2
;
constexpr
index_t
Gemm
NLevel0
Cluster
=
2
;
constexpr
index_t
GemmM
Level1
Cluster
=
8
;
constexpr
index_t
Gemm
NLevel1
Cluster
=
8
;
constexpr
index_t
GemmM
1N1Thread
Cluster
M11
=
2
;
constexpr
index_t
Gemm
M1N1Thread
Cluster
N11
=
2
;
constexpr
index_t
GemmM
1N1Thread
Cluster
M10
=
8
;
constexpr
index_t
Gemm
M1N1Thread
Cluster
N10
=
8
;
using
GemmABlockTransferThreadSliceLengths_GemmK_GemmM
=
Sequence
<
4
,
1
>
;
using
GemmABlockTransferThreadClusterLengths_GemmK_GemmM
=
Sequence
<
2
,
128
>
;
...
...
@@ -344,7 +344,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(
constexpr
index_t
GemmBBlockTransferSrcScalarPerVector_GemmK
=
4
;
constexpr
index_t
GemmBBlockTransferDstScalarPerVector_GemmN
=
1
;
constexpr
index_t
GemmCThreadTransferDstScalarPerVector_GemmM1
=
4
;
constexpr
index_t
GemmCThreadTransferDstScalarPerVector_GemmM1
1
=
4
;
#elif 1
// cdata = 64, BlockSize = 256, 128x128x16
constexpr
index_t
BlockSize
=
256
;
...
...
@@ -353,14 +353,14 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(
constexpr
index_t
GemmNPerBlock
=
128
;
constexpr
index_t
GemmKPerBlock
=
16
;
constexpr
index_t
GemmMPerThread
=
4
;
constexpr
index_t
GemmNPerThread
=
4
;
constexpr
index_t
GemmM
1
PerThread
=
4
;
constexpr
index_t
GemmN
1
PerThread
=
4
;
constexpr
index_t
GemmKPerThread
=
1
;
constexpr
index_t
GemmM
Level0
Cluster
=
2
;
constexpr
index_t
Gemm
NLevel0
Cluster
=
2
;
constexpr
index_t
GemmM
Level1
Cluster
=
8
;
constexpr
index_t
Gemm
NLevel1
Cluster
=
8
;
constexpr
index_t
GemmM
1N1Thread
Cluster
M11
=
2
;
constexpr
index_t
Gemm
M1N1Thread
Cluster
N11
=
2
;
constexpr
index_t
GemmM
1N1Thread
Cluster
M10
=
8
;
constexpr
index_t
Gemm
M1N1Thread
Cluster
N10
=
8
;
using
GemmABlockTransferThreadSliceLengths_GemmK_GemmM
=
Sequence
<
4
,
2
>
;
using
GemmABlockTransferThreadClusterLengths_GemmK_GemmM
=
Sequence
<
4
,
64
>
;
...
...
@@ -374,11 +374,13 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(
constexpr
index_t
GemmBBlockTransferSrcScalarPerVector_GemmK
=
8
;
constexpr
index_t
GemmBBlockTransferDstScalarPerVector_GemmN
=
1
;
constexpr
index_t
GemmCThreadTransferDstScalarPerVector_GemmM1
=
4
;
constexpr
index_t
GemmCThreadTransferDstScalarPerVector_GemmM1
1
=
4
;
#endif
constexpr
index_t
GemmM1
=
GemmMPerThread
*
GemmMLevel0Cluster
*
GemmMLevel1Cluster
;
constexpr
index_t
GemmN1
=
GemmNPerThread
*
GemmNLevel0Cluster
*
GemmNLevel1Cluster
;
constexpr
index_t
GemmM1
=
GemmM1PerThread
*
GemmM1N1ThreadClusterM11
*
GemmM1N1ThreadClusterM10
;
constexpr
index_t
GemmN1
=
GemmN1PerThread
*
GemmM1N1ThreadClusterN11
*
GemmM1N1ThreadClusterN10
;
const
auto
descs
=
#if 1
...
...
@@ -409,13 +411,13 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(
GemmMPerBlock
,
GemmNPerBlock
,
GemmKPerBlock
,
GemmMPerThread
,
GemmNPerThread
,
GemmM
1
PerThread
,
GemmN
1
PerThread
,
GemmKPerThread
,
GemmM
Level0
Cluster
,
Gemm
NLevel0
Cluster
,
GemmM
Level1
Cluster
,
Gemm
NLevel1
Cluster
,
GemmM
1N1Thread
Cluster
M10
,
Gemm
M1N1Thread
Cluster
N10
,
GemmM
1N1Thread
Cluster
M11
,
Gemm
M1N1Thread
Cluster
N11
,
GemmABlockTransferThreadSliceLengths_GemmK_GemmM
,
GemmABlockTransferThreadClusterLengths_GemmK_GemmM
,
Sequence
<
1
,
0
>
,
...
...
@@ -435,7 +437,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(
// MoveSrcSliceWindow() to save addr computation
Sequence
<
2
,
3
,
0
,
1
>
,
1
,
GemmCThreadTransferDstScalarPerVector_GemmM1
,
GemmCThreadTransferDstScalarPerVector_GemmM1
1
,
decltype
(
descs
[
I4
]),
decltype
(
descs
[
I5
]),
decltype
(
descs
[
I6
]),
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment