Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
1480375f
Commit
1480375f
authored
Jul 14, 2019
by
Chao Liu
Browse files
adding implicit GEMM v4r2
parent
a4b52461
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
55 additions
and
55 deletions
+55
-55
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer.hpp
...n_implicit_gemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer.hpp
+6
-43
driver/include/device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp
.../device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp
+49
-12
No files found.
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer.hpp
View file @
1480375f
...
...
@@ -29,9 +29,6 @@ template <index_t GridSize,
index_t
BPerBlock
,
index_t
KPerBlock
,
index_t
EPerBlock
,
index_t
N0PerBlock
,
index_t
Ho0PerBlock
,
index_t
Wo0PerBlock
,
index_t
GemmMPerThreadSubC
,
index_t
GemmNPerThreadSubC
,
index_t
GemmMLevel0Cluster
,
...
...
@@ -164,14 +161,8 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer
// memory layout descriptor in LDS [E, N1, B, N2], dst of blockwise copy
// be careful of LDS alignment
constexpr
auto
in_e_n0_ho0_wo0_b_n2_ho2_wo2_block_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
EPerBlock
,
N0PerBlock
,
Ho0PerBlock
,
Wo0PerBlock
,
BPerBlock
,
N2
,
Ho2
,
Wo2
>
{});
make_ConstantTensorDescriptor_packed
(
Sequence
<
EPerBlock
,
N0
,
Ho0
,
Wo0
,
BPerBlock
,
N2
,
Ho2
,
Wo2
>
{});
// input blockwise copy
// slice a merged tensor, reorder and copy to a normal tensor
...
...
@@ -251,9 +242,8 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer
// c_thread_mtx definition: this is a mess
// TODO:: more elegent way of defining c_thread_mtx
constexpr
auto
c_k0k2_n0ho0wo0n2ho2wo2_thread_mtx_desc
=
make_ConstantMatrixDescriptor_packed
(
Number
<
GemmMRepeat
*
GemmMPerThreadSubC
>
{},
Number
<
N0PerBlock
*
Ho0PerBlock
*
Wo0PerBlock
*
N2
*
Ho2
*
Wo2
>
{});
make_ConstantMatrixDescriptor_packed
(
Number
<
GemmMRepeat
*
GemmMPerThreadSubC
>
{},
Number
<
N0
*
Ho0
*
Wo0
*
N2
*
Ho2
*
Wo2
>
{});
const
auto
blockwise_gemm
=
BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
<
BlockSize
,
...
...
@@ -384,18 +374,8 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer
// define tensor descriptor for threadwise copy
// output memory layout descriptor in register
constexpr
auto
out_k0_k1_k2_n0_ho0_wo0_n1_ho1_wo1_n2_ho2_wo2_thread_mem_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
KPerBlock
/
(
K1
*
K2
),
1
,
K2
,
N0PerBlock
,
Ho0PerBlock
,
Wo0PerBlock
,
1
,
1
,
1
,
N2
,
Ho2
,
Wo2
>
{});
make_ConstantTensorDescriptor_packed
(
Sequence
<
KPerBlock
/
(
K1
*
K2
),
1
,
K2
,
N0
,
Ho0
,
Wo0
,
1
,
1
,
1
,
N2
,
Ho2
,
Wo2
>
{});
// output tensor descriptor in register, src of threadwise copy
constexpr
auto
out_n0_n1_n2_k0_k1_k2_ho0_ho1_ho2_wo0_wo1_wo2_thread_desc
=
...
...
@@ -440,7 +420,6 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer
out_k_n0_ho0_wo0_b_n2_ho2_wo2_global_merged_desc
.
GetOffsetFromMultiIndex
(
k_thread_data_on_global
,
0
,
0
,
0
,
b_thread_data_on_global
,
0
,
0
,
0
);
#if 1
threadwise_generic_tensor_slice_copy_v1
(
out_n0_n1_n2_k0_k1_k2_ho0_ho1_ho2_wo0_wo1_wo2_thread_desc
,
p_out_thread
,
...
...
@@ -451,22 +430,6 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer
out_n0_n1_n2_k0_k1_k2_ho0_ho1_ho2_wo0_wo1_wo2_thread_desc
.
GetLengths
(),
arithmetic_sequence_gen
<
0
,
12
,
1
>::
type
{},
Number
<
1
>
{});
#else
if
(
get_thread_local_1d_id
()
==
0
&&
get_block_1d_id
()
==
0
)
{
print_ConstantTensorDescriptor
(
"out thread: "
,
out_n0_n1_n2_k0_k1_k2_ho0_ho1_ho2_wo0_wo1_wo2_thread_desc
);
printf
(
"size: %d
\n
"
,
out_n0_n1_n2_k0_k1_k2_ho0_ho1_ho2_wo0_wo1_wo2_thread_desc
.
GetElementSize
());
for
(
index_t
i
=
0
;
i
<
out_n0_n1_n2_k0_k1_k2_ho0_ho1_ho2_wo0_wo1_wo2_thread_desc
.
GetElementSize
();
++
i
)
{
p_out_global
[
0
]
=
p_out_thread
[
i
];
}
}
#endif
}
}
};
...
...
driver/include/device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp
View file @
1480375f
...
...
@@ -53,15 +53,14 @@ void device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw(InDesc,
wei_kcyx_device_buf
.
ToDevice
(
wei_kcyx
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
#if
1
#if
0
// 1x1 filter, 8x8 image
constexpr index_t N0 = 1;
constexpr
index_t
N2
=
1
;
constexpr index_t Ho0 = 1;
constexpr
index_t
Ho2
=
1
;
constexpr index_t Wo0 = 2;
constexpr index_t N2 = 1;
constexpr index_t Ho2 = 1;
constexpr index_t Wo2 = 4;
constexpr index_t BlockSize = 256;
...
...
@@ -70,10 +69,6 @@ void device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw(InDesc,
constexpr index_t KPerBlock = 128;
constexpr index_t EPerBlock = 8;
constexpr
index_t
N0PerBlock
=
1
;
constexpr
index_t
Ho0PerBlock
=
1
;
constexpr
index_t
Wo0PerBlock
=
2
;
constexpr index_t GemmMPerThreadSubC = 4;
constexpr index_t GemmNPerThreadSubC = 4;
constexpr index_t GemmMLevel0Cluster = 4;
...
...
@@ -101,6 +96,51 @@ void device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw(InDesc,
using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E]
using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K]
constexpr index_t WeiBlockCopySrcDataPerRead_E = 4;
constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
#elif
1
// 1x1 filter, 8x8 image
constexpr
index_t
N0
=
1
;
constexpr
index_t
Ho0
=
2
;
constexpr
index_t
Wo0
=
1
;
constexpr
index_t
N2
=
2
;
constexpr
index_t
Ho2
=
2
;
constexpr
index_t
Wo2
=
1
;
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BPerBlock
=
16
;
constexpr
index_t
KPerBlock
=
128
;
constexpr
index_t
EPerBlock
=
8
;
constexpr
index_t
GemmMPerThreadSubC
=
4
;
constexpr
index_t
GemmNPerThreadSubC
=
4
;
constexpr
index_t
GemmMLevel0Cluster
=
4
;
constexpr
index_t
GemmNLevel0Cluster
=
4
;
constexpr
index_t
GemmMLevel1Cluster
=
4
;
constexpr
index_t
GemmNLevel1Cluster
=
4
;
constexpr
index_t
GemmKPerThreadLoop
=
1
;
constexpr
index_t
GemmDataPerReadA
=
4
;
constexpr
index_t
GemmDataPerReadB
=
4
;
using
InBlockCopySubLengths_E_N0_Ho0_Wo0_B_N2_Ho2_Wo2
=
Sequence
<
1
,
1
,
2
,
1
,
1
,
2
,
1
,
1
>
;
using
InBlockCopyClusterLengths_E_N0_Ho0_Wo0_B_N2_Ho2_Wo2
=
Sequence
<
8
,
1
,
1
,
1
,
16
,
1
,
2
,
1
>
;
using
InBlockCopyThreadClusterArrangeOrder
=
Sequence
<
0
,
1
,
5
,
2
,
6
,
3
,
4
,
7
>
;
// [E, N0, N2, Ho0, Ho2, Wo0, B, Wo2]
using
InBlockCopySrcAccessOrder
=
Sequence
<
0
,
1
,
5
,
2
,
6
,
3
,
4
,
7
>
;
// [E, N0, N2, Ho0, Ho2, Wo0, B, Wo2]
using
InBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
>
;
// [E, N0, Ho0, Wo0, B, N2, Ho2, Wo2]
constexpr
index_t
InBlockCopyDataPerAccess_W2
=
1
;
using
WeiBlockCopySubLengths_E_K
=
Sequence
<
4
,
1
>
;
using
WeiBlockCopyClusterLengths_E_K
=
Sequence
<
2
,
128
>
;
using
WeiBlockCopyThreadClusterArrangeOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopySrcAccessOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, K]
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
4
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
1
;
#endif
...
...
@@ -137,9 +177,6 @@ void device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw(InDesc,
BPerBlock
,
KPerBlock
,
EPerBlock
,
N0PerBlock
,
Ho0PerBlock
,
Wo0PerBlock
,
GemmMPerThreadSubC
,
GemmNPerThreadSubC
,
GemmMLevel0Cluster
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment