Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
9bbe9073
Commit
9bbe9073
authored
Feb 04, 2019
by
Chao Liu
Browse files
refactor
parent
3439e4b5
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
138 additions
and
14 deletions
+138
-14
driver/conv.cu
driver/conv.cu
+55
-8
driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh
driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh
+16
-1
driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding.cuh
...plicit_gemm_convolution_1_chwn_csrk_khwn_with_padding.cuh
+47
-2
driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
+18
-1
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
...e/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
+1
-1
src/include/gridwise_implicit_gemm_convolution_3_cnhw_srck_knhw.cuh
...e/gridwise_implicit_gemm_convolution_3_cnhw_srck_knhw.cuh
+1
-1
No files found.
driver/conv.cu
View file @
9bbe9073
...
@@ -453,7 +453,7 @@ int main()
...
@@ -453,7 +453,7 @@ int main()
constexpr
unsigned
HPad
=
0
;
constexpr
unsigned
HPad
=
0
;
constexpr
unsigned
WPad
=
0
;
constexpr
unsigned
WPad
=
0
;
#elif
1
#elif
0
// 3x3 filter, 56x56 image, 1x1 padding
// 3x3 filter, 56x56 image, 1x1 padding
constexpr
unsigned
N
=
16
;
constexpr
unsigned
N
=
16
;
constexpr
unsigned
C
=
128
;
constexpr
unsigned
C
=
128
;
...
@@ -477,6 +477,18 @@ int main()
...
@@ -477,6 +477,18 @@ int main()
constexpr
unsigned
HPad
=
1
;
constexpr
unsigned
HPad
=
1
;
constexpr
unsigned
WPad
=
1
;
constexpr
unsigned
WPad
=
1
;
#elif 1
// 1x1 filter, 28x28 image
constexpr
unsigned
N
=
16
;
constexpr
unsigned
C
=
256
;
constexpr
unsigned
HI
=
28
;
constexpr
unsigned
WI
=
28
;
constexpr
unsigned
K
=
512
;
constexpr
unsigned
S
=
1
;
constexpr
unsigned
R
=
1
;
constexpr
unsigned
HPad
=
0
;
constexpr
unsigned
WPad
=
0
;
#elif 0
#elif 0
// 3x3 filter, 20x84 image, 1x1 padding
// 3x3 filter, 20x84 image, 1x1 padding
constexpr
unsigned
N
=
16
;
constexpr
unsigned
N
=
16
;
...
@@ -489,6 +501,42 @@ int main()
...
@@ -489,6 +501,42 @@ int main()
constexpr
unsigned
HPad
=
1
;
constexpr
unsigned
HPad
=
1
;
constexpr
unsigned
WPad
=
1
;
constexpr
unsigned
WPad
=
1
;
#elif 0
// 3x3 filter, 112x112 image, 1x1 padding
constexpr
unsigned
N
=
16
;
constexpr
unsigned
C
=
64
;
constexpr
unsigned
HI
=
112
;
constexpr
unsigned
WI
=
112
;
constexpr
unsigned
K
=
128
;
constexpr
unsigned
S
=
3
;
constexpr
unsigned
R
=
3
;
constexpr
unsigned
HPad
=
1
;
constexpr
unsigned
WPad
=
1
;
#elif 0
// 5x5 filter, 20x86 image, 1x1 padding
constexpr
unsigned
N
=
16
;
constexpr
unsigned
C
=
256
;
constexpr
unsigned
HI
=
20
;
constexpr
unsigned
WI
=
86
;
constexpr
unsigned
K
=
512
;
constexpr
unsigned
S
=
5
;
constexpr
unsigned
R
=
5
;
constexpr
unsigned
HPad
=
1
;
constexpr
unsigned
WPad
=
1
;
#elif 0
// 5x5 filter, 28x28 image, 2x2 padding
constexpr
unsigned
N
=
16
;
constexpr
unsigned
C
=
192
;
constexpr
unsigned
HI
=
28
;
constexpr
unsigned
WI
=
28
;
constexpr
unsigned
K
=
32
;
constexpr
unsigned
S
=
5
;
constexpr
unsigned
R
=
5
;
constexpr
unsigned
HPad
=
2
;
constexpr
unsigned
WPad
=
2
;
#endif
#endif
auto
lower_pads
=
Sequence
<
HPad
,
WPad
>
{};
auto
lower_pads
=
Sequence
<
HPad
,
WPad
>
{};
...
@@ -510,7 +558,7 @@ int main()
...
@@ -510,7 +558,7 @@ int main()
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
();
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
();
#if
1
#if
0
in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei_kcsr.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei_kcsr.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
#elif
1
#elif
1
...
@@ -518,9 +566,9 @@ int main()
...
@@ -518,9 +566,9 @@ int main()
wei_kcsr
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei_kcsr
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
#endif
#endif
unsigned
nrepeat
=
5
0
;
unsigned
nrepeat
=
10
0
;
#if
0
#if
1
#if 0
#if 0
device_direct_convolution_1
device_direct_convolution_1
#elif
0
#elif
0
...
@@ -531,15 +579,14 @@ int main()
...
@@ -531,15 +579,14 @@ int main()
device_implicit_gemm_convolution_1_nchw_srck_nkhw
device_implicit_gemm_convolution_1_nchw_srck_nkhw
#elif 0
#elif 0
device_implicit_gemm_convolution_1_chwn_csrk_khwn
device_implicit_gemm_convolution_1_chwn_csrk_khwn
#elif
0
#elif
1
device_implicit_gemm_convolution_2_cnhw_srck_knhw
device_implicit_gemm_convolution_2_cnhw_srck_knhw
#elif 0
#elif 0
device_winograd_convolution
device_winograd_convolution
#endif
#endif
(
in_nchw_desc
,
in_nchw
,
wei_kcsr_desc
,
wei_kcsr
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
(
in_nchw_desc
,
in_nchw
,
wei_kcsr_desc
,
wei_kcsr
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
#endif
#if 1
#
el
if 1
device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding
(
in_nchw_desc
,
device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding
(
in_nchw_desc
,
in_nchw
,
in_nchw
,
wei_kcsr_desc
,
wei_kcsr_desc
,
...
@@ -551,7 +598,7 @@ int main()
...
@@ -551,7 +598,7 @@ int main()
nrepeat
);
nrepeat
);
#endif
#endif
#if
1
#if
0
if(S == 3 && R == 3)
if(S == 3 && R == 3)
{
{
host_winograd_3x3_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads);
host_winograd_3x3_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads);
...
...
driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh
View file @
9bbe9073
...
@@ -102,7 +102,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
...
@@ -102,7 +102,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
constexpr
unsigned
WoPerThread
=
1
;
constexpr
unsigned
WoPerThread
=
1
;
constexpr
unsigned
BlockSize
=
128
;
constexpr
unsigned
BlockSize
=
128
;
#elif
1
#elif
0
// 3x3 58x58, NKC = 16,256,128
// 3x3 58x58, NKC = 16,256,128
constexpr
unsigned
NPerBlock
=
8
;
constexpr
unsigned
NPerBlock
=
8
;
constexpr
unsigned
KPerBlock
=
64
;
constexpr
unsigned
KPerBlock
=
64
;
...
@@ -161,6 +161,21 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
...
@@ -161,6 +161,21 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
constexpr
unsigned
HoPerThread
=
1
;
constexpr
unsigned
HoPerThread
=
1
;
constexpr
unsigned
WoPerThread
=
1
;
constexpr
unsigned
WoPerThread
=
1
;
constexpr
unsigned
BlockSize
=
128
;
#elif 1
// for 1x1, 28x28
constexpr
unsigned
NPerBlock
=
16
;
constexpr
unsigned
KPerBlock
=
128
;
constexpr
unsigned
CPerBlock
=
8
;
constexpr
unsigned
HoPerBlock
=
2
;
constexpr
unsigned
WoPerBlock
=
2
;
constexpr
unsigned
NPerThread
=
4
;
constexpr
unsigned
KPerThread
=
16
;
constexpr
unsigned
CPerThread
=
2
;
constexpr
unsigned
HoPerThread
=
1
;
constexpr
unsigned
WoPerThread
=
1
;
constexpr
unsigned
BlockSize
=
128
;
constexpr
unsigned
BlockSize
=
128
;
#endif
#endif
...
...
driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding.cuh
View file @
9bbe9073
...
@@ -164,7 +164,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding(InDesc,
...
@@ -164,7 +164,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding(InDesc,
constexpr
unsigned
WoPerThread
=
1
;
constexpr
unsigned
WoPerThread
=
1
;
constexpr
unsigned
BlockSize
=
128
;
constexpr
unsigned
BlockSize
=
128
;
#elif
1
#elif
0
// 3x3 56x56, NKC = 16,256,128, with padding
// 3x3 56x56, NKC = 16,256,128, with padding
// 3x3 28x28, NKC = 16,512,256, with padding
// 3x3 28x28, NKC = 16,512,256, with padding
// 3x3 20x84, NKC = 16,256,256, with padding
// 3x3 20x84, NKC = 16,256,256, with padding
...
@@ -180,6 +180,51 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding(InDesc,
...
@@ -180,6 +180,51 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding(InDesc,
constexpr
unsigned
HoPerThread
=
1
;
constexpr
unsigned
HoPerThread
=
1
;
constexpr
unsigned
WoPerThread
=
1
;
constexpr
unsigned
WoPerThread
=
1
;
constexpr
unsigned
BlockSize
=
128
;
#elif 0
// for 5x5 filter, 20x84 image, 1x1 padding
constexpr
unsigned
NPerBlock
=
16
;
constexpr
unsigned
KPerBlock
=
64
;
constexpr
unsigned
CPerBlock
=
1
;
constexpr
unsigned
HoPerBlock
=
2
;
constexpr
unsigned
WoPerBlock
=
4
;
constexpr
unsigned
NPerThread
=
4
;
constexpr
unsigned
KPerThread
=
16
;
constexpr
unsigned
CPerThread
=
1
;
constexpr
unsigned
HoPerThread
=
1
;
constexpr
unsigned
WoPerThread
=
1
;
constexpr
unsigned
BlockSize
=
128
;
#elif 0
// 5x5 filter, 28x28 image, 2x2 padding
constexpr
unsigned
NPerBlock
=
16
;
constexpr
unsigned
KPerBlock
=
32
;
constexpr
unsigned
CPerBlock
=
2
;
constexpr
unsigned
HoPerBlock
=
4
;
constexpr
unsigned
WoPerBlock
=
4
;
constexpr
unsigned
NPerThread
=
4
;
constexpr
unsigned
KPerThread
=
16
;
constexpr
unsigned
CPerThread
=
1
;
constexpr
unsigned
HoPerThread
=
1
;
constexpr
unsigned
WoPerThread
=
1
;
constexpr
unsigned
BlockSize
=
128
;
#elif 1
// for 1x1, 28x28
constexpr
unsigned
NPerBlock
=
16
;
constexpr
unsigned
KPerBlock
=
128
;
constexpr
unsigned
CPerBlock
=
8
;
constexpr
unsigned
HoPerBlock
=
2
;
constexpr
unsigned
WoPerBlock
=
2
;
constexpr
unsigned
NPerThread
=
4
;
constexpr
unsigned
KPerThread
=
16
;
constexpr
unsigned
CPerThread
=
2
;
constexpr
unsigned
HoPerThread
=
1
;
constexpr
unsigned
WoPerThread
=
1
;
constexpr
unsigned
BlockSize
=
128
;
constexpr
unsigned
BlockSize
=
128
;
#endif
#endif
...
@@ -229,7 +274,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding(InDesc,
...
@@ -229,7 +274,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding(InDesc,
cudaEventElapsedTime
(
&
elapsedTime
,
start
,
stop
);
cudaEventElapsedTime
(
&
elapsedTime
,
start
,
stop
);
printf
(
"Elapsed time : %f ms
\n
"
,
elapsedTime
);
printf
(
"Elapsed time : %f ms
\n
"
,
elapsedTime
);
usleep
(
1
0
000
);
usleep
(
elapsedTime
*
1000
);
}
}
checkCudaErrors
(
cudaGetLastError
());
checkCudaErrors
(
cudaGetLastError
());
...
...
driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
View file @
9bbe9073
...
@@ -93,7 +93,7 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc,
...
@@ -93,7 +93,7 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc,
constexpr
unsigned
GemmThreadPerClusterColumn
=
4
;
constexpr
unsigned
GemmThreadPerClusterColumn
=
4
;
constexpr
unsigned
BlockSize
=
128
;
constexpr
unsigned
BlockSize
=
128
;
#elif
1
#elif
0
constexpr
unsigned
BPerBlock
=
128
;
constexpr
unsigned
BPerBlock
=
128
;
constexpr
unsigned
KPerBlock
=
64
;
constexpr
unsigned
KPerBlock
=
64
;
constexpr
unsigned
CPerBlock
=
2
;
constexpr
unsigned
CPerBlock
=
2
;
...
@@ -108,6 +108,23 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc,
...
@@ -108,6 +108,23 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc,
constexpr
unsigned
InBlockCopyThreadPerDim0
=
2
;
constexpr
unsigned
InBlockCopyThreadPerDim0
=
2
;
constexpr
unsigned
InBlockCopyThreadPerDim1
=
64
;
constexpr
unsigned
InBlockCopyThreadPerDim1
=
64
;
constexpr
unsigned
BlockSize
=
128
;
#elif 1
// 1x1, 28x28
constexpr
unsigned
BPerBlock
=
64
;
constexpr
unsigned
KPerBlock
=
128
;
constexpr
unsigned
CPerBlock
=
8
;
constexpr
unsigned
BPerThread
=
4
;
constexpr
unsigned
KPerThread
=
16
;
constexpr
unsigned
CPerThread
=
2
;
constexpr
unsigned
GemmRowThreadPerCluster
=
8
;
constexpr
unsigned
GemmColumnThreadPerCluster
=
8
;
constexpr
unsigned
InBlockCopyThreadPerDim0
=
2
;
constexpr
unsigned
InBlockCopyThreadPerDim1
=
64
;
constexpr
unsigned
BlockSize
=
128
;
constexpr
unsigned
BlockSize
=
128
;
#endif
#endif
...
...
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
View file @
9bbe9073
...
@@ -7,7 +7,7 @@
...
@@ -7,7 +7,7 @@
#include "threadwise_2d_tensor_op.cuh"
#include "threadwise_2d_tensor_op.cuh"
#include "gemm.cuh"
#include "gemm.cuh"
// define B =
N*Hi*
Wi
// define B =
flatten(N, Hi,
Wi
)
template
<
unsigned
GridSize
,
template
<
unsigned
GridSize
,
unsigned
BlockSize
,
unsigned
BlockSize
,
class
Float
,
class
Float
,
...
...
src/include/gridwise_implicit_gemm_convolution_3_cnhw_srck_knhw.cuh
View file @
9bbe9073
...
@@ -115,7 +115,7 @@ gridwise_implicit_gemm_convolution_3_cnhw_srck_knhw(InGlobalDesc,
...
@@ -115,7 +115,7 @@ gridwise_implicit_gemm_convolution_3_cnhw_srck_knhw(InGlobalDesc,
decltype
(
in_cb_global_desc
),
decltype
(
in_cb_global_desc
),
decltype
(
in_cb_block_desc
),
decltype
(
in_cb_block_desc
),
decltype
(
in_cb_block_desc
.
GetLengths
())
>
{};
decltype
(
in_cb_block_desc
.
GetLengths
())
>
{};
#elif
0
#elif
1
const
auto
blockwise_in_copy
=
const
auto
blockwise_in_copy
=
blockwise_2d_tensor_copy_2
<
BlockSize
,
blockwise_2d_tensor_copy_2
<
BlockSize
,
Float
,
Float
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment