Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
3bd51021
"vscode:/vscode.git/clone" did not exist on "14b3504d95082ffd466ae43a05951053f36718a8"
Commit
3bd51021
authored
Jan 17, 2019
by
Chao Liu
Browse files
tune implicit_gemm
parent
216e3da6
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
26 additions
and
26 deletions
+26
-26
driver/conv.cu
driver/conv.cu
+1
-1
driver/device_implicit_gemm_convolution_nchw_kcsr.cuh
driver/device_implicit_gemm_convolution_nchw_kcsr.cuh
+8
-8
driver/device_implicit_gemm_convolution_nchw_srck.cuh
driver/device_implicit_gemm_convolution_nchw_srck.cuh
+15
-15
src/include/gridwise_implicit_gemm_convolution_nchw_kcsr.cuh
src/include/gridwise_implicit_gemm_convolution_nchw_kcsr.cuh
+2
-2
No files found.
driver/conv.cu
View file @
3bd51021
...
@@ -404,7 +404,7 @@ int main()
...
@@ -404,7 +404,7 @@ int main()
#elif 1
#elif 1
device_implicit_gemm_convolution_nchw_kcsr
(
device_implicit_gemm_convolution_nchw_kcsr
(
in_nchw_desc
,
in_nchw
,
wei_kcsr_desc
,
wei_kcsr
,
out_nkhw_desc
,
out_nkhw_device
);
in_nchw_desc
,
in_nchw
,
wei_kcsr_desc
,
wei_kcsr
,
out_nkhw_desc
,
out_nkhw_device
);
#elif
0
#elif
1
device_implicit_gemm_convolution_nchw_srck
(
device_implicit_gemm_convolution_nchw_srck
(
in_nchw_desc
,
in_nchw
,
wei_kcsr_desc
,
wei_kcsr
,
out_nkhw_desc
,
out_nkhw_device
);
in_nchw_desc
,
in_nchw
,
wei_kcsr_desc
,
wei_kcsr
,
out_nkhw_desc
,
out_nkhw_device
);
#elif 0
#elif 0
...
...
driver/device_implicit_gemm_convolution_nchw_kcsr.cuh
View file @
3bd51021
...
@@ -38,20 +38,20 @@ void device_implicit_gemm_convolution_nchw_kcsr(
...
@@ -38,20 +38,20 @@ void device_implicit_gemm_convolution_nchw_kcsr(
constexpr unsigned WoPerThread = 2;
constexpr unsigned WoPerThread = 2;
constexpr unsigned BlockSize = 16;
constexpr unsigned BlockSize = 16;
#elif
0
#elif
1
constexpr
unsigned
NPerBlock
=
2
;
constexpr
unsigned
NPerBlock
=
1
;
constexpr
unsigned
KPerBlock
=
32
;
constexpr
unsigned
KPerBlock
=
64
;
constexpr
unsigned
CPerBlock
=
4
;
constexpr
unsigned
CPerBlock
=
2
;
constexpr
unsigned
HoPerBlock
=
2
;
constexpr
unsigned
HoPerBlock
=
4
;
constexpr
unsigned
WoPerBlock
=
32
;
constexpr
unsigned
WoPerBlock
=
32
;
constexpr
unsigned
KPerThread
=
4
;
constexpr
unsigned
KPerThread
=
16
;
constexpr
unsigned
CPerThread
=
2
;
constexpr
unsigned
CPerThread
=
1
;
constexpr
unsigned
HoPerThread
=
2
;
constexpr
unsigned
HoPerThread
=
2
;
constexpr
unsigned
WoPerThread
=
2
;
constexpr
unsigned
WoPerThread
=
2
;
constexpr
unsigned
BlockSize
=
128
;
constexpr
unsigned
BlockSize
=
128
;
#elif
1
#elif
0
constexpr
unsigned
NPerBlock
=
1
;
constexpr
unsigned
NPerBlock
=
1
;
constexpr
unsigned
KPerBlock
=
64
;
constexpr
unsigned
KPerBlock
=
64
;
constexpr
unsigned
CPerBlock
=
4
;
constexpr
unsigned
CPerBlock
=
4
;
...
...
driver/device_implicit_gemm_convolution_nchw_srck.cuh
View file @
3bd51021
...
@@ -63,32 +63,32 @@ void device_implicit_gemm_convolution_nchw_srck(InDesc,
...
@@ -63,32 +63,32 @@ void device_implicit_gemm_convolution_nchw_srck(InDesc,
constexpr unsigned WoPerThread = 2;
constexpr unsigned WoPerThread = 2;
constexpr unsigned BlockSize = 16;
constexpr unsigned BlockSize = 16;
#elif
1
#elif
0
constexpr
unsigned
NPerBlock
=
2
;
constexpr
unsigned
NPerBlock
=
1
;
constexpr
unsigned
KPerBlock
=
32
;
constexpr
unsigned
KPerBlock
=
64
;
constexpr
unsigned
CPerBlock
=
4
;
constexpr
unsigned
CPerBlock
=
2
;
constexpr
unsigned
HoPerBlock
=
2
;
constexpr
unsigned
HoPerBlock
=
4
;
constexpr
unsigned
WoPerBlock
=
32
;
constexpr
unsigned
WoPerBlock
=
32
;
constexpr
unsigned
KPerThread
=
4
;
constexpr
unsigned
KPerThread
=
8
;
constexpr
unsigned
CPerThread
=
2
;
constexpr
unsigned
CPerThread
=
1
;
constexpr
unsigned
HoPerThread
=
2
;
constexpr
unsigned
HoPerThread
=
2
;
constexpr
unsigned
WoPerThread
=
2
;
constexpr
unsigned
WoPerThread
=
4
;
constexpr
unsigned
BlockSize
=
128
;
constexpr
unsigned
BlockSize
=
128
;
#elif
0
#elif
1
constexpr
unsigned
NPerBlock
=
2
;
constexpr
unsigned
NPerBlock
=
1
;
constexpr
unsigned
KPerBlock
=
64
;
constexpr
unsigned
KPerBlock
=
64
;
constexpr
unsigned
CPerBlock
=
4
;
constexpr
unsigned
CPerBlock
=
2
;
constexpr
unsigned
HoPerBlock
=
2
;
constexpr
unsigned
HoPerBlock
=
4
;
constexpr
unsigned
WoPerBlock
=
32
;
constexpr
unsigned
WoPerBlock
=
32
;
constexpr
unsigned
KPerThread
=
4
;
constexpr
unsigned
KPerThread
=
16
;
constexpr
unsigned
CPerThread
=
2
;
constexpr
unsigned
CPerThread
=
1
;
constexpr
unsigned
HoPerThread
=
2
;
constexpr
unsigned
HoPerThread
=
2
;
constexpr
unsigned
WoPerThread
=
2
;
constexpr
unsigned
WoPerThread
=
2
;
constexpr
unsigned
BlockSize
=
256
;
constexpr
unsigned
BlockSize
=
128
;
#endif
#endif
constexpr
unsigned
GridSize
=
constexpr
unsigned
GridSize
=
...
...
src/include/gridwise_implicit_gemm_convolution_nchw_kcsr.cuh
View file @
3bd51021
...
@@ -185,7 +185,7 @@ __global__ void gridwise_implicit_gemm_convolution_nchw_kcsr(InGlobalDesc,
...
@@ -185,7 +185,7 @@ __global__ void gridwise_implicit_gemm_convolution_nchw_kcsr(InGlobalDesc,
in_nchw_block_desc
.
GetLengths
());
in_nchw_block_desc
.
GetLengths
());
#endif
#endif
#if
0
#if
1
// weight: global mem to LDS,
// weight: global mem to LDS,
// convert [K,C,S,R] to [S,R,C,K]
// convert [K,C,S,R] to [S,R,C,K]
blockwise_4d_tensor_copy_reorder_by_get_dst_from_src
<
BlockSize
>
(
blockwise_4d_tensor_copy_reorder_by_get_dst_from_src
<
BlockSize
>
(
...
@@ -238,7 +238,7 @@ __global__ void gridwise_implicit_gemm_convolution_nchw_kcsr(InGlobalDesc,
...
@@ -238,7 +238,7 @@ __global__ void gridwise_implicit_gemm_convolution_nchw_kcsr(InGlobalDesc,
const
unsigned
k_thread_data_begin
=
matrix_c_index
.
row_begin
;
const
unsigned
k_thread_data_begin
=
matrix_c_index
.
row_begin
;
const
unsigned
wo_thread_data_begin
=
matrix_c_index
.
col_begin
/
NPerThread
;
const
unsigned
wo_thread_data_begin
=
matrix_c_index
.
col_begin
/
NPerThread
;
#if
0
#if
1
// output: register to global mem,
// output: register to global mem,
// convert out_thread[Ho,K,Wo,N] to out_global[N,K,Ho,Wo]
// convert out_thread[Ho,K,Wo,N] to out_global[N,K,Ho,Wo]
constexpr
auto
reorder_nkhw_from_hkwn
=
Sequence
<
3
,
1
,
0
,
2
>
{};
constexpr
auto
reorder_nkhw_from_hkwn
=
Sequence
<
3
,
1
,
0
,
2
>
{};
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment