Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
bd0098af
"vscode:/vscode.git/clone" did not exist on "61cce232c72a206a3617b95dbcd4d85e2f3ac7a8"
Commit
bd0098af
authored
Apr 06, 2019
by
Chao Liu
Browse files
use dedicated threadwise_copy for 1x1, perf at 80%
parent
5245a016
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
62 additions
and
28 deletions
+62
-28
driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
+4
-2
src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp
...dwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp
+11
-11
src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
...implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
+47
-15
No files found.
driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
View file @
bd0098af
...
...
@@ -192,7 +192,6 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
constexpr
index_t
BlockSize
=
256
;
#elif 0
// 1x1, 14x14, Vega 20, disable lds_double_buffer, enable register double buffer
// 1x1, 14x14, Pascal, enable lds_double_buffer, disable register double buffer
constexpr
index_t
BPerBlock
=
64
;
constexpr
index_t
KPerBlock
=
128
;
...
...
@@ -245,6 +244,8 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
constexpr
index_t
InBlockCopyDataPerRead
=
4
;
constexpr
index_t
WeiBlockCopyDataPerRead
=
4
;
constexpr
index_t
OutThreadCopyDataPerWrite
=
4
;
constexpr
index_t
BlockSize
=
256
;
#endif
...
...
@@ -295,7 +296,8 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
WeiBlockCopyThreadPerDim0
,
WeiBlockCopyThreadPerDim1
,
InBlockCopyDataPerRead
,
WeiBlockCopyDataPerRead
>
{};
WeiBlockCopyDataPerRead
,
OutThreadCopyDataPerWrite
>
{};
float
time
=
launch_kernel
(
run_gridwise_convolution
<
decltype
(
gridwise_conv
),
T
>
,
dim3
(
GridSize
),
...
...
src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp
View file @
bd0098af
...
...
@@ -206,11 +206,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
p_wei_global_block_offset
+=
CPerBlock
*
wei_cyxk_global_desc
.
GetStride
(
I0
),
__syncthreads
())
{
// load data
#if 1
blockwise_in_copy
.
Run
(
p_in_global_block_offset
,
p_in_block
);
blockwise_wei_copy
.
Run
(
p_wei_global_block_offset
,
p_wei_block
);
#elif 0
// load data
Float
p_in_register_clipboard
[
blockwise_in_copy
.
GetRegisterClipboardSize
()];
Float
p_wei_register_clipboard
[
blockwise_wei_copy
.
GetRegisterClipboardSize
()];
...
...
@@ -219,9 +215,13 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
blockwise_wei_copy
.
RunLoadRegisterClipboard
(
p_wei_global_block_offset
,
p_wei_register_clipboard
);
#if 1
blockwise_in_copy
.
RunStoreRegisterClipboard
(
p_in_register_clipboard
,
p_in_block
);
blockwise_wei_copy
.
RunStoreRegisterClipboard
(
p_wei_register_clipboard
,
p_wei_block
);
#else
vmcnt
(
0
);
blockwise_in_copy
.
RunStoreRegisterClipboard_asm
(
p_in_register_clipboard
,
p_in_block
);
blockwise_wei_copy
.
RunStoreRegisterClipboard_asm
(
p_wei_register_clipboard
,
p_wei_block
);
#endif
__syncthreads
();
...
...
@@ -232,11 +232,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
{
for
(
index_t
x
=
0
;
x
<
X
;
++
x
)
{
#if
1
#if
0
blockwise_gemm.Run
#elif
0
blockwise_gemm
.
Run_RegisterDoubleBuffer
#elif
0
#elif
1
blockwise_gemm
.
Run_asm
#endif
(
p_wei_block
+
wei_cyxk_block_desc
.
Get1dIndex
(
0
,
y
,
x
,
0
),
...
...
src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
View file @
bd0098af
...
...
@@ -5,6 +5,7 @@
#include "blockwise_4d_tensor_op.hip.hpp"
#include "blockwise_2d_tensor_op.hip.hpp"
#include "threadwise_2d_tensor_op.hip.hpp"
#include "threadwise_nd_tensor_op.hip.hpp"
#include "blockwise_gemm.hip.hpp"
// define B = flatten(N, Hi, Wi)
...
...
@@ -31,7 +32,8 @@ template <index_t GridSize,
index_t
WeiBlockCopyThreadPerDim0
,
index_t
WeiBlockCopyThreadPerDim1
,
index_t
InBlockCopyDataPerRead
,
index_t
WeiBlockCopyDataPerRead
>
index_t
WeiBlockCopyDataPerRead
,
index_t
OutThreadCopyDataPerWrite
>
struct
GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
{
__device__
void
Run
(
const
Float
*
const
__restrict__
p_in_global
,
...
...
@@ -369,6 +371,34 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
const
index_t
k_thread_data_begin
=
k_block_data_begin
+
c_thread_mtx_begin
.
row
;
const
index_t
b_thread_data_begin
=
b_block_data_begin
+
c_thread_mtx_begin
.
col
;
#if 1
if
(
Y
==
1
&&
X
==
1
)
{
// pure 1x1 conv
constexpr
index_t
K2_
=
GemmMPerThreadSubC
;
constexpr
index_t
K1_
=
KPerBlock
/
KPerThread
;
constexpr
index_t
B2_
=
GemmNPerThreadSubC
;
constexpr
index_t
B1_
=
BPerBlock
/
BPerThread
;
constexpr
auto
out_6d_global_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
K
/
(
K1_
*
K2_
),
K1_
,
K2_
,
B
/
(
B1_
*
B2_
),
B1_
,
B2_
>
{});
constexpr
auto
out_6d_thread_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
KPerBlock
/
(
K1_
*
K2_
),
1
,
K2_
,
BPerBlock
/
(
B1_
*
B2_
),
1
,
B2_
>
{});
constexpr
auto
out_kb_global_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
K
,
B
>
{});
threadwise_6d_tensor_copy
(
out_6d_thread_desc
,
p_out_thread
,
out_6d_global_desc
,
p_out_global
+
out_kb_global_desc
.
Get1dIndex
(
k_thread_data_begin
,
b_thread_data_begin
),
out_6d_thread_desc
.
GetLengths
(),
Number
<
OutThreadCopyDataPerWrite
>
{});
}
else
#endif
{
for
(
index_t
k
=
0
;
k
<
out_kb_thread_desc
.
GetLength
(
I0
);
++
k
)
{
for
(
index_t
b
=
0
;
b
<
out_kb_thread_desc
.
GetLength
(
I1
);
++
b
)
...
...
@@ -386,10 +416,12 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
if
(
n_data
<
N
&&
h_data
<
Ho
&&
w_data
<
Wo
)
{
p_out_global
[
out_khwn_global_desc
.
Get1dIndex
(
k_data
,
h_data
,
w_data
,
n_data
)]
=
p_out_global
[
out_khwn_global_desc
.
Get1dIndex
(
k_data
,
h_data
,
w_data
,
n_data
)]
=
p_out_thread
[
out_kb_thread_desc
.
Get1dIndex
(
k
,
b
)];
}
}
}
}
}
};
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment