Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
86ceded9
Commit
86ceded9
authored
Aug 15, 2019
by
Chao Liu
Browse files
Merge remote-tracking branch 'origin/master' into add_padding
parents
4fb81e00
0979fb4a
Changes
8
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
82 additions
and
240 deletions
+82
-240
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp
...n_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp
+36
-27
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp
..._convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp
+3
-43
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
...n_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+33
-61
driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
...de/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
+1
-63
driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded.hpp
...ce_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded.hpp
+7
-7
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
.../device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+0
-37
driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
.../device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+1
-1
driver/src/driver.cpp
driver/src/driver.cpp
+1
-1
No files found.
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp
View file @
86ceded9
...
@@ -126,7 +126,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
...
@@ -126,7 +126,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
// blockwise copy
// blockwise copy
// input: format is [C, Hi, Wi, N]
// input: format is [C, Hi, Wi, N]
auto
blockwise_in_copy
=
auto
blockwise_in_copy
=
BlockwiseGenericTensorSliceCopy_v1
<
BlockSize
,
#if 0
BlockwiseGenericTensorSliceCopy_v1
#else
BlockwiseGenericTensorSliceCopy_v2
#endif
<
BlockSize
,
decltype
(
in_c_h_w_n_global_desc
),
decltype
(
in_c_h_w_n_global_desc
),
decltype
(
in_c_h_w_n_block_desc
),
decltype
(
in_c_h_w_n_block_desc
),
decltype
(
in_c_h_w_n_block_desc
.
GetLengths
()),
decltype
(
in_c_h_w_n_block_desc
.
GetLengths
()),
...
@@ -138,13 +143,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
...
@@ -138,13 +143,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
3
,
3
,
3
,
3
,
InBlockCopyDataPerAccess_N
,
InBlockCopyDataPerAccess_N
,
InBlockCopyDataPerAccess_N
>
({
0
,
0
,
0
,
0
},
InBlockCopyDataPerAccess_N
>
({
0
,
0
,
0
,
0
},
{
0
,
0
,
0
,
0
});
{
0
,
0
,
0
,
0
});
// blockwise wei copy
// blockwise wei copy
// format is [CPerBlock, X * KPerBlock]
// format is [CPerBlock, X * KPerBlock]
const
auto
blockwise_wei_copy
=
const
auto
blockwise_wei_copy
=
BlockwiseGenericTensorSliceCopy_v1
<
BlockSize
,
#if 0
BlockwiseGenericTensorSliceCopy_v1
#else
BlockwiseGenericTensorSliceCopy_v2
#endif
<
BlockSize
,
decltype
(
wei_c_k_global_desc
),
decltype
(
wei_c_k_global_desc
),
decltype
(
wei_c_k_block_desc
),
decltype
(
wei_c_k_block_desc
),
decltype
(
wei_c_k_block_desc
.
GetLengths
()),
decltype
(
wei_c_k_block_desc
.
GetLengths
()),
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp
View file @
86ceded9
...
@@ -128,7 +128,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
...
@@ -128,7 +128,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
// blockwise copy
// blockwise copy
// input: format is [C, Hi, Wi, N]
// input: format is [C, Hi, Wi, N]
auto
blockwise_in_copy
=
auto
blockwise_in_copy
=
BlockwiseGenericTensorSliceCopy_v
1
<
BlockSize
,
BlockwiseGenericTensorSliceCopy_v
2
<
BlockSize
,
decltype
(
in_c_h_w_n_global_desc
),
decltype
(
in_c_h_w_n_global_desc
),
decltype
(
in_c_h_w_n_block_desc
),
decltype
(
in_c_h_w_n_block_desc
),
decltype
(
in_c_h_w_n_block_desc
.
GetLengths
()),
decltype
(
in_c_h_w_n_block_desc
.
GetLengths
()),
...
@@ -144,9 +144,9 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
...
@@ -144,9 +144,9 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
{
0
,
0
,
0
,
0
});
{
0
,
0
,
0
,
0
});
// blockwise wei copy
// blockwise wei copy
// format is [CPerBlock,
X *
KPerBlock]
// format is [CPerBlock, KPerBlock]
const
auto
blockwise_wei_copy
=
const
auto
blockwise_wei_copy
=
BlockwiseGenericTensorSliceCopy_v
1
<
BlockSize
,
BlockwiseGenericTensorSliceCopy_v
2
<
BlockSize
,
decltype
(
wei_c_k_global_desc
),
decltype
(
wei_c_k_global_desc
),
decltype
(
wei_c_k_block_desc
),
decltype
(
wei_c_k_block_desc
),
decltype
(
wei_c_k_block_desc
.
GetLengths
()),
decltype
(
wei_c_k_block_desc
.
GetLengths
()),
...
@@ -212,19 +212,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
...
@@ -212,19 +212,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
Float
p_out_thread_data
[
out_k_h_w_n_thread_desc
.
GetElementSpace
()];
Float
p_out_thread_data
[
out_k_h_w_n_thread_desc
.
GetElementSpace
()];
Float
*
const
p_out_thread
=
p_out_thread_data
;
Float
*
const
p_out_thread
=
p_out_thread_data
;
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
print_ConstantTensorDescriptor(in_c_h_w_n_global_desc, "in_c_h_w_n_global_desc");
print_ConstantTensorDescriptor(wei_c_y_x_k_global_desc, "wei_c_y_x_k_global_desc");
print_ConstantTensorDescriptor(in_c_h_w_n_block_desc, "in_c_h_w_n_block_desc");
print_ConstantTensorDescriptor(wei_c_x_k_block_desc, "wei_c_x_k_block_desc");
printf("in_block_space %u, wei_block_space %u\n", in_block_space, wei_block_space);
}
#endif
// set threadwise output tensor to 0
// set threadwise output tensor to 0
threadwise_matrix_set_zero
(
c_k_wn_thread_mtx_desc
,
p_out_thread
);
threadwise_matrix_set_zero
(
c_k_wn_thread_mtx_desc
,
p_out_thread
);
...
@@ -249,7 +236,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
...
@@ -249,7 +236,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
CPerBlock
*
wei_c_y_x_k_global_desc
.
GetStride
(
I0
))
CPerBlock
*
wei_c_y_x_k_global_desc
.
GetStride
(
I0
))
{
{
blockwise_in_copy
.
Run
(
p_in_global_block_offset
,
p_in_block
);
blockwise_in_copy
.
Run
(
p_in_global_block_offset
,
p_in_block
);
blockwise_wei_copy
.
Run
(
p_wei_global_block_offset
,
p_wei_block
);
blockwise_wei_copy
.
Run
(
p_wei_global_block_offset
,
p_wei_block
);
__syncthreads
();
__syncthreads
();
...
@@ -299,19 +285,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
...
@@ -299,19 +285,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
.
Fold
(
I2
,
Number
<
W1
>
{},
Number
<
1
>
{})
.
Fold
(
I2
,
Number
<
W1
>
{},
Number
<
1
>
{})
.
Fold
(
I0
,
Number
<
1
>
{},
Number
<
K2
>
{});
.
Fold
(
I0
,
Number
<
1
>
{},
Number
<
K2
>
{});
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
print_ConstantTensorDescriptor(out_k_h_w_n_thread_desc,
"a: out_k_h_w_n_thread_desc");
print_ConstantTensorDescriptor(out_10d_thread_desc, "a: out_10d_thread_desc");
print_ConstantTensorDescriptor(out_k_h_w_n_global_desc,
"a: out_k_h_w_n_global_desc");
print_ConstantTensorDescriptor(out_10d_global_desc, "a: out_10d_global_desc");
}
#endif
Float
*
p_out_thread_on_global
=
p_out_global
+
Float
*
p_out_thread_on_global
=
p_out_global
+
out_k_h_w_n_global_desc
.
GetOffsetFromMultiIndex
(
out_k_h_w_n_global_desc
.
GetOffsetFromMultiIndex
(
k_block_data_begin
+
k_thread_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
...
@@ -369,19 +342,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
...
@@ -369,19 +342,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
.
Fold
(
I2
,
Number
<
W1
>
{},
Number
<
1
>
{},
Number
<
W3
>
{})
.
Fold
(
I2
,
Number
<
W1
>
{},
Number
<
1
>
{},
Number
<
W3
>
{})
.
Fold
(
I0
,
Number
<
1
>
{},
Number
<
K2
>
{});
.
Fold
(
I0
,
Number
<
1
>
{},
Number
<
K2
>
{});
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
print_ConstantTensorDescriptor(out_k_h_w_n_thread_desc,
"b: out_k_h_w_n_thread_desc");
print_ConstantTensorDescriptor(out_10d_thread_desc, "b: out_10d_thread_desc");
print_ConstantTensorDescriptor(out_k_h_w_n_global_desc,
"b: out_k_h_w_n_global_desc");
print_ConstantTensorDescriptor(out_10d_global_desc, "b: out_10d_global_desc");
}
#endif
Float
*
p_out_thread_on_global
=
p_out_global
+
Float
*
p_out_thread_on_global
=
p_out_global
+
out_k_h_w_n_global_desc
.
GetOffsetFromMultiIndex
(
out_k_h_w_n_global_desc
.
GetOffsetFromMultiIndex
(
k_block_data_begin
+
k_thread_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
View file @
86ceded9
...
@@ -156,28 +156,16 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -156,28 +156,16 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
static_assert
(
in_e_n1_b_n2_block_desc
.
GetStride
(
I1
)
%
GemmDataPerReadB
==
0
,
static_assert
(
in_e_n1_b_n2_block_desc
.
GetStride
(
I1
)
%
GemmDataPerReadB
==
0
,
"GemmDataPerReadB alignment requirement is not satisfied"
);
"GemmDataPerReadB alignment requirement is not satisfied"
);
#if 0
// input blockwise copy
// input blockwise copy
// slice a merged tensor, reorder and copy to a normal tensor
// slice a merged tensor, reorder and copy to a normal tensor
// this copy operator already has blockwise offset built-in
// this copy operator already has blockwise offset built-in
auto
blockwise_in_copy
=
auto
blockwise_in_copy
=
BlockwiseGenericTensorSliceCopy_v1<BlockSize,
#if 0
decltype(in_e_n1_b_n2_global_merged_desc),
BlockwiseGenericTensorSliceCopy_v1
decltype(in_e_n1_b_n2_block_desc),
decltype(in_e_n1_b_n2_block_desc.GetLengths()),
InBlockCopySubLengths_E_N1_B_N2,
InBlockCopyClusterLengths_E_N1_B_N2,
InBlockCopyThreadClusterArrangeOrder,
InBlockCopySrcAccessOrder,
InBlockCopyDstAccessOrder,
2,
3,
InBlockCopySrcDataPerRead_B,
InBlockCopyDstDataPerWrite_N2>(
{0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0});
#else
#else
auto
blockwise_in_copy
=
BlockwiseGenericTensorSliceCopy_v2
BlockwiseGenericTensorSliceCopy_v2
<
BlockSize
,
#endif
<
BlockSize
,
decltype
(
in_e_n1_b_n2_global_merged_desc
),
decltype
(
in_e_n1_b_n2_global_merged_desc
),
decltype
(
in_e_n1_b_n2_block_desc
),
decltype
(
in_e_n1_b_n2_block_desc
),
decltype
(
in_e_n1_b_n2_block_desc
.
GetLengths
()),
decltype
(
in_e_n1_b_n2_block_desc
.
GetLengths
()),
...
@@ -189,9 +177,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -189,9 +177,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
2
,
2
,
3
,
3
,
InBlockCopySrcDataPerRead_B
,
InBlockCopySrcDataPerRead_B
,
InBlockCopyDstDataPerWrite_N2
>
(
InBlockCopyDstDataPerWrite_N2
>
({
0
,
0
,
b_block_data_on_global
,
0
},
{
0
,
0
,
0
,
0
});
{
0
,
0
,
b_block_data_on_global
,
0
},
{
0
,
0
,
0
,
0
});
#endif
// weight tensor
// weight tensor
// tensor descriptor in device memory, src of blockwise copy
// tensor descriptor in device memory, src of blockwise copy
...
@@ -204,28 +190,16 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -204,28 +190,16 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
Sequence
<
EPerBlock
,
KPerBlock
>
{},
Sequence
<
EPerBlock
,
KPerBlock
>
{},
Number
<
math
::
lcm
(
WeiBlockCopyDstDataPerWrite_K
,
GemmDataPerReadA
)
>
{});
Number
<
math
::
lcm
(
WeiBlockCopyDstDataPerWrite_K
,
GemmDataPerReadA
)
>
{});
#if 0
// operator for blockwise copy of weight into LDS
// operator for blockwise copy of weight into LDS
// slice a tensor, and copy it into another tensor
// slice a tensor, and copy it into another tensor
// this copy operator already have blockwise offset built-in
// this copy operator already have blockwise offset built-in
auto
blockwise_wei_copy
=
auto
blockwise_wei_copy
=
BlockwiseGenericTensorSliceCopy_v1<BlockSize,
#if 0
decltype(wei_e_k_global_desc),
BlockwiseGenericTensorSliceCopy_v1
decltype(wei_e_k_block_desc),
decltype(wei_e_k_block_desc.GetLengths()),
WeiBlockCopySubLengths_E_K,
WeiBlockCopyClusterLengths_E_K,
WeiBlockCopyThreadClusterArrangeOrder,
WeiBlockCopySrcAccessOrder,
WeiBlockCopyDstAccessOrder,
0,
1,
WeiBlockCopySrcDataPerRead_E,
WeiBlockCopyDstDataPerWrite_K>(
{0, k_block_data_on_global}, {0, 0});
#else
#else
auto
blockwise_wei_copy
=
BlockwiseGenericTensorSliceCopy_v2
BlockwiseGenericTensorSliceCopy_v2
<
BlockSize
,
#endif
<
BlockSize
,
decltype
(
wei_e_k_global_desc
),
decltype
(
wei_e_k_global_desc
),
decltype
(
wei_e_k_block_desc
),
decltype
(
wei_e_k_block_desc
),
decltype
(
wei_e_k_block_desc
.
GetLengths
()),
decltype
(
wei_e_k_block_desc
.
GetLengths
()),
...
@@ -237,9 +211,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -237,9 +211,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
0
,
0
,
1
,
1
,
WeiBlockCopySrcDataPerRead_E
,
WeiBlockCopySrcDataPerRead_E
,
WeiBlockCopyDstDataPerWrite_K
>
(
WeiBlockCopyDstDataPerWrite_K
>
({
0
,
k_block_data_on_global
},
{
0
,
0
});
{
0
,
k_block_data_on_global
},
{
0
,
0
});
#endif
// GEMM definition
// GEMM definition
// c_mtx += transpose(a_mtx) * b_mtx
// c_mtx += transpose(a_mtx) * b_mtx
...
@@ -430,7 +402,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -430,7 +402,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
out_k_n1_b_n2_global_merged_desc
.
GetOffsetFromMultiIndex
(
out_k_n1_b_n2_global_merged_desc
.
GetOffsetFromMultiIndex
(
k_thread_data_on_global
,
0
,
b_thread_data_on_global
,
0
);
k_thread_data_on_global
,
0
,
b_thread_data_on_global
,
0
);
#if
1
#if
0
ThreadwiseGenericTensorSliceCopy_v1r2<
ThreadwiseGenericTensorSliceCopy_v1r2<
decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc),
decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc),
decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc),
decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc),
...
...
driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
View file @
86ceded9
...
@@ -112,38 +112,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
...
@@ -112,38 +112,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
constexpr index_t WeiBlockCopyDataPerAccess_K = 4;
constexpr index_t WeiBlockCopyDataPerAccess_K = 4;
constexpr index_t OutThreadCopyDataPerAccess_N = 2;
constexpr index_t OutThreadCopyDataPerAccess_N = 2;
#elif
0
#elif
1
// for 3x3, 34x34, v1r2, Pascal, in-block-copy1
constexpr
index_t
BlockSize
=
128
;
constexpr
index_t
NPerBlock
=
4
;
constexpr
index_t
KPerBlock
=
64
;
constexpr
index_t
CPerBlock
=
8
;
constexpr
index_t
HoPerBlock
=
4
;
constexpr
index_t
WoPerBlock
=
8
;
constexpr
index_t
NPerThread
=
4
;
constexpr
index_t
KPerThread
=
8
;
constexpr
index_t
HoPerThread
=
1
;
constexpr
index_t
WoPerThread
=
2
;
constexpr
index_t
GemmMPerThreadSubC
=
4
;
constexpr
index_t
GemmNPerThreadSubC
=
4
;
constexpr
index_t
GemmMLevel0Cluster
=
4
;
constexpr
index_t
GemmNLevel0Cluster
=
2
;
constexpr
index_t
GemmMLevel1Cluster
=
2
;
constexpr
index_t
GemmNLevel1Cluster
=
2
;
constexpr
index_t
GemmKPerThreadLoop
=
1
;
constexpr
index_t
GemmDataPerReadA
=
4
;
constexpr
index_t
GemmDataPerReadB
=
4
;
using
InBlockCopyClusterLengths_CHWN
=
Sequence
<
0
,
0
,
0
,
0
>
;
// not used
constexpr
index_t
InBlockCopyDataPerAccess_N
=
4
;
constexpr
index_t
WeiBlockCopyDataPerAccess_K
=
4
;
constexpr
index_t
OutThreadCopyDataPerAccess_N
=
2
;
#elif 0
// for 3x3, 34x34, v1r3, Pascal
// for 3x3, 34x34, v1r3, Pascal
// for 3x3, 28x28, v1r3, Pascal
// for 3x3, 28x28, v1r3, Pascal
// for 3x3, 14x14, v1r3, Pascal
// for 3x3, 14x14, v1r3, Pascal
...
@@ -179,37 +148,6 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
...
@@ -179,37 +148,6 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
constexpr
index_t
WeiBlockCopyDataPerAccess_K
=
4
;
constexpr
index_t
WeiBlockCopyDataPerAccess_K
=
4
;
constexpr
index_t
OutThreadCopyDataPerAccess_N
=
2
;
constexpr
index_t
OutThreadCopyDataPerAccess_N
=
2
;
#elif 0
// for 3x3, 34x34, v1r3, Pascal, bad
constexpr
index_t
BlockSize
=
128
;
constexpr
index_t
NPerBlock
=
1
;
constexpr
index_t
KPerBlock
=
128
;
constexpr
index_t
CPerBlock
=
8
;
constexpr
index_t
HoPerBlock
=
2
;
constexpr
index_t
WoPerBlock
=
32
;
constexpr
index_t
NPerThread
=
1
;
constexpr
index_t
KPerThread
=
8
;
constexpr
index_t
HoPerThread
=
1
;
constexpr
index_t
WoPerThread
=
8
;
constexpr
index_t
GemmMPerThreadSubC
=
4
;
constexpr
index_t
GemmNPerThreadSubC
=
4
;
constexpr
index_t
GemmMLevel0Cluster
=
4
;
constexpr
index_t
GemmNLevel0Cluster
=
2
;
constexpr
index_t
GemmMLevel1Cluster
=
4
;
constexpr
index_t
GemmNLevel1Cluster
=
2
;
constexpr
index_t
GemmKPerThreadLoop
=
1
;
constexpr
index_t
GemmDataPerReadA
=
4
;
constexpr
index_t
GemmDataPerReadB
=
4
;
using
InBlockCopyClusterLengths_CHWN
=
Sequence
<
2
,
2
,
32
,
1
>
;
constexpr
index_t
InBlockCopyDataPerAccess_N
=
1
;
constexpr
index_t
WeiBlockCopyDataPerAccess_K
=
2
;
constexpr
index_t
OutThreadCopyDataPerAccess_N
=
1
;
#elif 0
#elif 0
// for 3x3, 34x34, v1r1, Vega 20
// for 3x3, 34x34, v1r1, Vega 20
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BlockSize
=
256
;
...
...
driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded.hpp
View file @
86ceded9
...
@@ -81,9 +81,9 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded(InDesc,
...
@@ -81,9 +81,9 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded(InDesc,
#if 1
#if 1
// v1r3, 3x3, 32x32, 1x1 pad
// v1r3, 3x3, 32x32, 1x1 pad
constexpr
index_t
BlockSize
=
128
;
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
NPerBlock
=
16
;
constexpr
index_t
NPerBlock
=
32
;
constexpr
index_t
KPerBlock
=
128
;
constexpr
index_t
KPerBlock
=
128
;
constexpr
index_t
CPerBlock
=
8
;
constexpr
index_t
CPerBlock
=
8
;
constexpr
index_t
HoPerBlock
=
2
;
constexpr
index_t
HoPerBlock
=
2
;
...
@@ -97,7 +97,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded(InDesc,
...
@@ -97,7 +97,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded(InDesc,
constexpr
index_t
GemmMPerThreadSubC
=
4
;
constexpr
index_t
GemmMPerThreadSubC
=
4
;
constexpr
index_t
GemmNPerThreadSubC
=
4
;
constexpr
index_t
GemmNPerThreadSubC
=
4
;
constexpr
index_t
GemmMLevel0Cluster
=
4
;
constexpr
index_t
GemmMLevel0Cluster
=
4
;
constexpr
index_t
GemmNLevel0Cluster
=
2
;
constexpr
index_t
GemmNLevel0Cluster
=
4
;
constexpr
index_t
GemmMLevel1Cluster
=
4
;
constexpr
index_t
GemmMLevel1Cluster
=
4
;
constexpr
index_t
GemmNLevel1Cluster
=
2
;
constexpr
index_t
GemmNLevel1Cluster
=
2
;
constexpr
index_t
GemmKPerThreadLoop
=
1
;
constexpr
index_t
GemmKPerThreadLoop
=
1
;
...
@@ -105,14 +105,14 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded(InDesc,
...
@@ -105,14 +105,14 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded(InDesc,
constexpr
index_t
GemmDataPerReadB
=
4
;
constexpr
index_t
GemmDataPerReadB
=
4
;
using
InBlockCopySubLengths_CHWN
=
Sequence
<
1
,
1
,
1
,
4
>
;
using
InBlockCopySubLengths_CHWN
=
Sequence
<
1
,
1
,
1
,
4
>
;
using
InBlockCopyClusterLengths_CHWN
=
Sequence
<
8
,
2
,
2
,
4
>
;
using
InBlockCopyClusterLengths_CHWN
=
Sequence
<
8
,
2
,
2
,
8
>
;
constexpr
index_t
InBlockCopyDataPerAccess_N
=
4
;
constexpr
index_t
InBlockCopyDataPerAccess_N
=
4
;
using
WeiBlockCopySubLengths_CK
=
Sequence
<
2
,
4
>
;
using
WeiBlockCopySubLengths_CK
=
Sequence
<
1
,
4
>
;
using
WeiBlockCopyClusterLengths_CK
=
Sequence
<
4
,
32
>
;
using
WeiBlockCopyClusterLengths_CK
=
Sequence
<
8
,
32
>
;
constexpr
index_t
WeiBlockCopyDataPerAccess_K
=
4
;
constexpr
index_t
WeiBlockCopyDataPerAccess_K
=
4
;
constexpr
index_t
OutThreadCopyDataPerAccess_N
=
2
;
constexpr
index_t
OutThreadCopyDataPerAccess_N
=
4
;
#endif
#endif
constexpr
index_t
GridSize
=
constexpr
index_t
GridSize
=
...
...
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
View file @
86ceded9
...
@@ -91,43 +91,6 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
...
@@ -91,43 +91,6 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
4
;
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
4
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
1
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
1
;
#elif 1
// each thread hold 64 data
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BPerBlock
=
16
;
constexpr
index_t
KPerBlock
=
128
;
constexpr
index_t
EPerBlock
=
8
;
constexpr
index_t
GemmNRepeat
=
2
;
constexpr
index_t
GemmMPerThreadSubC
=
4
;
constexpr
index_t
GemmNPerThreadSubC
=
4
;
constexpr
index_t
GemmMLevel0Cluster
=
4
;
constexpr
index_t
GemmNLevel0Cluster
=
4
;
constexpr
index_t
GemmMLevel1Cluster
=
4
;
constexpr
index_t
GemmNLevel1Cluster
=
4
;
constexpr
index_t
GemmKPerThreadLoop
=
1
;
constexpr
index_t
GemmDataPerReadA
=
4
;
constexpr
index_t
GemmDataPerReadB
=
4
;
using
InBlockCopySubLengths_E_N1_B_N2
=
Sequence
<
1
,
1
,
2
,
2
>
;
using
InBlockCopyClusterLengths_E_N1_B_N2
=
Sequence
<
8
,
2
,
8
,
2
>
;
using
InBlockCopyThreadClusterArrangeOrder
=
Sequence
<
0
,
1
,
3
,
2
>
;
// [E, N1, N2, B]
using
InBlockCopySrcAccessOrder
=
Sequence
<
0
,
1
,
3
,
2
>
;
// [E, N1, N2, B]
using
InBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
,
2
,
3
>
;
// [E, N1, B, N2]
constexpr
index_t
InBlockCopySrcDataPerRead_B
=
2
;
constexpr
index_t
InBlockCopyDstDataPerWrite_N2
=
2
;
using
WeiBlockCopySubLengths_E_K
=
Sequence
<
2
,
2
>
;
using
WeiBlockCopyClusterLengths_E_K
=
Sequence
<
4
,
64
>
;
using
WeiBlockCopyThreadClusterArrangeOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopySrcAccessOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, K]
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
2
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
2
;
#elif 0
#elif 0
// each thread hold 32 data
// each thread hold 32 data
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BlockSize
=
256
;
...
...
driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
86ceded9
...
@@ -54,7 +54,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
...
@@ -54,7 +54,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
wei_kcyx_device_buf
.
ToDevice
(
wei_kcyx
.
mData
.
data
());
wei_kcyx_device_buf
.
ToDevice
(
wei_kcyx
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
#if
0
#if
1
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BPerBlock
=
128
;
constexpr
index_t
BPerBlock
=
128
;
...
...
driver/src/driver.cpp
View file @
86ceded9
...
@@ -229,7 +229,7 @@ int main(int argc, char* argv[])
...
@@ -229,7 +229,7 @@ int main(int argc, char* argv[])
constexpr
index_t
HPad
=
0
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
constexpr
index_t
WPad
=
0
;
#elif
1
#elif
0
// 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
// 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
// cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
// cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment