Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
e87aa851
Commit
e87aa851
authored
Jul 13, 2019
by
Chao Liu
Browse files
adding implcit GEMM v4r2
parent
ce4ec7dc
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
198 additions
and
1257 deletions
+198
-1257
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
...n_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+5
-6
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer.hpp
...n_implicit_gemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer.hpp
+106
-59
composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp
...l/include/tensor_description/ConstantMatrixDescriptor.hpp
+8
-4
composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp
...l/include/tensor_description/ConstantTensorDescriptor.hpp
+21
-1
driver/include/device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp
.../device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp
+35
-16
driver/src/driver.cpp
driver/src/driver.cpp
+21
-11
driver/src/driver.cu
driver/src/driver.cu
+1
-580
driver/src/driver.cu
driver/src/driver.cu
+1
-580
No files found.
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
View file @
e87aa851
...
@@ -208,13 +208,12 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -208,13 +208,12 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
// b_mtx[EPerBlocl, N1 * BPerBlock * N2] is in LDS
// b_mtx[EPerBlocl, N1 * BPerBlock * N2] is in LDS
// c_mtx[KPerBlock, N1 * BPerBlock * N2] is distributed among threads, and saved in
// c_mtx[KPerBlock, N1 * BPerBlock * N2] is distributed among threads, and saved in
// register
// register
constexpr
auto
a_e_k_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
constexpr
auto
a_e_k_block_mtx_desc
=
Number
<
EPerBlock
>
{},
Number
<
KPerBlock
>
{},
Number
<
wei_e_k_block_desc
.
GetStride
(
I0
)
>
{}
);
make_ConstantMatrixDescriptor_from_ConstantTensorDescriptor
(
wei_e_k_block_desc
);
constexpr
auto
b_e_n1bn2_block_mtx_desc
=
constexpr
auto
b_e_n1bn2_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
EPerBlock
>
{},
make_ConstantMatrixDescriptor_from_ConstantTensorDescriptor
(
Number
<
N1
*
BPerBlock
*
N2
>
{},
in_e_n1_b_n2_block_desc
.
Unfold
(
I1
,
I3
));
Number
<
in_e_n1_b_n2_block_desc
.
GetStride
(
I0
)
>
{});
// sanity check
// sanity check
static_assert
(
KPerBlock
%
(
GemmMPerThreadSubC
*
GemmMLevel0Cluster
*
GemmMLevel1Cluster
)
==
static_assert
(
KPerBlock
%
(
GemmMPerThreadSubC
*
GemmMLevel0Cluster
*
GemmMLevel1Cluster
)
==
...
@@ -226,7 +225,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -226,7 +225,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
// c_thread_mtx definition: this is a mess
// c_thread_mtx definition: this is a mess
// TODO:: more elegent way of defining c_thread_mtx
// TODO:: more elegent way of defining c_thread_mtx
constexpr
auto
c_k0k2_n1n2_thread_mtx_desc
=
make_ConstantMatrixDescriptor
(
constexpr
auto
c_k0k2_n1n2_thread_mtx_desc
=
make_ConstantMatrixDescriptor
_packed
(
Number
<
GemmMRepeat
*
GemmMPerThreadSubC
>
{},
Number
<
N1
*
N2
>
{});
Number
<
GemmMRepeat
*
GemmMPerThreadSubC
>
{},
Number
<
N1
*
N2
>
{});
const
auto
blockwise_gemm
=
BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
<
const
auto
blockwise_gemm
=
BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
<
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer.hpp
View file @
e87aa851
...
@@ -20,15 +20,18 @@ template <index_t GridSize,
...
@@ -20,15 +20,18 @@ template <index_t GridSize,
class
OutGlobalDesc
,
class
OutGlobalDesc
,
class
ConvStrides
,
class
ConvStrides
,
class
ConvDilations
,
class
ConvDilations
,
index_t
BPerBlock
,
index_t
N1
,
index_t
KPerBlock
,
index_t
EPerBlock
,
index_t
N0
,
index_t
N2
,
index_t
N2
,
index_t
Ho
0
,
index_t
Ho
1
,
index_t
Ho2
,
index_t
Ho2
,
index_t
Wo
0
,
index_t
Wo
1
,
index_t
Wo2
,
index_t
Wo2
,
index_t
BPerBlock
,
index_t
KPerBlock
,
index_t
EPerBlock
,
index_t
N0PerBlock
,
index_t
Ho0PerBlock
,
index_t
Wo0PerBlock
,
index_t
GemmMPerThreadSubC
,
index_t
GemmMPerThreadSubC
,
index_t
GemmNPerThreadSubC
,
index_t
GemmNPerThreadSubC
,
index_t
GemmMLevel0Cluster
,
index_t
GemmMLevel0Cluster
,
...
@@ -38,13 +41,12 @@ template <index_t GridSize,
...
@@ -38,13 +41,12 @@ template <index_t GridSize,
index_t
GemmKPerThreadLoop
,
index_t
GemmKPerThreadLoop
,
index_t
GemmDataPerReadA
,
index_t
GemmDataPerReadA
,
index_t
GemmDataPerReadB
,
index_t
GemmDataPerReadB
,
class
InBlockCopySubLengths_E_N
1_B_N
2
,
class
InBlockCopySubLengths_E_N
0_Ho0_Wo0_B_N2_Ho2_Wo
2
,
class
InBlockCopyClusterLengths_E_N
1_B_N
2
,
class
InBlockCopyClusterLengths_E_N
0_Ho0_Wo0_B_N2_Ho2_Wo
2
,
class
InBlockCopyThreadClusterArrangeOrder
,
class
InBlockCopyThreadClusterArrangeOrder
,
class
InBlockCopySrcAccessOrder
,
class
InBlockCopySrcAccessOrder
,
class
InBlockCopyDstAccessOrder
,
class
InBlockCopyDstAccessOrder
,
index_t
InBlockCopySrcDataPerRead_B
,
index_t
InBlockCopyDataPerAccess_W2
,
index_t
InBlockCopyDstDataPerWrite_N2
,
class
WeiBlockCopySubLengths_E_K
,
class
WeiBlockCopySubLengths_E_K
,
class
WeiBlockCopyClusterLengths_E_K
,
class
WeiBlockCopyClusterLengths_E_K
,
class
WeiBlockCopyThreadClusterArrangeOrder
,
class
WeiBlockCopyThreadClusterArrangeOrder
,
...
@@ -60,8 +62,8 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -60,8 +62,8 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer
{
{
// this is a mess
// this is a mess
// TODO: find more elegent way of specifying (or calculating) performance parameters
// TODO: find more elegent way of specifying (or calculating) performance parameters
static_assert
(
N2
==
GemmNPerThreadSubC
,
"wrong!"
);
static_assert
(
N2
*
Ho2
*
Wo2
==
GemmNPerThreadSubC
,
"wrong!"
);
static_assert
((
N1
*
N2
*
BPerBlock
)
%
static_assert
((
N1
*
Ho1
*
Wo1
*
BPerBlock
*
N2
*
Ho2
*
Wo2
)
%
(
GemmNPerThreadSubC
*
GemmNLevel0Cluster
*
GemmNLevel1Cluster
)
==
(
GemmNPerThreadSubC
*
GemmNLevel0Cluster
*
GemmNLevel1Cluster
)
==
0
,
0
,
"wrong!"
);
"wrong!"
);
...
@@ -71,6 +73,7 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -71,6 +73,7 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
I7
=
Number
<
7
>
{};
constexpr
auto
True
=
integral_constant
<
bool
,
true
>
{};
constexpr
auto
True
=
integral_constant
<
bool
,
true
>
{};
...
@@ -96,13 +99,16 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -96,13 +99,16 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer
constexpr
index_t
E
=
C
*
Y
*
X
;
constexpr
index_t
E
=
C
*
Y
*
X
;
constexpr
index_t
N1
=
N
/
(
N0
*
N2
);
constexpr
index_t
B
=
N1
*
Ho1
*
Wo1
;
constexpr
index_t
Ho1
=
Ho
/
(
Ho0
*
Ho2
);
constexpr
index_t
Wo1
=
Wo
/
(
Wo0
*
Wo2
);
constexpr
index_t
B1
=
N1
*
Ho1
*
Wo1
;
static_assert
(
N
%
(
N1
*
N2
)
==
0
&&
Ho
%
(
Ho1
*
Ho2
)
==
0
&&
Wo
%
(
Wo1
*
Wo2
)
==
0
,
"wrong!"
);
constexpr
index_t
N0
=
N
/
(
N1
*
N2
);
constexpr
index_t
Ho0
=
Ho
/
(
Ho1
*
Ho2
);
constexpr
index_t
Wo0
=
Wo
/
(
Wo1
*
Wo2
);
static_assert
((
X
==
1
||
ConvDilationW
%
InBlockCopy
Src
DataPer
Read_B
==
0
),
static_assert
((
X
==
1
||
ConvDilationW
%
InBlockCopyDataPer
Access_W2
==
0
),
"wrong! aligment requirement for vectorized global load of input tensor will "
"wrong! aligment requirement for vectorized global load of input tensor will "
"be violated"
);
"be violated"
);
...
@@ -110,17 +116,17 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -110,17 +116,17 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer
static_assert
(
K
%
KPerBlock
==
0
&&
B
%
BPerBlock
==
0
&&
E
%
(
2
*
EPerBlock
)
==
0
,
static_assert
(
K
%
KPerBlock
==
0
&&
B
%
BPerBlock
==
0
&&
E
%
(
2
*
EPerBlock
)
==
0
,
"wrong! cannot divide work evenly among block"
);
"wrong! cannot divide work evenly among block"
);
constexpr
index_t
KBlockWork
=
K
/
KPerBlock
;
constexpr
index_t
KBlockWork
=
K
/
KPerBlock
;
constexpr
index_t
B
1
BlockWork
=
B
1
/
B
1
PerBlock
;
constexpr
index_t
BBlockWork
=
B
/
BPerBlock
;
constexpr
auto
block_work_desc
=
constexpr
auto
block_work_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
KBlockWork
,
B
1
BlockWork
>
{});
make_ConstantTensorDescriptor_packed
(
Sequence
<
KBlockWork
,
BBlockWork
>
{});
const
auto
block_work_multi_id
=
const
auto
block_work_multi_id
=
block_work_desc
.
GetMultiIndexFrom1dIndex
(
get_block_1d_id
());
block_work_desc
.
GetMultiIndexFrom1dIndex
(
get_block_1d_id
());
const
index_t
k_block_data_on_global
=
block_work_multi_id
[
0
]
*
KPerBlock
;
const
index_t
k_block_data_on_global
=
block_work_multi_id
[
0
]
*
KPerBlock
;
const
index_t
b
1
_block_data_on_global
=
block_work_multi_id
[
1
]
*
B
1
PerBlock
;
const
index_t
b_block_data_on_global
=
block_work_multi_id
[
1
]
*
BPerBlock
;
// input tensor
// input tensor
// tensor descriptor in device memory [N0, N1, N2, Ho0, Ho1, Ho2, Wo0, Wo1, Wo2]
// tensor descriptor in device memory [N0, N1, N2, Ho0, Ho1, Ho2, Wo0, Wo1, Wo2]
...
@@ -143,7 +149,7 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -143,7 +149,7 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer
.
Extract
(
Sequence
<
1
,
2
,
3
>
{});
.
Extract
(
Sequence
<
1
,
2
,
3
>
{});
// merged tensor descriptor in device memory [E, N1, B, N2], src of blockwise copy
// merged tensor descriptor in device memory [E, N1, B, N2], src of blockwise copy
constexpr
auto
in_e_n0_ho0_wo0_b
1
_n2_ho2_wo2_global_merged_desc
=
constexpr
auto
in_e_n0_ho0_wo0_b_n2_ho2_wo2_global_merged_desc
=
make_ConstantMergedTensorDescriptor
(
make_ConstantMergedTensorDescriptor
(
in_c_y_x_global_desc
.
Embed
(
in_n0_ho0_wo0_n1_ho1_wo1_n2_ho2_wo2_global_desc
),
in_c_y_x_global_desc
.
Embed
(
in_n0_ho0_wo0_n1_ho1_wo1_n2_ho2_wo2_global_desc
),
Sequence
<
0
,
1
,
2
>
{},
Sequence
<
0
,
1
,
2
>
{},
...
@@ -157,8 +163,15 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -157,8 +163,15 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer
// memory layout descriptor in LDS [E, N1, B, N2], dst of blockwise copy
// memory layout descriptor in LDS [E, N1, B, N2], dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
in_e_n0_ho0_wo0_b1_n2_ho2_wo2_block_desc
=
constexpr
auto
in_e_n0_ho0_wo0_b_n2_ho2_wo2_block_desc
=
in_e_n0_ho0_wo0_b1_n2_ho2_wo2_global_merged_desc
.
Pack
();
make_ConstantTensorDescriptor_packed
(
Sequence
<
EPerBlock
,
N0PerBlock
,
Ho0PerBlock
,
Wo0PerBlock
,
BPerBlock
,
N2
,
Ho2
,
Wo2
>
{});
// input blockwise copy
// input blockwise copy
// slice a merged tensor, reorder and copy to a normal tensor
// slice a merged tensor, reorder and copy to a normal tensor
...
@@ -166,17 +179,17 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -166,17 +179,17 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer
auto
blockwise_in_copy
=
BlockwiseGenericTensorSliceCopy_v1
<
auto
blockwise_in_copy
=
BlockwiseGenericTensorSliceCopy_v1
<
BlockSize
,
BlockSize
,
Float
,
Float
,
decltype
(
in_e_n0_ho0_wo0_b
1
_n2_ho2_wo2_global_merged_desc
),
decltype
(
in_e_n0_ho0_wo0_b_n2_ho2_wo2_global_merged_desc
),
decltype
(
in_e_n0_ho0_wo0_b
1
_n2_ho2_wo2_block_desc
),
decltype
(
in_e_n0_ho0_wo0_b_n2_ho2_wo2_block_desc
),
decltype
(
in_e_n0_ho0_wo0_b
1
_n2_ho2_wo2_block_desc
.
GetLengths
()),
decltype
(
in_e_n0_ho0_wo0_b_n2_ho2_wo2_block_desc
.
GetLengths
()),
InBlockCopySubLengths_E_N0_Ho0_Wo0_B
1
_N2_Ho2_Wo2
,
InBlockCopySubLengths_E_N0_Ho0_Wo0_B_N2_Ho2_Wo2
,
InBlockCopyClusterLengths_E_N0_Ho0_Wo0_B
1
_N2_Ho2_Wo2
,
InBlockCopyClusterLengths_E_N0_Ho0_Wo0_B_N2_Ho2_Wo2
,
InBlockCopyThreadClusterArrangeOrder
,
InBlockCopyThreadClusterArrangeOrder
,
InBlockCopySrcAccessOrder
,
InBlockCopySrcAccessOrder
,
InBlockCopyDstAccessOrder
,
InBlockCopyDstAccessOrder
,
InBlockCopyDataPerAccess_W
o
2
,
InBlockCopyDataPerAccess_W2
,
InBlockCopyDataPerAccess_W
o
2
>
({
0
,
0
,
0
,
0
,
b
1
_block_data_on_global
,
0
,
0
,
0
},
InBlockCopyDataPerAccess_W2
>
({
0
,
0
,
0
,
0
,
b_block_data_on_global
,
0
,
0
,
0
},
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
});
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
});
// weight tensor
// weight tensor
// tensor descriptor in device memory, src of blockwise copy
// tensor descriptor in device memory, src of blockwise copy
...
@@ -219,13 +232,13 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -219,13 +232,13 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer
// this check is ad-hoc
// this check is ad-hoc
// TODO: need to properly implement tensor descriptor with multiple alignment
// TODO: need to properly implement tensor descriptor with multiple alignment
// requirements
// requirements
static_assert
(
in_e_n0_ho0_wo0_b
1
_n2_ho2_wo2_block_desc
.
GetStrides
()[
3
]
%
GemmDataPerReadB
==
static_assert
(
in_e_n0_ho0_wo0_b_n2_ho2_wo2_block_desc
.
GetStrides
()[
3
]
%
GemmDataPerReadB
==
0
,
0
,
"GemmDataPerReadB alignment requirement is not satisfied"
);
"GemmDataPerReadB alignment requirement is not satisfied"
);
constexpr
auto
b_e_n0ho0wo0b
1
n2ho2wo2_block_mtx_desc
=
constexpr
auto
b_e_n0ho0wo0bn2ho2wo2_block_mtx_desc
=
make_ConstantMatrixDescriptor_from_ConstantTensorDescriptor
(
make_ConstantMatrixDescriptor_from_ConstantTensorDescriptor
(
in_e_n0_ho0_wo0_b
1
_n2_ho2_wo2_block_desc
.
Unfold
(
I1
,
I7
));
in_e_n0_ho0_wo0_b_n2_ho2_wo2_block_desc
.
Unfold
(
I1
,
I7
));
// sanity check
// sanity check
static_assert
(
KPerBlock
%
(
GemmMPerThreadSubC
*
GemmMLevel0Cluster
*
GemmMLevel1Cluster
)
==
static_assert
(
KPerBlock
%
(
GemmMPerThreadSubC
*
GemmMLevel0Cluster
*
GemmMLevel1Cluster
)
==
...
@@ -237,13 +250,15 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -237,13 +250,15 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer
// c_thread_mtx definition: this is a mess
// c_thread_mtx definition: this is a mess
// TODO:: more elegent way of defining c_thread_mtx
// TODO:: more elegent way of defining c_thread_mtx
constexpr
auto
c_k0k2_n0ho0wo0n2ho2wo2_thread_mtx_desc
=
make_ConstantMatrixDescriptor
(
constexpr
auto
c_k0k2_n0ho0wo0n2ho2wo2_thread_mtx_desc
=
Number
<
GemmMRepeat
*
GemmMPerThreadSubC
>
{},
Number
<
N0
*
Ho0
*
Wo0
*
N2
*
Ho2
*
Wo2
>
{});
make_ConstantMatrixDescriptor_packed
(
Number
<
GemmMRepeat
*
GemmMPerThreadSubC
>
{},
Number
<
N0PerBlock
*
Ho0PerBlock
*
Wo0PerBlock
*
N2
*
Ho2
*
Wo2
>
{});
const
auto
blockwise_gemm
=
BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
<
const
auto
blockwise_gemm
=
BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
<
BlockSize
,
BlockSize
,
decltype
(
a_e_k_block_mtx_desc
),
decltype
(
a_e_k_block_mtx_desc
),
decltype
(
b_e_n0ho0wo0b
1
n2ho2wo2_block_mtx_desc
),
decltype
(
b_e_n0ho0wo0bn2ho2wo2_block_mtx_desc
),
decltype
(
c_k0k2_n0ho0wo0n2ho2wo2_thread_mtx_desc
),
decltype
(
c_k0k2_n0ho0wo0n2ho2wo2_thread_mtx_desc
),
GemmMPerThreadSubC
,
GemmMPerThreadSubC
,
GemmNPerThreadSubC
,
GemmNPerThreadSubC
,
...
@@ -256,13 +271,13 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -256,13 +271,13 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer
GemmDataPerReadB
>
{};
GemmDataPerReadB
>
{};
// LDS allocation for input and weight: be careful of alignment
// LDS allocation for input and weight: be careful of alignment
constexpr
index_t
max_align
=
math
::
lcm
(
InBlockCopyD
stD
ataPer
Write_N
2
,
constexpr
index_t
max_align
=
math
::
lcm
(
InBlockCopyDataPer
Access_W
2
,
WeiBlockCopyDstDataPerWrite_K
,
WeiBlockCopyDstDataPerWrite_K
,
GemmDataPerReadA
,
GemmDataPerReadA
,
GemmDataPerReadB
);
GemmDataPerReadB
);
constexpr
index_t
in_block_space
=
math
::
integer_least_multiple
(
constexpr
index_t
in_block_space
=
math
::
integer_least_multiple
(
in_e_n0_ho0_wo0_b
1
_n2_ho2_wo2_block_desc
.
GetElementSpace
(),
max_align
);
in_e_n0_ho0_wo0_b_n2_ho2_wo2_block_desc
.
GetElementSpace
(),
max_align
);
constexpr
index_t
wei_block_space
=
constexpr
index_t
wei_block_space
=
math
::
integer_least_multiple
(
wei_e_k_block_desc
.
GetElementSpace
(),
max_align
);
math
::
integer_least_multiple
(
wei_e_k_block_desc
.
GetElementSpace
(),
max_align
);
...
@@ -369,8 +384,18 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -369,8 +384,18 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer
// define tensor descriptor for threadwise copy
// define tensor descriptor for threadwise copy
// output memory layout descriptor in register
// output memory layout descriptor in register
constexpr
auto
out_k0_k1_k2_n0_ho0_wo0_n1_ho1_wo1_n2_ho2_wo2_thread_mem_desc
=
constexpr
auto
out_k0_k1_k2_n0_ho0_wo0_n1_ho1_wo1_n2_ho2_wo2_thread_mem_desc
=
make_ConstantTensorDescriptor_packed
(
make_ConstantTensorDescriptor_packed
(
Sequence
<
KPerBlock
/
(
K1
*
K2
),
Sequence
<
KPerBlock
/
(
K1
*
K2
),
1
,
K2
,
N0
,
ho0
,
wo0
,
1
,
1
,
1
,
N2
,
Ho2
,
Wo2
>
{});
1
,
K2
,
N0PerBlock
,
Ho0PerBlock
,
Wo0PerBlock
,
1
,
1
,
1
,
N2
,
Ho2
,
Wo2
>
{});
// output tensor descriptor in register, src of threadwise copy
// output tensor descriptor in register, src of threadwise copy
constexpr
auto
out_n0_n1_n2_k0_k1_k2_ho0_ho1_ho2_wo0_wo1_wo2_thread_desc
=
constexpr
auto
out_n0_n1_n2_k0_k1_k2_ho0_ho1_ho2_wo0_wo1_wo2_thread_desc
=
...
@@ -378,7 +403,7 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -378,7 +403,7 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer
Sequence
<
3
,
6
,
9
,
0
,
1
,
2
,
4
,
7
,
10
,
5
,
8
,
11
>
{});
Sequence
<
3
,
6
,
9
,
0
,
1
,
2
,
4
,
7
,
10
,
5
,
8
,
11
>
{});
// output memory layout descriptor in device memory, dst of threadwise copy
// output memory layout descriptor in device memory, dst of threadwise copy
constexpr
auto
out_n0_n1_n2_k0_k1_k2_ho0_ho1_ho2_wo0_wo1_wo2_global_desc
=
constexpr
auto
out_n0_n1_n2_k0_k1_k2_ho0_ho1_ho2_wo0_wo1_wo2_global_
mem_
desc
=
out_n_k_h_w_global_desc
.
Fold
(
I3
,
Sequence
<
Wo1
,
Wo2
>
{})
out_n_k_h_w_global_desc
.
Fold
(
I3
,
Sequence
<
Wo1
,
Wo2
>
{})
.
Fold
(
I2
,
Sequence
<
Ho1
,
Ho2
>
{})
.
Fold
(
I2
,
Sequence
<
Ho1
,
Ho2
>
{})
.
Fold
(
I1
,
Sequence
<
K1
,
K2
>
{})
.
Fold
(
I1
,
Sequence
<
K1
,
K2
>
{})
...
@@ -393,33 +418,55 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -393,33 +418,55 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer
k_block_data_on_global
+
c_thread_mtx_on_block
.
row
;
k_block_data_on_global
+
c_thread_mtx_on_block
.
row
;
const
index_t
b_thread_data_on_global
=
const
index_t
b_thread_data_on_global
=
b_block_data_on_global
+
c_thread_mtx_on_block
.
col
/
N2
;
b_block_data_on_global
+
c_thread_mtx_on_block
.
col
/
(
N2
*
Ho2
*
Wo2
)
;
// output merged global tensor descriptor, for calculating origin of thread tensor
// output merged global tensor descriptor, for calculating origin of thread tensor
// in global memory
// in global memory
constexpr
auto
out_k_n1_b_n2_global_merged_desc
=
make_ConstantMergedTensorDescriptor
(
constexpr
auto
out_k_n0_ho0_wo0_b_n2_ho2_wo2_global_merged_desc
=
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
.
Unfold
(
I3
,
I5
),
make_ConstantMergedTensorDescriptor
(
Sequence
<
3
>
{},
out_n0_n1_n2_k0_k1_k2_ho0_ho1_ho2_wo0_wo1_wo2_global_mem_desc
.
Unfold
(
I3
,
I5
),
Sequence
<
1
>
{},
Sequence
<
3
>
{},
Sequence
<
0
,
4
,
5
>
{},
Sequence
<
0
>
{},
Sequence
<
2
>
{});
Sequence
<
4
>
{},
Sequence
<
7
>
{},
Sequence
<
1
,
5
,
8
>
{},
Sequence
<
2
>
{},
Sequence
<
6
>
{},
Sequence
<
9
>
{});
// origin of dst in device memory
// origin of dst in device memory
Float
*
p_out_thread_on_global
=
Float
*
p_out_thread_on_global
=
p_out_global
+
p_out_global
+
out_k_n
1_b_n
2_global_merged_desc
.
GetOffsetFromMultiIndex
(
out_k_n
0_ho0_wo0_b_n2_ho2_wo
2_global_merged_desc
.
GetOffsetFromMultiIndex
(
k_thread_data_on_global
,
0
,
b_thread_data_on_global
,
0
);
k_thread_data_on_global
,
0
,
0
,
0
,
b_thread_data_on_global
,
0
,
0
,
0
);
#if 1
threadwise_generic_tensor_slice_copy_v1
(
threadwise_generic_tensor_slice_copy_v1
(
out_n0_n1_n2_k0_k1_k2_h
_w
_thread_desc
,
out_n0_n1_n2_k0_k1_k2_h
o0_ho1_ho2_wo0_wo1_wo2
_thread_desc
,
p_out_thread
,
p_out_thread
,
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
},
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
},
out_n0_n1_n2_k0_k1_k2_h
_w
_global_mem_desc
,
out_n0_n1_n2_k0_k1_k2_h
o0_ho1_ho2_wo0_wo1_wo2
_global_mem_desc
,
p_out_thread_on_global
,
p_out_thread_on_global
,
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
},
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
},
out_n0_n1_n2_k0_k1_k2_h
_w
_thread_desc
.
GetLengths
(),
out_n0_n1_n2_k0_k1_k2_h
o0_ho1_ho2_wo0_wo1_wo2
_thread_desc
.
GetLengths
(),
arithmetic_sequence_gen
<
0
,
8
,
1
>::
type
{},
arithmetic_sequence_gen
<
0
,
12
,
1
>::
type
{},
Number
<
1
>
{});
Number
<
1
>
{});
#else
if
(
get_thread_local_1d_id
()
==
0
&&
get_block_1d_id
()
==
0
)
{
print_ConstantTensorDescriptor
(
"out thread: "
,
out_n0_n1_n2_k0_k1_k2_ho0_ho1_ho2_wo0_wo1_wo2_thread_desc
);
printf
(
"size: %d
\n
"
,
out_n0_n1_n2_k0_k1_k2_ho0_ho1_ho2_wo0_wo1_wo2_thread_desc
.
GetElementSize
());
for
(
index_t
i
=
0
;
i
<
out_n0_n1_n2_k0_k1_k2_ho0_ho1_ho2_wo0_wo1_wo2_thread_desc
.
GetElementSize
();
++
i
)
{
p_out_global
[
0
]
=
p_out_thread
[
i
];
}
}
#endif
}
}
}
}
};
};
...
...
composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp
View file @
e87aa851
...
@@ -52,11 +52,15 @@ __host__ __device__ constexpr auto
...
@@ -52,11 +52,15 @@ __host__ __device__ constexpr auto
return
ConstantMatrixDescriptor
<
NRow
,
NCol
,
RowStride
>
{};
return
ConstantMatrixDescriptor
<
NRow
,
NCol
,
RowStride
>
{};
}
}
template
<
index_t
NRow
,
index_t
NCol
,
index_t
RowStride
>
template
<
class
TDesc
>
__host__
__device__
constexpr
auto
make_ConstantMatrixDescriptor_from_ConstantTensorDescriptor
(
__host__
__device__
constexpr
auto
Constant
Tensor
Descriptor
<
Sequence
<
NRow
,
NCol
>
,
Sequence
<
RowStride
,
1
>>
)
make_
Constant
Matrix
Descriptor
_from_ConstantTensorDescriptor
(
TDesc
)
{
{
return
ConstantMatrixDescriptor
<
NRow
,
NCol
,
RowStride
>
{};
static_assert
(
TDesc
::
GetNumOfDimension
()
==
2
,
"wrong"
);
static_assert
(
TDesc
::
GetStrides
()[
1
]
==
1
,
"wrong"
);
return
ConstantMatrixDescriptor
<
TDesc
::
GetLengths
()[
0
],
TDesc
::
GetLengths
()[
1
],
TDesc
::
GetStrides
()[
0
]
>
{};
}
}
template
<
class
TDesc
>
template
<
class
TDesc
>
...
...
composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp
View file @
e87aa851
...
@@ -456,7 +456,7 @@ print_ConstantTensorDescriptor(const char* s,
...
@@ -456,7 +456,7 @@ print_ConstantTensorDescriptor(const char* s,
{
{
constexpr
index_t
ndim
=
sizeof
...(
Lengths
);
constexpr
index_t
ndim
=
sizeof
...(
Lengths
);
static_assert
(
ndim
>
0
&&
ndim
<=
1
0
,
"wrong!"
);
static_assert
(
ndim
>
0
&&
ndim
<=
1
2
,
"wrong!"
);
static_if
<
ndim
==
1
>
{}([
&
](
auto
)
{
static_if
<
ndim
==
1
>
{}([
&
](
auto
)
{
printf
(
"%s dim %u, lengths {%u}, strides {%u}
\n
"
,
s
,
ndim
,
Lengths
...,
Strides
...);
printf
(
"%s dim %u, lengths {%u}, strides {%u}
\n
"
,
s
,
ndim
,
Lengths
...,
Strides
...);
...
@@ -528,6 +528,26 @@ print_ConstantTensorDescriptor(const char* s,
...
@@ -528,6 +528,26 @@ print_ConstantTensorDescriptor(const char* s,
Lengths
...,
Lengths
...,
Strides
...);
Strides
...);
});
});
static_if
<
ndim
==
11
>
{}([
&
](
auto
)
{
printf
(
"%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u "
"%u %u "
"%u %u %u}
\n
"
,
s
,
ndim
,
Lengths
...,
Strides
...);
});
static_if
<
ndim
==
12
>
{}([
&
](
auto
)
{
printf
(
"%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u "
"%u %u %u %u "
"%u %u %u}
\n
"
,
s
,
ndim
,
Lengths
...,
Strides
...);
});
}
}
}
// namespace ck
}
// namespace ck
...
...
driver/include/device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp
View file @
e87aa851
...
@@ -53,18 +53,27 @@ void device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw(InDesc,
...
@@ -53,18 +53,27 @@ void device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw(InDesc,
wei_kcyx_device_buf
.
ToDevice
(
wei_kcyx
.
mData
.
data
());
wei_kcyx_device_buf
.
ToDevice
(
wei_kcyx
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
#if 1
// 1x1 filter, 8x8 image
constexpr
index_t
N1
=
2
;
constexpr
index_t
N1
=
2
;
constexpr
index_t
N2
=
4
;
constexpr
index_t
N2
=
1
;
constexpr
index_t
B
=
(
N
*
Ho
*
Wo
)
/
(
N1
*
N2
);
constexpr
index_t
Ho1
=
8
;
constexpr
index_t
Ho2
=
1
;
constexpr
index_t
Wo1
=
1
;
constexpr
index_t
Wo2
=
4
;
#if 1
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BPerBlock
=
16
;
constexpr
index_t
BPerBlock
=
16
;
constexpr
index_t
KPerBlock
=
128
;
constexpr
index_t
KPerBlock
=
128
;
constexpr
index_t
EPerBlock
=
8
;
constexpr
index_t
EPerBlock
=
8
;
constexpr
index_t
N0PerBlock
=
1
;
constexpr
index_t
Ho0PerBlock
=
1
;
constexpr
index_t
Wo0PerBlock
=
2
;
constexpr
index_t
GemmMPerThreadSubC
=
4
;
constexpr
index_t
GemmMPerThreadSubC
=
4
;
constexpr
index_t
GemmNPerThreadSubC
=
4
;
constexpr
index_t
GemmNPerThreadSubC
=
4
;
constexpr
index_t
GemmMLevel0Cluster
=
4
;
constexpr
index_t
GemmMLevel0Cluster
=
4
;
...
@@ -75,14 +84,16 @@ void device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw(InDesc,
...
@@ -75,14 +84,16 @@ void device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw(InDesc,
constexpr
index_t
GemmDataPerReadA
=
4
;
constexpr
index_t
GemmDataPerReadA
=
4
;
constexpr
index_t
GemmDataPerReadB
=
4
;
constexpr
index_t
GemmDataPerReadB
=
4
;
using
InBlockCopySubLengths_E_N1_B_N2
=
Sequence
<
1
,
1
,
1
,
4
>
;
using
InBlockCopySubLengths_E_N0_Ho0_Wo0_B_N2_Ho2_Wo2
=
Sequence
<
1
,
1
,
1
,
1
,
1
,
1
,
1
,
4
>
;
using
InBlockCopyClusterLengths_E_N1_B_N2
=
Sequence
<
8
,
2
,
16
,
1
>
;
using
InBlockCopyClusterLengths_E_N0_Ho0_Wo0_B_N2_Ho2_Wo2
=
Sequence
<
8
,
1
,
1
,
2
,
16
,
1
,
1
,
1
>
;
using
InBlockCopyThreadClusterArrangeOrder
=
Sequence
<
0
,
1
,
3
,
2
>
;
// [E, N1, N2, B]
using
InBlockCopyThreadClusterArrangeOrder
=
using
InBlockCopySrcAccessOrder
=
Sequence
<
0
,
1
,
3
,
2
>
;
// [E, N1, N2, B]
Sequence
<
0
,
1
,
5
,
2
,
6
,
3
,
4
,
7
>
;
// [E, N0, N2, Ho0, Ho2, Wo0, B, Wo2]
using
InBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
,
2
,
3
>
;
// [E, N1, B, N2]
using
InBlockCopySrcAccessOrder
=
Sequence
<
0
,
1
,
5
,
2
,
6
,
3
,
4
,
7
>
;
// [E, N0, N2, Ho0, Ho2, Wo0, B, Wo2]
using
InBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
>
;
// [E, N0, Ho0, Wo0, B, N2, Ho2, Wo2]
constexpr
index_t
InBlockCopySrcDataPerRead_B
=
1
;
constexpr
index_t
InBlockCopyDataPerAccess_W2
=
4
;
constexpr
index_t
InBlockCopyDstDataPerWrite_N2
=
4
;
using
WeiBlockCopySubLengths_E_K
=
Sequence
<
4
,
1
>
;
using
WeiBlockCopySubLengths_E_K
=
Sequence
<
4
,
1
>
;
using
WeiBlockCopyClusterLengths_E_K
=
Sequence
<
2
,
128
>
;
using
WeiBlockCopyClusterLengths_E_K
=
Sequence
<
2
,
128
>
;
...
@@ -94,6 +105,8 @@ void device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw(InDesc,
...
@@ -94,6 +105,8 @@ void device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw(InDesc,
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
1
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
1
;
#endif
#endif
constexpr
index_t
B
=
N1
*
Ho1
*
Wo1
;
constexpr
index_t
GridSize
=
constexpr
index_t
GridSize
=
((
B
+
BPerBlock
-
1
)
/
BPerBlock
)
*
((
K
+
KPerBlock
-
1
)
/
KPerBlock
);
((
B
+
BPerBlock
-
1
)
/
BPerBlock
)
*
((
K
+
KPerBlock
-
1
)
/
KPerBlock
);
...
@@ -111,11 +124,18 @@ void device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw(InDesc,
...
@@ -111,11 +124,18 @@ void device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw(InDesc,
decltype
(
out_nkhw_desc
),
decltype
(
out_nkhw_desc
),
ConvStrides
,
ConvStrides
,
ConvDilations
,
ConvDilations
,
N1
,
N2
,
Ho1
,
Ho2
,
Wo1
,
Wo2
,
BPerBlock
,
BPerBlock
,
KPerBlock
,
KPerBlock
,
EPerBlock
,
EPerBlock
,
N1
,
N0PerBlock
,
N2
,
Ho0PerBlock
,
Wo0PerBlock
,
GemmMPerThreadSubC
,
GemmMPerThreadSubC
,
GemmNPerThreadSubC
,
GemmNPerThreadSubC
,
GemmMLevel0Cluster
,
GemmMLevel0Cluster
,
...
@@ -125,13 +145,12 @@ void device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw(InDesc,
...
@@ -125,13 +145,12 @@ void device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw(InDesc,
GemmKPerThreadLoop
,
GemmKPerThreadLoop
,
GemmDataPerReadA
,
GemmDataPerReadA
,
GemmDataPerReadB
,
GemmDataPerReadB
,
InBlockCopySubLengths_E_N
1_B_N
2
,
InBlockCopySubLengths_E_N
0_Ho0_Wo0_B_N2_Ho2_Wo
2
,
InBlockCopyClusterLengths_E_N
1_B_N
2
,
InBlockCopyClusterLengths_E_N
0_Ho0_Wo0_B_N2_Ho2_Wo
2
,
InBlockCopyThreadClusterArrangeOrder
,
InBlockCopyThreadClusterArrangeOrder
,
InBlockCopySrcAccessOrder
,
InBlockCopySrcAccessOrder
,
InBlockCopyDstAccessOrder
,
InBlockCopyDstAccessOrder
,
InBlockCopySrcDataPerRead_B
,
InBlockCopyDataPerAccess_W2
,
InBlockCopyDstDataPerWrite_N2
,
WeiBlockCopySubLengths_E_K
,
WeiBlockCopySubLengths_E_K
,
WeiBlockCopyClusterLengths_E_K
,
WeiBlockCopyClusterLengths_E_K
,
WeiBlockCopyThreadClusterArrangeOrder
,
WeiBlockCopyThreadClusterArrangeOrder
,
...
...
driver/src/driver.cpp
View file @
e87aa851
...
@@ -14,7 +14,7 @@
...
@@ -14,7 +14,7 @@
#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
//
#include "device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp"
struct
GeneratorTensor_1
struct
GeneratorTensor_1
{
{
...
@@ -524,19 +524,29 @@ int main(int argc, char* argv[])
...
@@ -524,19 +524,29 @@ int main(int argc, char* argv[])
#elif 0
#elif 0
device_convolution_implicit_gemm_v2_chwn_cyxk_khwn
device_convolution_implicit_gemm_v2_chwn_cyxk_khwn
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
#elif
1
#elif
0
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw
(
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
#elif 0
#elif 0
device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw
(
in_nchw_desc
,
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw
(
in_nchw_desc
,
in_nchw
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx_desc
,
wei_kcyx
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_desc
,
out_nkhw_device
,
out_nkhw_device
,
ConvStrides
{},
ConvStrides
{},
ConvDilations
{},
ConvDilations
{},
nrepeat
);
nrepeat
);
#elif 1
device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
ConvStrides
{},
ConvDilations
{},
nrepeat
);
#elif 0
#elif 0
device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded
(
in_nchw_desc
,
device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded
(
in_nchw_desc
,
in_nchw
,
in_nchw
,
...
...
driver/src/driver.cu
deleted
100644 → 0
View file @
ce4ec7dc
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include "config.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "device.hpp"
#include "conv_common.hpp"
#include "host_conv.hpp"
#include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp"
#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp"
struct
GeneratorTensor_1
{
template
<
class
...
Is
>
double
operator
()(
Is
...
is
)
{
return
1
;
}
};
struct
GeneratorTensor_2
{
int
min_value
=
0
;
int
max_value
=
1
;
template
<
class
...
Is
>
double
operator
()(
Is
...)
{
return
(
std
::
rand
()
%
(
max_value
-
min_value
))
+
min_value
;
}
};
struct
GeneratorTensor_3
{
template
<
class
...
Is
>
double
operator
()(
Is
...
is
)
{
std
::
array
<
index_t
,
sizeof
...(
Is
)
>
dims
=
{{
static_cast
<
index_t
>
(
is
)...}};
auto
f_acc
=
[](
auto
a
,
auto
b
)
{
return
100
*
a
+
b
;
};
return
std
::
accumulate
(
dims
.
begin
(),
dims
.
end
(),
index_t
(
0
),
f_acc
);
}
};
struct
GeneratorTensor_Checkboard
{
template
<
class
...
Ts
>
double
operator
()(
Ts
...
Xs
)
const
{
std
::
array
<
index_t
,
sizeof
...(
Ts
)
>
dims
=
{{
Xs
...}};
return
std
::
accumulate
(
dims
.
begin
(),
dims
.
end
(),
true
,
[](
bool
init
,
index_t
x
)
->
int
{
return
init
!=
(
x
%
2
);
})
?
1
:
-
1
;
}
};
int
main
(
int
argc
,
char
*
argv
[])
{
using
namespace
ck
;
#if 0
constexpr index_t N = 8;
constexpr index_t C = 16;
constexpr index_t HI = 3;
constexpr index_t WI = 18;
constexpr index_t K = 128;
constexpr index_t Y = 3;
constexpr index_t X = 3;
constexpr index_t HPad = 0;
constexpr index_t WPad = 0;
#elif
0
// 3x3, 34x34
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
256
;
constexpr
index_t
HI
=
34
;
constexpr
index_t
WI
=
34
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
2
,
2
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
#elif 0
// 3x3, 56x56
constexpr
index_t
N
=
64
;
constexpr
index_t
C
=
64
;
constexpr
index_t
HI
=
56
;
constexpr
index_t
WI
=
56
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
#elif 0
// 3x3 filter, 28x28 image
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
256
;
constexpr
index_t
HI
=
28
;
constexpr
index_t
WI
=
28
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
#elif 0
// 1x1 filter, 28x28 image
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
512
;
constexpr
index_t
HI
=
28
;
constexpr
index_t
WI
=
28
;
constexpr
index_t
K
=
512
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
#elif 0
// 3x3 filter, 20x84 image, 1x1 padding
constexpr
index_t
N
=
16
;
constexpr
index_t
C
=
256
;
constexpr
index_t
HI
=
20
;
constexpr
index_t
WI
=
84
;
constexpr
index_t
K
=
256
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
constexpr
index_t
HPad
=
1
;
constexpr
index_t
WPad
=
1
;
#elif 0
// 3x3 filter, 112x112 image, 1x1 padding
constexpr
index_t
N
=
16
;
constexpr
index_t
C
=
64
;
constexpr
index_t
HI
=
112
;
constexpr
index_t
WI
=
112
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
constexpr
index_t
HPad
=
1
;
constexpr
index_t
WPad
=
1
;
#elif 0
// 5x5 filter, 20x86 image
constexpr
index_t
N
=
16
;
constexpr
index_t
C
=
256
;
constexpr
index_t
HI
=
20
;
constexpr
index_t
WI
=
86
;
constexpr
index_t
K
=
512
;
constexpr
index_t
Y
=
5
;
constexpr
index_t
X
=
5
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
#elif 0
// 5x5 filter, 20x86 image, 1x1 padding
constexpr
index_t
N
=
16
;
constexpr
index_t
C
=
256
;
constexpr
index_t
HI
=
20
;
constexpr
index_t
WI
=
86
;
constexpr
index_t
K
=
512
;
constexpr
index_t
Y
=
5
;
constexpr
index_t
X
=
5
;
constexpr
index_t
HPad
=
1
;
constexpr
index_t
WPad
=
1
;
#elif 0
// 5x5 filter, 28x28 image, 2x2 padding
constexpr
index_t
N
=
16
;
constexpr
index_t
C
=
192
;
constexpr
index_t
HI
=
28
;
constexpr
index_t
WI
=
28
;
constexpr
index_t
K
=
32
;
constexpr
index_t
Y
=
5
;
constexpr
index_t
X
=
5
;
constexpr
index_t
HPad
=
2
;
constexpr
index_t
WPad
=
2
;
#elif 0
// 3x3 filter, 14x14 image
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
256
;
constexpr
index_t
HI
=
14
;
constexpr
index_t
WI
=
14
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
#elif 0
// 1x1 filter, 14x14 image
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
512
;
constexpr
index_t
HI
=
14
;
constexpr
index_t
WI
=
14
;
constexpr
index_t
K
=
512
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
#elif 0
// 1x1 filter, 7x7 image
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
512
;
constexpr
index_t
HI
=
7
;
constexpr
index_t
WI
=
7
;
constexpr
index_t
K
=
2048
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
#elif 0
// 1x1 filter, 73x73 image
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
512
;
constexpr
index_t
HI
=
73
;
constexpr
index_t
WI
=
73
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
#elif 0
// 1x1 filter, 8x8 image
// cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
constexpr
index_t
N
=
64
;
constexpr
index_t
C
=
1536
;
constexpr
index_t
HI
=
8
;
constexpr
index_t
WI
=
8
;
constexpr
index_t
K
=
256
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
#elif 1
// 1x1 filter, 8x8 image
// cudnn@V100 77%, ck@V100 76%, ck@P100 79%, ck@VII 51%
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
2048
;
constexpr
index_t
HI
=
8
;
constexpr
index_t
WI
=
8
;
constexpr
index_t
K
=
384
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
#elif 0
// 1x1 filter, 7x7 image
// cudnn@V100 82%, ck@V100 76%, ck@P100 67%, ck@VII 64%
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
832
;
constexpr
index_t
HI
=
7
;
constexpr
index_t
WI
=
7
;
constexpr
index_t
K
=
384
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
#elif 0
// 1x1 filter, 8x8 image
// cudnn@V100 83%, ck@V100 75%, ck@P100 78%, ck@VII 65%
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
1280
;
constexpr
index_t
HI
=
8
;
constexpr
index_t
WI
=
8
;
constexpr
index_t
K
=
384
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
#elif 0
// 1x1 filter, 14x14 image
// cudnn@V100 62%, ck@V100 68%, ck@P100 70%, ck@VII 50%
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
512
;
constexpr
index_t
HI
=
14
;
constexpr
index_t
WI
=
14
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
#elif 0
// 1x1 filter, 8x8 image
// cudnn@V100 74%, ck@V100 57%, ck@P100 78%, ck@VII 61%
constexpr
index_t
N
=
64
;
constexpr
index_t
C
=
1536
;
constexpr
index_t
HI
=
8
;
constexpr
index_t
WI
=
8
;
constexpr
index_t
K
=
384
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
#elif 0
// 1x1 filter, 28x28 image
// cudnn@V100 86%, ck@V100 84%, ck@P100 80%, ck@VII 69%
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
256
;
constexpr
index_t
HI
=
28
;
constexpr
index_t
WI
=
28
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
#elif 0
// 1x1 filter, 7x7 image
// cudnn@V100 71%, ck@V100 55%, ck@P100 70%, ck@VII 62%
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
832
;
constexpr
index_t
HI
=
7
;
constexpr
index_t
WI
=
7
;
constexpr
index_t
K
=
256
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
#elif 0
// 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
// cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
288
;
constexpr
index_t
HI
=
35
;
constexpr
index_t
WI
=
35
;
constexpr
index_t
K
=
384
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
2
,
2
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
#elif 0
// 1x1 filter, 17x17 input
// cudnn@V100 81%, ck@V100 76%, ck@P100 70%, ck@VII 76%
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
768
;
constexpr
index_t
HI
=
17
;
constexpr
index_t
WI
=
17
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
#elif 0
// 1x1 filter, 14x14 image
// cudnn@V100 73%, ck@V100 71%, ck@P100 70%, ck@VII 64%
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
528
;
constexpr
index_t
HI
=
14
;
constexpr
index_t
WI
=
14
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
#elif 0
// 1x1 filter, 14x14 image
// cudnn@V100 73%, ck@V100 72%, ck@P100 79%, ck@VII 75%
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
528
;
constexpr
index_t
HI
=
14
;
constexpr
index_t
WI
=
14
;
constexpr
index_t
K
=
256
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
#elif 0
// 1x1 filter, 7x7 image
// cudnn@V100 49%, ck@V100 50%, ck@P100 61%, ck@VII 52%
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
832
;
constexpr
index_t
HI
=
7
;
constexpr
index_t
WI
=
7
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
#endif
auto
lower_pads
=
Sequence
<
HPad
,
WPad
>
{};
auto
upper_pads
=
Sequence
<
HPad
,
WPad
>
{};
auto
in_nchw_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
N
,
C
,
HI
,
WI
>
{});
auto
wei_kcyx_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
K
,
C
,
Y
,
X
>
{});
auto
out_nkhw_desc
=
get_convolution_with_padding_output_default_4d_tensor_descriptor
(
in_nchw_desc
,
wei_kcyx_desc
,
ConvStrides
{},
ConvDilations
{},
lower_pads
,
upper_pads
);
ostream_ConstantTensorDescriptor
(
in_nchw_desc
,
std
::
cout
<<
"in_nchw_desc: "
);
ostream_ConstantTensorDescriptor
(
wei_kcyx_desc
,
std
::
cout
<<
"wei_kcyx_desc: "
);
ostream_ConstantTensorDescriptor
(
out_nkhw_desc
,
std
::
cout
<<
"out_nkhw_desc: "
);
using
in_data_t
=
float
;
using
out_data_t
=
float
;
Tensor
<
in_data_t
>
in_nchw
(
make_TensorDescriptor
(
in_nchw_desc
));
Tensor
<
in_data_t
>
wei_kcyx
(
make_TensorDescriptor
(
wei_kcyx_desc
));
Tensor
<
out_data_t
>
out_nkhw_host
(
make_TensorDescriptor
(
out_nkhw_desc
));
Tensor
<
out_data_t
>
out_nkhw_device
(
make_TensorDescriptor
(
out_nkhw_desc
));
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
();
if
(
argc
!=
3
)
{
printf
(
"arg1: do_verification, arg2: nrepeat
\n
"
);
exit
(
1
);
}
bool
do_verification
=
atoi
(
argv
[
1
]);
index_t
nrepeat
=
atoi
(
argv
[
2
]);
if
(
do_verification
)
{
#if 0
in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
#elif
0
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei_kcyx
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
#elif 0
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_3
{},
num_thread
);
wei_kcyx
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
#elif 1
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei_kcyx
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
#elif 0
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_2
{
1
,
5
},
num_thread
);
auto
gen_wei
=
[](
auto
...
is
)
{
return
GeneratorTensor_2
{
1
,
5
}(
is
...)
*
GeneratorTensor_Checkboard
{}(
is
...);
};
wei_kcyx
.
GenerateTensorValue
(
gen_wei
,
num_thread
);
#endif
}
#if 0
device_convolution_direct_v2_nchw_kcyx_nkhw
in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
#elif
0
device_convolution_implicit_gemm_v1_chwn_cyxk_khwn
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
#elif 0
device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
#elif 0
device_convolution_implicit_gemm_v2_chwn_cyxk_khwn
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
#elif 1
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
#elif 0
device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
ConvStrides
{},
ConvDilations
{},
nrepeat
);
#elif 0
device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
lower_pads
,
upper_pads
,
nrepeat
);
#endif
if
(
do_verification
)
{
#if 1
if
(
Y
==
3
&&
X
==
3
&&
ConvStrides
{}[
0
]
==
1
&&
ConvStrides
{}[
1
]
==
1
&&
ConvDilations
{}[
0
]
==
1
&&
ConvDilations
{}[
1
]
==
1
)
{
host_winograd_3x3_convolution
(
in_nchw
,
wei_kcyx
,
out_nkhw_host
,
lower_pads
,
upper_pads
);
}
else
#endif
{
host_direct_convolution
(
in_nchw
,
wei_kcyx
,
out_nkhw_host
,
ConvStrides
{},
ConvDilations
{},
lower_pads
,
upper_pads
);
}
check_error
(
out_nkhw_host
,
out_nkhw_device
);
#if 0
LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
LogRange(std::cout << "wei_kcyx: ", wei_kcyx.mData, ",") << std::endl;
LogRange(std::cout << "out_nkhw_host : ", out_nkhw_host.mData, ",") << std::endl;
LogRange(std::cout << "out_nkhw_device: ", out_nkhw_device.mData, ",") << std::endl;
#endif
}
}
driver/src/driver.cu
0 → 120000
View file @
e87aa851
driver
.
cpp
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment