Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
5ed51b71
Commit
5ed51b71
authored
Jul 26, 2021
by
Jing Zhang
Browse files
init commit of conv+add
parent
b53926e9
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
2672 additions
and
122 deletions
+2672
-122
composable_kernel/include/driver/driver_static_convolution_add_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
..._add_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
+1139
-0
composable_kernel/include/driver/driver_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
...tion_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
+9
-11
composable_kernel/include/tensor_operation/gridwise_static_gemm_v2.hpp
...rnel/include/tensor_operation/gridwise_static_gemm_v2.hpp
+6
-4
composable_kernel/include/tensor_operation/gridwise_static_gemm_v3.hpp
...rnel/include/tensor_operation/gridwise_static_gemm_v3.hpp
+476
-0
host/driver_offline/CMakeLists.txt
host/driver_offline/CMakeLists.txt
+6
-0
host/driver_offline/conv_activ_fwd_driver_offline.cpp
host/driver_offline/conv_activ_fwd_driver_offline.cpp
+341
-0
host/driver_offline/conv_add_fwd_driver_offline.cpp
host/driver_offline/conv_add_fwd_driver_offline.cpp
+367
-0
host/driver_offline/conv_fwd_driver_offline.cpp
host/driver_offline/conv_fwd_driver_offline.cpp
+33
-105
host/driver_offline/include/device_static_convolution_add_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
...olution_add_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
+184
-0
host/host_tensor/include/host_conv.hpp
host/host_tensor/include/host_conv.hpp
+109
-0
script/run.sh
script/run.sh
+2
-2
No files found.
composable_kernel/include/driver/driver_static_convolution_add_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
0 → 100644
View file @
5ed51b71
#ifndef CK_DRIVER_STATIC_CONVOLUTION_ADD_FORWARD_IMPLICIT_GEMM_V5R1_NCHW_KCYX_NKHW_OUTPAD_HPP
#define CK_DRIVER_STATIC_CONVOLUTION_ADD_FORWARD_IMPLICIT_GEMM_V5R1_NCHW_KCYX_NKHW_OUTPAD_HPP
#include "common_header.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "gridwise_static_gemm_v3.hpp"
#include "gridwise_operation_wrapper.hpp"
namespace
ck
{
template
<
index_t
BlockSize
,
typename
FloatAB
,
typename
FloatAcc
,
typename
FloatC
,
index_t
KPerBlock
,
index_t
HoPerBlock
,
index_t
WoPerBlock
,
index_t
EPerBlock
,
index_t
KPerThread
,
index_t
HoPerThread
,
index_t
WoPerThread
,
index_t
EPerThread
,
typename
ABlockTransferThreadSliceLengths_E_K
,
typename
ABlockTransferThreadClusterLengths_E_K
,
index_t
ABlockTransferSrcScalarPerVector_E
,
index_t
ABlockTransferDstScalarPerVector_K
,
index_t
BThreadTransferSrcScalarPerVector_W
,
index_t
CThreadTransferDstScalarPerVector_W
>
struct
DriverStaticConvolutionAddForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
{
template
<
typename
...
Wei
,
typename
...
In
,
typename
...
Add
,
typename
...
Out
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
,
index_t
activ_type
>
__host__
void
Run
(
const
DynamicTensorDescriptor
<
Wei
...
>&
wei_k_c_y_x_global_desc
,
const
DynamicTensorDescriptor
<
In
...
>&
in_n_c_hi_wi_global_desc
,
const
DynamicTensorDescriptor
<
Add
...
>&
add_n_k0_hox2_wox2_k1_global_desc
,
const
DynamicTensorDescriptor
<
Out
...
>&
out_n_k0_ho_wo_k1_global_desc
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
in_right_pads_
,
Number
<
activ_type
>
,
const
FloatAB
*
__restrict__
p_wei_global
,
const
FloatAB
*
__restrict__
p_in_global
,
const
FloatC
*
__restrict__
p_add_global
,
FloatC
*
__restrict__
p_out_global
)
const
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
const
auto
N_
=
in_n_c_hi_wi_global_desc
.
GetLength
(
I0
);
const
auto
C_
=
in_n_c_hi_wi_global_desc
.
GetLength
(
I1
);
const
auto
Hi_
=
in_n_c_hi_wi_global_desc
.
GetLength
(
I2
);
const
auto
Wi_
=
in_n_c_hi_wi_global_desc
.
GetLength
(
I3
);
const
auto
K0_
=
out_n_k0_ho_wo_k1_global_desc
.
GetLength
(
I1
);
const
auto
Ho_
=
out_n_k0_ho_wo_k1_global_desc
.
GetLength
(
I2
);
const
auto
Wo_
=
out_n_k0_ho_wo_k1_global_desc
.
GetLength
(
I3
);
const
auto
K1_
=
out_n_k0_ho_wo_k1_global_desc
.
GetLength
(
I4
);
const
auto
K_
=
wei_k_c_y_x_global_desc
.
GetLength
(
I0
);
const
auto
Y_
=
wei_k_c_y_x_global_desc
.
GetLength
(
I2
);
const
auto
X_
=
wei_k_c_y_x_global_desc
.
GetLength
(
I3
);
constexpr
auto
N
=
Number
<
N_
>
{};
constexpr
auto
C
=
Number
<
C_
>
{};
constexpr
auto
K0
=
Number
<
K0_
>
{};
constexpr
auto
K1
=
Number
<
K1_
>
{};
constexpr
auto
Hi
=
Number
<
Hi_
>
{};
constexpr
auto
Wi
=
Number
<
Wi_
>
{};
constexpr
auto
Ho
=
Number
<
Ho_
>
{};
constexpr
auto
Wo
=
Number
<
Wo_
>
{};
constexpr
auto
Hox2
=
Number
<
Ho
*
2
>
{};
constexpr
auto
Wox2
=
Number
<
Wo
*
2
>
{};
constexpr
auto
K
=
Number
<
K_
>
{};
constexpr
auto
Y
=
Number
<
Y_
>
{};
constexpr
auto
X
=
Number
<
X_
>
{};
const
auto
ConvStrideH_
=
conv_strides
[
I0
];
const
auto
ConvStrideW_
=
conv_strides
[
I1
];
const
auto
ConvDilationH_
=
conv_dilations
[
I0
];
const
auto
ConvDilationW_
=
conv_dilations
[
I1
];
constexpr
auto
ConvStrideH
=
Number
<
ConvStrideH_
>
{};
constexpr
auto
ConvStrideW
=
Number
<
ConvStrideW_
>
{};
constexpr
auto
ConvDilationH
=
Number
<
ConvDilationH_
>
{};
constexpr
auto
ConvDilationW
=
Number
<
ConvDilationW_
>
{};
constexpr
auto
Hop
=
Number
<
(
Ho
+
HoPerBlock
-
1
)
/
HoPerBlock
*
HoPerBlock
>
{};
constexpr
auto
Wop
=
Number
<
(
Wo
+
WoPerBlock
-
1
)
/
WoPerBlock
*
WoPerBlock
>
{};
constexpr
auto
OutRightPadH
=
Hop
-
Ho
;
constexpr
auto
OutRightPadW
=
Wop
-
Wo
;
const
auto
InLeftPadH_
=
in_left_pads
[
I0
];
const
auto
InLeftPadW_
=
in_left_pads
[
I1
];
constexpr
auto
InLeftPadH
=
Number
<
InLeftPadH_
>
{};
constexpr
auto
InLeftPadW
=
Number
<
InLeftPadW_
>
{};
constexpr
auto
in_right_pads
=
InRightPads
{};
const
auto
InRightPadH_
=
in_right_pads
[
I0
]
+
OutRightPadH
*
ConvStrideH
;
const
auto
InRightPadW_
=
in_right_pads
[
I1
]
+
OutRightPadW
*
ConvStrideW
;
constexpr
auto
InRightPadH
=
Number
<
InRightPadH_
>
{};
constexpr
auto
InRightPadW
=
Number
<
InRightPadW_
>
{};
std
::
cerr
<<
"OutRightPadH = "
<<
OutRightPadH
<<
" OutRightPadW = "
<<
OutRightPadW
<<
std
::
endl
;
std
::
cerr
<<
"InRightPadH = "
<<
InRightPadH
<<
" InRightPadW = "
<<
InRightPadW
<<
std
::
endl
;
// weight tensor
const
auto
wei_e_k_global_desc
=
transform_dynamic_tensor_descriptor
(
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
K
,
C
*
Y
*
X
)),
make_tuple
(
make_pass_through_transform
(
K
),
make_pass_through_transform
(
C
*
Y
*
X
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
>
{}));
static_assert
(
wei_e_k_global_desc
.
IsKnownAtCompileTime
(),
"wrong! wei_e_k_global_desc need to known at compile-time"
);
// input tensor
const
auto
in_n_c_hip_wip_global_desc
=
transform_dynamic_tensor_descriptor
(
in_n_c_hi_wi_global_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_pass_through_transform
(
C
),
make_pad_transform
(
Hi
,
InLeftPadH
,
InRightPadH
),
make_pad_transform
(
Wi
,
InLeftPadW
,
InRightPadW
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
static_assert
(
in_n_c_hip_wip_global_desc
.
IsKnownAtCompileTime
(),
"wrong! in_n_c_hip_wip_global_desc need to known at compile-time"
);
const
auto
in_n_c_y_ho_x_wo_global_desc
=
transform_dynamic_tensor_descriptor
(
in_n_c_hip_wip_global_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_pass_through_transform
(
C
),
make_embed_transform
(
make_tuple
(
Y
,
Hop
),
make_tuple
(
ConvDilationH
,
ConvStrideH
)),
make_embed_transform
(
make_tuple
(
X
,
Wop
),
make_tuple
(
ConvDilationW
,
ConvStrideW
))),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{},
Sequence
<
4
,
5
>
{}));
static_assert
(
in_n_c_y_ho_x_wo_global_desc
.
IsKnownAtCompileTime
(),
"wrong! in_n_c_y_ho_x_wo_global_desc need to known at compile-time"
);
const
auto
in_e_n_ho_wo_global_desc
=
transform_dynamic_tensor_descriptor
(
in_n_c_y_ho_x_wo_global_desc
,
make_tuple
(
make_merge_transform
(
make_tuple
(
C
,
Y
,
X
)),
make_pass_through_transform
(
N
),
make_pass_through_transform
(
Hop
),
make_pass_through_transform
(
Wop
)),
make_tuple
(
Sequence
<
1
,
2
,
4
>
{},
Sequence
<
0
>
{},
Sequence
<
3
>
{},
Sequence
<
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
static_assert
(
in_e_n_ho_wo_global_desc
.
IsKnownAtCompileTime
(),
"wrong! in_e_n_ho_wo_global_desc need to known at compile-time"
);
// output tensor
const
auto
out_k_n_hop_wop_global_desc
=
transform_dynamic_tensor_descriptor
(
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
N
,
K0
,
Ho
,
Wo
,
K1
)),
make_tuple
(
make_merge_transform
(
make_tuple
(
K0
,
K1
)),
make_pass_through_transform
(
N
),
make_right_pad_transform
(
Ho
,
OutRightPadH
),
make_right_pad_transform
(
Wo
,
OutRightPadW
)),
make_tuple
(
Sequence
<
1
,
4
>
{},
Sequence
<
0
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
static_assert
(
out_k_n_hop_wop_global_desc
.
IsKnownAtCompileTime
(),
"wrong! out_k_n_hop_wop_global_desc need to known at compile-time"
);
// add tensor
const
auto
add_k_n_hopx2_wopx2_global_desc
=
transform_dynamic_tensor_descriptor
(
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
N
,
K0
,
Hox2
,
Wox2
)),
make_tuple
(
make_pass_through_transform
(
K0
),
make_pass_through_transform
(
N
),
make_pass_through_transform
(
Hox2
),
make_pass_through_transform
(
Wox2
)),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
static_assert
(
add_k_n_hopx2_wopx2_global_desc
.
IsKnownAtCompileTime
(),
"wrong! add_k_n_hopx2_wopx2_global_desc need to known at compile-time"
);
const
auto
E
=
C
*
Y
*
X
;
std
::
cerr
<<
"Hop = "
<<
Hop
<<
" Wop = "
<<
Wop
<<
std
::
endl
;
if
(
!
((
K
%
KPerBlock
)
==
0
&&
(
Hop
%
HoPerBlock
)
==
0
&&
(
Wop
%
WoPerBlock
)
==
0
&&
(
E
%
EPerBlock
)
==
0
))
{
throw
std
::
runtime_error
(
"wrong! GEMM size no divisible"
);
}
// hack to control index calculation when iterating over a_k_m_global tensor
constexpr
auto
a_e_k_global_iterator_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
>
{}));
constexpr
auto
a_e_k_global_move_slice_window_iterator_hack
=
Sequence
<
0
,
0
,
0
>
{};
constexpr
auto
b_e_n_ho_wo_global_iterator_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}));
constexpr
auto
b_e_n_ho_wo_global_move_slice_window_iterator_hack
=
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
>
{};
// hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
// hack for NKHW format
constexpr
auto
c_k_n_ho_wo_global_tensor_iterator_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
1
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
2
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{}));
// GEMM
using
gridwise_gemm
=
GridwiseStaticGemm_km_kn_mn_v3
<
BlockSize
,
FloatAB
,
FloatAcc
,
FloatC
,
InMemoryDataOperation
::
Set
,
decltype
(
wei_e_k_global_desc
),
decltype
(
in_e_n_ho_wo_global_desc
),
decltype
(
add_k_n_hopx2_wopx2_global_desc
),
decltype
(
out_k_n_hop_wop_global_desc
),
KPerBlock
,
HoPerBlock
,
WoPerBlock
,
EPerBlock
,
KPerThread
,
HoPerThread
,
WoPerThread
,
EPerThread
,
ABlockTransferThreadSliceLengths_E_K
,
ABlockTransferThreadClusterLengths_E_K
,
Sequence
<
1
,
0
>
,
Sequence
<
1
,
0
>
,
0
,
ABlockTransferSrcScalarPerVector_E
,
ABlockTransferDstScalarPerVector_K
,
false
,
// don't move back src coordinate after threadwise copy
Sequence
<
0
,
2
,
3
,
1
>
,
3
,
BThreadTransferSrcScalarPerVector_W
,
false
,
// don't move back src coordinate after threadwise copy, which will be fused with
// MoveSrcSliceWindow() to save addr computation
Sequence
<
0
,
2
,
3
,
1
>
,
0
,
CThreadTransferDstScalarPerVector_W
,
decltype
(
a_e_k_global_iterator_hacks
),
decltype
(
b_e_n_ho_wo_global_iterator_hacks
),
decltype
(
c_k_n_ho_wo_global_tensor_iterator_hacks
),
decltype
(
a_e_k_global_move_slice_window_iterator_hack
),
decltype
(
b_e_n_ho_wo_global_move_slice_window_iterator_hack
)
>
;
const
auto
GridSize
=
(
K
/
KPerBlock
)
*
(
Hop
/
HoPerBlock
)
*
(
Wop
/
WoPerBlock
)
*
N
;
constexpr
bool
has_main_k_block_loop
=
(
E
+
EPerBlock
)
/
(
2
*
EPerBlock
)
>
1
;
constexpr
bool
has_double_tail_k_block_loop
=
(
E
/
EPerBlock
)
%
2
==
0
;
index_t
nrepeat
=
100
;
std
::
cout
<<
"NCHWc"
<<
K1
<<
"_n"
<<
N
<<
"c"
<<
C
<<
"h"
<<
Hi
<<
"w"
<<
Wi
<<
"-k"
<<
K
<<
"c"
<<
C
<<
"y"
<<
Y
<<
"x"
<<
X
<<
"-u"
<<
conv_strides
[
I0
]
<<
"v"
<<
conv_strides
[
I1
]
<<
"l"
<<
conv_dilations
[
I0
]
<<
"j"
<<
conv_dilations
[
I1
]
<<
"q"
<<
in_left_pads
[
I0
]
<<
"p"
<<
in_right_pads
[
I0
]
<<
std
::
endl
;
std
::
cout
<<
"GridSize = "
<<
GridSize
<<
" BlockSize = "
<<
BlockSize
<<
std
::
endl
;
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
{
std
::
cout
<<
"Start running "
<<
nrepeat
<<
" times..."
<<
std
::
endl
;
KernelTimer
timer
;
timer
.
Start
();
std
::
cout
<<
"has_main_k_block_loop: "
<<
has_main_k_block_loop
<<
" has_double_tail_k_block_loop: "
<<
has_double_tail_k_block_loop
<<
std
::
endl
;
for
(
index_t
j
=
0
;
j
<
nrepeat
;
++
j
)
{
if
constexpr
(
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
run_gridwise_operation
<
gridwise_gemm
,
const
FloatAB
*
,
const
FloatAB
*
,
const
FloatC
*
,
FloatC
*
,
Number
<
activ_type
>
,
integral_constant
<
bool
,
true
>
,
integral_constant
<
bool
,
true
>>
;
launch_kernel
(
kernel
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
0
,
p_wei_global
,
p_in_global
,
p_add_global
,
p_out_global
,
Number
<
activ_type
>
{},
integral_constant
<
bool
,
true
>
{},
integral_constant
<
bool
,
true
>
{});
}
else
if
constexpr
(
has_main_k_block_loop
&&
!
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
run_gridwise_operation
<
gridwise_gemm
,
const
FloatAB
*
,
const
FloatAB
*
,
const
FloatC
*
,
FloatC
*
,
Number
<
activ_type
>
,
integral_constant
<
bool
,
true
>
,
integral_constant
<
bool
,
false
>>
;
launch_kernel
(
kernel
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
0
,
p_wei_global
,
p_in_global
,
p_add_global
,
p_out_global
,
Number
<
activ_type
>
{},
integral_constant
<
bool
,
true
>
{},
integral_constant
<
bool
,
false
>
{});
}
else
if
constexpr
(
!
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
run_gridwise_operation
<
gridwise_gemm
,
const
FloatAB
*
,
const
FloatAB
*
,
const
FloatC
*
,
FloatC
*
,
Number
<
activ_type
>
,
integral_constant
<
bool
,
false
>
,
integral_constant
<
bool
,
true
>>
;
launch_kernel
(
kernel
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
0
,
p_wei_global
,
p_in_global
,
p_add_global
,
p_out_global
,
Number
<
activ_type
>
{},
integral_constant
<
bool
,
false
>
{},
integral_constant
<
bool
,
true
>
{});
}
else
{
const
auto
kernel
=
run_gridwise_operation
<
gridwise_gemm
,
const
FloatAB
*
,
const
FloatAB
*
,
const
FloatC
*
,
FloatC
*
,
Number
<
activ_type
>
,
integral_constant
<
bool
,
false
>
,
integral_constant
<
bool
,
false
>>
;
launch_kernel
(
kernel
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
0
,
p_wei_global
,
p_in_global
,
p_add_global
,
p_out_global
,
Number
<
activ_type
>
{},
integral_constant
<
bool
,
false
>
{},
integral_constant
<
bool
,
false
>
{});
}
}
timer
.
End
();
float
ave_time
=
timer
.
GetElapsedTime
()
/
nrepeat
;
float
perf
=
(
float
)
calculate_convolution_flops
(
in_n_c_hi_wi_global_desc
,
wei_k_c_y_x_global_desc
,
out_n_k0_ho_wo_k1_global_desc
)
/
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
ave_time
;
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
}
}
};
#if 0
template <index_t BlockSize,
typename FloatAB,
typename FloatAcc,
typename FloatC,
index_t KPerBlock,
index_t HoPerBlock,
index_t WoPerBlock,
index_t EPerBlock,
index_t KPerThread,
index_t HoPerThread,
index_t WoPerThread,
index_t EPerThread,
typename ABlockTransferThreadSliceLengths_E_K,
typename ABlockTransferThreadClusterLengths_E_K,
index_t ABlockTransferSrcScalarPerVector_E,
index_t ABlockTransferDstScalarPerVector_K,
index_t BThreadTransferSrcScalarPerVector_W,
index_t CThreadTransferDstScalarPerVector_W>
struct DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad_1x1
{
template <typename... Wei,
typename... In,
typename... Out,
typename ConvStrides,
typename ConvDilations,
typename InLeftPads,
typename InRightPads>
__host__ void Run(const DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
const DynamicTensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
const DynamicTensorDescriptor<Out...>& out_n_k0_ho_wo_k1_global_desc,
const ConvStrides& conv_strides,
const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads,
const InRightPads& in_right_pads_,
const FloatAB* __restrict__ p_wei_global,
const FloatAB* __restrict__ p_in_global,
FloatC* __restrict__ p_out_global) const
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto I4 = Number<4>{};
const auto N_ = in_n_c_hi_wi_global_desc.GetLength(I0);
const auto C_ = in_n_c_hi_wi_global_desc.GetLength(I1);
const auto Hi_ = in_n_c_hi_wi_global_desc.GetLength(I2);
const auto Wi_ = in_n_c_hi_wi_global_desc.GetLength(I3);
const auto K0_ = out_n_k0_ho_wo_k1_global_desc.GetLength(I1);
const auto Ho_ = out_n_k0_ho_wo_k1_global_desc.GetLength(I2);
const auto Wo_ = out_n_k0_ho_wo_k1_global_desc.GetLength(I3);
const auto K1_ = out_n_k0_ho_wo_k1_global_desc.GetLength(I4);
const auto K_ = wei_k_c_y_x_global_desc.GetLength(I0);
const auto Y_ = wei_k_c_y_x_global_desc.GetLength(I2);
const auto X_ = wei_k_c_y_x_global_desc.GetLength(I3);
constexpr auto N = Number<N_>{};
constexpr auto C = Number<C_>{};
constexpr auto K0 = Number<K0_>{};
constexpr auto K1 = Number<K1_>{};
constexpr auto Hi = Number<Hi_>{};
constexpr auto Wi = Number<Wi_>{};
constexpr auto Ho = Number<Ho_>{};
constexpr auto Wo = Number<Wo_>{};
constexpr auto K = Number<K_>{};
constexpr auto Y = Number<Y_>{};
constexpr auto X = Number<X_>{};
const auto ConvStrideH_ = conv_strides[I0];
const auto ConvStrideW_ = conv_strides[I1];
const auto ConvDilationH_ = conv_dilations[I0];
const auto ConvDilationW_ = conv_dilations[I1];
constexpr auto ConvStrideH = Number<ConvStrideH_>{};
constexpr auto ConvStrideW = Number<ConvStrideW_>{};
constexpr auto ConvDilationH = Number<ConvDilationH_>{};
constexpr auto ConvDilationW = Number<ConvDilationW_>{};
constexpr auto Hop = Number<(Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock>{};
constexpr auto Wop = Number<(Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock>{};
constexpr auto OutRightPadH = Hop - Ho;
constexpr auto OutRightPadW = Wop - Wo;
const auto InLeftPadH_ = in_left_pads[I0];
const auto InLeftPadW_ = in_left_pads[I1];
constexpr auto InLeftPadH = Number<InLeftPadH_>{};
constexpr auto InLeftPadW = Number<InLeftPadW_>{};
static_assert(InLeftPadH == 0 and InLeftPadW == 0, "");
constexpr auto in_right_pads = InRightPads{};
const auto InRightPadH_ = in_right_pads[I0] + OutRightPadH * ConvStrideH;
const auto InRightPadW_ = in_right_pads[I1] + OutRightPadW * ConvStrideW;
constexpr auto InRightPadH = Number<InRightPadH_>{};
constexpr auto InRightPadW = Number<InRightPadW_>{};
std::cerr << "OutRightPadH = " << OutRightPadH << " OutRightPadW = " << OutRightPadW
<< std::endl;
std::cerr << "InRightPadH = " << InRightPadH << " InRightPadW = " << InRightPadW
<< std::endl;
// weight tensor
const auto wei_e_k_global_desc = transform_dynamic_tensor_descriptor(
make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C)),
make_tuple(make_pass_through_transform(K), make_pass_through_transform(C)),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<1>{}, Sequence<0>{}));
static_assert(wei_e_k_global_desc.IsKnownAtCompileTime(),
"wrong! wei_e_k_global_desc need to known at compile-time");
// input tensor
const auto in_e_n_ho_wo_global_desc = transform_dynamic_tensor_descriptor(
in_n_c_hi_wi_global_desc,
make_tuple(make_pass_through_transform(C),
make_pass_through_transform(N),
make_right_pad_transform(Hi, InRightPadH),
make_right_pad_transform(Wi, InRightPadW)),
make_tuple(Sequence<1>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
static_assert(in_e_n_ho_wo_global_desc.IsKnownAtCompileTime(),
"wrong! in_e_n_ho_wo_global_desc need to known at compile-time");
// output tensor
const auto out_k_n_hop_wop_global_desc = transform_dynamic_tensor_descriptor(
make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K0, Ho, Wo, K1)),
make_tuple(make_merge_transform(make_tuple(K0, K1)),
make_pass_through_transform(N),
make_right_pad_transform(Ho, OutRightPadH),
make_right_pad_transform(Wo, OutRightPadW)),
make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
static_assert(out_k_n_hop_wop_global_desc.IsKnownAtCompileTime(),
"wrong! out_k_n_hop_wop_global_desc need to known at compile-time");
const auto E = C;
std::cerr << "Hop = " << Hop << " Wop = " << Wop << std::endl;
if(!((K % KPerBlock) == 0 && (Hop % HoPerBlock) == 0 && (Wop % WoPerBlock) == 0 &&
(E % EPerBlock) == 0))
{
throw std::runtime_error("wrong! GEMM size no divisible");
}
// hack to control index calculation when iterating over a_k_m_global tensor
constexpr auto a_e_k_global_iterator_hacks =
make_tuple(make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}),
make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}));
constexpr auto a_e_k_global_move_slice_window_iterator_hack = Sequence<0, 0, 0>{};
constexpr auto b_e_n_ho_wo_global_iterator_hacks =
make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}),
make_tuple(Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}));
constexpr auto b_e_n_ho_wo_global_move_slice_window_iterator_hack =
Sequence<0, 0, 0, 0, 0>{};
// hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
// hack for NKHW format
constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks =
make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}),
make_tuple(Sequence<0, 2, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}));
// GEMM
using gridwise_gemm = GridwiseStaticGemm_km_kn_mn_v3<
BlockSize,
FloatAB,
FloatAcc,
FloatC,
InMemoryDataOperation::Set,
decltype(wei_e_k_global_desc),
decltype(in_e_n_ho_wo_global_desc),
decltype(out_k_n_hop_wop_global_desc),
KPerBlock,
HoPerBlock,
WoPerBlock,
EPerBlock,
KPerThread,
HoPerThread,
WoPerThread,
EPerThread,
ABlockTransferThreadSliceLengths_E_K,
ABlockTransferThreadClusterLengths_E_K,
Sequence<1, 0>,
Sequence<1, 0>,
0,
ABlockTransferSrcScalarPerVector_E,
ABlockTransferDstScalarPerVector_K,
false, // don't move back src coordinate after threadwise copy
Sequence<0, 2, 3, 1>,
3,
BThreadTransferSrcScalarPerVector_W,
false, // don't move back src coordinate after threadwise copy, which will be fused with
// MoveSrcSliceWindow() to save addr computation
Sequence<0, 2, 3, 1>,
0,
CThreadTransferDstScalarPerVector_W,
decltype(a_e_k_global_iterator_hacks),
decltype(b_e_n_ho_wo_global_iterator_hacks),
decltype(c_k_n_ho_wo_global_tensor_iterator_hacks),
decltype(a_e_k_global_move_slice_window_iterator_hack),
decltype(b_e_n_ho_wo_global_move_slice_window_iterator_hack)>;
const auto GridSize = (K / KPerBlock) * (Hop / HoPerBlock) * (Wop / WoPerBlock) * N;
constexpr bool has_main_k_block_loop = (E + EPerBlock) / (2 * EPerBlock) > 1;
constexpr bool has_double_tail_k_block_loop = (E / EPerBlock) % 2 == 0;
index_t nrepeat = 100;
std::cout << "conv_v5r1__NCHWc" << K1 << "_n" << N << "c" << C << "h" << Hi << "w" << Wi
<< "-k" << K << "c" << C << "y" << Y << "x" << X << "-u" << conv_strides[I0]
<< "v" << conv_strides[I1] << "l" << conv_dilations[I0] << "j"
<< conv_dilations[I1] << "q" << in_left_pads[I0] << "p" << in_right_pads[I0]
<< std::endl;
std::cout << "GridSize = " << GridSize << " BlockSize = " << BlockSize << std::endl;
for(index_t i = 0; i < 5; ++i)
{
std::cout << "Start running " << nrepeat << " times..." << std::endl;
KernelTimer timer;
timer.Start();
std::cout << "has_main_k_block_loop: " << has_main_k_block_loop
<< " has_double_tail_k_block_loop: " << has_double_tail_k_block_loop
<< std::endl;
for(index_t j = 0; j < nrepeat; ++j)
{
if constexpr(has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel = run_gridwise_operation<gridwise_gemm,
const FloatAB*,
const FloatAB*,
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, true>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
p_wei_global,
p_in_global,
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, true>{});
}
else if constexpr(has_main_k_block_loop && !has_double_tail_k_block_loop)
{
const auto kernel = run_gridwise_operation<gridwise_gemm,
const FloatAB*,
const FloatAB*,
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, false>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
p_wei_global,
p_in_global,
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, false>{});
}
else if constexpr(!has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel = run_gridwise_operation<gridwise_gemm,
const FloatAB*,
const FloatAB*,
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, true>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
p_wei_global,
p_in_global,
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, true>{});
}
else
{
const auto kernel = run_gridwise_operation<gridwise_gemm,
const FloatAB*,
const FloatAB*,
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, false>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
p_wei_global,
p_in_global,
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, false>{});
}
}
timer.End();
float ave_time = timer.GetElapsedTime() / nrepeat;
float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc,
wei_k_c_y_x_global_desc,
out_n_k0_ho_wo_k1_global_desc) /
(std::size_t(1000) * 1000 * 1000) / ave_time;
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
<< std::endl;
}
}
};
template <index_t BlockSize,
typename FloatAB,
typename FloatAcc,
typename FloatC,
index_t KPerBlock,
index_t HoPerBlock,
index_t WoPerBlock,
index_t EPerBlock,
index_t KPerThread,
index_t HoPerThread,
index_t WoPerThread,
index_t EPerThread,
typename ABlockTransferThreadSliceLengths_E_K,
typename ABlockTransferThreadClusterLengths_E_K,
index_t ABlockTransferSrcScalarPerVector_E,
index_t ABlockTransferDstScalarPerVector_K,
index_t BThreadTransferSrcScalarPerVector_W,
index_t CThreadTransferDstScalarPerVector_W>
struct DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_1x1
{
template <typename... Wei,
typename... In,
typename... Out,
typename ConvStrides,
typename ConvDilations,
typename InLeftPads,
typename InRightPads>
__host__ void Run(const DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
const DynamicTensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
const DynamicTensorDescriptor<Out...>& out_n_k0_ho_wo_k1_global_desc,
const ConvStrides& conv_strides,
const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads,
const InRightPads& in_right_pads_,
const FloatAB* __restrict__ p_wei_global,
const FloatAB* __restrict__ p_in_global,
FloatC* __restrict__ p_out_global) const
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto I4 = Number<4>{};
const auto N_ = in_n_c_hi_wi_global_desc.GetLength(I0);
const auto C_ = in_n_c_hi_wi_global_desc.GetLength(I1);
const auto Hi_ = in_n_c_hi_wi_global_desc.GetLength(I2);
const auto Wi_ = in_n_c_hi_wi_global_desc.GetLength(I3);
const auto K0_ = out_n_k0_ho_wo_k1_global_desc.GetLength(I1);
const auto Ho_ = out_n_k0_ho_wo_k1_global_desc.GetLength(I2);
const auto Wo_ = out_n_k0_ho_wo_k1_global_desc.GetLength(I3);
const auto K1_ = out_n_k0_ho_wo_k1_global_desc.GetLength(I4);
const auto K_ = wei_k_c_y_x_global_desc.GetLength(I0);
const auto Y_ = wei_k_c_y_x_global_desc.GetLength(I2);
const auto X_ = wei_k_c_y_x_global_desc.GetLength(I3);
constexpr auto N = Number<N_>{};
constexpr auto C = Number<C_>{};
constexpr auto K0 = Number<K0_>{};
constexpr auto K1 = Number<K1_>{};
constexpr auto Hi = Number<Hi_>{};
constexpr auto Wi = Number<Wi_>{};
constexpr auto Ho = Number<Ho_>{};
constexpr auto Wo = Number<Wo_>{};
constexpr auto K = Number<K_>{};
constexpr auto Y = Number<Y_>{};
constexpr auto X = Number<X_>{};
const auto ConvStrideH_ = conv_strides[I0];
const auto ConvStrideW_ = conv_strides[I1];
const auto ConvDilationH_ = conv_dilations[I0];
const auto ConvDilationW_ = conv_dilations[I1];
constexpr auto ConvStrideH = Number<ConvStrideH_>{};
constexpr auto ConvStrideW = Number<ConvStrideW_>{};
constexpr auto ConvDilationH = Number<ConvDilationH_>{};
constexpr auto ConvDilationW = Number<ConvDilationW_>{};
constexpr auto Hop = Number<(Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock>{};
constexpr auto Wop = Number<(Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock>{};
constexpr auto OutRightPadH = Hop - Ho;
constexpr auto OutRightPadW = Wop - Wo;
const auto InLeftPadH_ = in_left_pads[I0];
const auto InLeftPadW_ = in_left_pads[I1];
constexpr auto InLeftPadH = Number<InLeftPadH_>{};
constexpr auto InLeftPadW = Number<InLeftPadW_>{};
static_assert(InLeftPadH == 0 and InLeftPadW == 0, "");
constexpr auto in_right_pads = InRightPads{};
const auto InRightPadH_ = in_right_pads[I0] + OutRightPadH * ConvStrideH;
const auto InRightPadW_ = in_right_pads[I1] + OutRightPadW * ConvStrideW;
constexpr auto InRightPadH = Number<InRightPadH_>{};
constexpr auto InRightPadW = Number<InRightPadW_>{};
static_assert(OutRightPadW == 0 and OutRightPadH == 0, "");
// weight tensor
const auto wei_e_k_global_desc = transform_dynamic_tensor_descriptor(
make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C)),
make_tuple(make_pass_through_transform(K), make_pass_through_transform(C)),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<1>{}, Sequence<0>{}));
static_assert(wei_e_k_global_desc.IsKnownAtCompileTime(),
"wrong! wei_e_k_global_desc need to known at compile-time");
// input tensor
const auto in_e_n_ho_wo_global_desc = transform_dynamic_tensor_descriptor(
in_n_c_hi_wi_global_desc,
make_tuple(make_pass_through_transform(C),
make_pass_through_transform(N),
make_pass_through_transform(Ho),
make_pass_through_transform(Wo)),
make_tuple(Sequence<1>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
static_assert(in_e_n_ho_wo_global_desc.IsKnownAtCompileTime(),
"wrong! in_e_n_ho_wo_global_desc need to known at compile-time");
// output tensor
const auto out_k_n_hop_wop_global_desc = transform_dynamic_tensor_descriptor(
make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K0, Ho, Wo, K1)),
make_tuple(make_merge_transform(make_tuple(K0, K1)),
make_pass_through_transform(N),
make_pass_through_transform(Ho),
make_pass_through_transform(Wo)),
make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
static_assert(out_k_n_hop_wop_global_desc.IsKnownAtCompileTime(),
"wrong! out_k_n_hop_wop_global_desc need to known at compile-time");
const auto E = C;
std::cerr << "Hop = " << Hop << " Wop = " << Wop << std::endl;
if(!((K % KPerBlock) == 0 && (Hop % HoPerBlock) == 0 && (Wop % WoPerBlock) == 0 &&
(E % EPerBlock) == 0))
{
throw std::runtime_error("wrong! GEMM size no divisible");
}
// hack to control index calculation when iterating over a_k_m_global tensor
constexpr auto a_e_k_global_iterator_hacks =
make_tuple(make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}),
make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}));
constexpr auto a_e_k_global_move_slice_window_iterator_hack = Sequence<0, 0, 0>{};
constexpr auto b_e_n_ho_wo_global_iterator_hacks =
make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}),
make_tuple(Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}));
constexpr auto b_e_n_ho_wo_global_move_slice_window_iterator_hack =
Sequence<0, 0, 0, 0, 0>{};
// hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
// hack for NKHW format
constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks =
make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}),
make_tuple(Sequence<0, 2, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}));
// GEMM
using gridwise_gemm = GridwiseStaticGemm_km_kn_mn_v3<
BlockSize,
FloatAB,
FloatAcc,
FloatC,
InMemoryDataOperation::Set,
decltype(wei_e_k_global_desc),
decltype(in_e_n_ho_wo_global_desc),
decltype(out_k_n_hop_wop_global_desc),
KPerBlock,
HoPerBlock,
WoPerBlock,
EPerBlock,
KPerThread,
HoPerThread,
WoPerThread,
EPerThread,
ABlockTransferThreadSliceLengths_E_K,
ABlockTransferThreadClusterLengths_E_K,
Sequence<1, 0>,
Sequence<1, 0>,
0,
ABlockTransferSrcScalarPerVector_E,
ABlockTransferDstScalarPerVector_K,
false, // don't move back src coordinate after threadwise copy
Sequence<0, 2, 3, 1>,
3,
BThreadTransferSrcScalarPerVector_W,
false, // don't move back src coordinate after threadwise copy, which will be fused with
// MoveSrcSliceWindow() to save addr computation
Sequence<0, 2, 3, 1>,
0,
CThreadTransferDstScalarPerVector_W,
decltype(a_e_k_global_iterator_hacks),
decltype(b_e_n_ho_wo_global_iterator_hacks),
decltype(c_k_n_ho_wo_global_tensor_iterator_hacks),
decltype(a_e_k_global_move_slice_window_iterator_hack),
decltype(b_e_n_ho_wo_global_move_slice_window_iterator_hack)>;
const auto GridSize = (K / KPerBlock) * (Hop / HoPerBlock) * (Wop / WoPerBlock) * N;
constexpr bool has_main_k_block_loop = (E + EPerBlock) / (2 * EPerBlock) > 1;
constexpr bool has_double_tail_k_block_loop = (E / EPerBlock) % 2 == 0;
index_t nrepeat = 100;
std::cout << "NCHWc" << K1 << "_n" << N << "c" << C << "h" << Hi << "w" << Wi << "-k" << K
<< "c" << C << "y" << Y << "x" << X << "-u" << conv_strides[I0] << "v"
<< conv_strides[I1] << "l" << conv_dilations[I0] << "j" << conv_dilations[I1]
<< "q" << in_left_pads[I0] << "p" << in_right_pads[I0] << std::endl;
std::cout << "GridSize = " << GridSize << " BlockSize = " << BlockSize << std::endl;
for(index_t i = 0; i < 5; ++i)
{
std::cout << "Start running " << nrepeat << " times..." << std::endl;
KernelTimer timer;
timer.Start();
std::cout << "has_main_k_block_loop: " << has_main_k_block_loop
<< " has_double_tail_k_block_loop: " << has_double_tail_k_block_loop
<< std::endl;
for(index_t j = 0; j < nrepeat; ++j)
{
if constexpr(has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel = run_gridwise_operation<gridwise_gemm,
const FloatAB*,
const FloatAB*,
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, true>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
p_wei_global,
p_in_global,
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, true>{});
}
else if constexpr(has_main_k_block_loop && !has_double_tail_k_block_loop)
{
const auto kernel = run_gridwise_operation<gridwise_gemm,
const FloatAB*,
const FloatAB*,
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, false>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
p_wei_global,
p_in_global,
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, false>{});
}
else if constexpr(!has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel = run_gridwise_operation<gridwise_gemm,
const FloatAB*,
const FloatAB*,
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, true>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
p_wei_global,
p_in_global,
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, true>{});
}
else
{
const auto kernel = run_gridwise_operation<gridwise_gemm,
const FloatAB*,
const FloatAB*,
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, false>>;
launch_kernel(kernel,
dim3(GridSize),
dim3(BlockSize),
0,
0,
p_wei_global,
p_in_global,
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, false>{});
}
}
timer.End();
float ave_time = timer.GetElapsedTime() / nrepeat;
float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc,
wei_k_c_y_x_global_desc,
out_n_k0_ho_wo_k1_global_desc) /
(std::size_t(1000) * 1000 * 1000) / ave_time;
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
<< std::endl;
}
}
};
#endif
}
// namespace ck
#endif
composable_kernel/include/driver/driver_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
View file @
5ed51b71
...
...
@@ -226,7 +226,7 @@ struct DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
Sequence
<
0
,
0
,
0
,
0
,
0
>
{}));
// GEMM
using
gridwise_gemm
=
GridwiseStaticGemm_km_kn_mn_v
3
<
using
gridwise_gemm
=
GridwiseStaticGemm_km_kn_mn_v
2
<
BlockSize
,
FloatAB
,
FloatAcc
,
...
...
@@ -273,11 +273,10 @@ struct DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
index_t
nrepeat
=
100
;
std
::
cout
<<
"conv_v5r1__NCHWc"
<<
K1
<<
"_n"
<<
N
<<
"c"
<<
C
<<
"h"
<<
Hi
<<
"w"
<<
Wi
<<
"-k"
<<
K
<<
"c"
<<
C
<<
"y"
<<
Y
<<
"x"
<<
X
<<
"-u"
<<
conv_strides
[
I0
]
<<
"v"
<<
conv_strides
[
I1
]
<<
"l"
<<
conv_dilations
[
I0
]
<<
"j"
<<
conv_dilations
[
I1
]
<<
"q"
<<
in_left_pads
[
I0
]
<<
"p"
<<
in_right_pads
[
I0
]
<<
std
::
endl
;
std
::
cout
<<
"NCHWc"
<<
K1
<<
"_n"
<<
N
<<
"c"
<<
C
<<
"h"
<<
Hi
<<
"w"
<<
Wi
<<
"-k"
<<
K
<<
"c"
<<
C
<<
"y"
<<
Y
<<
"x"
<<
X
<<
"-u"
<<
conv_strides
[
I0
]
<<
"v"
<<
conv_strides
[
I1
]
<<
"l"
<<
conv_dilations
[
I0
]
<<
"j"
<<
conv_dilations
[
I1
]
<<
"q"
<<
in_left_pads
[
I0
]
<<
"p"
<<
in_right_pads
[
I0
]
<<
std
::
endl
;
std
::
cout
<<
"GridSize = "
<<
GridSize
<<
" BlockSize = "
<<
BlockSize
<<
std
::
endl
;
...
...
@@ -990,11 +989,10 @@ struct DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_1x1
index_t
nrepeat
=
100
;
std
::
cout
<<
"conv_v5r1_NCHWc"
<<
K1
<<
"_n"
<<
N
<<
"c"
<<
C
<<
"h"
<<
Hi
<<
"w"
<<
Wi
<<
"-k"
<<
K
<<
"c"
<<
C
<<
"y"
<<
Y
<<
"x"
<<
X
<<
"-u"
<<
conv_strides
[
I0
]
<<
"v"
<<
conv_strides
[
I1
]
<<
"l"
<<
conv_dilations
[
I0
]
<<
"j"
<<
conv_dilations
[
I1
]
<<
"q"
<<
in_left_pads
[
I0
]
<<
"p"
<<
in_right_pads
[
I0
]
<<
std
::
endl
;
std
::
cout
<<
"NCHWc"
<<
K1
<<
"_n"
<<
N
<<
"c"
<<
C
<<
"h"
<<
Hi
<<
"w"
<<
Wi
<<
"-k"
<<
K
<<
"c"
<<
C
<<
"y"
<<
Y
<<
"x"
<<
X
<<
"-u"
<<
conv_strides
[
I0
]
<<
"v"
<<
conv_strides
[
I1
]
<<
"l"
<<
conv_dilations
[
I0
]
<<
"j"
<<
conv_dilations
[
I1
]
<<
"q"
<<
in_left_pads
[
I0
]
<<
"p"
<<
in_right_pads
[
I0
]
<<
std
::
endl
;
std
::
cout
<<
"GridSize = "
<<
GridSize
<<
" BlockSize = "
<<
BlockSize
<<
std
::
endl
;
...
...
composable_kernel/include/tensor_operation/gridwise_static_gemm_v2.hpp
View file @
5ed51b71
...
...
@@ -47,7 +47,7 @@ template <index_t BlockSize,
typename
CGlobalIteratorHacks
,
typename
AGlobalMoveSliceWindowIteratorHacks
,
typename
BGlobalMoveSliceWindowIteratorHacks
>
struct
GridwiseStaticGemm_km_kn_mn_v
3
struct
GridwiseStaticGemm_km_kn_mn_v
2
{
__host__
__device__
static
constexpr
index_t
GetSharedMemoryNumberOfByte
()
{
...
...
@@ -237,10 +237,12 @@ struct GridwiseStaticGemm_km_kn_mn_v3
c_thread_buf
;
// initialize output thread tensor
#if 0
ThreadwiseDynamicTensorSliceSet_v1<FloatAcc,
decltype
(
c_k_n_ho_wo_thread_desc
),
Sequence
<
KPerThread
,
1
,
HoPerThread
,
WoPerThread
>>
{}
.
Run
(
c_k_n_ho_wo_thread_desc
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
c_thread_buf
,
FloatAcc
{
0
});
decltype(c_k_n_ho_wo_thread_desc),
Sequence<KPerThread, 1, HoPerThread, WoPerThread>>{}
.Run(c_k_n_ho_wo_thread_desc, make_tuple(I0, I0, I0, I0), c_thread_buf, FloatAcc{0});
#endif
constexpr
auto
b_thread_slice_copy_step
=
make_multi_index
(
EPerBlock
,
0
,
0
,
0
);
...
...
composable_kernel/include/tensor_operation/gridwise_static_gemm_v3.hpp
0 → 100644
View file @
5ed51b71
#ifndef CK_GRIDWISE_STATIC_GEMM_V3_HPP
#define CK_GRIDWISE_STATIC_GEMM_V3_HPP
#include "common_header.hpp"
#include "dynamic_multi_index_transform_helper.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "blockwise_dynamic_tensor_slice_transfer.hpp"
#include "threadwise_dynamic_tensor_slice_transfer.hpp"
#include "blockwise_gemm_v3.hpp"
namespace
ck
{
template
<
index_t
BlockSize
,
typename
FloatAB
,
typename
FloatAcc
,
typename
FloatC
,
InMemoryDataOperation
CGlobalMemoryDataOperation
,
typename
AGlobalDesc
,
typename
BGlobalDesc
,
typename
DGlobalDesc
,
typename
CGlobalDesc
,
index_t
KPerBlock
,
index_t
HoPerBlock
,
index_t
WoPerBlock
,
index_t
EPerBlock
,
index_t
KPerThread
,
index_t
HoPerThread
,
index_t
WoPerThread
,
index_t
EPerThread
,
typename
ABlockTransferThreadSliceLengths_E_K
,
typename
ABlockTransferThreadClusterLengths_E_K
,
typename
ABlockTransferThreadClusterArrangeOrder
,
typename
ABlockTransferSrcAccessOrder
,
index_t
ABlockTransferSrcVectorDim
,
index_t
ABlockTransferSrcScalarPerVector
,
index_t
ABlockTransferDstScalarPerVector_K
,
bool
AThreadTransferSrcResetCoordinateAfterRun
,
typename
BBlockTransferSrcAccessOrder
,
index_t
BBlockTransferSrcVectorDim
,
index_t
BBlockTransferSrcScalarPerVector
,
bool
BThreadTransferSrcResetCoordinateAfterRun
,
typename
CThreadTransferSrcDstAccessOrder
,
index_t
CThreadTransferSrcDstVectorDim
,
index_t
CThreadTransferDstScalarPerVector
,
typename
AGlobalIteratorHacks
,
typename
BGlobalIteratorHacks
,
typename
CGlobalIteratorHacks
,
typename
AGlobalMoveSliceWindowIteratorHacks
,
typename
BGlobalMoveSliceWindowIteratorHacks
>
struct
GridwiseStaticGemm_km_kn_mn_v3
{
__host__
__device__
static
constexpr
index_t
GetSharedMemoryNumberOfByte
()
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
a_e_k_global_desc
=
AGlobalDesc
{};
constexpr
auto
E
=
a_e_k_global_desc
.
GetLength
(
I0
);
constexpr
auto
max_lds_align
=
math
::
lcm
(
Number
<
ABlockTransferDstScalarPerVector_K
>
{},
Number
<
KPerBlock
>
{});
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
constexpr
auto
a_e_k_desc
=
make_dynamic_naive_tensor_descriptor_aligned_v2
(
make_tuple
(
Number
<
E
>
{},
Number
<
KPerBlock
>
{}),
max_lds_align
);
// LDS allocation for A and B: be careful of alignment
constexpr
auto
a_block_space_size
=
math
::
integer_least_multiple
(
a_e_k_desc
.
GetElementSpaceSize
(),
max_lds_align
);
return
a_block_space_size
*
sizeof
(
FloatAB
);
}
template
<
index_t
activ_type
,
bool
HasMainKBlockLoop
,
bool
HasDoubleTailKBlockLoop
>
__device__
void
Run
(
const
FloatAB
*
__restrict__
p_a_global
,
const
FloatAB
*
__restrict__
p_b_global
,
const
FloatC
*
__restrict__
p_d_global
,
FloatC
*
__restrict__
p_c_global
,
FloatAB
*
__restrict__
p_shared_block
,
Number
<
activ_type
>
,
integral_constant
<
bool
,
HasMainKBlockLoop
>
,
integral_constant
<
bool
,
HasDoubleTailKBlockLoop
>
)
const
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
a_e_k_global_desc
=
AGlobalDesc
{};
constexpr
auto
b_e_n_ho_wo_global_desc
=
BGlobalDesc
{};
constexpr
auto
d_k_n_hox2_wox2_global_desc
=
DGlobalDesc
{};
constexpr
auto
c_k_n_ho_wo_global_desc
=
CGlobalDesc
{};
const
auto
a_global_buf
=
make_dynamic_buffer
<
AddressSpace
::
Global
>
(
p_a_global
,
a_e_k_global_desc
.
GetElementSpaceSize
());
const
auto
b_global_buf
=
make_dynamic_buffer
<
AddressSpace
::
Global
>
(
p_b_global
,
b_e_n_ho_wo_global_desc
.
GetElementSpaceSize
());
auto
d_global_buf
=
make_dynamic_buffer
<
AddressSpace
::
Global
>
(
p_d_global
,
d_k_n_hox2_wox2_global_desc
.
GetElementSpaceSize
());
auto
c_global_buf
=
make_dynamic_buffer
<
AddressSpace
::
Global
>
(
p_c_global
,
c_k_n_ho_wo_global_desc
.
GetElementSpaceSize
());
constexpr
auto
E
=
a_e_k_global_desc
.
GetLength
(
I0
);
constexpr
auto
K
=
a_e_k_global_desc
.
GetLength
(
I1
);
constexpr
auto
N
=
b_e_n_ho_wo_global_desc
.
GetLength
(
I1
);
constexpr
auto
Ho
=
b_e_n_ho_wo_global_desc
.
GetLength
(
I2
);
constexpr
auto
Wo
=
b_e_n_ho_wo_global_desc
.
GetLength
(
I3
);
// divide block work by [M, N]
#if 0
const auto k_block_work_num = K / Number<KPerBlock>{};
const auto ho_block_work_num = Ho / Number<HoPerBlock>{};
const auto wo_block_work_num = Wo / Number<WoPerBlock>{};
const auto hwo_block_work_num = ho_block_work_num * wo_block_work_num;
const index_t k_block_work_id = get_block_1d_id() / hwo_block_work_num;
const index_t hwo_block_work_id = get_block_1d_id() - k_block_work_id * hwo_block_work_num;
const index_t ho_block_work_id = hwo_block_work_id / wo_block_work_num;
const index_t wo_block_work_id = hwo_block_work_id - ho_block_work_id * wo_block_work_num;
#else
// Hack: this force result into SGPR
const
index_t
k_block_work_num
=
__builtin_amdgcn_readfirstlane
(
K
/
KPerBlock
);
const
index_t
ho_block_work_num
=
__builtin_amdgcn_readfirstlane
(
Ho
/
HoPerBlock
);
const
index_t
wo_block_work_num
=
__builtin_amdgcn_readfirstlane
(
Wo
/
WoPerBlock
);
const
index_t
hwo_block_work_num
=
ho_block_work_num
*
wo_block_work_num
;
const
index_t
k_block_work_id
=
__builtin_amdgcn_readfirstlane
(
get_block_1d_id
()
/
hwo_block_work_num
);
const
index_t
hwo_block_work_id
=
get_block_1d_id
()
-
k_block_work_id
*
hwo_block_work_num
;
const
index_t
ho_block_work_id
=
__builtin_amdgcn_readfirstlane
(
hwo_block_work_id
/
wo_block_work_num
);
const
index_t
wo_block_work_id
=
hwo_block_work_id
-
ho_block_work_id
*
wo_block_work_num
;
#endif
// lds max alignment
constexpr
auto
max_lds_align
=
math
::
lcm
(
Number
<
ABlockTransferDstScalarPerVector_K
>
{},
Number
<
KPerBlock
>
{});
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
constexpr
auto
a_e_k_block_desc
=
make_dynamic_naive_tensor_descriptor_aligned_v2
(
make_tuple
(
Number
<
EPerBlock
>
{},
Number
<
KPerBlock
>
{}),
max_lds_align
);
constexpr
auto
a_e_k_desc
=
make_dynamic_naive_tensor_descriptor_aligned_v2
(
make_tuple
(
Number
<
E
>
{},
Number
<
KPerBlock
>
{}),
max_lds_align
);
// B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
constexpr
auto
b_e_n_ho_wo_block_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
Number
<
EPerBlock
>
{},
Number
<
1
>
{},
Number
<
HoPerBlock
>
{},
Number
<
WoPerBlock
>
{}));
// c_thread_mtx definition: this is a mess
// TODO:: more elegent way of defining c_thread_mtx
constexpr
auto
c_k_n_ho_wo_thread_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
Number
<
KPerThread
>
{},
Number
<
1
>
{},
Number
<
HoPerThread
>
{},
Number
<
WoPerThread
>
{}));
auto
blockwise_gemm
=
BlockwiseGemm_km_kn_m0m1n0n1_v3
<
BlockSize
,
FloatAB
,
FloatAB
,
FloatAcc
,
decltype
(
a_e_k_block_desc
),
decltype
(
b_e_n_ho_wo_block_desc
),
decltype
(
c_k_n_ho_wo_thread_desc
),
KPerThread
,
HoPerThread
,
WoPerThread
,
EPerThread
,
ABlockTransferSrcScalarPerVector
,
ABlockTransferDstScalarPerVector_K
>
{};
auto
c_thread_mtx_index
=
blockwise_gemm
.
GetBeginOfThreadMatrixC
(
get_thread_local_1d_id
());
const
auto
k_thread_id
=
c_thread_mtx_index
.
k
;
const
auto
ho_thread_id
=
c_thread_mtx_index
.
h
;
const
auto
wo_thread_id
=
c_thread_mtx_index
.
w
;
const
index_t
k_block_data_on_global
=
k_block_work_id
*
KPerBlock
;
const
index_t
ho_block_data_on_global
=
ho_block_work_id
*
HoPerBlock
;
const
index_t
wo_block_data_on_global
=
wo_block_work_id
*
WoPerBlock
;
const
index_t
ho_thread_data_on_global
=
ho_block_data_on_global
+
ho_thread_id
*
HoPerThread
;
const
index_t
wo_thread_data_on_global
=
wo_block_data_on_global
+
wo_thread_id
*
WoPerThread
;
// A matrix blockwise copy
auto
a_blockwise_copy
=
BlockwiseDynamicTensorSliceTransfer_v4
<
BlockSize
,
InMemoryDataOperation
::
Set
,
Sequence
<
E
,
KPerBlock
>
,
ABlockTransferThreadSliceLengths_E_K
,
ABlockTransferThreadClusterLengths_E_K
,
ABlockTransferThreadClusterArrangeOrder
,
FloatAB
,
FloatAB
,
decltype
(
a_e_k_global_desc
),
decltype
(
a_e_k_desc
),
ABlockTransferSrcAccessOrder
,
Sequence
<
0
,
1
>
,
ABlockTransferSrcVectorDim
,
1
,
ABlockTransferSrcScalarPerVector
,
ABlockTransferDstScalarPerVector_K
,
1
,
1
,
AThreadTransferSrcResetCoordinateAfterRun
,
true
>
(
a_e_k_global_desc
,
make_multi_index
(
0
,
k_block_data_on_global
),
a_e_k_desc
,
make_multi_index
(
0
,
0
));
constexpr
auto
b_e_n_ho_wo_thread_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
Number
<
EPerBlock
>
{},
Number
<
1
>
{},
Number
<
HoPerThread
>
{},
Number
<
WoPerThread
>
{}));
auto
b_threadwise_transfer
=
ThreadwiseDynamicTensorSliceTransfer_v2
<
FloatAB
,
FloatAB
,
decltype
(
b_e_n_ho_wo_global_desc
),
decltype
(
b_e_n_ho_wo_thread_desc
),
Sequence
<
EPerBlock
,
1
,
HoPerThread
,
WoPerThread
>
,
BBlockTransferSrcAccessOrder
,
BBlockTransferSrcVectorDim
,
BBlockTransferSrcScalarPerVector
,
1
,
true
>
(
b_e_n_ho_wo_global_desc
,
make_multi_index
(
0
,
0
,
ho_thread_data_on_global
,
wo_thread_data_on_global
));
auto
a_block_buf
=
make_dynamic_buffer
<
AddressSpace
::
Lds
>
(
p_shared_block
,
a_e_k_desc
.
GetElementSpaceSize
());
// register allocation for output
StaticBuffer
<
AddressSpace
::
Vgpr
,
FloatAcc
,
c_k_n_ho_wo_thread_desc
.
GetElementSpaceSize
()
>
c_thread_buf
;
// initialize output thread tensor
#if 0
ThreadwiseDynamicTensorSliceSet_v1<FloatAcc,
decltype(c_k_n_ho_wo_thread_desc),
Sequence<KPerThread, 1, HoPerThread, WoPerThread>>{}
.Run(c_k_n_ho_wo_thread_desc, make_tuple(I0, I0, I0, I0), c_thread_buf, FloatAcc{0});
#endif
constexpr
auto
b_thread_slice_copy_step
=
make_multi_index
(
EPerBlock
,
0
,
0
,
0
);
// hack to control index calculation when iterating over A and B matrix for threadwise copy
constexpr
auto
a_e_k_global_iterator_hacks
=
AGlobalIteratorHacks
{};
constexpr
auto
b_e_n_ho_wo_global_iterator_hacks
=
BGlobalIteratorHacks
{};
// hack to control index calculation when move slice window for A and B matrix for
// threadwise copy
constexpr
auto
a_e_k_global_move_slice_window_iterator_hack
=
AGlobalMoveSliceWindowIteratorHacks
{};
constexpr
auto
b_e_n_ho_wo_global_move_slice_window_iterator_hack
=
BGlobalMoveSliceWindowIteratorHacks
{};
// double regsiter buffer for b
StaticBuffer
<
AddressSpace
::
Vgpr
,
FloatAB
,
b_e_n_ho_wo_thread_desc
.
GetElementSpaceSize
()
>
b_thread_even_buf
,
b_thread_odd_buf
;
// LDS double buffer: preload data
{
a_blockwise_copy
.
RunRead
(
a_e_k_global_desc
,
a_global_buf
,
a_e_k_global_iterator_hacks
);
b_threadwise_transfer
.
Run
(
b_e_n_ho_wo_global_desc
,
b_global_buf
,
b_e_n_ho_wo_thread_desc
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
b_thread_even_buf
,
b_e_n_ho_wo_global_iterator_hacks
);
a_blockwise_copy
.
RunWrite
(
a_e_k_desc
,
a_block_buf
);
}
__syncthreads
();
if
constexpr
(
HasMainKBlockLoop
)
{
index_t
e_block_data_begin
=
0
;
// LDS double buffer: main body
// use Do-While loop instead of For loop to simplify control flow
do
{
// even iteration
b_threadwise_transfer
.
MoveSrcSliceWindow
(
b_e_n_ho_wo_global_desc
,
b_thread_slice_copy_step
);
b_threadwise_transfer
.
Run
(
b_e_n_ho_wo_global_desc
,
b_global_buf
,
b_e_n_ho_wo_thread_desc
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
b_thread_odd_buf
,
b_e_n_ho_wo_global_iterator_hacks
);
// LDS double buffer: GEMM on current data
// TODO: @Zhang Jing: blockwise gemm should be able to move slice window
blockwise_gemm
.
Run
(
a_block_buf
,
b_thread_even_buf
,
c_thread_buf
);
blockwise_gemm
.
MoveASliceWindow
(
a_e_k_block_desc
,
make_tuple
(
EPerBlock
,
0
));
b_threadwise_transfer
.
MoveSrcSliceWindow
(
b_e_n_ho_wo_global_desc
,
b_thread_slice_copy_step
);
b_threadwise_transfer
.
Run
(
b_e_n_ho_wo_global_desc
,
b_global_buf
,
b_e_n_ho_wo_thread_desc
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
b_thread_even_buf
,
b_e_n_ho_wo_global_iterator_hacks
);
// LDS double buffer: GEMM on current data
blockwise_gemm
.
Run
(
a_block_buf
,
b_thread_odd_buf
,
c_thread_buf
);
blockwise_gemm
.
MoveASliceWindow
(
a_e_k_block_desc
,
make_tuple
(
EPerBlock
,
0
));
e_block_data_begin
+=
2
*
EPerBlock
;
}
while
(
e_block_data_begin
<
E
-
2
*
EPerBlock
);
}
// LDS double buffer: tail
if
constexpr
(
HasDoubleTailKBlockLoop
)
// if has 2 iteration left
{
b_threadwise_transfer
.
MoveSrcSliceWindow
(
b_e_n_ho_wo_global_desc
,
b_thread_slice_copy_step
);
b_threadwise_transfer
.
Run
(
b_e_n_ho_wo_global_desc
,
b_global_buf
,
b_e_n_ho_wo_thread_desc
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
b_thread_odd_buf
,
b_e_n_ho_wo_global_iterator_hacks
);
// LDS double buffer: GEMM on 2nd-last data
blockwise_gemm
.
Run
(
a_block_buf
,
b_thread_even_buf
,
c_thread_buf
);
blockwise_gemm
.
MoveASliceWindow
(
a_e_k_block_desc
,
make_tuple
(
EPerBlock
,
0
));
// LDS double buffer: GEMM on last data
blockwise_gemm
.
Run
(
a_block_buf
,
b_thread_odd_buf
,
c_thread_buf
);
}
else
// if has 1 iteration left
{
// LDS double buffer: GEMM on last data
blockwise_gemm
.
Run
(
a_block_buf
,
b_thread_even_buf
,
c_thread_buf
);
}
// activ
{
static_for
<
0
,
c_k_n_ho_wo_thread_desc
.
GetElementSpaceSize
(),
1
>
{}([
&
](
auto
i
)
{
if
constexpr
(
activ_type
==
1
)
c_thread_buf
(
i
)
=
c_thread_buf
[
i
]
>=
0
?
c_thread_buf
[
i
]
:
0.0
;
else
if
constexpr
(
activ_type
==
2
)
c_thread_buf
(
i
)
=
1.0
/
(
1.0
+
exp
(
-
c_thread_buf
[
i
]));
});
}
constexpr
auto
HoPerThreadx2
=
HoPerThread
*
2
;
constexpr
auto
WoPerThreadx2
=
WoPerThread
*
2
;
constexpr
auto
d_k_n_hox2_wox2_thread_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
Number
<
KPerThread
>
{},
Number
<
1
>
{},
Number
<
HoPerThreadx2
>
{},
Number
<
WoPerThreadx2
>
{}));
StaticBuffer
<
AddressSpace
::
Vgpr
,
FloatC
,
d_k_n_hox2_wox2_thread_desc
.
GetElementSpaceSize
()
>
d_thread_buf
;
const
index_t
hox2_thread_data_on_global
=
ho_thread_data_on_global
*
2
;
const
index_t
wox2_thread_data_on_global
=
wo_thread_data_on_global
*
2
;
// hack to control index calculation when iterating over c_k_n_ho_wo_global tensor
constexpr
auto
c_k_n_ho_wo_global_tensor_iterator_hacks
=
CGlobalIteratorHacks
{};
const
index_t
k_thread_data_on_global
=
k_block_data_on_global
+
k_thread_id
*
KPerThread
;
// Resize_Add
{
ThreadwiseDynamicTensorSliceTransfer_v2
<
FloatC
,
FloatC
,
decltype
(
d_k_n_hox2_wox2_global_desc
),
decltype
(
d_k_n_hox2_wox2_thread_desc
),
Sequence
<
KPerThread
,
1
,
HoPerThreadx2
,
WoPerThreadx2
>
,
CThreadTransferSrcDstAccessOrder
,
CThreadTransferSrcDstVectorDim
,
CThreadTransferDstScalarPerVector
,
1
,
true
>
(
d_k_n_hox2_wox2_global_desc
,
make_multi_index
(
k_thread_data_on_global
,
0
,
hox2_thread_data_on_global
,
wox2_thread_data_on_global
))
.
Run
(
d_k_n_hox2_wox2_global_desc
,
d_global_buf
,
d_k_n_hox2_wox2_thread_desc
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
d_thread_buf
,
c_k_n_ho_wo_global_tensor_iterator_hacks
);
static_for
<
0
,
KPerThread
,
1
>
{}([
&
](
auto
k_i
)
{
static_for
<
0
,
HoPerThreadx2
,
1
>
{}([
&
](
auto
h_i
)
{
static_for
<
0
,
WoPerThreadx2
,
1
>
{}([
&
](
auto
w_i
)
{
d_thread_buf
(
Number
<
d_k_n_hox2_wox2_thread_desc
.
CalculateOffset
(
make_tuple
(
k_i
,
0
,
h_i
,
w_i
))
>
{})
+=
c_thread_buf
[
Number
<
c_k_n_ho_wo_thread_desc
.
CalculateOffset
(
make_tuple
(
k_i
,
0
,
h_i
/
2
,
w_i
/
2
))
>
{}];
});
});
});
}
// output: register to global memory
{
ThreadwiseDynamicTensorSliceTransfer_v1r3
<
FloatC
,
FloatC
,
decltype
(
d_k_n_hox2_wox2_thread_desc
),
decltype
(
d_k_n_hox2_wox2_global_desc
),
Sequence
<
KPerThread
,
1
,
HoPerThreadx2
,
WoPerThreadx2
>
,
CThreadTransferSrcDstAccessOrder
,
CThreadTransferSrcDstVectorDim
,
CThreadTransferDstScalarPerVector
,
CGlobalMemoryDataOperation
,
1
,
true
>
(
d_k_n_hox2_wox2_global_desc
,
make_multi_index
(
k_thread_data_on_global
,
0
,
hox2_thread_data_on_global
,
wox2_thread_data_on_global
))
.
Run
(
d_k_n_hox2_wox2_thread_desc
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
d_thread_buf
,
d_k_n_hox2_wox2_global_desc
,
c_global_buf
,
c_k_n_ho_wo_global_tensor_iterator_hacks
);
}
}
// pass tensor descriptor by reference
template
<
index_t
activ_type
,
bool
HasMainKBlockLoop
,
bool
HasDoubleTailKBlockLoop
>
__device__
void
Run
(
const
FloatAB
*
__restrict__
p_a_global
,
const
FloatAB
*
__restrict__
p_b_global
,
const
FloatC
*
__restrict__
p_d_global
,
FloatC
*
__restrict__
p_c_global
,
Number
<
activ_type
>
,
integral_constant
<
bool
,
HasMainKBlockLoop
>
,
integral_constant
<
bool
,
HasDoubleTailKBlockLoop
>
)
const
{
constexpr
index_t
shared_block_size
=
GetSharedMemoryNumberOfByte
()
/
sizeof
(
FloatAB
);
__shared__
FloatAB
p_shared_block
[
shared_block_size
];
Run
(
p_a_global
,
p_b_global
,
p_d_global
,
p_c_global
,
p_shared_block
,
Number
<
activ_type
>
{},
integral_constant
<
bool
,
HasMainKBlockLoop
>
{},
integral_constant
<
bool
,
HasDoubleTailKBlockLoop
>
{});
}
};
}
// namespace ck
#endif
host/driver_offline/CMakeLists.txt
View file @
5ed51b71
...
...
@@ -13,9 +13,15 @@ include_directories(BEFORE
set
(
CONV_FWD_DRIVER_OFFLINE_SOURCE conv_fwd_driver_offline.cpp
)
set
(
CONV_BWD_DRIVER_OFFLINE_SOURCE conv_bwd_driver_offline.cpp
)
set
(
CONV_ADD_FWD_DRIVER_OFFLINE_SOURCE conv_add_fwd_driver_offline.cpp
)
set
(
CONV_ACTIV_FWD_DRIVER_OFFLINE_SOURCE conv_activ_fwd_driver_offline.cpp
)
add_executable
(
conv_fwd_driver_offline
${
CONV_FWD_DRIVER_OFFLINE_SOURCE
}
)
add_executable
(
conv_bwd_driver_offline
${
CONV_BWD_DRIVER_OFFLINE_SOURCE
}
)
add_executable
(
conv_add_fwd_driver_offline
${
CONV_ADD_FWD_DRIVER_OFFLINE_SOURCE
}
)
add_executable
(
conv_activ_fwd_driver_offline
${
CONV_ACTIV_FWD_DRIVER_OFFLINE_SOURCE
}
)
target_link_libraries
(
conv_fwd_driver_offline PRIVATE host_tensor
)
target_link_libraries
(
conv_bwd_driver_offline PRIVATE host_tensor
)
target_link_libraries
(
conv_add_fwd_driver_offline PRIVATE host_tensor
)
target_link_libraries
(
conv_activ_fwd_driver_offline PRIVATE host_tensor
)
host/driver_offline/conv_activ_fwd_driver_offline.cpp
0 → 100644
View file @
5ed51b71
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "conv_common.hpp"
#include "host_conv.hpp"
#include "device_tensor.hpp"
#include "device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp"
#define USE_DYNAMIC_MODE 0
#define USE_CONV_FWD_V5R1_NCHW 1
enum
ConvForwardAlgo
{
V5R1NCHW
};
int
main
(
int
argc
,
char
*
argv
[])
{
using
namespace
ck
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
I6
=
Number
<
6
>
{};
// static mode
if
(
argc
<
7
)
{
printf
(
"arg1 to 5: layout, algo, do_verification, init_method, do_log, nrepeat
\n
"
);
exit
(
1
);
}
const
ConvTensorLayout
layout
=
static_cast
<
ConvTensorLayout
>
(
atoi
(
argv
[
1
]));
const
ConvForwardAlgo
algo
=
static_cast
<
ConvForwardAlgo
>
(
atoi
(
argv
[
2
]));
const
bool
do_verification
=
atoi
(
argv
[
3
]);
const
int
init_method
=
atoi
(
argv
[
4
]);
const
bool
do_log
=
atoi
(
argv
[
5
]);
const
int
nrepeat
=
atoi
(
argv
[
6
]);
#if 1
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
1080
;
constexpr
index_t
Wi
=
1920
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
540
;
constexpr
index_t
Wi
=
960
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
480
;
constexpr
index_t
Wi
=
270
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
240
;
constexpr
index_t
Wi
=
135
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
1080
;
constexpr
index_t
Wi
=
1920
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
540
;
constexpr
index_t
Wi
=
960
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
480
;
constexpr
index_t
Wi
=
270
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
8
;
constexpr
index_t
Hi
=
1080
;
constexpr
index_t
Wi
=
1920
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
1080
;
constexpr
index_t
Wi
=
1920
;
constexpr
index_t
K
=
4
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
#endif
const
index_t
conv_stride_h
=
1
;
const
index_t
conv_stride_w
=
1
;
const
index_t
conv_dilation_h
=
1
;
const
index_t
conv_dilation_w
=
1
;
const
index_t
in_left_pad_h
=
1
;
const
index_t
in_left_pad_w
=
1
;
const
index_t
in_right_pad_h
=
1
;
const
index_t
in_right_pad_w
=
1
;
const
index_t
YEff
=
(
Y
-
1
)
*
conv_dilation_h
+
1
;
const
index_t
XEff
=
(
X
-
1
)
*
conv_dilation_w
+
1
;
const
index_t
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
YEff
)
/
conv_stride_h
+
1
;
const
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
XEff
)
/
conv_stride_w
+
1
;
#if 0
using in_data_t = float;
using acc_data_t = float;
using out_data_t = float;
#elif
1
using
in_data_t
=
half_t
;
using
acc_data_t
=
float
;
using
out_data_t
=
half_t
;
#elif 1
using
in_data_t
=
int8_t
;
using
acc_data_t
=
int32_t
;
using
out_data_t
=
int8_t
;
#endif
std
::
vector
<
std
::
size_t
>
in_lengths_host
(
4
),
wei_lengths_host
(
4
),
out_lengths_host
(
4
);
switch
(
layout
)
{
case
ConvTensorLayout
::
NCHW
:
// NCHW
in_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
in_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
C
);
in_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Hi
);
in_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wi
);
wei_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K
);
wei_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
C
);
wei_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Y
);
wei_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
X
);
out_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
out_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
K
);
out_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Ho
);
out_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wo
);
break
;
case
ConvTensorLayout
::
NHWC
:
// NHWC
in_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
in_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
Hi
);
in_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Wi
);
in_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
C
);
wei_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K
);
wei_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
Y
);
wei_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
X
);
wei_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
C
);
out_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
out_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
Ho
);
out_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Wo
);
out_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
K
);
break
;
default:
throw
std
::
runtime_error
(
"wrong! not implemented"
);
}
Tensor
<
in_data_t
>
in
(
in_lengths_host
);
Tensor
<
in_data_t
>
wei
(
wei_lengths_host
);
Tensor
<
out_data_t
>
out_host
(
out_lengths_host
);
Tensor
<
out_data_t
>
out_device
(
out_lengths_host
);
Tensor
<
out_data_t
>
add_device
(
out_lengths_host
);
std
::
cout
<<
"layout: "
<<
layout
<<
std
::
endl
;
ostream_HostTensorDescriptor
(
in
.
mDesc
,
std
::
cout
<<
"in: "
);
ostream_HostTensorDescriptor
(
wei
.
mDesc
,
std
::
cout
<<
"wei: "
);
ostream_HostTensorDescriptor
(
out_host
.
mDesc
,
std
::
cout
<<
"out: "
);
print_array
(
"InLeftPads"
,
make_tuple
(
in_left_pad_h
,
in_left_pad_w
));
print_array
(
"InRightPads"
,
make_tuple
(
in_right_pad_h
,
in_right_pad_w
));
print_array
(
"ConvStrides"
,
make_tuple
(
conv_stride_h
,
conv_stride_w
));
print_array
(
"ConvDilations"
,
make_tuple
(
conv_dilation_h
,
conv_dilation_w
));
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
();
switch
(
init_method
)
{
case
0
:
// no initialization
break
;
case
1
:
in
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
break
;
case
2
:
in
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
break
;
case
3
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
break
;
case
4
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
break
;
case
5
:
in
.
GenerateTensorValue
(
GeneratorTensor_3
<
float
>
{
0.0
,
1.0
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_3
<
float
>
{
-
0.5
,
0.5
},
num_thread
);
break
;
default:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
1
,
5
},
num_thread
);
auto
gen_wei
=
[](
auto
...
is
)
{
return
GeneratorTensor_2
{
1
,
5
}(
is
...)
*
GeneratorTensor_Checkboard
{}(
is
...);
};
wei
.
GenerateTensorValue
(
gen_wei
,
num_thread
);
}
auto
f_make_for_device_nchw
=
[
&
]()
{
const
auto
in_lengths_dev
=
make_tuple
(
Number
<
N
>
{},
Number
<
C
>
{},
Number
<
Hi
>
{},
Number
<
Wi
>
{});
const
auto
wei_lengths_dev
=
make_tuple
(
Number
<
K
>
{},
Number
<
C
>
{},
Number
<
Y
>
{},
Number
<
X
>
{});
const
auto
out_lengths_dev
=
make_tuple
(
Number
<
N
>
{},
Number
<
K
>
{},
Number
<
Ho
>
{},
Number
<
Wo
>
{});
const
auto
conv_strides_dev
=
make_tuple
(
Number
<
conv_stride_h
>
{},
Number
<
conv_stride_w
>
{});
const
auto
conv_dilations_dev
=
make_tuple
(
Number
<
conv_dilation_h
>
{},
Number
<
conv_dilation_w
>
{});
const
auto
in_left_pads_dev
=
make_tuple
(
Number
<
in_left_pad_h
>
{},
Number
<
in_left_pad_w
>
{});
const
auto
in_right_pads_dev
=
make_tuple
(
Number
<
in_right_pad_h
>
{},
Number
<
in_right_pad_w
>
{});
return
make_tuple
(
in_lengths_dev
,
wei_lengths_dev
,
out_lengths_dev
,
conv_strides_dev
,
conv_dilations_dev
,
in_left_pads_dev
,
in_right_pads_dev
);
};
auto
f_make_for_device_nhwc
=
[
&
]()
{
const
auto
in_lengths_dev
=
make_tuple
(
Number
<
N
>
{},
Number
<
Hi
>
{},
Number
<
Wi
>
{},
Number
<
C
>
{});
const
auto
wei_lengths_dev
=
make_tuple
(
Number
<
K
>
{},
Number
<
Y
>
{},
Number
<
X
>
{},
Number
<
C
>
{});
const
auto
out_lengths_dev
=
make_tuple
(
Number
<
N
>
{},
Number
<
Ho
>
{},
Number
<
Wo
>
{},
Number
<
K
>
{});
const
auto
conv_strides_dev
=
make_tuple
(
Number
<
conv_stride_h
>
{},
Number
<
conv_stride_w
>
{});
const
auto
conv_dilations_dev
=
make_tuple
(
Number
<
conv_dilation_h
>
{},
Number
<
conv_dilation_w
>
{});
const
auto
in_left_pads_dev
=
make_tuple
(
Number
<
in_left_pad_h
>
{},
Number
<
in_left_pad_w
>
{});
const
auto
in_right_pads_dev
=
make_tuple
(
Number
<
in_right_pad_h
>
{},
Number
<
in_right_pad_w
>
{});
return
make_tuple
(
in_lengths_dev
,
wei_lengths_dev
,
out_lengths_dev
,
conv_strides_dev
,
conv_dilations_dev
,
in_left_pads_dev
,
in_right_pads_dev
);
};
constexpr
ck
::
index_t
activ_type
=
2
;
#if USE_CONV_FWD_V5R1_NCHW
if
(
algo
==
ConvForwardAlgo
::
V5R1NCHW
)
{
if
(
layout
!=
ConvTensorLayout
::
NCHW
)
{
throw
std
::
runtime_error
(
"wrong! layout"
);
}
const
auto
tmp
=
f_make_for_device_nchw
();
device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw
<
in_data_t
,
8
,
8
,
activ_type
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I3
],
tmp
[
I4
],
tmp
[
I5
],
tmp
[
I6
],
in
,
wei
,
out_device
,
nrepeat
);
}
#endif
if
(
do_verification
)
{
host_direct_convolution_activ
(
in
,
wei
,
out_host
,
make_tuple
(
conv_stride_h
,
conv_stride_w
),
make_tuple
(
conv_dilation_h
,
conv_dilation_w
),
make_tuple
(
in_left_pad_h
,
in_left_pad_w
),
make_tuple
(
in_right_pad_h
,
in_right_pad_w
),
activ_type
,
layout
);
check_error
(
out_host
,
out_device
);
#if 0
if(do_log)
{
LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "out_host : ", out_host.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") << std::endl;
}
#endif
}
}
host/driver_offline/conv_add_fwd_driver_offline.cpp
0 → 100644
View file @
5ed51b71
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "conv_common.hpp"
#include "host_conv.hpp"
#include "device_tensor.hpp"
#include "device_static_convolution_add_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp"
#define USE_DYNAMIC_MODE 0
#define USE_CONV_FWD_V5R1_NCHW 1
enum
ConvForwardAlgo
{
V5R1NCHW
};
int
main
(
int
argc
,
char
*
argv
[])
{
using
namespace
ck
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
I6
=
Number
<
6
>
{};
constexpr
auto
I7
=
Number
<
7
>
{};
// static mode
if
(
argc
<
7
)
{
printf
(
"arg1 to 5: layout, algo, do_verification, init_method, do_log, nrepeat
\n
"
);
exit
(
1
);
}
const
ConvTensorLayout
layout
=
static_cast
<
ConvTensorLayout
>
(
atoi
(
argv
[
1
]));
const
ConvForwardAlgo
algo
=
static_cast
<
ConvForwardAlgo
>
(
atoi
(
argv
[
2
]));
const
bool
do_verification
=
atoi
(
argv
[
3
]);
const
int
init_method
=
atoi
(
argv
[
4
]);
const
bool
do_log
=
atoi
(
argv
[
5
]);
const
int
nrepeat
=
atoi
(
argv
[
6
]);
#if 0
constexpr index_t N = 1;
constexpr index_t C = 16;
constexpr index_t Hi = 1080;
constexpr index_t Wi = 1920;
constexpr index_t K = 16;
constexpr index_t Y = 3;
constexpr index_t X = 3;
#elif
0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
540
;
constexpr
index_t
Wi
=
960
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
480
;
constexpr
index_t
Wi
=
270
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
#elif 1
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
240
;
constexpr
index_t
Wi
=
135
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
1080
;
constexpr
index_t
Wi
=
1920
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
540
;
constexpr
index_t
Wi
=
960
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
480
;
constexpr
index_t
Wi
=
270
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
8
;
constexpr
index_t
Hi
=
1080
;
constexpr
index_t
Wi
=
1920
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
1080
;
constexpr
index_t
Wi
=
1920
;
constexpr
index_t
K
=
4
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
#endif
const
index_t
conv_stride_h
=
1
;
const
index_t
conv_stride_w
=
1
;
const
index_t
conv_dilation_h
=
1
;
const
index_t
conv_dilation_w
=
1
;
const
index_t
in_left_pad_h
=
1
;
const
index_t
in_left_pad_w
=
1
;
const
index_t
in_right_pad_h
=
1
;
const
index_t
in_right_pad_w
=
1
;
const
index_t
YEff
=
(
Y
-
1
)
*
conv_dilation_h
+
1
;
const
index_t
XEff
=
(
X
-
1
)
*
conv_dilation_w
+
1
;
const
index_t
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
YEff
)
/
conv_stride_h
+
1
;
const
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
XEff
)
/
conv_stride_w
+
1
;
const
index_t
Hox2
=
Ho
*
2
;
const
index_t
Wox2
=
Wo
*
2
;
#if 0
using in_data_t = float;
using acc_data_t = float;
using out_data_t = float;
#elif
1
using
in_data_t
=
half_t
;
using
acc_data_t
=
float
;
using
out_data_t
=
half_t
;
#elif 1
using
in_data_t
=
int8_t
;
using
acc_data_t
=
int32_t
;
using
out_data_t
=
int8_t
;
#endif
std
::
vector
<
std
::
size_t
>
in_lengths_host
(
4
),
wei_lengths_host
(
4
),
out_lengths_host
(
4
),
add_lengths_host
(
4
);
switch
(
layout
)
{
case
ConvTensorLayout
::
NCHW
:
// NCHW
in_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
in_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
C
);
in_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Hi
);
in_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wi
);
wei_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K
);
wei_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
C
);
wei_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Y
);
wei_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
X
);
out_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
out_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
K
);
out_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Ho
);
out_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wo
);
add_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
add_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
K
);
add_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Hox2
);
add_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wox2
);
break
;
case
ConvTensorLayout
::
NHWC
:
// NHWC
in_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
in_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
Hi
);
in_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Wi
);
in_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
C
);
wei_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K
);
wei_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
Y
);
wei_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
X
);
wei_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
C
);
out_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
out_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
Ho
);
out_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Wo
);
out_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
K
);
add_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
add_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
Hox2
);
add_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Wox2
);
add_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
K
);
break
;
default:
throw
std
::
runtime_error
(
"wrong! not implemented"
);
}
Tensor
<
in_data_t
>
in
(
in_lengths_host
);
Tensor
<
in_data_t
>
wei
(
wei_lengths_host
);
Tensor
<
in_data_t
>
add
(
add_lengths_host
);
Tensor
<
out_data_t
>
out_host
(
add_lengths_host
);
Tensor
<
out_data_t
>
out_device
(
add_lengths_host
);
std
::
cout
<<
"layout: "
<<
layout
<<
std
::
endl
;
ostream_HostTensorDescriptor
(
in
.
mDesc
,
std
::
cout
<<
"in: "
);
ostream_HostTensorDescriptor
(
wei
.
mDesc
,
std
::
cout
<<
"wei: "
);
ostream_HostTensorDescriptor
(
add
.
mDesc
,
std
::
cout
<<
"add: "
);
ostream_HostTensorDescriptor
(
out_host
.
mDesc
,
std
::
cout
<<
"out: "
);
print_array
(
"InLeftPads"
,
make_tuple
(
in_left_pad_h
,
in_left_pad_w
));
print_array
(
"InRightPads"
,
make_tuple
(
in_right_pad_h
,
in_right_pad_w
));
print_array
(
"ConvStrides"
,
make_tuple
(
conv_stride_h
,
conv_stride_w
));
print_array
(
"ConvDilations"
,
make_tuple
(
conv_dilation_h
,
conv_dilation_w
));
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
();
switch
(
init_method
)
{
case
0
:
// no initialization
break
;
case
1
:
in
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
break
;
case
2
:
in
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
break
;
case
3
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
break
;
case
4
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
break
;
case
5
:
in
.
GenerateTensorValue
(
GeneratorTensor_3
<
float
>
{
0.0
,
1.0
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_3
<
float
>
{
-
0.5
,
0.5
},
num_thread
);
break
;
default:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
1
,
5
},
num_thread
);
auto
gen_wei
=
[](
auto
...
is
)
{
return
GeneratorTensor_2
{
1
,
5
}(
is
...)
*
GeneratorTensor_Checkboard
{}(
is
...);
};
wei
.
GenerateTensorValue
(
gen_wei
,
num_thread
);
}
auto
f_make_for_device_nchw
=
[
&
]()
{
const
auto
in_lengths_dev
=
make_tuple
(
Number
<
N
>
{},
Number
<
C
>
{},
Number
<
Hi
>
{},
Number
<
Wi
>
{});
const
auto
wei_lengths_dev
=
make_tuple
(
Number
<
K
>
{},
Number
<
C
>
{},
Number
<
Y
>
{},
Number
<
X
>
{});
const
auto
out_lengths_dev
=
make_tuple
(
Number
<
N
>
{},
Number
<
K
>
{},
Number
<
Ho
>
{},
Number
<
Wo
>
{});
const
auto
add_lengths_dev
=
make_tuple
(
Number
<
N
>
{},
Number
<
K
>
{},
Number
<
Hox2
>
{},
Number
<
Wox2
>
{});
const
auto
conv_strides_dev
=
make_tuple
(
Number
<
conv_stride_h
>
{},
Number
<
conv_stride_w
>
{});
const
auto
conv_dilations_dev
=
make_tuple
(
Number
<
conv_dilation_h
>
{},
Number
<
conv_dilation_w
>
{});
const
auto
in_left_pads_dev
=
make_tuple
(
Number
<
in_left_pad_h
>
{},
Number
<
in_left_pad_w
>
{});
const
auto
in_right_pads_dev
=
make_tuple
(
Number
<
in_right_pad_h
>
{},
Number
<
in_right_pad_w
>
{});
return
make_tuple
(
in_lengths_dev
,
wei_lengths_dev
,
add_lengths_dev
,
out_lengths_dev
,
conv_strides_dev
,
conv_dilations_dev
,
in_left_pads_dev
,
in_right_pads_dev
);
};
auto
f_make_for_device_nhwc
=
[
&
]()
{
const
auto
in_lengths_dev
=
make_tuple
(
Number
<
N
>
{},
Number
<
Hi
>
{},
Number
<
Wi
>
{},
Number
<
C
>
{});
const
auto
wei_lengths_dev
=
make_tuple
(
Number
<
K
>
{},
Number
<
Y
>
{},
Number
<
X
>
{},
Number
<
C
>
{});
const
auto
out_lengths_dev
=
make_tuple
(
Number
<
N
>
{},
Number
<
Ho
>
{},
Number
<
Wo
>
{},
Number
<
K
>
{});
const
auto
add_lengths_dev
=
make_tuple
(
Number
<
N
>
{},
Number
<
Hox2
>
{},
Number
<
Wox2
>
{},
Number
<
K
>
{});
const
auto
conv_strides_dev
=
make_tuple
(
Number
<
conv_stride_h
>
{},
Number
<
conv_stride_w
>
{});
const
auto
conv_dilations_dev
=
make_tuple
(
Number
<
conv_dilation_h
>
{},
Number
<
conv_dilation_w
>
{});
const
auto
in_left_pads_dev
=
make_tuple
(
Number
<
in_left_pad_h
>
{},
Number
<
in_left_pad_w
>
{});
const
auto
in_right_pads_dev
=
make_tuple
(
Number
<
in_right_pad_h
>
{},
Number
<
in_right_pad_w
>
{});
return
make_tuple
(
in_lengths_dev
,
wei_lengths_dev
,
add_lengths_dev
,
out_lengths_dev
,
conv_strides_dev
,
conv_dilations_dev
,
in_left_pads_dev
,
in_right_pads_dev
);
};
constexpr
ck
::
index_t
activ_type
=
2
;
#if USE_CONV_FWD_V5R1_NCHW
if
(
algo
==
ConvForwardAlgo
::
V5R1NCHW
)
{
if
(
layout
!=
ConvTensorLayout
::
NCHW
)
{
throw
std
::
runtime_error
(
"wrong! layout"
);
}
const
auto
tmp
=
f_make_for_device_nchw
();
#if 1
device_static_convolution_add_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw
<
in_data_t
,
8
,
8
,
activ_type
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
// in_lengths_dev
tmp
[
I1
],
// wei_lengths_dev
tmp
[
I2
],
// add_lengths_dev
tmp
[
I3
],
// out_lengths_dev
tmp
[
I4
],
tmp
[
I5
],
tmp
[
I6
],
tmp
[
I7
],
in
,
wei
,
add
,
out_device
,
nrepeat
);
#endif
}
#endif
if
(
do_verification
)
{
host_direct_convolution_add
(
in
,
wei
,
add
,
out_host
,
make_tuple
(
conv_stride_h
,
conv_stride_w
),
make_tuple
(
conv_dilation_h
,
conv_dilation_w
),
make_tuple
(
in_left_pad_h
,
in_left_pad_w
),
make_tuple
(
in_right_pad_h
,
in_right_pad_w
),
activ_type
,
layout
);
check_error
(
out_host
,
out_device
);
#if 0
if(do_log)
{
LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "out_host : ", out_host.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") << std::endl;
}
#endif
}
}
host/driver_offline/conv_fwd_driver_offline.cpp
View file @
5ed51b71
...
...
@@ -15,16 +15,15 @@
#include "device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v6r1_nchw_kcyx_nkhw.hpp"
#include "device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
#define USE_DYNAMIC_MODE
0
#define USE_CONV_FWD_V4R4_NCHW
0
#define USE_DYNAMIC_MODE
1
#define USE_CONV_FWD_V4R4_NCHW
1
#define USE_CONV_FWD_V4R4R2_NHWC 0
#define USE_CONV_FWD_V6R1_NCHW 0
#define USE_CONV_FWD_V5R1_NCHW
1
#define USE_CONV_FWD_V5R1_NCHW
0
#define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
#define USE_CONV_FWD_V4R4R4_XDL_NHWC 0
...
...
@@ -103,82 +102,16 @@ int main(int argc, char* argv[])
const
bool
do_log
=
atoi
(
argv
[
5
]);
const
int
nrepeat
=
atoi
(
argv
[
6
]);
#if 1
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
1080
;
constexpr
index_t
Wi
=
1920
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
540
;
constexpr
index_t
Wi
=
960
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
480
;
constexpr
index_t
Wi
=
270
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
240
;
constexpr
index_t
Wi
=
135
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
1080
;
constexpr
index_t
Wi
=
1920
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
540
;
constexpr
index_t
Wi
=
960
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
480
;
constexpr
index_t
Wi
=
270
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
8
;
constexpr
index_t
Hi
=
1080
;
constexpr
index_t
Wi
=
1920
;
constexpr
index_t
K
=
16
;
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
192
;
constexpr
index_t
Hi
=
71
;
constexpr
index_t
Wi
=
71
;
constexpr
index_t
K
=
256
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
1080
;
constexpr
index_t
Wi
=
1920
;
constexpr
index_t
K
=
4
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
#endif
const
index_t
conv_stride_h
=
1
;
const
index_t
conv_stride_w
=
1
;
const
index_t
conv_stride_h
=
2
;
const
index_t
conv_stride_w
=
2
;
const
index_t
conv_dilation_h
=
1
;
const
index_t
conv_dilation_w
=
1
;
const
index_t
in_left_pad_h
=
1
;
...
...
@@ -193,7 +126,7 @@ int main(int argc, char* argv[])
const
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
XEff
)
/
conv_stride_w
+
1
;
#endif
#if
0
#if
1
using
in_data_t
=
float
;
using
acc_data_t
=
float
;
using
out_data_t
=
float
;
...
...
@@ -437,8 +370,6 @@ int main(int argc, char* argv[])
}
#endif
constexpr
ck
::
index_t
activ_type
=
2
;
#if USE_CONV_FWD_V5R1_NCHW
if
(
algo
==
ConvForwardAlgo
::
V5R1NCHW
)
{
...
...
@@ -449,22 +380,20 @@ int main(int argc, char* argv[])
const
auto
tmp
=
f_make_for_device_nchw
();
#if 1
device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw
#else
device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw
#endif
<
in_data_t
,
8
,
8
,
activ_type
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I3
],
tmp
[
I4
],
tmp
[
I5
],
tmp
[
I6
],
in
,
wei
,
out_device
,
nrepeat
);
device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw
<
in_data_t
,
16
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I3
],
tmp
[
I4
],
tmp
[
I5
],
tmp
[
I6
],
in
,
wei
,
out_device
,
nrepeat
);
}
#endif
...
...
@@ -524,15 +453,14 @@ int main(int argc, char* argv[])
if
(
do_verification
)
{
host_direct_convolution_activ
(
in
,
wei
,
out_host
,
make_tuple
(
conv_stride_h
,
conv_stride_w
),
make_tuple
(
conv_dilation_h
,
conv_dilation_w
),
make_tuple
(
in_left_pad_h
,
in_left_pad_w
),
make_tuple
(
in_right_pad_h
,
in_right_pad_w
),
activ_type
,
layout
);
host_direct_convolution
(
in
,
wei
,
out_host
,
make_tuple
(
conv_stride_h
,
conv_stride_w
),
make_tuple
(
conv_dilation_h
,
conv_dilation_w
),
make_tuple
(
in_left_pad_h
,
in_left_pad_w
),
make_tuple
(
in_right_pad_h
,
in_right_pad_w
),
layout
);
check_error
(
out_host
,
out_device
);
...
...
host/driver_offline/include/device_static_convolution_add_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
0 → 100644
View file @
5ed51b71
#include <unistd.h>
#include "device.hpp"
#include "host_tensor.hpp"
#include "driver_static_convolution_add_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp"
template
<
typename
TInWei
,
ck
::
index_t
InWeiVectorSize
,
ck
::
index_t
OutVectorSize
,
ck
::
index_t
activ_type
,
typename
TAcc
,
typename
TOut
,
typename
InLengths
,
typename
WeiLengths
,
typename
AddLengths
,
typename
OutLengths
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
void
device_static_convolution_add_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw
(
const
InLengths
&
in_n_c_hi_wi_lengths
,
const
WeiLengths
&
wei_k_c_y_x_lengths
,
const
AddLengths
&
add_n_k_hox2_wox2_lengths
,
const
OutLengths
&
out_n_k_ho_wo_lengths
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
in_right_pads
,
const
Tensor
<
TInWei
>&
in_n_c_hi_wi
,
const
Tensor
<
TInWei
>&
wei_k_c_y_x
,
const
Tensor
<
TOut
>&
add_n_k_hox2_wox2
,
Tensor
<
TOut
>&
out_n_k_hox2_wox2
,
ck
::
index_t
nrepeat
)
{
using
namespace
ck
;
std
::
cout
<<
__func__
<<
std
::
endl
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
const
auto
N
=
out_n_k_ho_wo_lengths
[
I0
];
const
auto
K
=
out_n_k_ho_wo_lengths
[
I1
];
const
auto
C
=
wei_k_c_y_x_lengths
[
I1
];
const
auto
Hi
=
in_n_c_hi_wi_lengths
[
I2
];
const
auto
Wi
=
in_n_c_hi_wi_lengths
[
I3
];
const
auto
Ho
=
out_n_k_ho_wo_lengths
[
I2
];
const
auto
Wo
=
out_n_k_ho_wo_lengths
[
I3
];
const
auto
Hox2
=
Ho
*
2
;
const
auto
Wox2
=
Wo
*
2
;
const
auto
Y
=
wei_k_c_y_x_lengths
[
I2
];
const
auto
X
=
wei_k_c_y_x_lengths
[
I3
];
const
auto
C0
=
C
/
Number
<
InWeiVectorSize
>
{};
const
auto
C1
=
Number
<
InWeiVectorSize
>
{};
const
auto
K0
=
K
/
Number
<
OutVectorSize
>
{};
const
auto
K1
=
Number
<
OutVectorSize
>
{};
Tensor
<
TInWei
>
in_n_c0_hi_wi_c1
(
HostTensorDescriptor
(
std
::
initializer_list
<
index_t
>
{
N
,
C0
,
Hi
,
Wi
,
C1
}));
Tensor
<
TInWei
>
wei_k_c0_y_x_c1
(
HostTensorDescriptor
(
std
::
initializer_list
<
index_t
>
{
K
,
C0
,
Y
,
X
,
C1
}));
Tensor
<
TOut
>
out_n_k0_hox2_wox2_k1
(
HostTensorDescriptor
(
std
::
initializer_list
<
index_t
>
{
N
,
K0
,
Hox2
,
Wox2
,
K1
}));
Tensor
<
TOut
>
add_n_k0_hox2_wox2_k1
(
HostTensorDescriptor
(
std
::
initializer_list
<
index_t
>
{
N
,
K0
,
Hox2
,
Wox2
,
K1
}));
auto
f_nchw2nc0hwc1
=
[
&
](
auto
n
,
auto
hi
,
auto
wi
,
auto
c
)
{
in_n_c0_hi_wi_c1
(
n
,
c
/
InWeiVectorSize
,
hi
,
wi
,
c
%
InWeiVectorSize
)
=
in_n_c_hi_wi
(
n
,
c
,
hi
,
wi
);
};
auto
f_kcyx2kc0yxc1
=
[
&
](
auto
k
,
auto
y
,
auto
x
,
auto
c
)
{
wei_k_c0_y_x_c1
(
k
,
c
/
InWeiVectorSize
,
y
,
x
,
c
%
InWeiVectorSize
)
=
wei_k_c_y_x
(
k
,
c
,
y
,
x
);
};
auto
f_nchx2wx2_to_nc0hx2wx2c1
=
[
&
](
auto
n
,
auto
ho
,
auto
wo
,
auto
c
)
{
add_n_k0_hox2_wox2_k1
(
n
,
c
/
InWeiVectorSize
,
ho
,
wo
,
c
%
InWeiVectorSize
)
=
add_n_k_hox2_wox2
(
n
,
c
,
ho
,
wo
);
};
make_ParallelTensorFunctor
(
f_nchw2nc0hwc1
,
N
,
Hi
,
Wi
,
C
)();
make_ParallelTensorFunctor
(
f_kcyx2kc0yxc1
,
K
,
Y
,
X
,
C
)();
make_ParallelTensorFunctor
(
f_nchx2wx2_to_nc0hx2wx2c1
,
N
,
Hox2
,
Wox2
,
K
)();
DeviceMem
in_n_c0_hi_wi_c1_device_buf
(
sizeof
(
TInWei
)
*
in_n_c0_hi_wi_c1
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_k_c0_y_x_c1_device_buf
(
sizeof
(
TInWei
)
*
wei_k_c0_y_x_c1
.
mDesc
.
GetElementSpace
());
DeviceMem
add_n_k0_hox2_wox2_k1_device_buf
(
sizeof
(
TOut
)
*
add_n_k0_hox2_wox2_k1
.
mDesc
.
GetElementSpace
());
DeviceMem
out_n_k0_hox2_wox2_k1_device_buf
(
sizeof
(
TOut
)
*
out_n_k0_hox2_wox2_k1
.
mDesc
.
GetElementSpace
());
in_n_c0_hi_wi_c1_device_buf
.
ToDevice
(
in_n_c0_hi_wi_c1
.
mData
.
data
());
wei_k_c0_y_x_c1_device_buf
.
ToDevice
(
wei_k_c0_y_x_c1
.
mData
.
data
());
const
auto
in_n_c0_hi_wi_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
N
,
C0
,
Hi
,
Wi
));
const
auto
wei_k_c0_y_x_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
K
,
C0
,
Y
,
X
));
const
auto
out_n_k0_ho_wo_k1_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
N
,
K0
,
Ho
,
Wo
,
K1
));
const
auto
add_n_k0_hox2_wox2_k1_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
N
,
K0
,
Hox2
,
Wox2
,
K1
));
// cdata = 64, BlockSize = 64, 16x8x32x4
constexpr
index_t
BlockSize
=
64
;
constexpr
index_t
KPerBlock
=
K
;
constexpr
index_t
HoPerBlock
=
8
;
constexpr
index_t
WoPerBlock
=
32
;
constexpr
index_t
EPerBlock
=
C0
;
constexpr
index_t
KPerThread
=
KPerBlock
;
constexpr
index_t
HoPerThread
=
2
;
constexpr
index_t
WoPerThread
=
2
;
constexpr
index_t
EPerThread
=
EPerBlock
;
using
ABlockTransferThreadSliceLengths_E_K
=
Sequence
<
Y
*
X
,
1
>
;
using
ABlockTransferThreadClusterLengths_E_K
=
Sequence
<
EPerBlock
,
KPerBlock
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_E
=
1
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K
=
1
;
constexpr
index_t
BThreadTransferSrcScalarPerVector_W
=
1
;
constexpr
index_t
CThreadTransferDstScalarPerVector_W
=
K1
;
static_assert
(
KPerThread
%
CThreadTransferDstScalarPerVector_W
==
0
,
""
);
constexpr
auto
conv_driver
=
DriverStaticConvolutionAddForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
<
BlockSize
,
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
,
TAcc
,
TOut
,
KPerBlock
,
HoPerBlock
,
WoPerBlock
,
EPerBlock
,
KPerThread
,
HoPerThread
,
WoPerThread
,
EPerThread
,
ABlockTransferThreadSliceLengths_E_K
,
ABlockTransferThreadClusterLengths_E_K
,
ABlockTransferSrcScalarPerVector_E
,
ABlockTransferDstScalarPerVector_K
,
BThreadTransferSrcScalarPerVector_W
,
CThreadTransferDstScalarPerVector_W
>
{};
conv_driver
.
Run
(
wei_k_c0_y_x_desc
,
in_n_c0_hi_wi_desc
,
add_n_k0_hox2_wox2_k1_desc
,
out_n_k0_ho_wo_k1_desc
,
conv_strides
,
conv_dilations
,
in_left_pads
,
in_right_pads
,
Number
<
activ_type
>
{},
static_cast
<
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
*>
(
wei_k_c0_y_x_c1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
*>
(
in_n_c0_hi_wi_c1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TOut
*>
(
add_n_k0_hox2_wox2_k1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TOut
*>
(
out_n_k0_hox2_wox2_k1_device_buf
.
GetDeviceBuffer
()));
out_n_k0_hox2_wox2_k1_device_buf
.
FromDevice
(
out_n_k0_hox2_wox2_k1
.
mData
.
data
());
auto
f_nk0hwk1_to_nkhw
=
[
&
](
auto
n
,
auto
k
,
auto
ho
,
auto
wo
)
{
out_n_k_hox2_wox2
(
n
,
k
,
ho
,
wo
)
=
out_n_k0_hox2_wox2_k1
(
n
,
k
/
InWeiVectorSize
,
ho
,
wo
,
k
%
InWeiVectorSize
);
};
make_ParallelTensorFunctor
(
f_nk0hwk1_to_nkhw
,
N
,
K
,
Hox2
,
Wox2
)();
}
host/host_tensor/include/host_conv.hpp
View file @
5ed51b71
...
...
@@ -188,6 +188,115 @@ void host_direct_convolution_activ(const Tensor<TIn>& in,
}
}
template
<
typename
TIn
,
typename
TWei
,
typename
TOut
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
void
host_direct_convolution_add
(
const
Tensor
<
TIn
>&
in
,
const
Tensor
<
TWei
>&
wei
,
const
Tensor
<
TOut
>&
add
,
Tensor
<
TOut
>&
out
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
in_right_pads
,
const
ck
::
index_t
activ_type
,
const
ConvTensorLayout
layout
=
ConvTensorLayout
::
NCHW
)
{
using
namespace
ck
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
auto
f_nchw
=
[
&
](
auto
n
,
auto
k
,
auto
ho
,
auto
wo
)
{
double
v
=
0
;
for
(
int
c
=
0
;
c
<
wei
.
mDesc
.
GetLengths
()[
1
];
++
c
)
{
for
(
int
y
=
0
;
y
<
wei
.
mDesc
.
GetLengths
()[
2
];
++
y
)
{
int
hi
=
ho
*
conv_strides
[
I0
]
+
y
*
conv_dilations
[
I0
]
-
in_left_pads
[
I0
];
for
(
int
x
=
0
;
x
<
wei
.
mDesc
.
GetLengths
()[
3
];
++
x
)
{
int
wi
=
wo
*
conv_strides
[
I1
]
+
x
*
conv_dilations
[
I1
]
-
in_left_pads
[
I1
];
if
(
hi
>=
0
&&
hi
<
in
.
mDesc
.
GetLengths
()[
2
]
&&
wi
>=
0
&&
wi
<
in
.
mDesc
.
GetLengths
()[
3
])
{
v
+=
static_cast
<
const
double
>
(
in
(
n
,
c
,
hi
,
wi
))
*
static_cast
<
const
double
>
(
wei
(
k
,
c
,
y
,
x
));
}
}
}
}
index_t
hox2
=
ho
*
2
;
index_t
wox2
=
wo
*
2
;
v
=
activ
(
v
,
activ_type
);
out
(
n
,
k
,
hox2
,
wox2
)
=
v
+
add
(
n
,
k
,
hox2
,
wox2
);
out
(
n
,
k
,
hox2
,
wox2
+
1
)
=
v
+
add
(
n
,
k
,
hox2
,
wox2
+
1
);
out
(
n
,
k
,
hox2
+
1
,
wox2
)
=
v
+
add
(
n
,
k
,
hox2
+
1
,
wox2
);
out
(
n
,
k
,
hox2
+
1
,
wox2
+
1
)
=
v
+
add
(
n
,
k
,
hox2
+
1
,
wox2
+
1
);
};
auto
f_nhwc
=
[
&
](
auto
n
,
auto
ho
,
auto
wo
,
auto
k
)
{
double
v
=
0
;
for
(
int
c
=
0
;
c
<
wei
.
mDesc
.
GetLengths
()[
3
];
++
c
)
{
for
(
int
y
=
0
;
y
<
wei
.
mDesc
.
GetLengths
()[
1
];
++
y
)
{
int
hi
=
ho
*
conv_strides
[
I0
]
+
y
*
conv_dilations
[
I0
]
-
in_left_pads
[
I0
];
for
(
int
x
=
0
;
x
<
wei
.
mDesc
.
GetLengths
()[
2
];
++
x
)
{
int
wi
=
wo
*
conv_strides
[
I1
]
+
x
*
conv_dilations
[
I1
]
-
in_left_pads
[
I1
];
if
(
hi
>=
0
&&
hi
<
in
.
mDesc
.
GetLengths
()[
1
]
&&
wi
>=
0
&&
wi
<
in
.
mDesc
.
GetLengths
()[
2
])
{
v
+=
static_cast
<
const
double
>
(
in
(
n
,
hi
,
wi
,
c
))
*
static_cast
<
const
double
>
(
wei
(
k
,
y
,
x
,
c
));
}
}
}
}
index_t
hox2
=
ho
*
2
;
index_t
wox2
=
wo
*
2
;
v
=
activ
(
v
,
activ_type
);
out
(
n
,
k
,
hox2
,
wox2
)
=
v
+
add
(
n
,
k
,
hox2
,
wox2
);
out
(
n
,
k
,
hox2
,
wox2
+
1
)
=
v
+
add
(
n
,
k
,
hox2
,
wox2
+
1
);
out
(
n
,
k
,
hox2
+
1
,
wox2
)
=
v
+
add
(
n
,
k
,
hox2
+
1
,
wox2
);
out
(
n
,
k
,
hox2
+
1
,
wox2
+
1
)
=
v
+
add
(
n
,
k
,
hox2
+
1
,
wox2
+
1
);
};
switch
(
layout
)
{
case
ConvTensorLayout
::
NCHW
:
make_ParallelTensorFunctor
(
f_nchw
,
out
.
mDesc
.
GetLengths
()[
0
],
out
.
mDesc
.
GetLengths
()[
1
],
out
.
mDesc
.
GetLengths
()[
2
]
/
2
,
out
.
mDesc
.
GetLengths
()[
3
]
/
2
)(
std
::
thread
::
hardware_concurrency
());
break
;
case
ConvTensorLayout
::
NHWC
:
make_ParallelTensorFunctor
(
f_nhwc
,
out
.
mDesc
.
GetLengths
()[
0
],
out
.
mDesc
.
GetLengths
()[
1
],
out
.
mDesc
.
GetLengths
()[
2
]
/
2
,
out
.
mDesc
.
GetLengths
()[
3
]
/
2
)(
std
::
thread
::
hardware_concurrency
());
break
;
default:
throw
std
::
runtime_error
(
"wrong! not supported layout"
);
}
}
template
<
typename
TIn
,
typename
TWei
,
typename
TOut
,
typename
InLeftPads
,
typename
InRightPads
>
void
host_winograd_3x3_convolution
(
const
Tensor
<
TIn
>&
in_nchw
,
const
Tensor
<
TWei
>&
wei_kcyx
,
...
...
script/run.sh
View file @
5ed51b71
...
...
@@ -12,7 +12,7 @@
#export OLC_DEBUG_HIP_DUMP=1
#export OLC_DEBUG_SAVE_TEMP_DIR=1
make
-j
conv_fwd_driver_offline
make
-j
conv_
add_
fwd_driver_offline
#make -j conv_bwd_driver_offline
#make -j conv_fwd_driver_online
...
...
@@ -26,7 +26,7 @@ INIT=$4
LOG
=
$5
REPEAT
=
$6
./host/driver_offline/conv_fwd_driver_offline
$LAYOUT
$ALGO
$VERIFY
$INIT
$LOG
$REPEAT
./host/driver_offline/conv_
add_
fwd_driver_offline
$LAYOUT
$ALGO
$VERIFY
$INIT
$LOG
$REPEAT
################################################ layout algo verify init log repeat N__ K___ C___ Y X Hi_ Wi__ Strides Dilations LeftPads RightPads
#./host/driver_offline/conv_fwd_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 128 128 192 3 3 71 71 2 2 1 1 1 1 1 1
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment