Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
a037693f
"vscode:/vscode.git/clone" did not exist on "57defc8acaee08810bd61b0f8575e3bd8ad32aff"
Commit
a037693f
authored
Dec 01, 2021
by
ltqin
Browse files
Merge branch 'develop' into conv_splitk_f32
parents
0694d6ed
4041850f
Changes
38
Hide whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
3381 additions
and
790 deletions
+3381
-790
host/driver_offline/include/device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
...ward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+212
-0
host/driver_offline/include/driver_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
...ward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+565
-0
host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
...ward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+500
-0
host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
...ution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
+0
-349
host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
...orward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
+0
-364
host/driver_offline/include/driver_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
...ward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+569
-0
host/driver_offline/src/conv_add_fwd_driver_offline_nchwc.cpp
.../driver_offline/src/conv_add_fwd_driver_offline_nchwc.cpp
+414
-0
host/driver_offline/src/conv_fwd_driver_offline.cpp
host/driver_offline/src/conv_fwd_driver_offline.cpp
+10
-40
host/driver_offline/src/conv_fwd_driver_offline_nchwc.cpp
host/driver_offline/src/conv_fwd_driver_offline_nchwc.cpp
+391
-0
host/driver_offline/src/conv_maxpool_fwd_driver_offline_nchwc.cpp
...ver_offline/src/conv_maxpool_fwd_driver_offline_nchwc.cpp
+413
-0
host/driver_offline/src/gemm_driver_offline.cpp
host/driver_offline/src/gemm_driver_offline.cpp
+1
-5
host/host_tensor/include/conv_common.hpp
host/host_tensor/include/conv_common.hpp
+13
-0
host/host_tensor/include/host_tensor.hpp
host/host_tensor/include/host_tensor.hpp
+12
-0
profiler/conv_profiler.cpp
profiler/conv_profiler.cpp
+8
-8
profiler/gemm_profiler.cpp
profiler/gemm_profiler.cpp
+89
-22
script/profile_gemm.sh
script/profile_gemm.sh
+23
-2
test/CMakeLists.txt
test/CMakeLists.txt
+18
-0
test/magic_number_division/main.cpp
test/magic_number_division/main.cpp
+143
-0
No files found.
host/driver_offline/include/device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
0 → 100644
View file @
a037693f
#include <unistd.h>
#include "device.hpp"
#include "host_tensor.hpp"
#include "driver_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
template
<
typename
TInWei
,
typename
TAcc
,
typename
TOut
,
ck
::
ActivTypeEnum_t
activ_type
,
typename
InLengths
,
typename
WeiLengths
,
typename
MaxLengths
,
typename
OutLengths
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
void
device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1
(
const
InLengths
&
in_n_c0_hi_wi_c1_lengths
,
const
WeiLengths
&
wei_k_c0_y_x_c1_lengths
,
const
MaxLengths
&
max_n_k0_hx_wx_k1_lengths
,
const
OutLengths
&
out_n_k0_ho_wo_k1_lengths
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
in_right_pads
,
const
Tensor
<
TInWei
>&
in_n_c0_hi_wi_c1
,
const
Tensor
<
TInWei
>&
wei_k_c0_y_x_c1
,
const
Tensor
<
TOut
>&
bias_k0_k1
,
Tensor
<
TOut
>&
out_n_k0_ho_wo_k1
,
Tensor
<
TOut
>&
max_n_k0_hx_wx_k1
,
ck
::
index_t
nrepeat
)
{
using
namespace
ck
;
std
::
cout
<<
__func__
<<
std
::
endl
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
const
auto
N
=
out_n_k0_ho_wo_k1_lengths
[
I0
];
const
auto
K0
=
out_n_k0_ho_wo_k1_lengths
[
I1
];
const
auto
Ho
=
out_n_k0_ho_wo_k1_lengths
[
I2
];
const
auto
Wo
=
out_n_k0_ho_wo_k1_lengths
[
I3
];
const
auto
K1
=
out_n_k0_ho_wo_k1_lengths
[
I4
];
const
auto
C0
=
in_n_c0_hi_wi_c1_lengths
[
I1
];
const
auto
Hi
=
in_n_c0_hi_wi_c1_lengths
[
I2
];
const
auto
Wi
=
in_n_c0_hi_wi_c1_lengths
[
I3
];
const
auto
C1
=
in_n_c0_hi_wi_c1_lengths
[
I4
];
const
auto
K
=
wei_k_c0_y_x_c1_lengths
[
I0
];
const
auto
Y
=
wei_k_c0_y_x_c1_lengths
[
I2
];
const
auto
X
=
wei_k_c0_y_x_c1_lengths
[
I3
];
const
auto
Hx
=
max_n_k0_hx_wx_k1_lengths
[
I2
];
const
auto
Wx
=
max_n_k0_hx_wx_k1_lengths
[
I3
];
DeviceMem
in_n_c0_hi_wi_c1_device_buf
(
sizeof
(
TInWei
)
*
in_n_c0_hi_wi_c1
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_k_c0_y_x_c1_device_buf
(
sizeof
(
TInWei
)
*
wei_k_c0_y_x_c1
.
mDesc
.
GetElementSpace
());
DeviceMem
bias_k0_k1_device_buf
(
sizeof
(
TOut
)
*
bias_k0_k1
.
mDesc
.
GetElementSpace
());
DeviceMem
out_n_k0_ho_wo_k1_device_buf
(
sizeof
(
TOut
)
*
out_n_k0_ho_wo_k1
.
mDesc
.
GetElementSpace
());
DeviceMem
max_n_k0_hx_wx_k1_device_buf
(
sizeof
(
TOut
)
*
max_n_k0_hx_wx_k1
.
mDesc
.
GetElementSpace
());
in_n_c0_hi_wi_c1_device_buf
.
ToDevice
(
in_n_c0_hi_wi_c1
.
mData
.
data
());
wei_k_c0_y_x_c1_device_buf
.
ToDevice
(
wei_k_c0_y_x_c1
.
mData
.
data
());
bias_k0_k1_device_buf
.
ToDevice
(
bias_k0_k1
.
mData
.
data
());
max_n_k0_hx_wx_k1_device_buf
.
ToDevice
(
max_n_k0_hx_wx_k1
.
mData
.
data
());
constexpr
index_t
InWeiVectorSize
=
8
;
if
(
C1
%
InWeiVectorSize
!=
0
)
{
throw
std
::
runtime_error
(
"wrong! C1 cannot be divided by InWeiVectorSize"
);
}
#if 0
constexpr index_t BlockSize = 256;
constexpr index_t KPerBlock = 32;
constexpr index_t HoPerBlock = 8;
constexpr index_t WoPerBlock = 64;
constexpr index_t E1 = C0 * 9;
constexpr index_t E2 = 1;
constexpr index_t E1PerBlock = C0;
constexpr index_t KPerThread = 16;
constexpr index_t HoPerThread = 2;
constexpr index_t WoPerThread = 2;
constexpr index_t EPerThread = 1;
using ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2 = Sequence<1, 9, 1, E2>;
using ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2 = Sequence<1, E1PerBlock, KPerBlock, 1>;
constexpr index_t ABlockTransferSrcScalarPerVector_E2 = E2;
constexpr index_t ABlockTransferDstScalarPerVector_E2 = E2;
constexpr index_t BThreadTransferSrcScalarPerVector_E2 = E2;
constexpr index_t CThreadTransferDstScalarPerVector_K = K1;
#elif
1
constexpr
index_t
BlockSize
=
64
;
constexpr
index_t
KPerBlock
=
8
;
constexpr
index_t
HoPerBlock
=
8
;
constexpr
index_t
WoPerBlock
=
32
;
constexpr
index_t
E1
=
2
*
9
;
constexpr
index_t
E2
=
1
;
constexpr
index_t
K2
=
2
;
constexpr
index_t
E1PerBlock
=
2
;
constexpr
index_t
KPerThread
=
KPerBlock
;
constexpr
index_t
HoPerThread
=
2
;
constexpr
index_t
WoPerThread
=
2
;
constexpr
index_t
EPerThread
=
1
;
using
ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2
=
Sequence
<
1
,
9
,
1
,
1
,
E2
>
;
using
ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2
=
Sequence
<
1
,
E1PerBlock
,
1
,
KPerBlock
,
1
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_E2
=
E2
;
constexpr
index_t
ABlockTransferDstScalarPerVector_E2
=
E2
;
constexpr
index_t
BThreadTransferSrcScalarPerVector_E2
=
E2
;
constexpr
index_t
CThreadTransferDstScalarPerVector_K
=
InWeiVectorSize
;
#endif
if
(
KPerThread
%
InWeiVectorSize
!=
0
)
{
throw
std
::
runtime_error
(
"wrong! C1 cannot be divided by InWeiVectorSize"
);
}
const
auto
in_n_c0_hi_wi_c1_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
C0
,
Hi
,
Wi
,
E2
));
const
auto
wei_k_c0_y_x_c1_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
K
,
C0
,
Y
,
X
,
E2
));
const
auto
max_n_k0_hx_wx_k1_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
K0
,
Hx
,
Wx
,
K1
));
const
auto
out_n_k0_ho_wo_k1_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
K0
,
Ho
,
Wo
,
K1
));
constexpr
auto
conv_driver
=
DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_maxpool
<
BlockSize
,
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
,
TAcc
,
TOut
,
E1
,
E2
,
K2
,
KPerBlock
,
HoPerBlock
,
WoPerBlock
,
E1PerBlock
,
KPerThread
,
HoPerThread
,
WoPerThread
,
EPerThread
,
ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2
,
ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2
,
ABlockTransferSrcScalarPerVector_E2
,
ABlockTransferDstScalarPerVector_E2
,
BThreadTransferSrcScalarPerVector_E2
,
CThreadTransferDstScalarPerVector_K
,
activ_type
>
{};
std
::
cerr
<<
"conv_bias_activ_maxpool_input_"
<<
"n"
<<
N
<<
"c"
<<
C0
<<
"h"
<<
Hi
<<
"w"
<<
Wi
<<
"c"
<<
C1
<<
"_filter_k"
<<
K
<<
"c"
<<
C0
<<
"y"
<<
Y
<<
"x"
<<
X
<<
"c"
<<
C1
<<
"_convout_n"
<<
N
<<
"k"
<<
K0
<<
"h"
<<
Ho
<<
"w"
<<
Wo
<<
"k"
<<
K1
<<
"_maxpoolout_n"
<<
N
<<
"k"
<<
K0
<<
"h"
<<
Ho
/
2
<<
"w"
<<
Wo
/
2
<<
"k"
<<
K1
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
5
;
i
++
)
{
const
auto
ave_time
=
conv_driver
.
Run
(
wei_k_c0_y_x_c1_desc
,
in_n_c0_hi_wi_c1_desc
,
out_n_k0_ho_wo_k1_desc
,
max_n_k0_hx_wx_k1_desc
,
conv_strides
,
conv_dilations
,
in_left_pads
,
in_right_pads
,
static_cast
<
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
*>
(
wei_k_c0_y_x_c1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
*>
(
in_n_c0_hi_wi_c1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TOut
*>
(
bias_k0_k1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TOut
*>
(
out_n_k0_ho_wo_k1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TOut
*>
(
max_n_k0_hx_wx_k1_device_buf
.
GetDeviceBuffer
()),
nrepeat
);
{
float
perf
=
static_cast
<
float
>
(
std
::
size_t
(
2
)
*
N
*
K
*
Ho
*
Wo
*
C0
*
C1
*
Y
*
X
)
/
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
ave_time
;
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
}
}
out_n_k0_ho_wo_k1_device_buf
.
FromDevice
(
out_n_k0_ho_wo_k1
.
mData
.
data
());
max_n_k0_hx_wx_k1_device_buf
.
FromDevice
(
max_n_k0_hx_wx_k1
.
mData
.
data
());
}
host/driver_offline/include/driver_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
0 → 100644
View file @
a037693f
#ifndef DRIVER_CONVOLUTION_ADD_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NC0HWc1_KC0YXC1_NK0HWK1_HPP
#define DRIVER_CONVOLUTION_ADD_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NC0HWc1_KC0YXC1_NK0HWK1_HPP
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "gridwise_gemm_dlops_v3.hpp"
template
<
ck
::
index_t
BlockSize
,
typename
FloatAB
,
typename
FloatAcc
,
typename
FloatC
,
ck
::
index_t
E1_
,
ck
::
index_t
E2_
,
ck
::
index_t
K2_
,
ck
::
index_t
KPerBlock
,
ck
::
index_t
HoPerBlock
,
ck
::
index_t
WoPerBlock
,
ck
::
index_t
E1PerBlock
,
ck
::
index_t
KPerThread
,
ck
::
index_t
HoPerThread
,
ck
::
index_t
WoPerThread
,
ck
::
index_t
EPerThread
,
typename
ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2
,
typename
ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2
,
ck
::
index_t
ABlockTransferSrcScalarPerVector_E2
,
ck
::
index_t
ABlockTransferDstScalarPerVector_E2
,
ck
::
index_t
BThreadTransferSrcScalarPerVector_E2
,
ck
::
index_t
CThreadTransferDstScalarPerVector_K
,
ck
::
ActivTypeEnum_t
activ_type
>
struct
DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_add
{
template
<
typename
...
Wei
,
typename
...
In
,
typename
...
Add
,
typename
...
Out
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
__host__
float
Run
(
const
ck
::
TensorDescriptor
<
Wei
...
>&
wei_k_c0_y_x_c1_global_desc
,
const
ck
::
TensorDescriptor
<
In
...
>&
in_n_c0_hi_wi_c1_global_desc
,
const
ck
::
TensorDescriptor
<
Out
...
>&
out_n_k0_ho_wo_k1_global_desc
,
const
ck
::
TensorDescriptor
<
Add
...
>&
add_n_k0_hox2_wox2_k1_global_desc
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
in_right_pads
,
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
const
FloatC
*
__restrict__
p_bias_grid
,
FloatC
*
__restrict__
p_d_grid
,
const
int
nrepeat
)
const
{
using
namespace
ck
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
const
auto
N
=
in_n_c0_hi_wi_c1_global_desc
.
GetLength
(
I0
);
const
auto
C0
=
in_n_c0_hi_wi_c1_global_desc
.
GetLength
(
I1
);
const
auto
Hi
=
in_n_c0_hi_wi_c1_global_desc
.
GetLength
(
I2
);
const
auto
Wi
=
in_n_c0_hi_wi_c1_global_desc
.
GetLength
(
I3
);
// const auto C1 = in_n_c0_hi_wi_c1_global_desc.GetLength(I4);
const
auto
K0
=
out_n_k0_ho_wo_k1_global_desc
.
GetLength
(
I1
);
const
auto
Ho
=
out_n_k0_ho_wo_k1_global_desc
.
GetLength
(
I2
);
const
auto
Wo
=
out_n_k0_ho_wo_k1_global_desc
.
GetLength
(
I3
);
const
auto
K1
=
out_n_k0_ho_wo_k1_global_desc
.
GetLength
(
I4
);
const
auto
Hox2
=
add_n_k0_hox2_wox2_k1_global_desc
.
GetLength
(
I2
);
const
auto
Wox2
=
add_n_k0_hox2_wox2_k1_global_desc
.
GetLength
(
I3
);
const
auto
K
=
wei_k_c0_y_x_c1_global_desc
.
GetLength
(
I0
);
const
auto
Y
=
wei_k_c0_y_x_c1_global_desc
.
GetLength
(
I2
);
const
auto
X
=
wei_k_c0_y_x_c1_global_desc
.
GetLength
(
I3
);
const
auto
ConvStrideH
=
conv_strides
[
I0
];
const
auto
ConvStrideW
=
conv_strides
[
I1
];
const
auto
ConvDilationH
=
conv_dilations
[
I0
];
const
auto
ConvDilationW
=
conv_dilations
[
I1
];
#if CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
const
auto
Hop
=
Number
<
(
Ho
+
HoPerBlock
-
1
)
/
HoPerBlock
*
HoPerBlock
>
{};
const
auto
Wop
=
Number
<
(
Wo
+
WoPerBlock
-
1
)
/
WoPerBlock
*
WoPerBlock
>
{};
const
auto
OutRightPadH
=
Hop
-
Ho
;
const
auto
OutRightPadW
=
Wop
-
Wo
;
const
auto
OutRightPadHx
=
Number
<
OutRightPadH
*
2
>
{};
const
auto
OutRightPadWx
=
Number
<
OutRightPadW
*
2
>
{};
#else
const
auto
Hop
=
(
Ho
+
HoPerBlock
-
1
)
/
HoPerBlock
*
HoPerBlock
;
const
auto
Wop
=
(
Wo
+
WoPerBlock
-
1
)
/
WoPerBlock
*
WoPerBlock
;
const
auto
OutRightPadH
=
Hop
-
Ho
;
const
auto
OutRightPadW
=
Wop
-
Wo
;
const
auto
OutRightPadHx
=
OutRightPadH
*
2
;
const
auto
OutRightPadWx
=
OutRightPadW
*
2
;
#endif
const
auto
InLeftPadH
=
in_left_pads
[
I0
];
const
auto
InLeftPadW
=
in_left_pads
[
I1
];
const
auto
InRightPadH
=
in_right_pads
[
I0
]
+
OutRightPadH
*
ConvStrideH
;
const
auto
InRightPadW
=
in_right_pads
[
I1
]
+
OutRightPadW
*
ConvStrideW
;
const
auto
E
=
C0
*
Y
*
X
;
constexpr
auto
E1
=
Number
<
E1_
>
{};
constexpr
auto
E2
=
Number
<
E2_
>
{};
constexpr
auto
K2
=
Number
<
K2_
>
{};
const
auto
E0
=
E
/
E1
;
// weight tensor
const
auto
a_e_k_e2_grid_desc
=
transform_tensor_descriptor
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
K
,
C0
*
Y
*
X
,
E2
)),
make_tuple
(
make_pass_through_transform
(
K
),
make_pass_through_transform
(
C0
*
Y
*
X
),
make_pass_through_transform
(
E2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{}),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
>
{},
Sequence
<
2
>
{}));
const
auto
a_e0_e1_k_e2_grid_desc
=
transform_tensor_descriptor
(
a_e_k_e2_grid_desc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
E0
,
E1
)),
make_pass_through_transform
(
K
),
make_pass_through_transform
(
E2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
// input tensor
const
auto
in_n_c0_hip_wip_e2_global_desc
=
transform_tensor_descriptor
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
C0
,
Hi
,
Wi
,
E2
)),
make_tuple
(
make_pass_through_transform
(
N
),
make_pass_through_transform
(
C0
),
make_pad_transform
(
Hi
,
InLeftPadH
,
InRightPadH
),
make_pad_transform
(
Wi
,
InLeftPadW
,
InRightPadW
),
make_pass_through_transform
(
E2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{}));
const
auto
in_n_c0_y_ho_x_wo_e2_global_desc
=
transform_tensor_descriptor
(
in_n_c0_hip_wip_e2_global_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_pass_through_transform
(
C0
),
make_embed_transform
(
make_tuple
(
Y
,
Hop
),
make_tuple
(
ConvDilationH
,
ConvStrideH
)),
make_embed_transform
(
make_tuple
(
X
,
Wop
),
make_tuple
(
ConvDilationW
,
ConvStrideW
)),
make_pass_through_transform
(
E2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{},
Sequence
<
4
,
5
>
{},
Sequence
<
6
>
{}));
const
auto
in_e_n_ho_wo_e2_grid_desc
=
transform_tensor_descriptor
(
in_n_c0_y_ho_x_wo_e2_global_desc
,
make_tuple
(
make_merge_transform
(
make_tuple
(
C0
,
Y
,
X
)),
make_pass_through_transform
(
N
),
make_pass_through_transform
(
Hop
),
make_pass_through_transform
(
Wop
),
make_pass_through_transform
(
E2
)),
make_tuple
(
Sequence
<
1
,
2
,
4
>
{},
Sequence
<
0
>
{},
Sequence
<
3
>
{},
Sequence
<
5
>
{},
Sequence
<
6
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{}));
const
auto
b_e0_e1_n_ho_wo_e2_grid_desc
=
transform_tensor_descriptor
(
in_e_n_ho_wo_e2_grid_desc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
E0
,
E1
)),
make_pass_through_transform
(
N
),
make_pass_through_transform
(
Hop
),
make_pass_through_transform
(
Wop
),
make_pass_through_transform
(
E2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{},
Sequence
<
5
>
{}));
// output tensor
const
auto
c_k_n_hop_wop_grid_desc
=
transform_tensor_descriptor
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
K0
,
Ho
,
Wo
,
K1
)),
make_tuple
(
make_merge_transform
(
make_tuple
(
K0
,
K1
)),
make_pass_through_transform
(
N
),
make_pad_transform
(
Ho
,
I0
,
OutRightPadH
),
make_pad_transform
(
Wo
,
I0
,
OutRightPadW
)),
make_tuple
(
Sequence
<
1
,
4
>
{},
Sequence
<
0
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
// add tensor
const
auto
d_k_n_hopx2_wopx2_grid_desc
=
transform_tensor_descriptor
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
K0
,
Hox2
,
Wox2
,
K1
)),
make_tuple
(
make_merge_transform
(
make_tuple
(
K0
,
K1
)),
make_pass_through_transform
(
N
),
make_pad_transform
(
Hox2
,
I0
,
OutRightPadHx
),
make_pad_transform
(
Wox2
,
I0
,
OutRightPadWx
)),
make_tuple
(
Sequence
<
1
,
4
>
{},
Sequence
<
0
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
std
::
cerr
<<
"Hop = "
<<
Hop
<<
" Wop = "
<<
Wop
<<
std
::
endl
;
if
(
!
((
K
%
KPerBlock
)
==
0
&&
(
Hop
%
HoPerBlock
)
==
0
&&
(
Wop
%
WoPerBlock
)
==
0
&&
(
E1
%
E1PerBlock
)
==
0
))
{
throw
std
::
runtime_error
(
"wrong! GEMM size no divisible"
);
}
// clang-format off
// hack to control index calculation when iterating over a_e0_e1_k_e2_global tensor
constexpr
auto
a_e0_e1_k_e2_global_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}));
constexpr
auto
a_e0_e1_k_e2_global_move_slice_window_step_hack
=
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{};
// hack to control index calculation when iterating over b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global tensor
constexpr
auto
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{})
);
constexpr
auto
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack
=
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{};
// hack to control index calculation when iterating over c_k0_k1_n_h0_h1_h2_w0_w1_w2_global tensor
constexpr
auto
c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
2
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
2
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}));
constexpr
auto
d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_global_tensor_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
2
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
2
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}));
// clang-format on
// GEMM
using
GridwiseGemm
=
GridwiseGemmDlops_km_kn_mn_v3
<
BlockSize
,
FloatAB
,
FloatAcc
,
FloatC
,
InMemoryDataOperationEnum_t
::
Set
,
decltype
(
a_e0_e1_k_e2_grid_desc
),
decltype
(
b_e0_e1_n_ho_wo_e2_grid_desc
),
decltype
(
c_k_n_hop_wop_grid_desc
),
decltype
(
d_k_n_hopx2_wopx2_grid_desc
),
E1
,
E2
,
K2
,
KPerBlock
,
HoPerBlock
,
WoPerBlock
,
E1PerBlock
,
KPerThread
,
HoPerThread
,
WoPerThread
,
EPerThread
,
ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2
,
ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2
,
Sequence
<
2
,
3
,
0
,
1
,
4
>
,
Sequence
<
0
,
1
,
2
,
3
,
4
>
,
4
,
ABlockTransferSrcScalarPerVector_E2
,
ABlockTransferDstScalarPerVector_E2
,
false
,
// don't move back src coordinate after threadwise copy
Sequence
<
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
>
,
// E0, E1, N, H0, H1, H2, W0, W1, W2, E2
9
,
BThreadTransferSrcScalarPerVector_E2
,
false
,
// don't move back src coordinate after threadwise copy, which will be fused with
// MoveSrcSliceWindow() to save addr computation
Sequence
<
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
>
,
// K0, K1, N, H0, H1, I2, H2, W0, W1, I2, W2
1
,
CThreadTransferDstScalarPerVector_K
,
decltype
(
a_e0_e1_k_e2_global_step_hacks
),
decltype
(
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks
),
decltype
(
c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks
),
decltype
(
d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_global_tensor_step_hacks
),
decltype
(
a_e0_e1_k_e2_global_move_slice_window_step_hack
),
decltype
(
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack
)
>
;
const
auto
a_e0_e1_k0_k1_e2_grid_desc
=
GridwiseGemm
::
MakeAE0E1K0K1E2GridDescriptor
(
a_e0_e1_k_e2_grid_desc
);
const
auto
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
=
GridwiseGemm
::
MakeBE0E1NH0H1H2W0W1W2E2GridDescriptor
(
b_e0_e1_n_ho_wo_e2_grid_desc
);
const
auto
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
=
GridwiseGemm
::
MakeCK0K1NH0H1H2W0W1W2GridDescriptor
(
c_k_n_hop_wop_grid_desc
);
const
auto
d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc
=
GridwiseGemm
::
MakeDK0K1NH0H1HxW0W1WxGridDescriptorResizeAdd
(
d_k_n_hopx2_wopx2_grid_desc
);
using
AGridDesc_E0_E1_K0_K1_E2
=
decltype
(
a_e0_e1_k0_k1_e2_grid_desc
);
using
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
=
decltype
(
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
);
using
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
=
decltype
(
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
);
using
DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2
=
decltype
(
d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc
);
const
auto
grid_size
=
(
K
/
KPerBlock
)
*
(
Hop
/
HoPerBlock
)
*
(
Wop
/
WoPerBlock
)
*
N
;
const
bool
has_main_e0_block_loop
=
E0
>
1
;
std
::
cerr
<<
"has_main_e0_block_loop = "
<<
has_main_e0_block_loop
<<
std
::
endl
;
const
auto
c_blockid_to_k_n_h_w_block_cluster_adaptor
=
GridwiseGemm
::
MakeCBlockIdToKNHoWoBlockClusterAdaptor
(
c_k_n_hop_wop_grid_desc
);
using
CBlockIdToBlockClusterAdaptor_K_N_H_W
=
decltype
(
c_blockid_to_k_n_h_w_block_cluster_adaptor
);
float
ave_time
=
0
;
#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
if
(
has_main_e0_block_loop
)
{
const
auto
kernel
=
kernel_gemm_dlops_v3_resize_add
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AGridDesc_E0_E1_K0_K1_E2
>
,
remove_reference_t
<
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
>
,
remove_reference_t
<
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
>
,
remove_reference_t
<
DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2
>
,
remove_reference_t
<
CBlockIdToBlockClusterAdaptor_K_N_H_W
>
,
true
,
activ_type
>
;
ave_time
=
launch_and_time_kernel
(
kernel
,
nrepeat
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
p_a_grid
,
p_b_grid
,
p_bias_grid
,
p_d_grid
,
a_e0_e1_k0_k1_e2_grid_desc
,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
,
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
,
d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc
,
c_blockid_to_k_n_h_w_block_cluster_adaptor
);
}
else
{
const
auto
kernel
=
kernel_gemm_dlops_v3_resize_add
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AGridDesc_E0_E1_K0_K1_E2
>
,
remove_reference_t
<
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
>
,
remove_reference_t
<
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
>
,
remove_reference_t
<
DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2
>
,
remove_reference_t
<
CBlockIdToBlockClusterAdaptor_K_N_H_W
>
,
false
,
activ_type
>
;
ave_time
=
launch_and_time_kernel
(
kernel
,
nrepeat
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
p_a_grid
,
p_b_grid
,
p_bias_grid
,
p_d_grid
,
a_e0_e1_k0_k1_e2_grid_desc
,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
,
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
,
d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc
,
c_blockid_to_k_n_h_w_block_cluster_adaptor
);
}
#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
DeviceMem
a_e0_e1_k0_k1_e2_grid_desc_dev_buf
(
sizeof
(
AGridDesc_E0_E1_K0_K1_E2
));
DeviceMem
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf
(
sizeof
(
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
));
DeviceMem
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf
(
sizeof
(
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
));
DeviceMem
d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc_dev_buf
(
sizeof
(
DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2
));
DeviceMem
c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf
(
sizeof
(
CBlockIdToBlockClusterAdaptor_K_N_H_W
));
a_e0_e1_k0_k1_e2_grid_desc_dev_buf
.
ToDevice
(
&
a_e0_e1_k0_k1_e2_grid_desc
);
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf
.
ToDevice
(
&
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
);
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf
.
ToDevice
(
&
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
);
d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc_dev_buf
.
ToDevice
(
&
d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc
);
c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf
.
ToDevice
(
&
c_blockid_to_k_n_h_w_block_cluster_adaptor
);
if
(
has_main_e0_block_loop
)
{
const
auto
kernel
=
kernel_gemm_dlops_v3_resize_add
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AGridDesc_E0_E1_K0_K1_E2
>
,
remove_reference_t
<
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
>
,
remove_reference_t
<
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
>
,
remove_reference_t
<
DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2
>
,
remove_reference_t
<
CBlockIdToBlockClusterAdaptor_K_N_H_W
>
,
true
,
activ_type
>
;
ave_time
=
launch_and_time_kernel
(
kernel
,
nrepeat
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
p_a_grid
,
p_b_grid
,
p_bias_grid
,
p_d_grid
,
cast_pointer_to_constant_address_space
(
a_e0_e1_k0_k1_e2_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf
.
GetDeviceBuffer
()));
}
else
{
const
auto
kernel
=
kernel_gemm_dlops_v3_resize_add
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AGridDesc_E0_E1_K0_K1_E2
>
,
remove_reference_t
<
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
>
,
remove_reference_t
<
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
>
,
remove_reference_t
<
DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2
>
,
remove_reference_t
<
CBlockIdToBlockClusterAdaptor_K_N_H_W
>
,
false
,
activ_type
>
;
ave_time
=
launch_and_time_kernel
(
kernel
,
nrepeat
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
p_a_grid
,
p_b_grid
,
p_bias_grid
,
p_d_grid
,
cast_pointer_to_constant_address_space
(
a_e0_e1_k0_k1_e2_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf
.
GetDeviceBuffer
()));
}
#elif CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
{
static_assert
(
a_e0_e1_k_e2_grid_desc
.
IsKnownAtCompileTime
(),
""
);
static_assert
(
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
.
IsKnownAtCompileTime
(),
""
);
static_assert
(
d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc
.
IsKnownAtCompileTime
(),
""
);
static_assert
(
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
.
IsKnownAtCompileTime
(),
""
);
static_assert
(
c_blockid_to_k_n_h_w_block_cluster_adaptor
.
IsKnownAtCompileTime
(),
""
);
const
auto
kernel
=
kernel_gemm_dlops_v3_resize_add
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AGridDesc_E0_E1_K0_K1_E2
>
,
remove_reference_t
<
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
>
,
remove_reference_t
<
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
>
,
remove_reference_t
<
DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2
>
,
remove_reference_t
<
CBlockIdToBlockClusterAdaptor_K_N_H_W
>
,
has_main_e0_block_loop
,
activ_type
>
;
ave_time
=
launch_and_time_kernel
(
kernel
,
nrepeat
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
p_a_grid
,
p_b_grid
,
p_bias_grid
,
p_d_grid
);
}
#endif
return
ave_time
;
}
};
#endif
host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
0 → 100644
View file @
a037693f
#ifndef DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NC0HWc1_KC0YXC1_NK0HWK1_HPP
#define DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NC0HWc1_KC0YXC1_NK0HWK1_HPP
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "gridwise_gemm_dlops_v3.hpp"
template
<
ck
::
index_t
BlockSize
,
typename
FloatAB
,
typename
FloatAcc
,
typename
FloatC
,
ck
::
index_t
E1_
,
ck
::
index_t
E2_
,
ck
::
index_t
K2_
,
ck
::
index_t
KPerBlock
,
ck
::
index_t
HoPerBlock
,
ck
::
index_t
WoPerBlock
,
ck
::
index_t
E1PerBlock
,
ck
::
index_t
KPerThread
,
ck
::
index_t
HoPerThread
,
ck
::
index_t
WoPerThread
,
ck
::
index_t
EPerThread
,
typename
ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2
,
typename
ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2
,
ck
::
index_t
ABlockTransferSrcScalarPerVector_E2
,
ck
::
index_t
ABlockTransferDstScalarPerVector_E2
,
ck
::
index_t
BThreadTransferSrcScalarPerVector_E2
,
ck
::
index_t
CThreadTransferDstScalarPerVector_K
,
ck
::
ActivTypeEnum_t
activ_type
>
struct
DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_outpad
{
template
<
typename
...
Wei
,
typename
...
In
,
typename
...
Out
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
__host__
float
Run
(
const
ck
::
TensorDescriptor
<
Wei
...
>&
wei_k_c0_y_x_c1_global_desc
,
const
ck
::
TensorDescriptor
<
In
...
>&
in_n_c0_hi_wi_c1_global_desc
,
const
ck
::
TensorDescriptor
<
Out
...
>&
out_n_k0_ho_wo_k1_global_desc
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
in_right_pads
,
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
const
FloatC
*
__restrict__
p_bias_grid
,
FloatC
*
__restrict__
p_c_grid
,
const
int
nrepeat
)
const
{
using
namespace
ck
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
const
auto
N
=
in_n_c0_hi_wi_c1_global_desc
.
GetLength
(
I0
);
const
auto
C0
=
in_n_c0_hi_wi_c1_global_desc
.
GetLength
(
I1
);
const
auto
Hi
=
in_n_c0_hi_wi_c1_global_desc
.
GetLength
(
I2
);
const
auto
Wi
=
in_n_c0_hi_wi_c1_global_desc
.
GetLength
(
I3
);
// const auto C1 = in_n_c0_hi_wi_c1_global_desc.GetLength(I4);
const
auto
K0
=
out_n_k0_ho_wo_k1_global_desc
.
GetLength
(
I1
);
const
auto
Ho
=
out_n_k0_ho_wo_k1_global_desc
.
GetLength
(
I2
);
const
auto
Wo
=
out_n_k0_ho_wo_k1_global_desc
.
GetLength
(
I3
);
const
auto
K1
=
out_n_k0_ho_wo_k1_global_desc
.
GetLength
(
I4
);
const
auto
K
=
wei_k_c0_y_x_c1_global_desc
.
GetLength
(
I0
);
const
auto
Y
=
wei_k_c0_y_x_c1_global_desc
.
GetLength
(
I2
);
const
auto
X
=
wei_k_c0_y_x_c1_global_desc
.
GetLength
(
I3
);
const
auto
ConvStrideH
=
conv_strides
[
I0
];
const
auto
ConvStrideW
=
conv_strides
[
I1
];
const
auto
ConvDilationH
=
conv_dilations
[
I0
];
const
auto
ConvDilationW
=
conv_dilations
[
I1
];
#if CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
const
auto
Hop
=
Number
<
(
Ho
+
HoPerBlock
-
1
)
/
HoPerBlock
*
HoPerBlock
>
{};
const
auto
Wop
=
Number
<
(
Wo
+
WoPerBlock
-
1
)
/
WoPerBlock
*
WoPerBlock
>
{};
#else
const
auto
Hop
=
(
Ho
+
HoPerBlock
-
1
)
/
HoPerBlock
*
HoPerBlock
;
const
auto
Wop
=
(
Wo
+
WoPerBlock
-
1
)
/
WoPerBlock
*
WoPerBlock
;
#endif
const
auto
OutRightPadH
=
Hop
-
Ho
;
const
auto
OutRightPadW
=
Wop
-
Wo
;
const
auto
InLeftPadH
=
in_left_pads
[
I0
];
const
auto
InLeftPadW
=
in_left_pads
[
I1
];
const
auto
InRightPadH
=
in_right_pads
[
I0
]
+
OutRightPadH
*
ConvStrideH
;
const
auto
InRightPadW
=
in_right_pads
[
I1
]
+
OutRightPadW
*
ConvStrideW
;
const
auto
E
=
C0
*
Y
*
X
;
constexpr
auto
E1
=
Number
<
E1_
>
{};
constexpr
auto
E2
=
Number
<
E2_
>
{};
constexpr
auto
K2
=
Number
<
K2_
>
{};
const
auto
E0
=
E
/
E1
;
// weight tensor
const
auto
a_e_k_e2_grid_desc
=
transform_tensor_descriptor
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
K
,
C0
*
Y
*
X
,
E2
)),
make_tuple
(
make_pass_through_transform
(
K
),
make_pass_through_transform
(
C0
*
Y
*
X
),
make_pass_through_transform
(
E2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{}),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
>
{},
Sequence
<
2
>
{}));
const
auto
a_e0_e1_k_e2_grid_desc
=
transform_tensor_descriptor
(
a_e_k_e2_grid_desc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
E0
,
E1
)),
make_pass_through_transform
(
K
),
make_pass_through_transform
(
E2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
// input tensor
const
auto
in_n_c0_hip_wip_e2_global_desc
=
transform_tensor_descriptor
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
C0
,
Hi
,
Wi
,
E2
)),
make_tuple
(
make_pass_through_transform
(
N
),
make_pass_through_transform
(
C0
),
make_pad_transform
(
Hi
,
InLeftPadH
,
InRightPadH
),
make_pad_transform
(
Wi
,
InLeftPadW
,
InRightPadW
),
make_pass_through_transform
(
E2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{}));
const
auto
in_n_c0_y_ho_x_wo_e2_global_desc
=
transform_tensor_descriptor
(
in_n_c0_hip_wip_e2_global_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_pass_through_transform
(
C0
),
make_embed_transform
(
make_tuple
(
Y
,
Hop
),
make_tuple
(
ConvDilationH
,
ConvStrideH
)),
make_embed_transform
(
make_tuple
(
X
,
Wop
),
make_tuple
(
ConvDilationW
,
ConvStrideW
)),
make_pass_through_transform
(
E2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{},
Sequence
<
4
,
5
>
{},
Sequence
<
6
>
{}));
const
auto
in_e_n_ho_wo_e2_grid_desc
=
transform_tensor_descriptor
(
in_n_c0_y_ho_x_wo_e2_global_desc
,
make_tuple
(
make_merge_transform
(
make_tuple
(
C0
,
Y
,
X
)),
make_pass_through_transform
(
N
),
make_pass_through_transform
(
Hop
),
make_pass_through_transform
(
Wop
),
make_pass_through_transform
(
E2
)),
make_tuple
(
Sequence
<
1
,
2
,
4
>
{},
Sequence
<
0
>
{},
Sequence
<
3
>
{},
Sequence
<
5
>
{},
Sequence
<
6
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{}));
const
auto
b_e0_e1_n_ho_wo_e2_grid_desc
=
transform_tensor_descriptor
(
in_e_n_ho_wo_e2_grid_desc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
E0
,
E1
)),
make_pass_through_transform
(
N
),
make_pass_through_transform
(
Hop
),
make_pass_through_transform
(
Wop
),
make_pass_through_transform
(
E2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{},
Sequence
<
5
>
{}));
// output tensor
const
auto
c_k_n_hop_wop_grid_desc
=
transform_tensor_descriptor
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
K0
,
Ho
,
Wo
,
K1
)),
make_tuple
(
make_merge_transform
(
make_tuple
(
K0
,
K1
)),
make_pass_through_transform
(
N
),
make_pad_transform
(
Ho
,
I0
,
OutRightPadH
),
make_pad_transform
(
Wo
,
I0
,
OutRightPadW
)),
make_tuple
(
Sequence
<
1
,
4
>
{},
Sequence
<
0
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
std
::
cerr
<<
"Hop = "
<<
Hop
<<
" Wop = "
<<
Wop
<<
std
::
endl
;
if
(
!
((
K
%
KPerBlock
)
==
0
&&
(
Hop
%
HoPerBlock
)
==
0
&&
(
Wop
%
WoPerBlock
)
==
0
&&
(
E1
%
E1PerBlock
)
==
0
))
{
throw
std
::
runtime_error
(
"wrong! GEMM size no divisible"
);
}
// clang-format off
// hack to control index calculation when iterating over a_e0_e1_k_e2_global tensor
constexpr
auto
a_e0_e1_k_e2_global_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}));
constexpr
auto
a_e0_e1_k_e2_global_move_slice_window_step_hack
=
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{};
// hack to control index calculation when iterating over b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global tensor
constexpr
auto
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{})
);
constexpr
auto
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack
=
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{};
// hack to control index calculation when iterating over c_k0_k1_n_h0_h1_h2_w0_w1_w2_global tensor
constexpr
auto
c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
2
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
2
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}));
// clang-format on
// GEMM
using
GridwiseGemm
=
GridwiseGemmDlops_km_kn_mn_v3
<
BlockSize
,
FloatAB
,
FloatAcc
,
FloatC
,
InMemoryDataOperationEnum_t
::
Set
,
decltype
(
a_e0_e1_k_e2_grid_desc
),
decltype
(
b_e0_e1_n_ho_wo_e2_grid_desc
),
decltype
(
c_k_n_hop_wop_grid_desc
),
decltype
(
c_k_n_hop_wop_grid_desc
),
E1
,
E2
,
K2
,
KPerBlock
,
HoPerBlock
,
WoPerBlock
,
E1PerBlock
,
KPerThread
,
HoPerThread
,
WoPerThread
,
EPerThread
,
ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2
,
ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2
,
Sequence
<
2
,
3
,
0
,
1
,
4
>
,
Sequence
<
0
,
1
,
2
,
3
,
4
>
,
4
,
ABlockTransferSrcScalarPerVector_E2
,
ABlockTransferDstScalarPerVector_E2
,
false
,
// don't move back src coordinate after threadwise copy
Sequence
<
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
>
,
// E0, E1, N, H0, H1, H2, W0, W1, W2, E2
9
,
BThreadTransferSrcScalarPerVector_E2
,
false
,
// don't move back src coordinate after threadwise copy, which will be fused with
// MoveSrcSliceWindow() to save addr computation
Sequence
<
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
>
,
// K0, K1, N, H0, H1, H2, W0, W1, W2
1
,
CThreadTransferDstScalarPerVector_K
,
decltype
(
a_e0_e1_k_e2_global_step_hacks
),
decltype
(
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks
),
decltype
(
c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks
),
decltype
(
c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks
),
decltype
(
a_e0_e1_k_e2_global_move_slice_window_step_hack
),
decltype
(
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack
)
>
;
const
auto
a_e0_e1_k0_k1_e2_grid_desc
=
GridwiseGemm
::
MakeAE0E1K0K1E2GridDescriptor
(
a_e0_e1_k_e2_grid_desc
);
const
auto
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
=
GridwiseGemm
::
MakeBE0E1NH0H1H2W0W1W2E2GridDescriptor
(
b_e0_e1_n_ho_wo_e2_grid_desc
);
const
auto
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
=
GridwiseGemm
::
MakeCK0K1NH0H1H2W0W1W2GridDescriptor
(
c_k_n_hop_wop_grid_desc
);
using
AGridDesc_E0_E1_K0_K1_E2
=
decltype
(
a_e0_e1_k0_k1_e2_grid_desc
);
using
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
=
decltype
(
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
);
using
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
=
decltype
(
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
);
const
auto
grid_size
=
(
K
/
KPerBlock
)
*
(
Hop
/
HoPerBlock
)
*
(
Wop
/
WoPerBlock
)
*
N
;
const
bool
has_main_e0_block_loop
=
E0
>
1
;
std
::
cerr
<<
"has_main_e0_block_loop = "
<<
has_main_e0_block_loop
<<
std
::
endl
;
const
auto
c_blockid_to_k_n_h_w_block_cluster_adaptor
=
GridwiseGemm
::
MakeCBlockIdToKNHoWoBlockClusterAdaptor
(
c_k_n_hop_wop_grid_desc
);
using
CBlockIdToBlockClusterAdaptor_K_N_H_W
=
decltype
(
c_blockid_to_k_n_h_w_block_cluster_adaptor
);
float
ave_time
=
0
;
#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
if
(
has_main_e0_block_loop
)
{
const
auto
kernel
=
kernel_gemm_dlops_v3
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AGridDesc_E0_E1_K0_K1_E2
>
,
remove_reference_t
<
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
>
,
remove_reference_t
<
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
>
,
remove_reference_t
<
CBlockIdToBlockClusterAdaptor_K_N_H_W
>
,
true
,
activ_type
>
;
ave_time
=
launch_and_time_kernel
(
kernel
,
nrepeat
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
p_a_grid
,
p_b_grid
,
p_bias_grid
,
p_c_grid
,
a_e0_e1_k0_k1_e2_grid_desc
,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
,
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
,
c_blockid_to_k_n_h_w_block_cluster_adaptor
);
}
else
{
const
auto
kernel
=
kernel_gemm_dlops_v3
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AGridDesc_E0_E1_K0_K1_E2
>
,
remove_reference_t
<
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
>
,
remove_reference_t
<
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
>
,
remove_reference_t
<
CBlockIdToBlockClusterAdaptor_K_N_H_W
>
,
false
,
activ_type
>
;
ave_time
=
launch_and_time_kernel
(
kernel
,
nrepeat
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
p_a_grid
,
p_b_grid
,
p_bias_grid
,
p_c_grid
,
a_e0_e1_k0_k1_e2_grid_desc
,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
,
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
,
c_blockid_to_k_n_h_w_block_cluster_adaptor
);
}
#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
DeviceMem
a_e0_e1_k0_k1_e2_grid_desc_dev_buf
(
sizeof
(
AGridDesc_E0_E1_K0_K1_E2
));
DeviceMem
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf
(
sizeof
(
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
));
DeviceMem
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf
(
sizeof
(
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
));
DeviceMem
c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf
(
sizeof
(
CBlockIdToBlockClusterAdaptor_K_N_H_W
));
a_e0_e1_k0_k1_e2_grid_desc_dev_buf
.
ToDevice
(
&
a_e0_e1_k0_k1_e2_grid_desc
);
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf
.
ToDevice
(
&
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
);
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf
.
ToDevice
(
&
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
);
c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf
.
ToDevice
(
&
c_blockid_to_k_n_h_w_block_cluster_adaptor
);
if
(
has_main_e0_block_loop
)
{
const
auto
kernel
=
kernel_gemm_dlops_v3
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AGridDesc_E0_E1_K0_K1_E2
>
,
remove_reference_t
<
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
>
,
remove_reference_t
<
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
>
,
remove_reference_t
<
CBlockIdToBlockClusterAdaptor_K_N_H_W
>
,
true
,
activ_type
>
;
ave_time
=
launch_and_time_kernel
(
kernel
,
nrepeat
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
p_a_grid
,
p_b_grid
,
p_bias_grid
,
p_c_grid
,
cast_pointer_to_constant_address_space
(
a_e0_e1_k0_k1_e2_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf
.
GetDeviceBuffer
()));
}
else
{
const
auto
kernel
=
kernel_gemm_dlops_v3
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AGridDesc_E0_E1_K0_K1_E2
>
,
remove_reference_t
<
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
>
,
remove_reference_t
<
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
>
,
remove_reference_t
<
CBlockIdToBlockClusterAdaptor_K_N_H_W
>
,
false
,
activ_type
>
;
ave_time
=
launch_and_time_kernel
(
kernel
,
nrepeat
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
p_a_grid
,
p_b_grid
,
p_bias_grid
,
p_c_grid
,
cast_pointer_to_constant_address_space
(
a_e0_e1_k0_k1_e2_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf
.
GetDeviceBuffer
()));
}
#elif CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
{
static_assert
(
a_e0_e1_k_e2_grid_desc
.
IsKnownAtCompileTime
(),
""
);
static_assert
(
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
.
IsKnownAtCompileTime
(),
""
);
static_assert
(
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
.
IsKnownAtCompileTime
(),
""
);
static_assert
(
c_blockid_to_k_n_h_w_block_cluster_adaptor
.
IsKnownAtCompileTime
(),
""
);
const
auto
kernel
=
kernel_gemm_dlops_v3
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AGridDesc_E0_E1_K0_K1_E2
>
,
remove_reference_t
<
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
>
,
remove_reference_t
<
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
>
,
remove_reference_t
<
CBlockIdToBlockClusterAdaptor_K_N_H_W
>
,
has_main_e0_block_loop
,
activ_type
>
;
ave_time
=
launch_and_time_kernel
(
kernel
,
nrepeat
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
p_a_grid
,
p_b_grid
,
p_bias_grid
,
p_c_grid
);
}
#endif
return
ave_time
;
}
};
#endif
host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
deleted
100644 → 0
View file @
0694d6ed
#ifndef DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_NCHW_KCYX_NKHW_HPP
#define DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_NCHW_KCYX_NKHW_HPP
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "gridwise_gemm_dlops_v2.hpp"
#include "gridwise_operation_wrapper.hpp"
template
<
ck
::
index_t
BlockSize
,
typename
FloatAB
,
typename
FloatAcc
,
typename
FloatC
,
ck
::
index_t
KPerBlock
,
ck
::
index_t
HoPerBlock
,
ck
::
index_t
WoPerBlock
,
ck
::
index_t
EPerBlock
,
ck
::
index_t
KPerThread
,
ck
::
index_t
HoPerThread
,
ck
::
index_t
WoPerThread
,
ck
::
index_t
EPerThread
,
typename
ABlockTransferThreadSliceLengths_E_K
,
typename
ABlockTransferThreadClusterLengths_E_K
,
ck
::
index_t
ABlockTransferSrcScalarPerVector_E
,
ck
::
index_t
ABlockTransferDstScalarPerVector_K
,
ck
::
index_t
BThreadTransferSrcScalarPerVector_W
,
ck
::
index_t
CThreadTransferDstScalarPerVector_W
>
struct
DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
{
template
<
typename
...
Wei
,
typename
...
In
,
typename
...
Out
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
__host__
void
Run
(
const
ck
::
TensorDescriptor
<
Wei
...
>&
wei_k_c_y_x_global_desc
,
const
ck
::
TensorDescriptor
<
In
...
>&
in_n_c_hi_wi_global_desc
,
const
ck
::
TensorDescriptor
<
Out
...
>&
out_n_k0_ho_wo_k1_global_desc
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
in_right_pads
,
const
FloatAB
*
__restrict__
p_wei_global
,
const
FloatAB
*
__restrict__
p_in_global
,
FloatC
*
__restrict__
p_out_global
)
const
{
using
namespace
ck
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
const
auto
N
=
in_n_c_hi_wi_global_desc
.
GetLength
(
I0
);
const
auto
C
=
in_n_c_hi_wi_global_desc
.
GetLength
(
I1
);
const
auto
K0
=
out_n_k0_ho_wo_k1_global_desc
.
GetLength
(
I1
);
const
auto
Hi
=
in_n_c_hi_wi_global_desc
.
GetLength
(
I2
);
const
auto
Wi
=
in_n_c_hi_wi_global_desc
.
GetLength
(
I3
);
const
auto
Ho
=
out_n_k0_ho_wo_k1_global_desc
.
GetLength
(
I2
);
const
auto
Wo
=
out_n_k0_ho_wo_k1_global_desc
.
GetLength
(
I3
);
const
auto
K1
=
out_n_k0_ho_wo_k1_global_desc
.
GetLength
(
I4
);
const
auto
K
=
wei_k_c_y_x_global_desc
.
GetLength
(
I0
);
const
auto
Y
=
wei_k_c_y_x_global_desc
.
GetLength
(
I2
);
const
auto
X
=
wei_k_c_y_x_global_desc
.
GetLength
(
I3
);
const
auto
ConvStrideH
=
conv_strides
[
I0
];
const
auto
ConvStrideW
=
conv_strides
[
I1
];
const
auto
ConvDilationH
=
conv_dilations
[
I0
];
const
auto
ConvDilationW
=
conv_dilations
[
I1
];
const
auto
InLeftPadH
=
in_left_pads
[
I0
];
const
auto
InLeftPadW
=
in_left_pads
[
I1
];
const
auto
InRightPadH
=
in_right_pads
[
I0
];
const
auto
InRightPadW
=
in_right_pads
[
I1
];
// weight tensor
const
auto
wei_e_k_global_desc
=
transform_tensor_descriptor
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
K
,
C
*
Y
*
X
)),
make_tuple
(
make_pass_through_transform
(
K
),
make_pass_through_transform
(
C
*
Y
*
X
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
>
{}));
// input tensor
const
auto
in_n_c_hip_wip_global_desc
=
transform_tensor_descriptor
(
in_n_c_hi_wi_global_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_pass_through_transform
(
C
),
make_pad_transform
(
Hi
,
InLeftPadH
,
InRightPadH
),
make_pad_transform
(
Wi
,
InLeftPadW
,
InRightPadW
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
const
auto
in_n_c_y_ho_x_wo_global_desc
=
transform_tensor_descriptor
(
in_n_c_hip_wip_global_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_pass_through_transform
(
C
),
make_embed_transform
(
make_tuple
(
Y
,
Ho
),
make_tuple
(
ConvDilationH
,
ConvStrideH
)),
make_embed_transform
(
make_tuple
(
X
,
Wo
),
make_tuple
(
ConvDilationW
,
ConvStrideW
))),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{},
Sequence
<
4
,
5
>
{}));
const
auto
in_e_n_ho_wo_global_desc
=
transform_tensor_descriptor
(
in_n_c_y_ho_x_wo_global_desc
,
make_tuple
(
make_merge_transform
(
make_tuple
(
C
,
Y
,
X
)),
make_pass_through_transform
(
N
),
make_pass_through_transform
(
Ho
),
make_pass_through_transform
(
Wo
)),
make_tuple
(
Sequence
<
1
,
2
,
4
>
{},
Sequence
<
0
>
{},
Sequence
<
3
>
{},
Sequence
<
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
// output tensor
const
auto
out_k_n_ho_wo_global_desc
=
transform_tensor_descriptor
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
K0
,
Ho
,
Wo
,
K1
)),
make_tuple
(
make_merge_transform
(
make_tuple
(
K0
,
K1
)),
make_pass_through_transform
(
N
),
make_pass_through_transform
(
Ho
),
make_pass_through_transform
(
Wo
)),
make_tuple
(
Sequence
<
1
,
4
>
{},
Sequence
<
0
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
const
auto
E
=
C
*
Y
*
X
;
if
(
!
((
K
%
KPerBlock
)
==
0
&&
(
Ho
%
HoPerBlock
)
==
0
&&
(
Wo
%
WoPerBlock
)
==
0
&&
(
E
%
EPerBlock
)
==
0
))
{
throw
std
::
runtime_error
(
"wrong! GEMM size no divisible"
);
}
// hack to control index calculation when iterating over a_k_m_global tensor
constexpr
auto
a_e_k_global_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
>
{}));
constexpr
auto
a_e_k_global_move_slice_window_step_hack
=
Sequence
<
0
,
0
,
0
>
{};
constexpr
auto
b_e_n_ho_wo_global_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}));
constexpr
auto
b_e_n_ho_wo_global_move_slice_window_step_hack
=
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
>
{};
// hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
// hack for NKHW format
constexpr
auto
c_k_n_ho_wo_global_tensor_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
1
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
2
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{}));
#if 1
// GEMM
using
gridwise_gemm
=
GridwiseGemmDlops_km_kn_mn_v3
<
BlockSize
,
FloatAB
,
FloatAcc
,
FloatC
,
InMemoryDataOperationEnum_t
::
Set
,
decltype
(
wei_e_k_global_desc
),
decltype
(
in_e_n_ho_wo_global_desc
),
decltype
(
out_k_n_ho_wo_global_desc
),
KPerBlock
,
HoPerBlock
,
WoPerBlock
,
EPerBlock
,
KPerThread
,
HoPerThread
,
WoPerThread
,
EPerThread
,
ABlockTransferThreadSliceLengths_E_K
,
ABlockTransferThreadClusterLengths_E_K
,
Sequence
<
1
,
0
>
,
Sequence
<
1
,
0
>
,
0
,
ABlockTransferSrcScalarPerVector_E
,
ABlockTransferDstScalarPerVector_K
,
false
,
// don't move back src coordinate after threadwise copy
Sequence
<
0
,
2
,
3
,
1
>
,
3
,
BThreadTransferSrcScalarPerVector_W
,
false
,
// don't move back src coordinate after threadwise copy, which will be fused with
// MoveSrcSliceWindow() to save addr computation
Sequence
<
0
,
2
,
3
,
1
>
,
0
,
CThreadTransferDstScalarPerVector_W
,
decltype
(
a_e_k_global_step_hacks
),
decltype
(
b_e_n_ho_wo_global_step_hacks
),
decltype
(
c_k_n_ho_wo_global_tensor_step_hacks
),
decltype
(
a_e_k_global_move_slice_window_step_hack
),
decltype
(
b_e_n_ho_wo_global_move_slice_window_step_hack
)
>
;
const
auto
GridSize
=
(
K
/
KPerBlock
)
*
(
Ho
/
HoPerBlock
)
*
(
Wo
/
WoPerBlock
)
*
N
;
const
bool
has_main_k_block_loop
=
(
E
+
EPerBlock
)
/
(
2
*
EPerBlock
)
>
1
;
const
bool
has_double_tail_k_block_loop
=
(
E
/
EPerBlock
)
%
2
==
0
;
index_t
nrepeat
=
100
;
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
{
std
::
cout
<<
"Start running "
<<
nrepeat
<<
" times..."
<<
std
::
endl
;
KernelTimer
timer
;
timer
.
Start
();
std
::
cout
<<
"has_main_k_block_loop: "
<<
has_main_k_block_loop
<<
" has_double_tail_k_block_loop: "
<<
has_double_tail_k_block_loop
<<
std
::
endl
;
for
(
index_t
j
=
0
;
j
<
nrepeat
;
++
j
)
{
if
(
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
run_gridwise_operation
<
gridwise_gemm
,
decltype
(
wei_e_k_global_desc
),
const
FloatAB
*
,
decltype
(
in_e_n_ho_wo_global_desc
),
const
FloatAB
*
,
decltype
(
out_k_n_ho_wo_global_desc
),
FloatC
*
,
integral_constant
<
bool
,
true
>
,
integral_constant
<
bool
,
true
>>
;
launch_kernel
(
kernel
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
wei_e_k_global_desc
,
p_wei_global
,
in_e_n_ho_wo_global_desc
,
p_in_global
,
out_k_n_ho_wo_global_desc
,
p_out_global
,
integral_constant
<
bool
,
true
>
{},
integral_constant
<
bool
,
true
>
{});
}
else
if
(
has_main_k_block_loop
&&
!
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
run_gridwise_operation
<
gridwise_gemm
,
decltype
(
wei_e_k_global_desc
),
const
FloatAB
*
,
decltype
(
in_e_n_ho_wo_global_desc
),
const
FloatAB
*
,
decltype
(
out_k_n_ho_wo_global_desc
),
FloatC
*
,
integral_constant
<
bool
,
true
>
,
integral_constant
<
bool
,
false
>>
;
launch_kernel
(
kernel
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
wei_e_k_global_desc
,
p_wei_global
,
in_e_n_ho_wo_global_desc
,
p_in_global
,
out_k_n_ho_wo_global_desc
,
p_out_global
,
integral_constant
<
bool
,
true
>
{},
integral_constant
<
bool
,
false
>
{});
}
else
if
(
!
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
run_gridwise_operation
<
gridwise_gemm
,
decltype
(
wei_e_k_global_desc
),
const
FloatAB
*
,
decltype
(
in_e_n_ho_wo_global_desc
),
const
FloatAB
*
,
decltype
(
out_k_n_ho_wo_global_desc
),
FloatC
*
,
integral_constant
<
bool
,
false
>
,
integral_constant
<
bool
,
true
>>
;
launch_kernel
(
kernel
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
wei_e_k_global_desc
,
p_wei_global
,
in_e_n_ho_wo_global_desc
,
p_in_global
,
out_k_n_ho_wo_global_desc
,
p_out_global
,
integral_constant
<
bool
,
false
>
{},
integral_constant
<
bool
,
true
>
{});
}
else
{
const
auto
kernel
=
run_gridwise_operation
<
gridwise_gemm
,
decltype
(
wei_e_k_global_desc
),
const
FloatAB
*
,
decltype
(
in_e_n_ho_wo_global_desc
),
const
FloatAB
*
,
decltype
(
out_k_n_ho_wo_global_desc
),
FloatC
*
,
integral_constant
<
bool
,
false
>
,
integral_constant
<
bool
,
false
>>
;
launch_kernel
(
kernel
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
wei_e_k_global_desc
,
p_wei_global
,
in_e_n_ho_wo_global_desc
,
p_in_global
,
out_k_n_ho_wo_global_desc
,
p_out_global
,
integral_constant
<
bool
,
false
>
{},
integral_constant
<
bool
,
false
>
{});
}
}
timer
.
End
();
float
ave_time
=
timer
.
GetElapsedTime
()
/
nrepeat
;
float
perf
=
static_cast
<
float
>
(
calculate_convolution_flops
(
in_n_c_hi_wi_global_desc
,
wei_k_c_y_x_global_desc
,
out_n_k0_ho_wo_k1_global_desc
))
/
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
ave_time
;
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
}
#endif
}
};
#endif
host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
deleted
100644 → 0
View file @
0694d6ed
#ifndef DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NCHW_KCYX_NKHW_OUTPAD_HPP
#define DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NCHW_KCYX_NKHW_OUTPAD_HPP
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "gridwise_gemm_dlops_v2.hpp"
#include "gridwise_operation_wrapper.hpp"
template
<
ck
::
index_t
BlockSize
,
typename
FloatAB
,
typename
FloatAcc
,
typename
FloatC
,
ck
::
index_t
KPerBlock
,
ck
::
index_t
HoPerBlock
,
ck
::
index_t
WoPerBlock
,
ck
::
index_t
EPerBlock
,
ck
::
index_t
KPerThread
,
ck
::
index_t
HoPerThread
,
ck
::
index_t
WoPerThread
,
ck
::
index_t
EPerThread
,
typename
ABlockTransferThreadSliceLengths_E_K
,
typename
ABlockTransferThreadClusterLengths_E_K
,
ck
::
index_t
ABlockTransferSrcScalarPerVector_E
,
ck
::
index_t
ABlockTransferDstScalarPerVector_K
,
ck
::
index_t
BThreadTransferSrcScalarPerVector_W
,
ck
::
index_t
CThreadTransferDstScalarPerVector_W
>
struct
DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outpad
{
template
<
typename
...
Wei
,
typename
...
In
,
typename
...
Out
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
__host__
void
Run
(
const
ck
::
TensorDescriptor
<
Wei
...
>&
wei_k_c_y_x_global_desc
,
const
ck
::
TensorDescriptor
<
In
...
>&
in_n_c_hi_wi_global_desc
,
const
ck
::
TensorDescriptor
<
Out
...
>&
out_n_k0_ho_wo_k1_global_desc
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
in_right_pads
,
const
FloatAB
*
__restrict__
p_wei_global
,
const
FloatAB
*
__restrict__
p_in_global
,
FloatC
*
__restrict__
p_out_global
)
const
{
using
namespace
ck
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
const
auto
N
=
in_n_c_hi_wi_global_desc
.
GetLength
(
I0
);
const
auto
C
=
in_n_c_hi_wi_global_desc
.
GetLength
(
I1
);
const
auto
K0
=
out_n_k0_ho_wo_k1_global_desc
.
GetLength
(
I1
);
const
auto
Hi
=
in_n_c_hi_wi_global_desc
.
GetLength
(
I2
);
const
auto
Wi
=
in_n_c_hi_wi_global_desc
.
GetLength
(
I3
);
const
auto
Ho
=
out_n_k0_ho_wo_k1_global_desc
.
GetLength
(
I2
);
const
auto
Wo
=
out_n_k0_ho_wo_k1_global_desc
.
GetLength
(
I3
);
const
auto
K1
=
out_n_k0_ho_wo_k1_global_desc
.
GetLength
(
I4
);
const
auto
K
=
wei_k_c_y_x_global_desc
.
GetLength
(
I0
);
const
auto
Y
=
wei_k_c_y_x_global_desc
.
GetLength
(
I2
);
const
auto
X
=
wei_k_c_y_x_global_desc
.
GetLength
(
I3
);
const
auto
ConvStrideH
=
conv_strides
[
I0
];
const
auto
ConvStrideW
=
conv_strides
[
I1
];
const
auto
ConvDilationH
=
conv_dilations
[
I0
];
const
auto
ConvDilationW
=
conv_dilations
[
I1
];
const
auto
Hop
=
(
Ho
+
HoPerBlock
-
1
)
/
HoPerBlock
*
HoPerBlock
;
const
auto
Wop
=
(
Wo
+
WoPerBlock
-
1
)
/
WoPerBlock
*
WoPerBlock
;
const
auto
OutRightPadH
=
Hop
-
Ho
;
const
auto
OutRightPadW
=
Wop
-
Wo
;
const
auto
InLeftPadH
=
in_left_pads
[
I0
];
const
auto
InLeftPadW
=
in_left_pads
[
I1
];
const
auto
InRightPadH
=
in_right_pads
[
I0
]
+
OutRightPadH
*
ConvStrideH
;
const
auto
InRightPadW
=
in_right_pads
[
I1
]
+
OutRightPadW
*
ConvStrideW
;
std
::
cerr
<<
"OutRightPadH = "
<<
OutRightPadH
<<
" OutRightPadW = "
<<
OutRightPadW
<<
std
::
endl
;
std
::
cerr
<<
"InRightPadH = "
<<
InRightPadH
<<
" InRightPadW = "
<<
InRightPadW
<<
std
::
endl
;
// weight tensor
const
auto
wei_e_k_global_desc
=
transform_tensor_descriptor
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
K
,
C
*
Y
*
X
)),
make_tuple
(
make_pass_through_transform
(
K
),
make_pass_through_transform
(
C
*
Y
*
X
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
>
{}));
// input tensor
const
auto
in_n_c_hip_wip_global_desc
=
transform_tensor_descriptor
(
in_n_c_hi_wi_global_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_pass_through_transform
(
C
),
make_pad_transform
(
Hi
,
InLeftPadH
,
InRightPadH
),
make_pad_transform
(
Wi
,
InLeftPadW
,
InRightPadW
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
const
auto
in_n_c_y_ho_x_wo_global_desc
=
transform_tensor_descriptor
(
in_n_c_hip_wip_global_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_pass_through_transform
(
C
),
make_embed_transform
(
make_tuple
(
Y
,
Hop
),
make_tuple
(
ConvDilationH
,
ConvStrideH
)),
make_embed_transform
(
make_tuple
(
X
,
Wop
),
make_tuple
(
ConvDilationW
,
ConvStrideW
))),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{},
Sequence
<
4
,
5
>
{}));
const
auto
in_e_n_ho_wo_global_desc
=
transform_tensor_descriptor
(
in_n_c_y_ho_x_wo_global_desc
,
make_tuple
(
make_merge_transform
(
make_tuple
(
C
,
Y
,
X
)),
make_pass_through_transform
(
N
),
make_pass_through_transform
(
Hop
),
make_pass_through_transform
(
Wop
)),
make_tuple
(
Sequence
<
1
,
2
,
4
>
{},
Sequence
<
0
>
{},
Sequence
<
3
>
{},
Sequence
<
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
// output tensor
const
auto
out_k_n_hop_wop_global_desc
=
transform_tensor_descriptor
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
K0
,
Ho
,
Wo
,
K1
)),
make_tuple
(
make_merge_transform
(
make_tuple
(
K0
,
K1
)),
make_pass_through_transform
(
N
),
make_pad_transform
(
Ho
,
0
,
OutRightPadH
),
make_pad_transform
(
Wo
,
0
,
OutRightPadW
)),
make_tuple
(
Sequence
<
1
,
4
>
{},
Sequence
<
0
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
const
auto
E
=
C
*
Y
*
X
;
std
::
cerr
<<
"Hop = "
<<
Hop
<<
" Wop = "
<<
Wop
<<
std
::
endl
;
if
(
!
((
K
%
KPerBlock
)
==
0
&&
(
Hop
%
HoPerBlock
)
==
0
&&
(
Wop
%
WoPerBlock
)
==
0
&&
(
E
%
EPerBlock
)
==
0
))
{
throw
std
::
runtime_error
(
"wrong! GEMM size no divisible"
);
}
// hack to control index calculation when iterating over a_k_m_global tensor
constexpr
auto
a_e_k_global_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
>
{}));
constexpr
auto
a_e_k_global_move_slice_window_step_hack
=
Sequence
<
0
,
0
,
0
>
{};
constexpr
auto
b_e_n_ho_wo_global_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}));
constexpr
auto
b_e_n_ho_wo_global_move_slice_window_step_hack
=
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
>
{};
// hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
// hack for NKHW format
constexpr
auto
c_k_n_ho_wo_global_tensor_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
1
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
2
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{}));
// GEMM
using
gridwise_gemm
=
GridwiseGemmDlops_km_kn_mn_v3
<
BlockSize
,
FloatAB
,
FloatAcc
,
FloatC
,
InMemoryDataOperationEnum_t
::
Set
,
decltype
(
wei_e_k_global_desc
),
decltype
(
in_e_n_ho_wo_global_desc
),
decltype
(
out_k_n_hop_wop_global_desc
),
KPerBlock
,
HoPerBlock
,
WoPerBlock
,
EPerBlock
,
KPerThread
,
HoPerThread
,
WoPerThread
,
EPerThread
,
ABlockTransferThreadSliceLengths_E_K
,
ABlockTransferThreadClusterLengths_E_K
,
Sequence
<
1
,
0
>
,
Sequence
<
1
,
0
>
,
0
,
ABlockTransferSrcScalarPerVector_E
,
ABlockTransferDstScalarPerVector_K
,
false
,
// don't move back src coordinate after threadwise copy
Sequence
<
0
,
2
,
3
,
1
>
,
3
,
BThreadTransferSrcScalarPerVector_W
,
false
,
// don't move back src coordinate after threadwise copy, which will be fused with
// MoveSrcSliceWindow() to save addr computation
Sequence
<
0
,
2
,
3
,
1
>
,
0
,
CThreadTransferDstScalarPerVector_W
,
decltype
(
a_e_k_global_step_hacks
),
decltype
(
b_e_n_ho_wo_global_step_hacks
),
decltype
(
c_k_n_ho_wo_global_tensor_step_hacks
),
decltype
(
a_e_k_global_move_slice_window_step_hack
),
decltype
(
b_e_n_ho_wo_global_move_slice_window_step_hack
)
>
;
const
auto
GridSize
=
(
K
/
KPerBlock
)
*
(
Hop
/
HoPerBlock
)
*
(
Wop
/
WoPerBlock
)
*
N
;
const
bool
has_main_k_block_loop
=
(
E
+
EPerBlock
)
/
(
2
*
EPerBlock
)
>
1
;
const
bool
has_double_tail_k_block_loop
=
(
E
/
EPerBlock
)
%
2
==
0
;
index_t
nrepeat
=
100
;
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
{
std
::
cout
<<
"Start running "
<<
nrepeat
<<
" times..."
<<
std
::
endl
;
KernelTimer
timer
;
timer
.
Start
();
std
::
cout
<<
"has_main_k_block_loop: "
<<
has_main_k_block_loop
<<
" has_double_tail_k_block_loop: "
<<
has_double_tail_k_block_loop
<<
std
::
endl
;
for
(
index_t
j
=
0
;
j
<
nrepeat
;
++
j
)
{
if
(
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
run_gridwise_operation
<
gridwise_gemm
,
decltype
(
wei_e_k_global_desc
),
const
FloatAB
*
,
decltype
(
in_e_n_ho_wo_global_desc
),
const
FloatAB
*
,
decltype
(
out_k_n_hop_wop_global_desc
),
FloatC
*
,
integral_constant
<
bool
,
true
>
,
integral_constant
<
bool
,
true
>>
;
launch_kernel
(
kernel
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
wei_e_k_global_desc
,
p_wei_global
,
in_e_n_ho_wo_global_desc
,
p_in_global
,
out_k_n_hop_wop_global_desc
,
p_out_global
,
integral_constant
<
bool
,
true
>
{},
integral_constant
<
bool
,
true
>
{});
}
else
if
(
has_main_k_block_loop
&&
!
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
run_gridwise_operation
<
gridwise_gemm
,
decltype
(
wei_e_k_global_desc
),
const
FloatAB
*
,
decltype
(
in_e_n_ho_wo_global_desc
),
const
FloatAB
*
,
decltype
(
out_k_n_hop_wop_global_desc
),
FloatC
*
,
integral_constant
<
bool
,
true
>
,
integral_constant
<
bool
,
false
>>
;
launch_kernel
(
kernel
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
wei_e_k_global_desc
,
p_wei_global
,
in_e_n_ho_wo_global_desc
,
p_in_global
,
out_k_n_hop_wop_global_desc
,
p_out_global
,
integral_constant
<
bool
,
true
>
{},
integral_constant
<
bool
,
false
>
{});
}
else
if
(
!
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
run_gridwise_operation
<
gridwise_gemm
,
decltype
(
wei_e_k_global_desc
),
const
FloatAB
*
,
decltype
(
in_e_n_ho_wo_global_desc
),
const
FloatAB
*
,
decltype
(
out_k_n_hop_wop_global_desc
),
FloatC
*
,
integral_constant
<
bool
,
false
>
,
integral_constant
<
bool
,
true
>>
;
launch_kernel
(
kernel
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
wei_e_k_global_desc
,
p_wei_global
,
in_e_n_ho_wo_global_desc
,
p_in_global
,
out_k_n_hop_wop_global_desc
,
p_out_global
,
integral_constant
<
bool
,
false
>
{},
integral_constant
<
bool
,
true
>
{});
}
else
{
const
auto
kernel
=
run_gridwise_operation
<
gridwise_gemm
,
decltype
(
wei_e_k_global_desc
),
const
FloatAB
*
,
decltype
(
in_e_n_ho_wo_global_desc
),
const
FloatAB
*
,
decltype
(
out_k_n_hop_wop_global_desc
),
FloatC
*
,
integral_constant
<
bool
,
false
>
,
integral_constant
<
bool
,
false
>>
;
launch_kernel
(
kernel
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
wei_e_k_global_desc
,
p_wei_global
,
in_e_n_ho_wo_global_desc
,
p_in_global
,
out_k_n_hop_wop_global_desc
,
p_out_global
,
integral_constant
<
bool
,
false
>
{},
integral_constant
<
bool
,
false
>
{});
}
}
timer
.
End
();
float
ave_time
=
timer
.
GetElapsedTime
()
/
nrepeat
;
float
perf
=
static_cast
<
float
>
(
calculate_convolution_flops
(
in_n_c_hi_wi_global_desc
,
wei_k_c_y_x_global_desc
,
out_n_k0_ho_wo_k1_global_desc
))
/
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
ave_time
;
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
}
}
};
#endif
host/driver_offline/include/driver_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
0 → 100644
View file @
a037693f
#ifndef DRIVER_CONVOLUTION_MAXPOOL_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NC0HWc1_KC0YXC1_NK0HWK1_HPP
#define DRIVER_CONVOLUTION_MAXPOOL_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NC0HWc1_KC0YXC1_NK0HWK1_HPP
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "gridwise_gemm_dlops_v3.hpp"
template
<
ck
::
index_t
BlockSize
,
typename
FloatAB
,
typename
FloatAcc
,
typename
FloatC
,
ck
::
index_t
E1_
,
ck
::
index_t
E2_
,
ck
::
index_t
K2_
,
ck
::
index_t
KPerBlock
,
ck
::
index_t
HoPerBlock
,
ck
::
index_t
WoPerBlock
,
ck
::
index_t
E1PerBlock
,
ck
::
index_t
KPerThread
,
ck
::
index_t
HoPerThread
,
ck
::
index_t
WoPerThread
,
ck
::
index_t
EPerThread
,
typename
ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2
,
typename
ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2
,
ck
::
index_t
ABlockTransferSrcScalarPerVector_E2
,
ck
::
index_t
ABlockTransferDstScalarPerVector_E2
,
ck
::
index_t
BThreadTransferSrcScalarPerVector_E2
,
ck
::
index_t
CThreadTransferDstScalarPerVector_K
,
ck
::
ActivTypeEnum_t
activ_type
>
struct
DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_maxpool
{
template
<
typename
...
Wei
,
typename
...
In
,
typename
...
MaxPool
,
typename
...
Out
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
__host__
float
Run
(
const
ck
::
TensorDescriptor
<
Wei
...
>&
wei_k_c0_y_x_c1_global_desc
,
const
ck
::
TensorDescriptor
<
In
...
>&
in_n_c0_hi_wi_c1_global_desc
,
const
ck
::
TensorDescriptor
<
Out
...
>&
out_n_k0_ho_wo_k1_global_desc
,
const
ck
::
TensorDescriptor
<
MaxPool
...
>&
max_n_k0_hx_wx_k1_global_desc
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
in_right_pads
,
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
const
FloatC
*
__restrict__
p_bias_grid
,
FloatC
*
__restrict__
p_c_grid
,
FloatC
*
__restrict__
p_d_grid
,
const
int
nrepeat
)
const
{
using
namespace
ck
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
const
auto
N
=
in_n_c0_hi_wi_c1_global_desc
.
GetLength
(
I0
);
const
auto
C0
=
in_n_c0_hi_wi_c1_global_desc
.
GetLength
(
I1
);
const
auto
Hi
=
in_n_c0_hi_wi_c1_global_desc
.
GetLength
(
I2
);
const
auto
Wi
=
in_n_c0_hi_wi_c1_global_desc
.
GetLength
(
I3
);
// const auto C1 = in_n_c0_hi_wi_c1_global_desc.GetLength(I4);
const
auto
K0
=
out_n_k0_ho_wo_k1_global_desc
.
GetLength
(
I1
);
const
auto
Ho
=
out_n_k0_ho_wo_k1_global_desc
.
GetLength
(
I2
);
const
auto
Wo
=
out_n_k0_ho_wo_k1_global_desc
.
GetLength
(
I3
);
const
auto
K1
=
out_n_k0_ho_wo_k1_global_desc
.
GetLength
(
I4
);
const
auto
Hx
=
max_n_k0_hx_wx_k1_global_desc
.
GetLength
(
I2
);
const
auto
Wx
=
max_n_k0_hx_wx_k1_global_desc
.
GetLength
(
I3
);
const
auto
K
=
wei_k_c0_y_x_c1_global_desc
.
GetLength
(
I0
);
const
auto
Y
=
wei_k_c0_y_x_c1_global_desc
.
GetLength
(
I2
);
const
auto
X
=
wei_k_c0_y_x_c1_global_desc
.
GetLength
(
I3
);
const
auto
ConvStrideH
=
conv_strides
[
I0
];
const
auto
ConvStrideW
=
conv_strides
[
I1
];
const
auto
ConvDilationH
=
conv_dilations
[
I0
];
const
auto
ConvDilationW
=
conv_dilations
[
I1
];
#if CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
const
auto
Hop
=
Number
<
(
Ho
+
HoPerBlock
-
1
)
/
HoPerBlock
*
HoPerBlock
>
{};
const
auto
Wop
=
Number
<
(
Wo
+
WoPerBlock
-
1
)
/
WoPerBlock
*
WoPerBlock
>
{};
const
auto
OutRightPadH
=
Hop
-
Ho
;
const
auto
OutRightPadW
=
Wop
-
Wo
;
const
auto
OutRightPadHx
=
Number
<
OutRightPadH
/
2
>
{};
const
auto
OutRightPadWx
=
Number
<
OutRightPadW
/
2
>
{};
#else
const
auto
Hop
=
(
Ho
+
HoPerBlock
-
1
)
/
HoPerBlock
*
HoPerBlock
;
const
auto
Wop
=
(
Wo
+
WoPerBlock
-
1
)
/
WoPerBlock
*
WoPerBlock
;
const
auto
OutRightPadH
=
Hop
-
Ho
;
const
auto
OutRightPadW
=
Wop
-
Wo
;
const
auto
OutRightPadHx
=
OutRightPadH
/
2
;
const
auto
OutRightPadWx
=
OutRightPadW
/
2
;
#endif
const
auto
InLeftPadH
=
in_left_pads
[
I0
];
const
auto
InLeftPadW
=
in_left_pads
[
I1
];
const
auto
InRightPadH
=
in_right_pads
[
I0
]
+
OutRightPadH
*
ConvStrideH
;
const
auto
InRightPadW
=
in_right_pads
[
I1
]
+
OutRightPadW
*
ConvStrideW
;
const
auto
E
=
C0
*
Y
*
X
;
constexpr
auto
E1
=
Number
<
E1_
>
{};
constexpr
auto
E2
=
Number
<
E2_
>
{};
constexpr
auto
K2
=
Number
<
K2_
>
{};
const
auto
E0
=
E
/
E1
;
// weight tensor
const
auto
a_e_k_e2_grid_desc
=
transform_tensor_descriptor
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
K
,
C0
*
Y
*
X
,
E2
)),
make_tuple
(
make_pass_through_transform
(
K
),
make_pass_through_transform
(
C0
*
Y
*
X
),
make_pass_through_transform
(
E2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{}),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
>
{},
Sequence
<
2
>
{}));
const
auto
a_e0_e1_k_e2_grid_desc
=
transform_tensor_descriptor
(
a_e_k_e2_grid_desc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
E0
,
E1
)),
make_pass_through_transform
(
K
),
make_pass_through_transform
(
E2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
// input tensor
const
auto
in_n_c0_hip_wip_e2_global_desc
=
transform_tensor_descriptor
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
C0
,
Hi
,
Wi
,
E2
)),
make_tuple
(
make_pass_through_transform
(
N
),
make_pass_through_transform
(
C0
),
make_pad_transform
(
Hi
,
InLeftPadH
,
InRightPadH
),
make_pad_transform
(
Wi
,
InLeftPadW
,
InRightPadW
),
make_pass_through_transform
(
E2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{}));
const
auto
in_n_c0_y_ho_x_wo_e2_global_desc
=
transform_tensor_descriptor
(
in_n_c0_hip_wip_e2_global_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_pass_through_transform
(
C0
),
make_embed_transform
(
make_tuple
(
Y
,
Hop
),
make_tuple
(
ConvDilationH
,
ConvStrideH
)),
make_embed_transform
(
make_tuple
(
X
,
Wop
),
make_tuple
(
ConvDilationW
,
ConvStrideW
)),
make_pass_through_transform
(
E2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{},
Sequence
<
4
,
5
>
{},
Sequence
<
6
>
{}));
const
auto
in_e_n_ho_wo_e2_grid_desc
=
transform_tensor_descriptor
(
in_n_c0_y_ho_x_wo_e2_global_desc
,
make_tuple
(
make_merge_transform
(
make_tuple
(
C0
,
Y
,
X
)),
make_pass_through_transform
(
N
),
make_pass_through_transform
(
Hop
),
make_pass_through_transform
(
Wop
),
make_pass_through_transform
(
E2
)),
make_tuple
(
Sequence
<
1
,
2
,
4
>
{},
Sequence
<
0
>
{},
Sequence
<
3
>
{},
Sequence
<
5
>
{},
Sequence
<
6
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{}));
const
auto
b_e0_e1_n_ho_wo_e2_grid_desc
=
transform_tensor_descriptor
(
in_e_n_ho_wo_e2_grid_desc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
E0
,
E1
)),
make_pass_through_transform
(
N
),
make_pass_through_transform
(
Hop
),
make_pass_through_transform
(
Wop
),
make_pass_through_transform
(
E2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{},
Sequence
<
5
>
{}));
// output tensor
const
auto
c_k_n_hop_wop_grid_desc
=
transform_tensor_descriptor
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
K0
,
Ho
,
Wo
,
K1
)),
make_tuple
(
make_merge_transform
(
make_tuple
(
K0
,
K1
)),
make_pass_through_transform
(
N
),
make_pad_transform
(
Ho
,
I0
,
OutRightPadH
),
make_pad_transform
(
Wo
,
I0
,
OutRightPadW
)),
make_tuple
(
Sequence
<
1
,
4
>
{},
Sequence
<
0
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
// max tensor
const
auto
d_k_n_hx_wx_grid_desc
=
transform_tensor_descriptor
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
K0
,
Hx
,
Wx
,
K1
)),
make_tuple
(
make_merge_transform
(
make_tuple
(
K0
,
K1
)),
make_pass_through_transform
(
N
),
make_pad_transform
(
Hx
,
I0
,
OutRightPadHx
),
make_pad_transform
(
Wx
,
I0
,
OutRightPadWx
)),
make_tuple
(
Sequence
<
1
,
4
>
{},
Sequence
<
0
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
std
::
cerr
<<
"Hop = "
<<
Hop
<<
" Wop = "
<<
Wop
<<
std
::
endl
;
if
(
!
((
K
%
KPerBlock
)
==
0
&&
(
Hop
%
HoPerBlock
)
==
0
&&
(
Wop
%
WoPerBlock
)
==
0
&&
(
E1
%
E1PerBlock
)
==
0
))
{
throw
std
::
runtime_error
(
"wrong! GEMM size no divisible"
);
}
// clang-format off
// hack to control index calculation when iterating over a_e0_e1_k_e2_global tensor
constexpr
auto
a_e0_e1_k_e2_global_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}));
constexpr
auto
a_e0_e1_k_e2_global_move_slice_window_step_hack
=
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{};
// hack to control index calculation when iterating over b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global tensor
constexpr
auto
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{})
);
constexpr
auto
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack
=
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{};
constexpr
auto
c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
2
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
2
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}));
constexpr
auto
d_k0_k1_n_h0_h1_hx_w0_w1_wx_global_tensor_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
2
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
2
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}));
// clang-format on
// GEMM
using
GridwiseGemm
=
GridwiseGemmDlops_km_kn_mn_v3
<
BlockSize
,
FloatAB
,
FloatAcc
,
FloatC
,
InMemoryDataOperationEnum_t
::
Set
,
decltype
(
a_e0_e1_k_e2_grid_desc
),
decltype
(
b_e0_e1_n_ho_wo_e2_grid_desc
),
decltype
(
c_k_n_hop_wop_grid_desc
),
decltype
(
d_k_n_hx_wx_grid_desc
),
E1
,
E2
,
K2
,
KPerBlock
,
HoPerBlock
,
WoPerBlock
,
E1PerBlock
,
KPerThread
,
HoPerThread
,
WoPerThread
,
EPerThread
,
ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2
,
ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2
,
Sequence
<
2
,
3
,
0
,
1
,
4
>
,
Sequence
<
0
,
1
,
2
,
3
,
4
>
,
4
,
ABlockTransferSrcScalarPerVector_E2
,
ABlockTransferDstScalarPerVector_E2
,
false
,
// don't move back src coordinate after threadwise copy
Sequence
<
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
>
,
// E0, E1, N, H0, H1, H2, W0, W1, W2, E2
9
,
BThreadTransferSrcScalarPerVector_E2
,
false
,
// don't move back src coordinate after threadwise copy, which will be fused
// with MoveSrcSliceWindow() to save addr computation
Sequence
<
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
>
,
// K0, K1, N, H0, H1, I2, H2, W0, W1, I2, W2
1
,
CThreadTransferDstScalarPerVector_K
,
decltype
(
a_e0_e1_k_e2_global_step_hacks
),
decltype
(
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks
),
decltype
(
c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks
),
decltype
(
d_k0_k1_n_h0_h1_hx_w0_w1_wx_global_tensor_step_hacks
),
decltype
(
a_e0_e1_k_e2_global_move_slice_window_step_hack
),
decltype
(
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack
)
>
;
const
auto
a_e0_e1_k0_k1_e2_grid_desc
=
GridwiseGemm
::
MakeAE0E1K0K1E2GridDescriptor
(
a_e0_e1_k_e2_grid_desc
);
const
auto
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
=
GridwiseGemm
::
MakeBE0E1NH0H1H2W0W1W2E2GridDescriptor
(
b_e0_e1_n_ho_wo_e2_grid_desc
);
const
auto
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
=
GridwiseGemm
::
MakeCK0K1NH0H1H2W0W1W2GridDescriptor
(
c_k_n_hop_wop_grid_desc
);
const
auto
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc
=
GridwiseGemm
::
MakeDK0K1NH0H1HxW0W1WxGridDescriptorMaxPool
(
d_k_n_hx_wx_grid_desc
);
using
AGridDesc_E0_E1_K0_K1_E2
=
decltype
(
a_e0_e1_k0_k1_e2_grid_desc
);
using
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
=
decltype
(
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
);
using
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
=
decltype
(
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
);
using
DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx
=
decltype
(
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc
);
const
auto
grid_size
=
(
K
/
KPerBlock
)
*
(
Hop
/
HoPerBlock
)
*
(
Wop
/
WoPerBlock
)
*
N
;
const
bool
has_main_e0_block_loop
=
E0
>
1
;
std
::
cerr
<<
"has_main_e0_block_loop = "
<<
has_main_e0_block_loop
<<
std
::
endl
;
const
auto
c_blockid_to_k_n_h_w_block_cluster_adaptor
=
GridwiseGemm
::
MakeCBlockIdToKNHoWoBlockClusterAdaptor
(
c_k_n_hop_wop_grid_desc
);
using
CBlockIdToBlockClusterAdaptor_K_N_H_W
=
decltype
(
c_blockid_to_k_n_h_w_block_cluster_adaptor
);
float
ave_time
=
0
;
#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
if
(
has_main_e0_block_loop
)
{
const
auto
kernel
=
kernel_gemm_dlops_v3_maxpool
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AGridDesc_E0_E1_K0_K1_E2
>
,
remove_reference_t
<
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
>
,
remove_reference_t
<
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
>
,
remove_reference_t
<
DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx
>
,
remove_reference_t
<
CBlockIdToBlockClusterAdaptor_K_N_H_W
>
,
true
,
activ_type
>
;
ave_time
=
launch_and_time_kernel
(
kernel
,
nrepeat
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
p_a_grid
,
p_b_grid
,
p_bias_grid
,
p_c_grid
,
p_d_grid
,
a_e0_e1_k0_k1_e2_grid_desc
,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
,
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
,
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc
,
c_blockid_to_k_n_h_w_block_cluster_adaptor
);
}
else
{
const
auto
kernel
=
kernel_gemm_dlops_v3_maxpool
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AGridDesc_E0_E1_K0_K1_E2
>
,
remove_reference_t
<
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
>
,
remove_reference_t
<
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
>
,
remove_reference_t
<
DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx
>
,
remove_reference_t
<
CBlockIdToBlockClusterAdaptor_K_N_H_W
>
,
false
,
activ_type
>
;
ave_time
=
launch_and_time_kernel
(
kernel
,
nrepeat
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
p_a_grid
,
p_b_grid
,
p_bias_grid
,
p_c_grid
,
p_d_grid
,
a_e0_e1_k0_k1_e2_grid_desc
,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
,
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
,
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc
,
c_blockid_to_k_n_h_w_block_cluster_adaptor
);
}
#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
DeviceMem
a_e0_e1_k0_k1_e2_grid_desc_dev_buf
(
sizeof
(
AGridDesc_E0_E1_K0_K1_E2
));
DeviceMem
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf
(
sizeof
(
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
));
DeviceMem
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf
(
sizeof
(
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
));
DeviceMem
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc_dev_buf
(
sizeof
(
DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx
));
DeviceMem
c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf
(
sizeof
(
CBlockIdToBlockClusterAdaptor_K_N_H_W
));
a_e0_e1_k0_k1_e2_grid_desc_dev_buf
.
ToDevice
(
&
a_e0_e1_k0_k1_e2_grid_desc
);
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf
.
ToDevice
(
&
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
);
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf
.
ToDevice
(
&
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
);
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc_dev_buf
.
ToDevice
(
&
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc
);
c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf
.
ToDevice
(
&
c_blockid_to_k_n_h_w_block_cluster_adaptor
);
if
(
has_main_e0_block_loop
)
{
const
auto
kernel
=
kernel_gemm_dlops_v3_maxpool
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AGridDesc_E0_E1_K0_K1_E2
>
,
remove_reference_t
<
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
>
,
remove_reference_t
<
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
>
,
remove_reference_t
<
DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx
>
,
remove_reference_t
<
CBlockIdToBlockClusterAdaptor_K_N_H_W
>
,
true
,
activ_type
>
;
ave_time
=
launch_and_time_kernel
(
kernel
,
nrepeat
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
p_a_grid
,
p_b_grid
,
p_bias_grid
,
p_c_grid
,
p_d_grid
,
cast_pointer_to_constant_address_space
(
a_e0_e1_k0_k1_e2_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf
.
GetDeviceBuffer
()));
}
else
{
const
auto
kernel
=
kernel_gemm_dlops_v3_maxpool
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AGridDesc_E0_E1_K0_K1_E2
>
,
remove_reference_t
<
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
>
,
remove_reference_t
<
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
>
,
remove_reference_t
<
DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx
>
,
remove_reference_t
<
CBlockIdToBlockClusterAdaptor_K_N_H_W
>
,
false
,
activ_type
>
;
ave_time
=
launch_and_time_kernel
(
kernel
,
nrepeat
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
p_a_grid
,
p_b_grid
,
p_bias_grid
,
p_c_grid
,
p_d_grid
,
cast_pointer_to_constant_address_space
(
a_e0_e1_k0_k1_e2_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf
.
GetDeviceBuffer
()));
}
#elif CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
{
static_assert
(
a_e0_e1_k_e2_grid_desc
.
IsKnownAtCompileTime
(),
""
);
static_assert
(
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
.
IsKnownAtCompileTime
(),
""
);
static_assert
(
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc
.
IsKnownAtCompileTime
(),
""
);
static_assert
(
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
.
IsKnownAtCompileTime
(),
""
);
static_assert
(
c_blockid_to_k_n_h_w_block_cluster_adaptor
.
IsKnownAtCompileTime
(),
""
);
const
auto
kernel
=
kernel_gemm_dlops_v3_maxpool
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AGridDesc_E0_E1_K0_K1_E2
>
,
remove_reference_t
<
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
>
,
remove_reference_t
<
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
>
,
remove_reference_t
<
DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx
>
,
remove_reference_t
<
CBlockIdToBlockClusterAdaptor_K_N_H_W
>
,
has_main_e0_block_loop
,
activ_type
>
;
ave_time
=
launch_and_time_kernel
(
kernel
,
nrepeat
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
p_a_grid
,
p_b_grid
,
p_bias_grid
,
p_c_grid
,
p_d_grid
);
}
#endif
return
ave_time
;
}
};
#endif
host/driver_offline/src/conv_add_fwd_driver_offline_nchwc.cpp
0 → 100644
View file @
a037693f
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "config.hpp"
#include "debug.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "conv_common.hpp"
#include "device_tensor.hpp"
#include "device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
#define USE_DYNAMIC_MODE 0
#define USE_CONV_FWD_V5R1_NCHWC 1
enum
ConvForwardAlgo
{
V5R1NCHWC
// 0
};
template
<
typename
TIn
,
typename
TWei
,
typename
TOut
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
void
host_direct_convolution_add_nchwc
(
const
Tensor
<
TIn
>&
in
,
const
Tensor
<
TWei
>&
wei
,
const
Tensor
<
TOut
>&
add
,
const
Tensor
<
TOut
>&
bias
,
Tensor
<
TOut
>&
add_host
,
Tensor
<
TOut
>&
out_host
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
,
const
ck
::
ActivTypeEnum_t
activ_type
)
{
using
namespace
ck
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
auto
f_nchw
=
[
&
](
auto
n
,
auto
k0
,
auto
ho
,
auto
wo
,
auto
k1
)
{
double
v
=
0
;
auto
k
=
k0
*
out_host
.
mDesc
.
GetLengths
()[
4
]
+
k1
;
for
(
int
c0
=
0
;
c0
<
wei
.
mDesc
.
GetLengths
()[
1
];
++
c0
)
{
for
(
int
y
=
0
;
y
<
wei
.
mDesc
.
GetLengths
()[
2
];
++
y
)
{
int
hi
=
ho
*
conv_strides
[
I0
]
+
y
*
conv_dilations
[
I0
]
-
in_left_pads
[
I0
];
for
(
int
x
=
0
;
x
<
wei
.
mDesc
.
GetLengths
()[
3
];
++
x
)
{
int
wi
=
wo
*
conv_strides
[
I1
]
+
x
*
conv_dilations
[
I1
]
-
in_left_pads
[
I1
];
if
(
hi
>=
0
&&
hi
<
in
.
mDesc
.
GetLengths
()[
2
]
&&
wi
>=
0
&&
wi
<
in
.
mDesc
.
GetLengths
()[
3
])
{
for
(
int
c1
=
0
;
c1
<
wei
.
mDesc
.
GetLengths
()[
4
];
++
c1
)
{
v
+=
static_cast
<
const
double
>
(
in
(
n
,
c0
,
hi
,
wi
,
c1
))
*
static_cast
<
const
double
>
(
wei
(
k
,
c0
,
y
,
x
,
c1
));
}
}
}
}
}
v
+=
bias
(
k0
,
k1
);
v
=
activ
(
v
,
activ_type
);
const
int
hox2
=
ho
*
2
;
const
int
wox2
=
wo
*
2
;
out_host
(
n
,
k0
,
ho
,
wo
,
k1
)
=
v
;
add_host
(
n
,
k0
,
hox2
,
wox2
,
k1
)
=
v
+
add
(
n
,
k0
,
hox2
,
wox2
,
k1
);
add_host
(
n
,
k0
,
hox2
,
wox2
+
1
,
k1
)
=
v
+
add
(
n
,
k0
,
hox2
,
wox2
+
1
,
k1
);
add_host
(
n
,
k0
,
hox2
+
1
,
wox2
,
k1
)
=
v
+
add
(
n
,
k0
,
hox2
+
1
,
wox2
,
k1
);
add_host
(
n
,
k0
,
hox2
+
1
,
wox2
+
1
,
k1
)
=
v
+
add
(
n
,
k0
,
hox2
+
1
,
wox2
+
1
,
k1
);
};
make_ParallelTensorFunctor
(
f_nchw
,
out_host
.
mDesc
.
GetLengths
()[
0
],
out_host
.
mDesc
.
GetLengths
()[
1
],
out_host
.
mDesc
.
GetLengths
()[
2
],
out_host
.
mDesc
.
GetLengths
()[
3
],
out_host
.
mDesc
.
GetLengths
()[
4
])(
std
::
thread
::
hardware_concurrency
());
}
int
main
(
int
argc
,
char
*
argv
[])
{
using
namespace
ck
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
I6
=
Number
<
6
>
{};
constexpr
auto
I7
=
Number
<
7
>
{};
#if USE_DYNAMIC_MODE
// dynamic mode
if
(
argc
!=
23
)
{
printf
(
"arg1 to 5: algo, do_verification, init_method, do_log, nrepeat
\n
"
);
printf
(
"rest: N, K0, K1, C0, C1, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx
\n
"
);
exit
(
1
);
}
constexpr
ck
::
ActivTypeEnum_t
activ_type
=
ActivTypeEnum_t
::
LeakyRelu
;
const
ConvForwardAlgo
algo
=
static_cast
<
ConvForwardAlgo
>
(
std
::
stoi
(
argv
[
1
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
2
]);
const
int
init_method
=
std
::
stoi
(
argv
[
3
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
4
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
5
]);
const
index_t
N
=
std
::
stoi
(
argv
[
6
]);
const
index_t
K0
=
std
::
stoi
(
argv
[
7
]);
const
index_t
K1
=
std
::
stoi
(
argv
[
8
]);
const
index_t
C0
=
std
::
stoi
(
argv
[
9
]);
const
index_t
C1
=
std
::
stoi
(
argv
[
10
]);
const
index_t
Y
=
std
::
stoi
(
argv
[
11
]);
const
index_t
X
=
std
::
stoi
(
argv
[
12
]);
const
index_t
Hi
=
std
::
stoi
(
argv
[
13
]);
const
index_t
Wi
=
std
::
stoi
(
argv
[
14
]);
const
index_t
conv_stride_h
=
std
::
stoi
(
argv
[
15
]);
const
index_t
conv_stride_w
=
std
::
stoi
(
argv
[
16
]);
const
index_t
conv_dilation_h
=
std
::
stoi
(
argv
[
17
]);
const
index_t
conv_dilation_w
=
std
::
stoi
(
argv
[
18
]);
const
index_t
in_left_pad_h
=
std
::
stoi
(
argv
[
19
]);
const
index_t
in_left_pad_w
=
std
::
stoi
(
argv
[
20
]);
const
index_t
in_right_pad_h
=
std
::
stoi
(
argv
[
21
]);
const
index_t
in_right_pad_w
=
std
::
stoi
(
argv
[
22
]);
const
index_t
YEff
=
(
Y
-
1
)
*
conv_dilation_h
+
1
;
const
index_t
XEff
=
(
X
-
1
)
*
conv_dilation_w
+
1
;
const
index_t
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
YEff
)
/
conv_stride_h
+
1
;
const
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
XEff
)
/
conv_stride_w
+
1
;
const
auto
Hox2
=
Ho
*
2
;
const
auto
Wox2
=
Wo
*
2
;
#else
// static mode
if
(
argc
<
6
)
{
printf
(
"arg1 to 5: algo, do_verification, init_method, do_log, nrepeat
\n
"
);
exit
(
1
);
}
const
ConvForwardAlgo
algo
=
static_cast
<
ConvForwardAlgo
>
(
std
::
stoi
(
argv
[
1
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
2
]);
const
int
init_method
=
std
::
stoi
(
argv
[
3
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
4
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
5
]);
constexpr
ck
::
ActivTypeEnum_t
activ_type
=
ActivTypeEnum_t
::
LeakyRelu
;
#if 0
constexpr auto N = Number<1>{};
constexpr auto Hi = Number<1080>{};
constexpr auto Wi = Number<1920>{};
constexpr auto Y = Number<3>{};
constexpr auto X = Number<3>{};
constexpr auto C0 = Number<2>{};
constexpr auto C1 = Number<8>{};
constexpr auto K1 = Number<8>{};
constexpr auto K0 = Number<8>{};
#elif
0
constexpr
auto
N
=
Number
<
1
>
{};
constexpr
auto
Hi
=
Number
<
540
>
{};
constexpr
auto
Wi
=
Number
<
960
>
{};
constexpr
auto
Y
=
Number
<
3
>
{};
constexpr
auto
X
=
Number
<
3
>
{};
constexpr
auto
C0
=
Number
<
2
>
{};
constexpr
auto
C1
=
Number
<
8
>
{};
constexpr
auto
K0
=
Number
<
2
>
{};
constexpr
auto
K1
=
Number
<
8
>
{};
#elif 0
constexpr
auto
N
=
Number
<
1
>
{};
constexpr
auto
Hi
=
Number
<
270
>
{};
constexpr
auto
Wi
=
Number
<
480
>
{};
constexpr
auto
Y
=
Number
<
3
>
{};
constexpr
auto
X
=
Number
<
3
>
{};
constexpr
auto
C0
=
Number
<
2
>
{};
constexpr
auto
C1
=
Number
<
8
>
{};
constexpr
auto
K0
=
Number
<
2
>
{};
constexpr
auto
K1
=
Number
<
8
>
{};
#elif 1
constexpr
auto
N
=
Number
<
128
>
{};
constexpr
auto
Hi
=
Number
<
135
>
{};
constexpr
auto
Wi
=
Number
<
240
>
{};
constexpr
auto
Y
=
Number
<
3
>
{};
constexpr
auto
X
=
Number
<
3
>
{};
constexpr
auto
C0
=
Number
<
2
>
{};
constexpr
auto
C1
=
Number
<
8
>
{};
constexpr
auto
K0
=
Number
<
2
>
{};
constexpr
auto
K1
=
Number
<
8
>
{};
#elif 1
constexpr
auto
N
=
Number
<
1
>
{};
constexpr
auto
Hi
=
Number
<
32
>
{};
constexpr
auto
Wi
=
Number
<
32
>
{};
constexpr
auto
Y
=
Number
<
3
>
{};
constexpr
auto
X
=
Number
<
3
>
{};
constexpr
auto
C0
=
Number
<
2
>
{};
constexpr
auto
C1
=
Number
<
8
>
{};
constexpr
auto
K1
=
Number
<
8
>
{};
constexpr
auto
K0
=
Number
<
8
>
{};
#endif
constexpr
auto
conv_stride_h
=
I1
;
constexpr
auto
conv_stride_w
=
I1
;
constexpr
auto
conv_dilation_h
=
I1
;
constexpr
auto
conv_dilation_w
=
I1
;
constexpr
auto
in_left_pad_h
=
I1
;
constexpr
auto
in_left_pad_w
=
I1
;
constexpr
auto
in_right_pad_h
=
I1
;
constexpr
auto
in_right_pad_w
=
I1
;
constexpr
auto
YEff
=
(
Y
-
I1
)
*
conv_dilation_h
+
I1
;
constexpr
auto
XEff
=
(
X
-
I1
)
*
conv_dilation_w
+
I1
;
constexpr
auto
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
YEff
)
/
conv_stride_h
+
I1
;
constexpr
auto
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
XEff
)
/
conv_stride_w
+
I1
;
constexpr
auto
Hox2
=
Number
<
Ho
*
2
>
{};
constexpr
auto
Wox2
=
Number
<
Wo
*
2
>
{};
#endif
#if 0
using in_data_t = float;
using acc_data_t = float;
using out_data_t = float;
#elif
1
using
in_data_t
=
half_t
;
using
acc_data_t
=
float
;
using
out_data_t
=
half_t
;
#elif 1
using
in_data_t
=
int8_t
;
using
acc_data_t
=
int32_t
;
using
out_data_t
=
int8_t
;
#endif
std
::
vector
<
std
::
size_t
>
in_lengths_host
(
5
),
wei_lengths_host
(
5
),
out_lengths_host
(
5
),
add_lengths_host
(
5
),
bias_lengths_host
(
2
);
in_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
in_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
C0
);
in_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Hi
);
in_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wi
);
in_lengths_host
[
4
]
=
static_cast
<
std
::
size_t
>
(
C1
);
wei_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K0
*
K1
);
wei_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
C0
);
wei_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Y
);
wei_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
X
);
wei_lengths_host
[
4
]
=
static_cast
<
std
::
size_t
>
(
C1
);
out_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
out_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
K0
);
out_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Ho
);
out_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wo
);
out_lengths_host
[
4
]
=
static_cast
<
std
::
size_t
>
(
K1
);
add_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
add_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
K0
);
add_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Hox2
);
add_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wox2
);
add_lengths_host
[
4
]
=
static_cast
<
std
::
size_t
>
(
K1
);
bias_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K0
);
bias_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
K1
);
Tensor
<
in_data_t
>
in
(
in_lengths_host
);
Tensor
<
in_data_t
>
wei
(
wei_lengths_host
);
Tensor
<
in_data_t
>
add
(
add_lengths_host
);
Tensor
<
in_data_t
>
add_device
(
add_lengths_host
);
Tensor
<
in_data_t
>
add_host
(
add_lengths_host
);
Tensor
<
out_data_t
>
bias
(
bias_lengths_host
);
Tensor
<
out_data_t
>
out_host
(
out_lengths_host
);
ostream_HostTensorDescriptor
(
in
.
mDesc
,
std
::
cout
<<
"in: "
);
ostream_HostTensorDescriptor
(
wei
.
mDesc
,
std
::
cout
<<
"wei: "
);
ostream_HostTensorDescriptor
(
add
.
mDesc
,
std
::
cout
<<
"add: "
);
print_array
(
"InLeftPads"
,
make_tuple
(
in_left_pad_h
,
in_left_pad_w
));
print_array
(
"InRightPads"
,
make_tuple
(
in_right_pad_h
,
in_right_pad_w
));
print_array
(
"ConvStrides"
,
make_tuple
(
conv_stride_h
,
conv_stride_w
));
print_array
(
"ConvDilations"
,
make_tuple
(
conv_dilation_h
,
conv_dilation_w
));
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
();
switch
(
init_method
)
{
case
0
:
// no initialization
break
;
case
1
:
in
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
break
;
case
2
:
in
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
break
;
case
3
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
break
;
case
4
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
break
;
case
5
:
in
.
GenerateTensorValue
(
GeneratorTensor_3
<
float
>
{
0.0
,
1.0
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_3
<
float
>
{
-
0.5
,
0.5
},
num_thread
);
break
;
default:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
1
,
5
},
num_thread
);
auto
gen_wei
=
[](
auto
...
is
)
{
return
GeneratorTensor_2
{
1
,
5
}(
is
...)
*
GeneratorTensor_Checkboard
{}(
is
...);
};
wei
.
GenerateTensorValue
(
gen_wei
,
num_thread
);
}
bias
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
add
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
auto
f_make_for_device_nchwc
=
[
&
]()
{
const
auto
in_lengths_dev
=
make_tuple
(
N
,
C0
,
Hi
,
Wi
,
C1
);
const
auto
wei_lengths_dev
=
make_tuple
(
K0
*
K1
,
C0
,
Y
,
X
,
C1
);
const
auto
add_lengths_dev
=
make_tuple
(
N
,
K0
,
Hox2
,
Wox2
,
K1
);
const
auto
out_lengths_dev
=
make_tuple
(
N
,
K0
,
Ho
,
Wo
,
K1
);
const
auto
conv_strides_dev
=
make_tuple
(
conv_stride_h
,
conv_stride_w
);
const
auto
conv_dilations_dev
=
make_tuple
(
conv_dilation_h
,
conv_dilation_w
);
const
auto
in_left_pads_dev
=
make_tuple
(
in_left_pad_h
,
in_left_pad_w
);
const
auto
in_right_pads_dev
=
make_tuple
(
in_right_pad_h
,
in_right_pad_w
);
return
make_tuple
(
in_lengths_dev
,
wei_lengths_dev
,
add_lengths_dev
,
out_lengths_dev
,
conv_strides_dev
,
conv_dilations_dev
,
in_left_pads_dev
,
in_right_pads_dev
);
};
#if USE_CONV_FWD_V5R1_NCHWC
if
(
algo
==
ConvForwardAlgo
::
V5R1NCHWC
)
{
const
auto
tmp
=
f_make_for_device_nchwc
();
device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1
<
in_data_t
,
acc_data_t
,
out_data_t
,
activ_type
>
(
tmp
[
I0
],
// in_lengths_dev
tmp
[
I1
],
// wei_lengths_dev
tmp
[
I2
],
// add_lengths_dev
tmp
[
I3
],
// out_lengths_dev
tmp
[
I4
],
// conv_strides_dev
tmp
[
I5
],
// conv_dilations_dev
tmp
[
I6
],
// in_left_pads_dev
tmp
[
I7
],
// in_right_pads_dev
in
,
wei
,
bias
,
add
,
add_device
,
nrepeat
);
}
#endif
if
(
do_verification
)
{
host_direct_convolution_add_nchwc
(
in
,
wei
,
add
,
bias
,
add_host
,
out_host
,
make_tuple
(
conv_stride_h
,
conv_stride_w
),
make_tuple
(
conv_dilation_h
,
conv_dilation_w
),
make_tuple
(
in_left_pad_h
,
in_left_pad_w
),
make_tuple
(
in_right_pad_h
,
in_right_pad_w
),
activ_type
);
check_error
(
add_host
,
add_device
);
if
(
do_log
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"in : "
,
in
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"wei: "
,
wei
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"add_host: "
,
add_host
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"add_device: "
,
add_device
.
mData
,
","
)
<<
std
::
endl
;
}
}
}
host/driver_offline/src/conv_fwd_driver_offline.cpp
View file @
a037693f
...
...
@@ -15,15 +15,13 @@
#include "device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp"
#include "device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp"
#include "device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp"
#include "device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp"
#include "device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
#include "device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
#define USE_DYNAMIC_MODE
1
#define USE_DYNAMIC_MODE
0
#define USE_CONV_FWD_V4R4_NCHW 0
#define USE_CONV_FWD_V4R4R2_NHWC 0
#define USE_CONV_FWD_V6R1_NCHW 0
#define USE_CONV_FWD_V5R1_NCHW 0
#define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
#define USE_CONV_FWD_V4R4R4_XDL_NHWC 1
...
...
@@ -41,9 +39,8 @@ enum ConvForwardAlgo
V4R4NCHW
,
// 0
V4R4R2NHWC
,
// 1
V6R1NCHW
,
// 2
V5R1NCHW
,
// 3
V4R4R2XDLNCHW
,
// 4
V4R4R4XDLNHWC
// 5
V4R4R2XDLNCHW
,
// 3
V4R4R4XDLNHWC
// 4
};
template
<
typename
TIn
,
...
...
@@ -97,7 +94,7 @@ void host_convolution_forward(const Tensor<TIn>& in,
if
constexpr
(
is_same
<
TOut
,
ushort
>::
value
)
{
out
(
n
,
k
,
ho
,
wo
)
=
type_convert
<
ushort
>
(
v
);
out
(
n
,
k
,
ho
,
wo
)
=
ck
::
type_convert
<
ushort
>
(
static_cast
<
float
>
(
v
)
);
}
else
{
...
...
@@ -134,7 +131,7 @@ void host_convolution_forward(const Tensor<TIn>& in,
}
if
constexpr
(
is_same
<
TOut
,
ushort
>::
value
)
{
out
(
n
,
ho
,
wo
,
k
)
=
ck
::
type_convert
<
ushort
>
(
v
);
out
(
n
,
ho
,
wo
,
k
)
=
ck
::
type_convert
<
ushort
>
(
static_cast
<
float
>
(
v
)
);
}
else
{
...
...
@@ -237,8 +234,8 @@ int main(int argc, char* argv[])
constexpr
auto
Y
=
Number
<
3
>
{};
constexpr
auto
X
=
Number
<
3
>
{};
constexpr
auto
conv_stride_h
=
I
2
;
constexpr
auto
conv_stride_w
=
I
2
;
constexpr
auto
conv_stride_h
=
I
1
;
constexpr
auto
conv_stride_w
=
I
1
;
constexpr
auto
conv_dilation_h
=
I1
;
constexpr
auto
conv_dilation_w
=
I1
;
constexpr
auto
in_left_pad_h
=
I1
;
...
...
@@ -253,15 +250,15 @@ int main(int argc, char* argv[])
constexpr
auto
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
XEff
)
/
conv_stride_w
+
I1
;
#endif
#if
0
#if
1
using
in_data_t
=
float
;
using
acc_data_t
=
float
;
using
out_data_t
=
float
;
#elif
0
#elif
1
using
in_data_t
=
half_t
;
using
acc_data_t
=
float
;
using
out_data_t
=
half_t
;
#elif
1
#elif
0
using
in_data_t
=
ushort
;
using
acc_data_t
=
float
;
using
out_data_t
=
ushort
;
...
...
@@ -472,33 +469,6 @@ int main(int argc, char* argv[])
}
#endif
#if USE_CONV_FWD_V5R1_NCHW
if
(
algo
==
ConvForwardAlgo
::
V5R1NCHW
)
{
if
(
layout
!=
ConvTensorLayout
::
NCHW
)
{
throw
std
::
runtime_error
(
"wrong! layout"
);
}
const
auto
tmp
=
f_make_for_device_nchw
();
device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw
<
in_data_t
,
16
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I3
],
tmp
[
I4
],
tmp
[
I5
],
tmp
[
I6
],
in
,
wei
,
out_device
,
nrepeat
);
}
#endif
#if USE_CONV_FWD_V4R4R2_XDL_NCHW
if
(
algo
==
ConvForwardAlgo
::
V4R4R2XDLNCHW
)
{
...
...
host/driver_offline/src/conv_fwd_driver_offline_nchwc.cpp
0 → 100644
View file @
a037693f
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "config.hpp"
#include "debug.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "conv_common.hpp"
#include "device_tensor.hpp"
#include "device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
#define USE_DYNAMIC_MODE 0
#define USE_CONV_FWD_V5R1_NCHWC 1
enum
ConvForwardAlgo
{
V5R1NCHWC
// 0
};
template
<
typename
TIn
,
typename
TWei
,
typename
TOut
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
void
host_direct_convolution_nchwc
(
const
Tensor
<
TIn
>&
in
,
const
Tensor
<
TWei
>&
wei
,
const
Tensor
<
TOut
>&
bias
,
Tensor
<
TOut
>&
out
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
,
const
ck
::
ActivTypeEnum_t
activ_type
)
{
using
namespace
ck
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
auto
f_nchw
=
[
&
](
auto
n
,
auto
k0
,
auto
ho
,
auto
wo
,
auto
k1
)
{
double
v
=
0
;
const
int
k
=
k0
*
out
.
mDesc
.
GetLengths
()[
4
]
+
k1
;
for
(
int
c0
=
0
;
c0
<
wei
.
mDesc
.
GetLengths
()[
1
];
++
c0
)
{
for
(
int
y
=
0
;
y
<
wei
.
mDesc
.
GetLengths
()[
2
];
++
y
)
{
int
hi
=
ho
*
conv_strides
[
I0
]
+
y
*
conv_dilations
[
I0
]
-
in_left_pads
[
I0
];
for
(
int
x
=
0
;
x
<
wei
.
mDesc
.
GetLengths
()[
3
];
++
x
)
{
int
wi
=
wo
*
conv_strides
[
I1
]
+
x
*
conv_dilations
[
I1
]
-
in_left_pads
[
I1
];
if
(
hi
>=
0
&&
hi
<
in
.
mDesc
.
GetLengths
()[
2
]
&&
wi
>=
0
&&
wi
<
in
.
mDesc
.
GetLengths
()[
3
])
{
for
(
int
c1
=
0
;
c1
<
wei
.
mDesc
.
GetLengths
()[
4
];
++
c1
)
{
v
+=
static_cast
<
const
double
>
(
in
(
n
,
c0
,
hi
,
wi
,
c1
))
*
static_cast
<
const
double
>
(
wei
(
k
,
c0
,
y
,
x
,
c1
));
}
}
}
}
}
v
+=
bias
(
k0
,
k1
);
out
(
n
,
k0
,
ho
,
wo
,
k1
)
=
activ
(
v
,
activ_type
);
};
make_ParallelTensorFunctor
(
f_nchw
,
out
.
mDesc
.
GetLengths
()[
0
],
out
.
mDesc
.
GetLengths
()[
1
],
out
.
mDesc
.
GetLengths
()[
2
],
out
.
mDesc
.
GetLengths
()[
3
],
out
.
mDesc
.
GetLengths
()[
4
])(
std
::
thread
::
hardware_concurrency
());
}
int
main
(
int
argc
,
char
*
argv
[])
{
using
namespace
ck
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
I6
=
Number
<
6
>
{};
#if USE_DYNAMIC_MODE
// dynamic mode
if
(
argc
!=
23
)
{
printf
(
"arg1 to 5: algo, do_verification, init_method, do_log, nrepeat
\n
"
);
printf
(
"rest: N, K0, K1, C0, C1, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx
\n
"
);
exit
(
1
);
}
constexpr
ck
::
ActivTypeEnum_t
activ_type
=
ActivTypeEnum_t
::
LeakyRelu
;
const
ConvForwardAlgo
algo
=
static_cast
<
ConvForwardAlgo
>
(
std
::
stoi
(
argv
[
1
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
2
]);
const
int
init_method
=
std
::
stoi
(
argv
[
3
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
4
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
5
]);
const
index_t
N
=
std
::
stoi
(
argv
[
6
]);
const
index_t
K0
=
std
::
stoi
(
argv
[
7
]);
const
index_t
K1
=
std
::
stoi
(
argv
[
8
]);
const
index_t
C0
=
std
::
stoi
(
argv
[
9
]);
const
index_t
C1
=
std
::
stoi
(
argv
[
10
]);
const
index_t
Y
=
std
::
stoi
(
argv
[
11
]);
const
index_t
X
=
std
::
stoi
(
argv
[
12
]);
const
index_t
Hi
=
std
::
stoi
(
argv
[
13
]);
const
index_t
Wi
=
std
::
stoi
(
argv
[
14
]);
const
index_t
conv_stride_h
=
std
::
stoi
(
argv
[
15
]);
const
index_t
conv_stride_w
=
std
::
stoi
(
argv
[
16
]);
const
index_t
conv_dilation_h
=
std
::
stoi
(
argv
[
17
]);
const
index_t
conv_dilation_w
=
std
::
stoi
(
argv
[
18
]);
const
index_t
in_left_pad_h
=
std
::
stoi
(
argv
[
19
]);
const
index_t
in_left_pad_w
=
std
::
stoi
(
argv
[
20
]);
const
index_t
in_right_pad_h
=
std
::
stoi
(
argv
[
21
]);
const
index_t
in_right_pad_w
=
std
::
stoi
(
argv
[
22
]);
const
index_t
YEff
=
(
Y
-
1
)
*
conv_dilation_h
+
1
;
const
index_t
XEff
=
(
X
-
1
)
*
conv_dilation_w
+
1
;
const
index_t
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
YEff
)
/
conv_stride_h
+
1
;
const
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
XEff
)
/
conv_stride_w
+
1
;
#else
// static mode
if
(
argc
<
6
)
{
printf
(
"arg1 to 5: algo, do_verification, init_method, do_log, nrepeat
\n
"
);
exit
(
1
);
}
const
ConvForwardAlgo
algo
=
static_cast
<
ConvForwardAlgo
>
(
std
::
stoi
(
argv
[
1
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
2
]);
const
int
init_method
=
std
::
stoi
(
argv
[
3
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
4
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
5
]);
// constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::Sigmoid;
constexpr
ck
::
ActivTypeEnum_t
activ_type
=
ActivTypeEnum_t
::
LeakyRelu
;
#if 0
constexpr auto N = Number<1>{};
constexpr auto Hi = Number<1080>{};
constexpr auto Wi = Number<1920>{};
constexpr auto Y = Number<3>{};
constexpr auto X = Number<3>{};
constexpr auto C0 = Number<2>{};
constexpr auto C1 = Number<8>{};
constexpr auto K0 = Number<1>{};
constexpr auto K1 = Number<4>{};
#elif
1
constexpr
auto
N
=
Number
<
1
>
{};
constexpr
auto
Hi
=
Number
<
1080
>
{};
constexpr
auto
Wi
=
Number
<
1920
>
{};
constexpr
auto
Y
=
Number
<
3
>
{};
constexpr
auto
X
=
Number
<
3
>
{};
constexpr
auto
C0
=
Number
<
2
>
{};
constexpr
auto
C1
=
Number
<
8
>
{};
constexpr
auto
K0
=
Number
<
2
>
{};
constexpr
auto
K1
=
Number
<
8
>
{};
#elif 0
constexpr
auto
N
=
Number
<
1
>
{};
constexpr
auto
Hi
=
Number
<
1080
>
{};
constexpr
auto
Wi
=
Number
<
1920
>
{};
constexpr
auto
Y
=
Number
<
1
>
{};
constexpr
auto
X
=
Number
<
1
>
{};
constexpr
auto
C0
=
Number
<
2
>
{};
constexpr
auto
C1
=
Number
<
8
>
{};
constexpr
auto
K0
=
Number
<
2
>
{};
constexpr
auto
K1
=
Number
<
8
>
{};
#elif 0
constexpr
auto
N
=
Number
<
1
>
{};
constexpr
auto
Hi
=
Number
<
540
>
{};
constexpr
auto
Wi
=
Number
<
960
>
{};
constexpr
auto
Y
=
Number
<
1
>
{};
constexpr
auto
X
=
Number
<
1
>
{};
constexpr
auto
C0
=
Number
<
2
>
{};
constexpr
auto
C1
=
Number
<
8
>
{};
constexpr
auto
K0
=
Number
<
2
>
{};
constexpr
auto
K1
=
Number
<
8
>
{};
#elif 0
constexpr
auto
N
=
Number
<
128
>
{};
constexpr
auto
Hi
=
Number
<
270
>
{};
constexpr
auto
Wi
=
Number
<
480
>
{};
constexpr
auto
Y
=
Number
<
1
>
{};
constexpr
auto
X
=
Number
<
1
>
{};
constexpr
auto
C0
=
Number
<
2
>
{};
constexpr
auto
C1
=
Number
<
8
>
{};
constexpr
auto
K0
=
Number
<
2
>
{};
constexpr
auto
K1
=
Number
<
8
>
{};
#endif
constexpr
auto
conv_stride_h
=
I1
;
constexpr
auto
conv_stride_w
=
I1
;
constexpr
auto
conv_dilation_h
=
I1
;
constexpr
auto
conv_dilation_w
=
I1
;
#if 1
constexpr
auto
in_left_pad_h
=
I1
;
constexpr
auto
in_left_pad_w
=
I1
;
constexpr
auto
in_right_pad_h
=
I1
;
constexpr
auto
in_right_pad_w
=
I1
;
#else
constexpr
auto
in_left_pad_h
=
I0
;
constexpr
auto
in_left_pad_w
=
I0
;
constexpr
auto
in_right_pad_h
=
I0
;
constexpr
auto
in_right_pad_w
=
I0
;
#endif
constexpr
auto
YEff
=
(
Y
-
I1
)
*
conv_dilation_h
+
I1
;
constexpr
auto
XEff
=
(
X
-
I1
)
*
conv_dilation_w
+
I1
;
constexpr
auto
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
YEff
)
/
conv_stride_h
+
I1
;
constexpr
auto
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
XEff
)
/
conv_stride_w
+
I1
;
#endif
#if 0
using in_data_t = float;
using acc_data_t = float;
using out_data_t = float;
#elif
1
using
in_data_t
=
half_t
;
using
acc_data_t
=
float
;
using
out_data_t
=
half_t
;
#elif 1
using
in_data_t
=
int8_t
;
using
acc_data_t
=
int32_t
;
using
out_data_t
=
int8_t
;
#endif
std
::
vector
<
std
::
size_t
>
in_lengths_host
(
5
),
wei_lengths_host
(
5
),
out_lengths_host
(
5
),
bias_lengths_host
(
2
);
in_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
in_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
C0
);
in_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Hi
);
in_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wi
);
in_lengths_host
[
4
]
=
static_cast
<
std
::
size_t
>
(
C1
);
wei_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K0
*
K1
);
wei_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
C0
);
wei_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Y
);
wei_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
X
);
wei_lengths_host
[
4
]
=
static_cast
<
std
::
size_t
>
(
C1
);
out_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
out_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
K0
);
out_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Ho
);
out_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wo
);
out_lengths_host
[
4
]
=
static_cast
<
std
::
size_t
>
(
K1
);
bias_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K0
);
bias_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
K1
);
Tensor
<
in_data_t
>
in
(
in_lengths_host
);
Tensor
<
in_data_t
>
wei
(
wei_lengths_host
);
Tensor
<
out_data_t
>
bias
(
bias_lengths_host
);
Tensor
<
out_data_t
>
out_host
(
out_lengths_host
);
Tensor
<
out_data_t
>
out_device
(
out_lengths_host
);
ostream_HostTensorDescriptor
(
in
.
mDesc
,
std
::
cout
<<
"in: "
);
ostream_HostTensorDescriptor
(
wei
.
mDesc
,
std
::
cout
<<
"wei: "
);
ostream_HostTensorDescriptor
(
bias
.
mDesc
,
std
::
cout
<<
"bias: "
);
ostream_HostTensorDescriptor
(
out_host
.
mDesc
,
std
::
cout
<<
"out: "
);
print_array
(
"InLeftPads"
,
make_tuple
(
in_left_pad_h
,
in_left_pad_w
));
print_array
(
"InRightPads"
,
make_tuple
(
in_right_pad_h
,
in_right_pad_w
));
print_array
(
"ConvStrides"
,
make_tuple
(
conv_stride_h
,
conv_stride_w
));
print_array
(
"ConvDilations"
,
make_tuple
(
conv_dilation_h
,
conv_dilation_w
));
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
();
switch
(
init_method
)
{
case
0
:
// no initialization
break
;
case
1
:
in
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
bias
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
break
;
case
2
:
in
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
bias
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
break
;
case
3
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
bias
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
break
;
case
4
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
bias
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
break
;
case
5
:
in
.
GenerateTensorValue
(
GeneratorTensor_3
<
float
>
{
0.0
,
1.0
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_3
<
float
>
{
-
0.5
,
0.5
},
num_thread
);
bias
.
GenerateTensorValue
(
GeneratorTensor_3
<
float
>
{
-
0.5
,
0.5
},
num_thread
);
break
;
default:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
1
,
5
},
num_thread
);
auto
gen_wei
=
[](
auto
...
is
)
{
return
GeneratorTensor_2
{
1
,
5
}(
is
...)
*
GeneratorTensor_Checkboard
{}(
is
...);
};
wei
.
GenerateTensorValue
(
gen_wei
,
num_thread
);
}
auto
f_make_for_device_nchwc
=
[
&
]()
{
const
auto
in_lengths_dev
=
make_tuple
(
N
,
C0
,
Hi
,
Wi
,
C1
);
const
auto
wei_lengths_dev
=
make_tuple
(
K0
*
K1
,
C0
,
Y
,
X
,
C1
);
const
auto
out_lengths_dev
=
make_tuple
(
N
,
K0
,
Ho
,
Wo
,
K1
);
const
auto
conv_strides_dev
=
make_tuple
(
conv_stride_h
,
conv_stride_w
);
const
auto
conv_dilations_dev
=
make_tuple
(
conv_dilation_h
,
conv_dilation_w
);
const
auto
in_left_pads_dev
=
make_tuple
(
in_left_pad_h
,
in_left_pad_w
);
const
auto
in_right_pads_dev
=
make_tuple
(
in_right_pad_h
,
in_right_pad_w
);
return
make_tuple
(
in_lengths_dev
,
wei_lengths_dev
,
out_lengths_dev
,
conv_strides_dev
,
conv_dilations_dev
,
in_left_pads_dev
,
in_right_pads_dev
);
};
#if USE_CONV_FWD_V5R1_NCHWC
if
(
algo
==
ConvForwardAlgo
::
V5R1NCHWC
)
{
const
auto
tmp
=
f_make_for_device_nchwc
();
device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1
<
in_data_t
,
acc_data_t
,
out_data_t
,
activ_type
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I3
],
tmp
[
I4
],
tmp
[
I5
],
tmp
[
I6
],
in
,
wei
,
bias
,
out_device
,
nrepeat
);
}
#endif
if
(
do_verification
)
{
host_direct_convolution_nchwc
(
in
,
wei
,
bias
,
out_host
,
make_tuple
(
conv_stride_h
,
conv_stride_w
),
make_tuple
(
conv_dilation_h
,
conv_dilation_w
),
make_tuple
(
in_left_pad_h
,
in_left_pad_w
),
make_tuple
(
in_right_pad_h
,
in_right_pad_w
),
activ_type
);
check_error
(
out_host
,
out_device
);
if
(
do_log
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"in : "
,
in
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"wei: "
,
wei
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"bias: "
,
bias
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out_host : "
,
out_host
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out_device: "
,
out_device
.
mData
,
","
)
<<
std
::
endl
;
}
}
}
host/driver_offline/src/conv_maxpool_fwd_driver_offline_nchwc.cpp
0 → 100644
View file @
a037693f
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "config.hpp"
#include "debug.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "conv_common.hpp"
#include "device_tensor.hpp"
#include "device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
#define USE_DYNAMIC_MODE 0
#define USE_CONV_FWD_V5R1_NCHWC 1
enum
ConvForwardAlgo
{
V5R1NCHWC
// 0
};
template
<
typename
TIn
,
typename
TWei
,
typename
TOut
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
void
host_direct_convolution_maxpool_nchwc
(
const
Tensor
<
TIn
>&
in
,
const
Tensor
<
TWei
>&
wei
,
const
Tensor
<
TOut
>&
bias
,
Tensor
<
TOut
>&
out_host
,
Tensor
<
TOut
>&
max_host
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
,
const
ck
::
ActivTypeEnum_t
activ_type
)
{
using
namespace
ck
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
auto
f_nchw
=
[
&
](
auto
n
,
auto
k0
,
auto
ho
,
auto
wo
,
auto
k1
)
{
double
v
=
0
;
auto
k
=
k0
*
out_host
.
mDesc
.
GetLengths
()[
4
]
+
k1
;
for
(
int
c0
=
0
;
c0
<
wei
.
mDesc
.
GetLengths
()[
1
];
++
c0
)
{
for
(
int
y
=
0
;
y
<
wei
.
mDesc
.
GetLengths
()[
2
];
++
y
)
{
int
hi
=
ho
*
conv_strides
[
I0
]
+
y
*
conv_dilations
[
I0
]
-
in_left_pads
[
I0
];
for
(
int
x
=
0
;
x
<
wei
.
mDesc
.
GetLengths
()[
3
];
++
x
)
{
int
wi
=
wo
*
conv_strides
[
I1
]
+
x
*
conv_dilations
[
I1
]
-
in_left_pads
[
I1
];
if
(
hi
>=
0
&&
hi
<
in
.
mDesc
.
GetLengths
()[
2
]
&&
wi
>=
0
&&
wi
<
in
.
mDesc
.
GetLengths
()[
3
])
{
for
(
int
c1
=
0
;
c1
<
wei
.
mDesc
.
GetLengths
()[
4
];
++
c1
)
{
v
+=
static_cast
<
const
double
>
(
in
(
n
,
c0
,
hi
,
wi
,
c1
))
*
static_cast
<
const
double
>
(
wei
(
k
,
c0
,
y
,
x
,
c1
));
}
}
}
}
}
v
+=
bias
(
k0
,
k1
);
v
=
activ
(
v
,
activ_type
);
out_host
(
n
,
k0
,
ho
,
wo
,
k1
)
=
v
;
};
make_ParallelTensorFunctor
(
f_nchw
,
out_host
.
mDesc
.
GetLengths
()[
0
],
out_host
.
mDesc
.
GetLengths
()[
1
],
out_host
.
mDesc
.
GetLengths
()[
2
],
out_host
.
mDesc
.
GetLengths
()[
3
],
out_host
.
mDesc
.
GetLengths
()[
4
])(
std
::
thread
::
hardware_concurrency
());
auto
maxpool_nchw
=
[
&
](
auto
n
,
auto
k0
,
auto
ho
,
auto
wo
,
auto
k1
)
{
auto
hx
=
ho
*
2
;
auto
wx
=
wo
*
2
;
auto
v0
=
out_host
(
n
,
k0
,
hx
,
wx
,
k1
);
auto
v1
=
out_host
(
n
,
k0
,
hx
,
wx
+
1
,
k1
);
auto
v2
=
out_host
(
n
,
k0
,
hx
+
1
,
wx
,
k1
);
auto
v3
=
out_host
(
n
,
k0
,
hx
+
1
,
wx
+
1
,
k1
);
max_host
(
n
,
k0
,
ho
,
wo
,
k1
)
=
std
::
max
({
v0
,
v1
,
v2
,
v3
});
};
make_ParallelTensorFunctor
(
maxpool_nchw
,
max_host
.
mDesc
.
GetLengths
()[
0
],
max_host
.
mDesc
.
GetLengths
()[
1
],
max_host
.
mDesc
.
GetLengths
()[
2
],
max_host
.
mDesc
.
GetLengths
()[
3
],
max_host
.
mDesc
.
GetLengths
()[
4
])(
std
::
thread
::
hardware_concurrency
());
}
int
main
(
int
argc
,
char
*
argv
[])
{
using
namespace
ck
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
I6
=
Number
<
6
>
{};
constexpr
auto
I7
=
Number
<
7
>
{};
#if USE_DYNAMIC_MODE
// dynamic mode
if
(
argc
!=
23
)
{
printf
(
"arg1 to 5: algo, do_verification, init_method, do_log, nrepeat
\n
"
);
printf
(
"rest: N, K0, K1, C0, C1, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx
\n
"
);
exit
(
1
);
}
constexpr
ck
::
ActivTypeEnum_t
activ_type
=
ActivTypeEnum_t
::
LeakyRelu
;
const
ConvForwardAlgo
algo
=
static_cast
<
ConvForwardAlgo
>
(
std
::
stoi
(
argv
[
1
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
2
]);
const
int
init_method
=
std
::
stoi
(
argv
[
3
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
4
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
5
]);
const
index_t
N
=
std
::
stoi
(
argv
[
6
]);
const
index_t
K0
=
std
::
stoi
(
argv
[
7
]);
const
index_t
K1
=
std
::
stoi
(
argv
[
8
]);
const
index_t
C0
=
std
::
stoi
(
argv
[
9
]);
const
index_t
C1
=
std
::
stoi
(
argv
[
10
]);
const
index_t
Y
=
std
::
stoi
(
argv
[
11
]);
const
index_t
X
=
std
::
stoi
(
argv
[
12
]);
const
index_t
Hi
=
std
::
stoi
(
argv
[
13
]);
const
index_t
Wi
=
std
::
stoi
(
argv
[
14
]);
const
index_t
conv_stride_h
=
std
::
stoi
(
argv
[
15
]);
const
index_t
conv_stride_w
=
std
::
stoi
(
argv
[
16
]);
const
index_t
conv_dilation_h
=
std
::
stoi
(
argv
[
17
]);
const
index_t
conv_dilation_w
=
std
::
stoi
(
argv
[
18
]);
const
index_t
in_left_pad_h
=
std
::
stoi
(
argv
[
19
]);
const
index_t
in_left_pad_w
=
std
::
stoi
(
argv
[
20
]);
const
index_t
in_right_pad_h
=
std
::
stoi
(
argv
[
21
]);
const
index_t
in_right_pad_w
=
std
::
stoi
(
argv
[
22
]);
const
index_t
YEff
=
(
Y
-
1
)
*
conv_dilation_h
+
1
;
const
index_t
XEff
=
(
X
-
1
)
*
conv_dilation_w
+
1
;
const
index_t
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
YEff
)
/
conv_stride_h
+
1
;
const
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
XEff
)
/
conv_stride_w
+
1
;
const
index_t
Ho_2
=
Ho
/
2
;
const
index_t
Wo_2
=
Wo
/
2
;
#else
// static mode
if
(
argc
<
6
)
{
printf
(
"arg1 to 5: algo, do_verification, init_method, do_log, nrepeat
\n
"
);
exit
(
1
);
}
const
ConvForwardAlgo
algo
=
static_cast
<
ConvForwardAlgo
>
(
std
::
stoi
(
argv
[
1
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
2
]);
const
int
init_method
=
std
::
stoi
(
argv
[
3
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
4
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
5
]);
constexpr
ck
::
ActivTypeEnum_t
activ_type
=
ActivTypeEnum_t
::
LeakyRelu
;
#if 1
constexpr
auto
N
=
Number
<
1
>
{};
constexpr
auto
Hi
=
Number
<
1080
>
{};
constexpr
auto
Wi
=
Number
<
1920
>
{};
constexpr
auto
Y
=
Number
<
3
>
{};
constexpr
auto
X
=
Number
<
3
>
{};
constexpr
auto
C0
=
Number
<
2
>
{};
constexpr
auto
C1
=
Number
<
8
>
{};
constexpr
auto
K0
=
Number
<
2
>
{};
constexpr
auto
K1
=
Number
<
8
>
{};
#elif 0
constexpr
auto
N
=
Number
<
1
>
{};
constexpr
auto
Hi
=
Number
<
1080
>
{};
constexpr
auto
Wi
=
Number
<
1920
>
{};
constexpr
auto
Y
=
Number
<
3
>
{};
constexpr
auto
X
=
Number
<
3
>
{};
constexpr
auto
C0
=
Number
<
3
>
{};
constexpr
auto
C1
=
Number
<
4
>
{};
constexpr
auto
K0
=
Number
<
2
>
{};
constexpr
auto
K1
=
Number
<
8
>
{};
#elif 0
constexpr
auto
N
=
Number
<
1
>
{};
constexpr
auto
Hi
=
Number
<
540
>
{};
constexpr
auto
Wi
=
Number
<
960
>
{};
constexpr
auto
Y
=
Number
<
3
>
{};
constexpr
auto
X
=
Number
<
3
>
{};
constexpr
auto
C0
=
Number
<
2
>
{};
constexpr
auto
C1
=
Number
<
8
>
{};
constexpr
auto
K0
=
Number
<
2
>
{};
constexpr
auto
K1
=
Number
<
8
>
{};
#elif 0
constexpr
auto
N
=
Number
<
128
>
{};
constexpr
auto
Hi
=
Number
<
270
>
{};
constexpr
auto
Wi
=
Number
<
480
>
{};
constexpr
auto
Y
=
Number
<
3
>
{};
constexpr
auto
X
=
Number
<
3
>
{};
constexpr
auto
C0
=
Number
<
2
>
{};
constexpr
auto
C1
=
Number
<
8
>
{};
constexpr
auto
K0
=
Number
<
2
>
{};
constexpr
auto
K1
=
Number
<
8
>
{};
#endif
constexpr
auto
conv_stride_h
=
I1
;
constexpr
auto
conv_stride_w
=
I1
;
constexpr
auto
conv_dilation_h
=
I1
;
constexpr
auto
conv_dilation_w
=
I1
;
constexpr
auto
in_left_pad_h
=
I1
;
constexpr
auto
in_left_pad_w
=
I1
;
constexpr
auto
in_right_pad_h
=
I1
;
constexpr
auto
in_right_pad_w
=
I1
;
constexpr
auto
YEff
=
(
Y
-
I1
)
*
conv_dilation_h
+
I1
;
constexpr
auto
XEff
=
(
X
-
I1
)
*
conv_dilation_w
+
I1
;
constexpr
auto
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
YEff
)
/
conv_stride_h
+
I1
;
constexpr
auto
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
XEff
)
/
conv_stride_w
+
I1
;
constexpr
auto
Ho_2
=
Number
<
Ho
/
2
>
{};
constexpr
auto
Wo_2
=
Number
<
Wo
/
2
>
{};
#endif
#if 0
using in_data_t = float;
using acc_data_t = float;
using out_data_t = float;
#elif
1
using
in_data_t
=
half_t
;
using
acc_data_t
=
float
;
using
out_data_t
=
half_t
;
#elif 1
using
in_data_t
=
int8_t
;
using
acc_data_t
=
int32_t
;
using
out_data_t
=
int8_t
;
#endif
std
::
vector
<
std
::
size_t
>
in_lengths_host
(
5
),
wei_lengths_host
(
5
),
out_lengths_host
(
5
),
max_lengths_host
(
5
),
bias_lengths_host
(
2
);
in_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
in_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
C0
);
in_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Hi
);
in_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wi
);
in_lengths_host
[
4
]
=
static_cast
<
std
::
size_t
>
(
C1
);
wei_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K0
*
K1
);
wei_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
C0
);
wei_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Y
);
wei_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
X
);
wei_lengths_host
[
4
]
=
static_cast
<
std
::
size_t
>
(
C1
);
out_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
out_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
K0
);
out_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Ho
);
out_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wo
);
out_lengths_host
[
4
]
=
static_cast
<
std
::
size_t
>
(
K1
);
max_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
max_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
K0
);
max_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Ho_2
);
max_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wo_2
);
max_lengths_host
[
4
]
=
static_cast
<
std
::
size_t
>
(
K1
);
bias_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K0
);
bias_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
K1
);
Tensor
<
in_data_t
>
in
(
in_lengths_host
);
Tensor
<
in_data_t
>
wei
(
wei_lengths_host
);
Tensor
<
out_data_t
>
bias
(
bias_lengths_host
);
Tensor
<
out_data_t
>
out_device
(
out_lengths_host
);
Tensor
<
out_data_t
>
out_host
(
out_lengths_host
);
Tensor
<
in_data_t
>
max_device
(
max_lengths_host
);
Tensor
<
in_data_t
>
max_host
(
max_lengths_host
);
ostream_HostTensorDescriptor
(
in
.
mDesc
,
std
::
cout
<<
"in: "
);
ostream_HostTensorDescriptor
(
wei
.
mDesc
,
std
::
cout
<<
"wei: "
);
print_array
(
"InLeftPads"
,
make_tuple
(
in_left_pad_h
,
in_left_pad_w
));
print_array
(
"InRightPads"
,
make_tuple
(
in_right_pad_h
,
in_right_pad_w
));
print_array
(
"ConvStrides"
,
make_tuple
(
conv_stride_h
,
conv_stride_w
));
print_array
(
"ConvDilations"
,
make_tuple
(
conv_dilation_h
,
conv_dilation_w
));
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
();
switch
(
init_method
)
{
case
0
:
// no initialization
break
;
case
1
:
in
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
break
;
case
2
:
in
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
break
;
case
3
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
break
;
case
4
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
break
;
case
5
:
in
.
GenerateTensorValue
(
GeneratorTensor_3
<
float
>
{
0.0
,
1.0
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_3
<
float
>
{
-
0.5
,
0.5
},
num_thread
);
break
;
default:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
1
,
5
},
num_thread
);
auto
gen_wei
=
[](
auto
...
is
)
{
return
GeneratorTensor_2
{
1
,
5
}(
is
...)
*
GeneratorTensor_Checkboard
{}(
is
...);
};
wei
.
GenerateTensorValue
(
gen_wei
,
num_thread
);
}
bias
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
auto
f_make_for_device_nchwc
=
[
&
]()
{
const
auto
in_lengths_dev
=
make_tuple
(
N
,
C0
,
Hi
,
Wi
,
C1
);
const
auto
wei_lengths_dev
=
make_tuple
(
K0
*
K1
,
C0
,
Y
,
X
,
C1
);
const
auto
max_lengths_dev
=
make_tuple
(
N
,
K0
,
Ho_2
,
Wo_2
,
K1
);
const
auto
out_lengths_dev
=
make_tuple
(
N
,
K0
,
Ho
,
Wo
,
K1
);
const
auto
conv_strides_dev
=
make_tuple
(
conv_stride_h
,
conv_stride_w
);
const
auto
conv_dilations_dev
=
make_tuple
(
conv_dilation_h
,
conv_dilation_w
);
const
auto
in_left_pads_dev
=
make_tuple
(
in_left_pad_h
,
in_left_pad_w
);
const
auto
in_right_pads_dev
=
make_tuple
(
in_right_pad_h
,
in_right_pad_w
);
return
make_tuple
(
in_lengths_dev
,
wei_lengths_dev
,
max_lengths_dev
,
out_lengths_dev
,
conv_strides_dev
,
conv_dilations_dev
,
in_left_pads_dev
,
in_right_pads_dev
);
};
#if USE_CONV_FWD_V5R1_NCHWC
if
(
algo
==
ConvForwardAlgo
::
V5R1NCHWC
)
{
const
auto
tmp
=
f_make_for_device_nchwc
();
device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1
<
in_data_t
,
acc_data_t
,
out_data_t
,
activ_type
>
(
tmp
[
I0
],
// in_lengths_dev
tmp
[
I1
],
// wei_lengths_dev
tmp
[
I2
],
// max_lengths_dev
tmp
[
I3
],
// out_lengths_dev
tmp
[
I4
],
// conv_strides_dev
tmp
[
I5
],
// conv_dilations_dev
tmp
[
I6
],
// in_left_pads_dev
tmp
[
I7
],
// in_right_pads_dev
in
,
wei
,
bias
,
out_device
,
max_device
,
nrepeat
);
}
#endif
if
(
do_verification
)
{
host_direct_convolution_maxpool_nchwc
(
in
,
wei
,
bias
,
out_host
,
max_host
,
make_tuple
(
conv_stride_h
,
conv_stride_w
),
make_tuple
(
conv_dilation_h
,
conv_dilation_w
),
make_tuple
(
in_left_pad_h
,
in_left_pad_w
),
make_tuple
(
in_right_pad_h
,
in_right_pad_w
),
activ_type
);
check_error
(
out_host
,
out_device
);
check_error
(
max_host
,
max_device
);
if
(
do_log
)
{
// LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
// LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
// LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") <<
// std::endl;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"max_host: "
,
max_host
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"max_device: "
,
max_device
.
mData
,
","
)
<<
std
::
endl
;
}
}
}
host/driver_offline/src/gemm_driver_offline.cpp
View file @
a037693f
...
...
@@ -239,14 +239,10 @@ int main(int argc, char* argv[])
using ab_data_t = float;
using acc_data_t = float;
using c_data_t = float;
#elif
0
#elif
1
using
ab_data_t
=
half_t
;
using
acc_data_t
=
float
;
using
c_data_t
=
half_t
;
#elif 1
using
ab_data_t
=
ushort
;
using
acc_data_t
=
float
;
using
c_data_t
=
ushort
;
#elif 1
using
ab_data_t
=
int8_t
;
using
acc_data_t
=
int32_t
;
...
...
host/host_tensor/include/conv_common.hpp
View file @
a037693f
...
...
@@ -74,4 +74,17 @@ calculate_convolution_flops(const InDesc&, const WeiDesc& wei_desc, const OutDes
return
std
::
size_t
(
2
)
*
N
*
K
*
Ho
*
Wo
*
C
*
Y
*
X
;
}
template
<
typename
T
>
inline
auto
activ
(
T
v
,
const
ck
::
ActivTypeEnum_t
activ_type
)
{
const
T
alpha
=
0.3
;
switch
(
activ_type
)
{
case
ck
::
ActivTypeEnum_t
::
None
:
return
v
;
case
ck
::
ActivTypeEnum_t
::
LeakyRelu
:
return
(
v
>=
0
?
v
:
alpha
*
v
);
case
ck
::
ActivTypeEnum_t
::
Sigmoid
:
return
(
1
/
(
1
+
exp
(
-
v
)));
default:
throw
std
::
runtime_error
(
"unsupported activ type"
);
break
;
}
}
#endif
host/host_tensor/include/host_tensor.hpp
View file @
a037693f
...
...
@@ -257,6 +257,18 @@ struct Tensor
mDesc
.
GetLengths
()[
3
])(
num_thread
);
break
;
}
case
5
:
{
auto
f
=
[
&
](
auto
i0
,
auto
i1
,
auto
i2
,
auto
i3
,
auto
i4
)
{
(
*
this
)(
i0
,
i1
,
i2
,
i3
,
i4
)
=
g
(
i0
,
i1
,
i2
,
i3
,
i4
);
};
make_ParallelTensorFunctor
(
f
,
mDesc
.
GetLengths
()[
0
],
mDesc
.
GetLengths
()[
1
],
mDesc
.
GetLengths
()[
2
],
mDesc
.
GetLengths
()[
3
],
mDesc
.
GetLengths
()[
4
])(
num_thread
);
break
;
}
default:
throw
std
::
runtime_error
(
"unspported dimension"
);
}
}
...
...
profiler/conv_profiler.cpp
View file @
a037693f
...
...
@@ -34,14 +34,14 @@ int conv_profiler(int argc, char* argv[])
{
if
(
argc
!=
25
)
{
printf
(
"arg1: tensor operation (conv
=
Convolution)
\n
"
);
printf
(
"arg2: data type (0
=
fp32
,
1
=
fp16)
\n
"
);
printf
(
"arg3: input tensor layout (0
=
NCHW
,
1
=
NHWC)
\n
"
);
printf
(
"arg4: weight tensor layout (0
=
KCYX
,
1
=
KYXC)
\n
"
);
printf
(
"arg5: output tensor layout (0
=
NKHW
,
1
=
NHWK)
\n
"
);
printf
(
"arg6: verification (0
=
no
,
1
=
yes)
\n
"
);
printf
(
"arg7: initialization (0
=
no init
,
1
=
integer value
,
2
=
decimal value)
\n
"
);
printf
(
"arg8: print
matrix
value (0
=
no
,
1
=
yes)
\n
"
);
printf
(
"arg1: tensor operation (conv
:
Convolution)
\n
"
);
printf
(
"arg2: data type (0
:
fp32
;
1
:
fp16)
\n
"
);
printf
(
"arg3: input tensor layout (0
:
NCHW
;
1
:
NHWC)
\n
"
);
printf
(
"arg4: weight tensor layout (0
:
KCYX
;
1
:
KYXC)
\n
"
);
printf
(
"arg5: output tensor layout (0
:
NKHW
;
1
:
NHWK)
\n
"
);
printf
(
"arg6: verification (0
:
no
;
1
:
yes)
\n
"
);
printf
(
"arg7: initialization (0
:
no init
;
1
:
integer value
;
2
:
decimal value)
\n
"
);
printf
(
"arg8: print
tensor
value (0
:
no
;
1
:
yes)
\n
"
);
printf
(
"arg9: run kernel # of times (>1)
\n
"
);
printf
(
"arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx
\n
"
);
...
...
profiler/gemm_profiler.cpp
View file @
a037693f
...
...
@@ -37,12 +37,15 @@ int gemm_profiler(int argc, char* argv[])
{
if
(
argc
!=
14
)
{
printf
(
"arg1: tensor operation (gemm=GEMM)
\n
"
);
printf
(
"arg2: data type (0=fp32, 1=fp16)
\n
"
);
printf
(
"arg3: matrix layout (0=NN, 1=NT, 2=TN, 3=TT)
\n
"
);
printf
(
"arg4: verification (0=no, 1=yes)
\n
"
);
printf
(
"arg5: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
);
printf
(
"arg6: print matrix value (0=no, 1=yes)
\n
"
);
printf
(
"arg1: tensor operation (gemm: GEMM)
\n
"
);
printf
(
"arg2: data type (0: fp32; 1: fp16)
\n
"
);
printf
(
"arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];
\n
"
);
printf
(
" 1: A[m, k] * B[n, k] = C[m, n];
\n
"
);
printf
(
" 2: A[k, n] * B[k, n] = C[m, n];
\n
"
);
printf
(
" 3: A[k, n] * B[n, k] = C[m, n])
\n
"
);
printf
(
"arg4: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg5: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg8: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg7: run kernel # of times (>1)
\n
"
);
printf
(
"arg8 to 13: M, N, K, StrideA, StrideB, StrideC
\n
"
);
exit
(
1
);
...
...
@@ -70,8 +73,16 @@ int gemm_profiler(int argc, char* argv[])
ck
::
half_t
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
);
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
K
:
StrideA
,
(
StrideB
<
0
)
?
N
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
);
}
else
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
MK_NK_MN
)
{
...
...
@@ -80,8 +91,16 @@ int gemm_profiler(int argc, char* argv[])
ck
::
half_t
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
);
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
K
:
StrideA
,
(
StrideB
<
0
)
?
K
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
);
}
else
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
KM_KN_MN
)
{
...
...
@@ -90,8 +109,16 @@ int gemm_profiler(int argc, char* argv[])
ck
::
half_t
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
);
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
M
:
StrideA
,
(
StrideB
<
0
)
?
N
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
);
}
else
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
KM_NK_MN
)
{
...
...
@@ -100,8 +127,16 @@ int gemm_profiler(int argc, char* argv[])
ck
::
half_t
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
);
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
M
:
StrideA
,
(
StrideB
<
0
)
?
K
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
);
}
else
if
(
data_type
==
GemmDataType
::
F32_F32_F32
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
{
...
...
@@ -110,8 +145,16 @@ int gemm_profiler(int argc, char* argv[])
float
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
);
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
K
:
StrideA
,
(
StrideB
<
0
)
?
N
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
);
}
else
if
(
data_type
==
GemmDataType
::
F32_F32_F32
&&
layout
==
GemmMatrixLayout
::
MK_NK_MN
)
{
...
...
@@ -120,8 +163,16 @@ int gemm_profiler(int argc, char* argv[])
float
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
);
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
K
:
StrideA
,
(
StrideB
<
0
)
?
K
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
);
}
else
if
(
data_type
==
GemmDataType
::
F32_F32_F32
&&
layout
==
GemmMatrixLayout
::
KM_KN_MN
)
{
...
...
@@ -130,8 +181,16 @@ int gemm_profiler(int argc, char* argv[])
float
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
);
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
M
:
StrideA
,
(
StrideB
<
0
)
?
N
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
);
}
else
if
(
data_type
==
GemmDataType
::
F32_F32_F32
&&
layout
==
GemmMatrixLayout
::
KM_NK_MN
)
{
...
...
@@ -140,8 +199,16 @@ int gemm_profiler(int argc, char* argv[])
float
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
);
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
M
:
StrideA
,
(
StrideB
<
0
)
?
K
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
);
}
else
{
...
...
script/profile_gemm.sh
View file @
a037693f
...
...
@@ -18,7 +18,28 @@ REPEAT=$7
######## op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC
#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 256 256 256 256 256 256
#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 960 1024 1024 1024 1024 1024
#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024 1024 1024 1024 1024 1024
#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1920 2048 2048 2048 2048 2048
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
3840 4096 4096 4096 4096 4096
#
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 3840 4096 4096 4096 4096 4096
#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 7680 8192 8192 8192 8192 8192
#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024 1024 1024 1024 1024 1024
#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048 2048 2048 2048 2048 2048
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
960 1024 1024
-1
-1
-1
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
1920 2048 2048
-1
-1
-1
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
3840 4096 4096
-1
-1
-1
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
7680 8192 8192
-1
-1
-1
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
1024 1024 1024 1024 1024 1024
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
2048 2048 2048 2048 2048 2048
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
4096 4096 4096 4096 4096 4096
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
8192 8192 8192 8192 8192 8192
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
1024 1024 1024 1056 1056 1056
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
2048 2048 2048 2080 2080 2080
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
4096 4096 4096 4128 4128 4128
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
8192 8192 8192 8224 8224 8224
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
1024 1024 1024 1088 1088 1088
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
2048 2048 2048 2112 2112 2112
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
4096 4096 4096 4160 4160 4160
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
8192 8192 8192 8256 8256 8256
test/CMakeLists.txt
0 → 100644
View file @
a037693f
include_directories
(
BEFORE
include
${
PROJECT_SOURCE_DIR
}
/host/host_tensor/include
${
PROJECT_SOURCE_DIR
}
/host/device/include
${
PROJECT_SOURCE_DIR
}
/device_operation/include
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/utility
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/tensor_description
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/tensor_operation
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/problem_transform
${
PROJECT_SOURCE_DIR
}
/external/rocm/include
)
set
(
MAGIC_NUMBER_DIVISISON_SOURCE magic_number_division/main.cpp
)
add_executable
(
test_magic_number_division
${
MAGIC_NUMBER_DIVISISON_SOURCE
}
)
target_link_libraries
(
test_magic_number_division PRIVATE host_tensor
)
test/magic_number_division/main.cpp
0 → 100644
View file @
a037693f
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "device_tensor.hpp"
__global__
void
gpu_magic_number_division
(
uint32_t
magic_multiplier
,
uint32_t
magic_shift
,
const
int32_t
*
p_dividend
,
int32_t
*
p_result
,
uint64_t
num
)
{
uint64_t
global_thread_num
=
blockDim
.
x
*
gridDim
.
x
;
uint64_t
global_thread_id
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(
uint64_t
data_id
=
global_thread_id
;
data_id
<
num
;
data_id
+=
global_thread_num
)
{
p_result
[
data_id
]
=
ck
::
MagicDivision
::
DoMagicDivision
(
p_dividend
[
data_id
],
magic_multiplier
,
magic_shift
);
}
}
__global__
void
gpu_naive_division
(
int32_t
divisor
,
const
int32_t
*
p_dividend
,
int32_t
*
p_result
,
uint64_t
num
)
{
uint64_t
global_thread_num
=
blockDim
.
x
*
gridDim
.
x
;
uint64_t
global_thread_id
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(
uint64_t
data_id
=
global_thread_id
;
data_id
<
num
;
data_id
+=
global_thread_num
)
{
p_result
[
data_id
]
=
p_dividend
[
data_id
]
/
divisor
;
}
}
template
<
typename
T
>
T
check_error
(
const
std
::
vector
<
T
>&
ref
,
const
std
::
vector
<
T
>&
result
)
{
T
error
=
0
;
T
max_diff
=
0
;
T
ref_value
=
0
,
result_value
=
0
;
for
(
std
::
size_t
i
=
0
;
i
<
ref
.
size
();
++
i
)
{
T
diff
=
std
::
abs
(
ref
[
i
]
-
result
[
i
]);
error
+=
diff
;
if
(
max_diff
<
diff
)
{
max_diff
=
diff
;
ref_value
=
ref
[
i
];
result_value
=
result
[
i
];
}
}
return
max_diff
;
}
int
main
(
int
,
char
*
[])
{
uint64_t
num_divisor
=
4096
;
uint64_t
num_dividend
=
1L
<<
16
;
std
::
vector
<
int32_t
>
divisors_host
(
num_divisor
);
std
::
vector
<
int32_t
>
dividends_host
(
num_dividend
);
// generate divisor
for
(
uint64_t
i
=
0
;
i
<
num_divisor
;
++
i
)
{
divisors_host
[
i
]
=
i
+
1
;
}
// generate dividend
for
(
uint64_t
i
=
0
;
i
<
num_divisor
;
++
i
)
{
dividends_host
[
i
]
=
i
;
}
DeviceMem
dividends_dev_buf
(
sizeof
(
int32_t
)
*
num_dividend
);
DeviceMem
naive_result_dev_buf
(
sizeof
(
int32_t
)
*
num_dividend
);
DeviceMem
magic_result_dev_buf
(
sizeof
(
int32_t
)
*
num_dividend
);
std
::
vector
<
int32_t
>
naive_result_host
(
num_dividend
);
std
::
vector
<
int32_t
>
magic_result_host
(
num_dividend
);
dividends_dev_buf
.
ToDevice
(
dividends_host
.
data
());
bool
pass
=
true
;
for
(
std
::
size_t
i
=
0
;
i
<
num_divisor
;
++
i
)
{
// run naive division on GPU
gpu_naive_division
<<<
1024
,
256
>>>
(
divisors_host
[
i
],
static_cast
<
const
int32_t
*>
(
dividends_dev_buf
.
GetDeviceBuffer
()),
static_cast
<
int32_t
*>
(
naive_result_dev_buf
.
GetDeviceBuffer
()),
num_dividend
);
// calculate magic number
uint32_t
magic_multiplier
,
magic_shift
;
ck
::
tie
(
magic_multiplier
,
magic_shift
)
=
ck
::
MagicDivision
::
CalculateMagicNumbers
(
divisors_host
[
i
]);
// run magic division on GPU
gpu_magic_number_division
<<<
1024
,
256
>>>
(
magic_multiplier
,
magic_shift
,
static_cast
<
const
int32_t
*>
(
dividends_dev_buf
.
GetDeviceBuffer
()),
static_cast
<
int32_t
*>
(
magic_result_dev_buf
.
GetDeviceBuffer
()),
num_dividend
);
naive_result_dev_buf
.
FromDevice
(
naive_result_host
.
data
());
magic_result_dev_buf
.
FromDevice
(
magic_result_host
.
data
());
int32_t
max_diff
=
check_error
(
naive_result_host
,
magic_result_host
);
if
(
max_diff
!=
0
)
{
pass
=
false
;
continue
;
}
}
if
(
pass
)
{
std
::
cout
<<
"test magic number division: Pass"
<<
std
::
endl
;
}
else
{
std
::
cout
<<
"test magic number division: Fail"
<<
std
::
endl
;
}
return
1
;
}
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment