Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
fe427fd1
Commit
fe427fd1
authored
Jul 23, 2021
by
Jing Zhang
Browse files
init commit for conv+activ
parent
b8bb1480
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
901 additions
and
36 deletions
+901
-36
composable_kernel/include/driver/driver_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
...tion_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
+711
-2
composable_kernel/include/tensor_description/dynamic_multi_index_transform_helper.hpp
...nsor_description/dynamic_multi_index_transform_helper.hpp
+1
-1
composable_kernel/include/tensor_operation/gridwise_static_gemm_v2.hpp
...rnel/include/tensor_operation/gridwise_static_gemm_v2.hpp
+13
-0
host/driver_offline/conv_fwd_driver_offline.cpp
host/driver_offline/conv_fwd_driver_offline.cpp
+62
-29
host/driver_offline/include/device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
...convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
+7
-4
host/host_tensor/include/host_conv.hpp
host/host_tensor/include/host_conv.hpp
+107
-0
No files found.
composable_kernel/include/driver/driver_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
View file @
fe427fd1
...
...
@@ -173,8 +173,8 @@ struct DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
N
,
K0
,
Ho
,
Wo
,
K1
)),
make_tuple
(
make_merge_transform
(
make_tuple
(
K0
,
K1
)),
make_pass_through_transform
(
N
),
make_pad_transform
(
Ho
,
I0
,
OutRightPadH
),
make_pad_transform
(
Wo
,
I0
,
OutRightPadW
)),
make_
right_
pad_transform
(
Ho
,
OutRightPadH
),
make_
right_
pad_transform
(
Wo
,
OutRightPadW
)),
make_tuple
(
Sequence
<
1
,
4
>
{},
Sequence
<
0
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
...
...
@@ -387,5 +387,714 @@ struct DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
}
}
};
template
<
index_t
BlockSize
,
typename
FloatAB
,
typename
FloatAcc
,
typename
FloatC
,
index_t
KPerBlock
,
index_t
HoPerBlock
,
index_t
WoPerBlock
,
index_t
EPerBlock
,
index_t
KPerThread
,
index_t
HoPerThread
,
index_t
WoPerThread
,
index_t
EPerThread
,
typename
ABlockTransferThreadSliceLengths_E_K
,
typename
ABlockTransferThreadClusterLengths_E_K
,
index_t
ABlockTransferSrcScalarPerVector_E
,
index_t
ABlockTransferDstScalarPerVector_K
,
index_t
BThreadTransferSrcScalarPerVector_W
,
index_t
CThreadTransferDstScalarPerVector_W
>
struct
DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad_1x1
{
template
<
typename
...
Wei
,
typename
...
In
,
typename
...
Out
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
__host__
void
Run
(
const
DynamicTensorDescriptor
<
Wei
...
>&
wei_k_c_y_x_global_desc
,
const
DynamicTensorDescriptor
<
In
...
>&
in_n_c_hi_wi_global_desc
,
const
DynamicTensorDescriptor
<
Out
...
>&
out_n_k0_ho_wo_k1_global_desc
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
in_right_pads_
,
const
FloatAB
*
__restrict__
p_wei_global
,
const
FloatAB
*
__restrict__
p_in_global
,
FloatC
*
__restrict__
p_out_global
)
const
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
const
auto
N_
=
in_n_c_hi_wi_global_desc
.
GetLength
(
I0
);
const
auto
C_
=
in_n_c_hi_wi_global_desc
.
GetLength
(
I1
);
const
auto
Hi_
=
in_n_c_hi_wi_global_desc
.
GetLength
(
I2
);
const
auto
Wi_
=
in_n_c_hi_wi_global_desc
.
GetLength
(
I3
);
const
auto
K0_
=
out_n_k0_ho_wo_k1_global_desc
.
GetLength
(
I1
);
const
auto
Ho_
=
out_n_k0_ho_wo_k1_global_desc
.
GetLength
(
I2
);
const
auto
Wo_
=
out_n_k0_ho_wo_k1_global_desc
.
GetLength
(
I3
);
const
auto
K1_
=
out_n_k0_ho_wo_k1_global_desc
.
GetLength
(
I4
);
const
auto
K_
=
wei_k_c_y_x_global_desc
.
GetLength
(
I0
);
const
auto
Y_
=
wei_k_c_y_x_global_desc
.
GetLength
(
I2
);
const
auto
X_
=
wei_k_c_y_x_global_desc
.
GetLength
(
I3
);
constexpr
auto
N
=
Number
<
N_
>
{};
constexpr
auto
C
=
Number
<
C_
>
{};
constexpr
auto
K0
=
Number
<
K0_
>
{};
constexpr
auto
K1
=
Number
<
K1_
>
{};
constexpr
auto
Hi
=
Number
<
Hi_
>
{};
constexpr
auto
Wi
=
Number
<
Wi_
>
{};
constexpr
auto
Ho
=
Number
<
Ho_
>
{};
constexpr
auto
Wo
=
Number
<
Wo_
>
{};
constexpr
auto
K
=
Number
<
K_
>
{};
constexpr
auto
Y
=
Number
<
Y_
>
{};
constexpr
auto
X
=
Number
<
X_
>
{};
const
auto
ConvStrideH_
=
conv_strides
[
I0
];
const
auto
ConvStrideW_
=
conv_strides
[
I1
];
const
auto
ConvDilationH_
=
conv_dilations
[
I0
];
const
auto
ConvDilationW_
=
conv_dilations
[
I1
];
constexpr
auto
ConvStrideH
=
Number
<
ConvStrideH_
>
{};
constexpr
auto
ConvStrideW
=
Number
<
ConvStrideW_
>
{};
constexpr
auto
ConvDilationH
=
Number
<
ConvDilationH_
>
{};
constexpr
auto
ConvDilationW
=
Number
<
ConvDilationW_
>
{};
constexpr
auto
Hop
=
Number
<
(
Ho
+
HoPerBlock
-
1
)
/
HoPerBlock
*
HoPerBlock
>
{};
constexpr
auto
Wop
=
Number
<
(
Wo
+
WoPerBlock
-
1
)
/
WoPerBlock
*
WoPerBlock
>
{};
constexpr
auto
OutRightPadH
=
Hop
-
Ho
;
constexpr
auto
OutRightPadW
=
Wop
-
Wo
;
const
auto
InLeftPadH_
=
in_left_pads
[
I0
];
const
auto
InLeftPadW_
=
in_left_pads
[
I1
];
constexpr
auto
InLeftPadH
=
Number
<
InLeftPadH_
>
{};
constexpr
auto
InLeftPadW
=
Number
<
InLeftPadW_
>
{};
static_assert
(
InLeftPadH
==
0
and
InLeftPadW
==
0
,
""
);
constexpr
auto
in_right_pads
=
InRightPads
{};
const
auto
InRightPadH_
=
in_right_pads
[
I0
]
+
OutRightPadH
*
ConvStrideH
;
const
auto
InRightPadW_
=
in_right_pads
[
I1
]
+
OutRightPadW
*
ConvStrideW
;
constexpr
auto
InRightPadH
=
Number
<
InRightPadH_
>
{};
constexpr
auto
InRightPadW
=
Number
<
InRightPadW_
>
{};
std
::
cerr
<<
"OutRightPadH = "
<<
OutRightPadH
<<
" OutRightPadW = "
<<
OutRightPadW
<<
std
::
endl
;
std
::
cerr
<<
"InRightPadH = "
<<
InRightPadH
<<
" InRightPadW = "
<<
InRightPadW
<<
std
::
endl
;
// weight tensor
const
auto
wei_e_k_global_desc
=
transform_dynamic_tensor_descriptor
(
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
K
,
C
)),
make_tuple
(
make_pass_through_transform
(
K
),
make_pass_through_transform
(
C
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
>
{}));
static_assert
(
wei_e_k_global_desc
.
IsKnownAtCompileTime
(),
"wrong! wei_e_k_global_desc need to known at compile-time"
);
// input tensor
const
auto
in_e_n_ho_wo_global_desc
=
transform_dynamic_tensor_descriptor
(
in_n_c_hi_wi_global_desc
,
make_tuple
(
make_pass_through_transform
(
C
),
make_pass_through_transform
(
N
),
make_right_pad_transform
(
Hi
,
InRightPadH
),
make_right_pad_transform
(
Wi
,
InRightPadW
)),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
static_assert
(
in_e_n_ho_wo_global_desc
.
IsKnownAtCompileTime
(),
"wrong! in_e_n_ho_wo_global_desc need to known at compile-time"
);
// output tensor
const
auto
out_k_n_hop_wop_global_desc
=
transform_dynamic_tensor_descriptor
(
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
N
,
K0
,
Ho
,
Wo
,
K1
)),
make_tuple
(
make_merge_transform
(
make_tuple
(
K0
,
K1
)),
make_pass_through_transform
(
N
),
make_right_pad_transform
(
Ho
,
OutRightPadH
),
make_right_pad_transform
(
Wo
,
OutRightPadW
)),
make_tuple
(
Sequence
<
1
,
4
>
{},
Sequence
<
0
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
static_assert
(
out_k_n_hop_wop_global_desc
.
IsKnownAtCompileTime
(),
"wrong! out_k_n_hop_wop_global_desc need to known at compile-time"
);
const
auto
E
=
C
;
std
::
cerr
<<
"Hop = "
<<
Hop
<<
" Wop = "
<<
Wop
<<
std
::
endl
;
if
(
!
((
K
%
KPerBlock
)
==
0
&&
(
Hop
%
HoPerBlock
)
==
0
&&
(
Wop
%
WoPerBlock
)
==
0
&&
(
E
%
EPerBlock
)
==
0
))
{
throw
std
::
runtime_error
(
"wrong! GEMM size no divisible"
);
}
// hack to control index calculation when iterating over a_k_m_global tensor
constexpr
auto
a_e_k_global_iterator_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
>
{}));
constexpr
auto
a_e_k_global_move_slice_window_iterator_hack
=
Sequence
<
0
,
0
,
0
>
{};
constexpr
auto
b_e_n_ho_wo_global_iterator_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{}));
constexpr
auto
b_e_n_ho_wo_global_move_slice_window_iterator_hack
=
Sequence
<
0
,
0
,
0
,
0
,
0
>
{};
// hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
// hack for NKHW format
constexpr
auto
c_k_n_ho_wo_global_tensor_iterator_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
1
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
2
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{}));
// GEMM
using
gridwise_gemm
=
GridwiseStaticGemm_km_kn_mn_v3
<
BlockSize
,
FloatAB
,
FloatAcc
,
FloatC
,
InMemoryDataOperation
::
Set
,
decltype
(
wei_e_k_global_desc
),
decltype
(
in_e_n_ho_wo_global_desc
),
decltype
(
out_k_n_hop_wop_global_desc
),
KPerBlock
,
HoPerBlock
,
WoPerBlock
,
EPerBlock
,
KPerThread
,
HoPerThread
,
WoPerThread
,
EPerThread
,
ABlockTransferThreadSliceLengths_E_K
,
ABlockTransferThreadClusterLengths_E_K
,
Sequence
<
1
,
0
>
,
Sequence
<
1
,
0
>
,
0
,
ABlockTransferSrcScalarPerVector_E
,
ABlockTransferDstScalarPerVector_K
,
false
,
// don't move back src coordinate after threadwise copy
Sequence
<
0
,
2
,
3
,
1
>
,
3
,
BThreadTransferSrcScalarPerVector_W
,
false
,
// don't move back src coordinate after threadwise copy, which will be fused with
// MoveSrcSliceWindow() to save addr computation
Sequence
<
0
,
2
,
3
,
1
>
,
0
,
CThreadTransferDstScalarPerVector_W
,
decltype
(
a_e_k_global_iterator_hacks
),
decltype
(
b_e_n_ho_wo_global_iterator_hacks
),
decltype
(
c_k_n_ho_wo_global_tensor_iterator_hacks
),
decltype
(
a_e_k_global_move_slice_window_iterator_hack
),
decltype
(
b_e_n_ho_wo_global_move_slice_window_iterator_hack
)
>
;
const
auto
GridSize
=
(
K
/
KPerBlock
)
*
(
Hop
/
HoPerBlock
)
*
(
Wop
/
WoPerBlock
)
*
N
;
constexpr
bool
has_main_k_block_loop
=
(
E
+
EPerBlock
)
/
(
2
*
EPerBlock
)
>
1
;
constexpr
bool
has_double_tail_k_block_loop
=
(
E
/
EPerBlock
)
%
2
==
0
;
index_t
nrepeat
=
100
;
std
::
cout
<<
"conv_v5r1__NCHWc"
<<
K1
<<
"_n"
<<
N
<<
"c"
<<
C
<<
"h"
<<
Hi
<<
"w"
<<
Wi
<<
"-k"
<<
K
<<
"c"
<<
C
<<
"y"
<<
Y
<<
"x"
<<
X
<<
"-u"
<<
conv_strides
[
I0
]
<<
"v"
<<
conv_strides
[
I1
]
<<
"l"
<<
conv_dilations
[
I0
]
<<
"j"
<<
conv_dilations
[
I1
]
<<
"q"
<<
in_left_pads
[
I0
]
<<
"p"
<<
in_right_pads
[
I0
]
<<
std
::
endl
;
std
::
cout
<<
"GridSize = "
<<
GridSize
<<
" BlockSize = "
<<
BlockSize
<<
std
::
endl
;
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
{
std
::
cout
<<
"Start running "
<<
nrepeat
<<
" times..."
<<
std
::
endl
;
KernelTimer
timer
;
timer
.
Start
();
std
::
cout
<<
"has_main_k_block_loop: "
<<
has_main_k_block_loop
<<
" has_double_tail_k_block_loop: "
<<
has_double_tail_k_block_loop
<<
std
::
endl
;
for
(
index_t
j
=
0
;
j
<
nrepeat
;
++
j
)
{
if
constexpr
(
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
run_gridwise_operation
<
gridwise_gemm
,
const
FloatAB
*
,
const
FloatAB
*
,
FloatC
*
,
integral_constant
<
bool
,
true
>
,
integral_constant
<
bool
,
true
>>
;
launch_kernel
(
kernel
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
0
,
p_wei_global
,
p_in_global
,
p_out_global
,
integral_constant
<
bool
,
true
>
{},
integral_constant
<
bool
,
true
>
{});
}
else
if
constexpr
(
has_main_k_block_loop
&&
!
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
run_gridwise_operation
<
gridwise_gemm
,
const
FloatAB
*
,
const
FloatAB
*
,
FloatC
*
,
integral_constant
<
bool
,
true
>
,
integral_constant
<
bool
,
false
>>
;
launch_kernel
(
kernel
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
0
,
p_wei_global
,
p_in_global
,
p_out_global
,
integral_constant
<
bool
,
true
>
{},
integral_constant
<
bool
,
false
>
{});
}
else
if
constexpr
(
!
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
run_gridwise_operation
<
gridwise_gemm
,
const
FloatAB
*
,
const
FloatAB
*
,
FloatC
*
,
integral_constant
<
bool
,
false
>
,
integral_constant
<
bool
,
true
>>
;
launch_kernel
(
kernel
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
0
,
p_wei_global
,
p_in_global
,
p_out_global
,
integral_constant
<
bool
,
false
>
{},
integral_constant
<
bool
,
true
>
{});
}
else
{
const
auto
kernel
=
run_gridwise_operation
<
gridwise_gemm
,
const
FloatAB
*
,
const
FloatAB
*
,
FloatC
*
,
integral_constant
<
bool
,
false
>
,
integral_constant
<
bool
,
false
>>
;
launch_kernel
(
kernel
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
0
,
p_wei_global
,
p_in_global
,
p_out_global
,
integral_constant
<
bool
,
false
>
{},
integral_constant
<
bool
,
false
>
{});
}
}
timer
.
End
();
float
ave_time
=
timer
.
GetElapsedTime
()
/
nrepeat
;
float
perf
=
(
float
)
calculate_convolution_flops
(
in_n_c_hi_wi_global_desc
,
wei_k_c_y_x_global_desc
,
out_n_k0_ho_wo_k1_global_desc
)
/
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
ave_time
;
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
}
}
};
template
<
index_t
BlockSize
,
typename
FloatAB
,
typename
FloatAcc
,
typename
FloatC
,
index_t
KPerBlock
,
index_t
HoPerBlock
,
index_t
WoPerBlock
,
index_t
EPerBlock
,
index_t
KPerThread
,
index_t
HoPerThread
,
index_t
WoPerThread
,
index_t
EPerThread
,
typename
ABlockTransferThreadSliceLengths_E_K
,
typename
ABlockTransferThreadClusterLengths_E_K
,
index_t
ABlockTransferSrcScalarPerVector_E
,
index_t
ABlockTransferDstScalarPerVector_K
,
index_t
BThreadTransferSrcScalarPerVector_W
,
index_t
CThreadTransferDstScalarPerVector_W
>
struct
DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_1x1
{
template
<
typename
...
Wei
,
typename
...
In
,
typename
...
Out
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
__host__
void
Run
(
const
DynamicTensorDescriptor
<
Wei
...
>&
wei_k_c_y_x_global_desc
,
const
DynamicTensorDescriptor
<
In
...
>&
in_n_c_hi_wi_global_desc
,
const
DynamicTensorDescriptor
<
Out
...
>&
out_n_k0_ho_wo_k1_global_desc
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
in_right_pads_
,
const
FloatAB
*
__restrict__
p_wei_global
,
const
FloatAB
*
__restrict__
p_in_global
,
FloatC
*
__restrict__
p_out_global
)
const
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
const
auto
N_
=
in_n_c_hi_wi_global_desc
.
GetLength
(
I0
);
const
auto
C_
=
in_n_c_hi_wi_global_desc
.
GetLength
(
I1
);
const
auto
Hi_
=
in_n_c_hi_wi_global_desc
.
GetLength
(
I2
);
const
auto
Wi_
=
in_n_c_hi_wi_global_desc
.
GetLength
(
I3
);
const
auto
K0_
=
out_n_k0_ho_wo_k1_global_desc
.
GetLength
(
I1
);
const
auto
Ho_
=
out_n_k0_ho_wo_k1_global_desc
.
GetLength
(
I2
);
const
auto
Wo_
=
out_n_k0_ho_wo_k1_global_desc
.
GetLength
(
I3
);
const
auto
K1_
=
out_n_k0_ho_wo_k1_global_desc
.
GetLength
(
I4
);
const
auto
K_
=
wei_k_c_y_x_global_desc
.
GetLength
(
I0
);
const
auto
Y_
=
wei_k_c_y_x_global_desc
.
GetLength
(
I2
);
const
auto
X_
=
wei_k_c_y_x_global_desc
.
GetLength
(
I3
);
constexpr
auto
N
=
Number
<
N_
>
{};
constexpr
auto
C
=
Number
<
C_
>
{};
constexpr
auto
K0
=
Number
<
K0_
>
{};
constexpr
auto
K1
=
Number
<
K1_
>
{};
constexpr
auto
Hi
=
Number
<
Hi_
>
{};
constexpr
auto
Wi
=
Number
<
Wi_
>
{};
constexpr
auto
Ho
=
Number
<
Ho_
>
{};
constexpr
auto
Wo
=
Number
<
Wo_
>
{};
constexpr
auto
K
=
Number
<
K_
>
{};
constexpr
auto
Y
=
Number
<
Y_
>
{};
constexpr
auto
X
=
Number
<
X_
>
{};
const
auto
ConvStrideH_
=
conv_strides
[
I0
];
const
auto
ConvStrideW_
=
conv_strides
[
I1
];
const
auto
ConvDilationH_
=
conv_dilations
[
I0
];
const
auto
ConvDilationW_
=
conv_dilations
[
I1
];
constexpr
auto
ConvStrideH
=
Number
<
ConvStrideH_
>
{};
constexpr
auto
ConvStrideW
=
Number
<
ConvStrideW_
>
{};
constexpr
auto
ConvDilationH
=
Number
<
ConvDilationH_
>
{};
constexpr
auto
ConvDilationW
=
Number
<
ConvDilationW_
>
{};
constexpr
auto
Hop
=
Number
<
(
Ho
+
HoPerBlock
-
1
)
/
HoPerBlock
*
HoPerBlock
>
{};
constexpr
auto
Wop
=
Number
<
(
Wo
+
WoPerBlock
-
1
)
/
WoPerBlock
*
WoPerBlock
>
{};
constexpr
auto
OutRightPadH
=
Hop
-
Ho
;
constexpr
auto
OutRightPadW
=
Wop
-
Wo
;
const
auto
InLeftPadH_
=
in_left_pads
[
I0
];
const
auto
InLeftPadW_
=
in_left_pads
[
I1
];
constexpr
auto
InLeftPadH
=
Number
<
InLeftPadH_
>
{};
constexpr
auto
InLeftPadW
=
Number
<
InLeftPadW_
>
{};
static_assert
(
InLeftPadH
==
0
and
InLeftPadW
==
0
,
""
);
constexpr
auto
in_right_pads
=
InRightPads
{};
const
auto
InRightPadH_
=
in_right_pads
[
I0
]
+
OutRightPadH
*
ConvStrideH
;
const
auto
InRightPadW_
=
in_right_pads
[
I1
]
+
OutRightPadW
*
ConvStrideW
;
constexpr
auto
InRightPadH
=
Number
<
InRightPadH_
>
{};
constexpr
auto
InRightPadW
=
Number
<
InRightPadW_
>
{};
static_assert
(
OutRightPadW
==
0
and
OutRightPadH
==
0
,
""
);
// weight tensor
const
auto
wei_e_k_global_desc
=
transform_dynamic_tensor_descriptor
(
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
K
,
C
)),
make_tuple
(
make_pass_through_transform
(
K
),
make_pass_through_transform
(
C
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
>
{}));
static_assert
(
wei_e_k_global_desc
.
IsKnownAtCompileTime
(),
"wrong! wei_e_k_global_desc need to known at compile-time"
);
// input tensor
const
auto
in_e_n_ho_wo_global_desc
=
transform_dynamic_tensor_descriptor
(
in_n_c_hi_wi_global_desc
,
make_tuple
(
make_pass_through_transform
(
C
),
make_pass_through_transform
(
N
),
make_pass_through_transform
(
Ho
),
make_pass_through_transform
(
Wo
)),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
static_assert
(
in_e_n_ho_wo_global_desc
.
IsKnownAtCompileTime
(),
"wrong! in_e_n_ho_wo_global_desc need to known at compile-time"
);
// output tensor
const
auto
out_k_n_hop_wop_global_desc
=
transform_dynamic_tensor_descriptor
(
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
N
,
K0
,
Ho
,
Wo
,
K1
)),
make_tuple
(
make_merge_transform
(
make_tuple
(
K0
,
K1
)),
make_pass_through_transform
(
N
),
make_pass_through_transform
(
Ho
),
make_pass_through_transform
(
Wo
)),
make_tuple
(
Sequence
<
1
,
4
>
{},
Sequence
<
0
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
static_assert
(
out_k_n_hop_wop_global_desc
.
IsKnownAtCompileTime
(),
"wrong! out_k_n_hop_wop_global_desc need to known at compile-time"
);
const
auto
E
=
C
;
std
::
cerr
<<
"Hop = "
<<
Hop
<<
" Wop = "
<<
Wop
<<
std
::
endl
;
if
(
!
((
K
%
KPerBlock
)
==
0
&&
(
Hop
%
HoPerBlock
)
==
0
&&
(
Wop
%
WoPerBlock
)
==
0
&&
(
E
%
EPerBlock
)
==
0
))
{
throw
std
::
runtime_error
(
"wrong! GEMM size no divisible"
);
}
// hack to control index calculation when iterating over a_k_m_global tensor
constexpr
auto
a_e_k_global_iterator_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
>
{}));
constexpr
auto
a_e_k_global_move_slice_window_iterator_hack
=
Sequence
<
0
,
0
,
0
>
{};
constexpr
auto
b_e_n_ho_wo_global_iterator_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{}));
constexpr
auto
b_e_n_ho_wo_global_move_slice_window_iterator_hack
=
Sequence
<
0
,
0
,
0
,
0
,
0
>
{};
// hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
// hack for NKHW format
constexpr
auto
c_k_n_ho_wo_global_tensor_iterator_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
1
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
2
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{}));
// GEMM
using
gridwise_gemm
=
GridwiseStaticGemm_km_kn_mn_v3
<
BlockSize
,
FloatAB
,
FloatAcc
,
FloatC
,
InMemoryDataOperation
::
Set
,
decltype
(
wei_e_k_global_desc
),
decltype
(
in_e_n_ho_wo_global_desc
),
decltype
(
out_k_n_hop_wop_global_desc
),
KPerBlock
,
HoPerBlock
,
WoPerBlock
,
EPerBlock
,
KPerThread
,
HoPerThread
,
WoPerThread
,
EPerThread
,
ABlockTransferThreadSliceLengths_E_K
,
ABlockTransferThreadClusterLengths_E_K
,
Sequence
<
1
,
0
>
,
Sequence
<
1
,
0
>
,
0
,
ABlockTransferSrcScalarPerVector_E
,
ABlockTransferDstScalarPerVector_K
,
false
,
// don't move back src coordinate after threadwise copy
Sequence
<
0
,
2
,
3
,
1
>
,
3
,
BThreadTransferSrcScalarPerVector_W
,
false
,
// don't move back src coordinate after threadwise copy, which will be fused with
// MoveSrcSliceWindow() to save addr computation
Sequence
<
0
,
2
,
3
,
1
>
,
0
,
CThreadTransferDstScalarPerVector_W
,
decltype
(
a_e_k_global_iterator_hacks
),
decltype
(
b_e_n_ho_wo_global_iterator_hacks
),
decltype
(
c_k_n_ho_wo_global_tensor_iterator_hacks
),
decltype
(
a_e_k_global_move_slice_window_iterator_hack
),
decltype
(
b_e_n_ho_wo_global_move_slice_window_iterator_hack
)
>
;
const
auto
GridSize
=
(
K
/
KPerBlock
)
*
(
Hop
/
HoPerBlock
)
*
(
Wop
/
WoPerBlock
)
*
N
;
constexpr
bool
has_main_k_block_loop
=
(
E
+
EPerBlock
)
/
(
2
*
EPerBlock
)
>
1
;
constexpr
bool
has_double_tail_k_block_loop
=
(
E
/
EPerBlock
)
%
2
==
0
;
index_t
nrepeat
=
100
;
std
::
cout
<<
"conv_v5r1_NCHWc"
<<
K1
<<
"_n"
<<
N
<<
"c"
<<
C
<<
"h"
<<
Hi
<<
"w"
<<
Wi
<<
"-k"
<<
K
<<
"c"
<<
C
<<
"y"
<<
Y
<<
"x"
<<
X
<<
"-u"
<<
conv_strides
[
I0
]
<<
"v"
<<
conv_strides
[
I1
]
<<
"l"
<<
conv_dilations
[
I0
]
<<
"j"
<<
conv_dilations
[
I1
]
<<
"q"
<<
in_left_pads
[
I0
]
<<
"p"
<<
in_right_pads
[
I0
]
<<
std
::
endl
;
std
::
cout
<<
"GridSize = "
<<
GridSize
<<
" BlockSize = "
<<
BlockSize
<<
std
::
endl
;
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
{
std
::
cout
<<
"Start running "
<<
nrepeat
<<
" times..."
<<
std
::
endl
;
KernelTimer
timer
;
timer
.
Start
();
std
::
cout
<<
"has_main_k_block_loop: "
<<
has_main_k_block_loop
<<
" has_double_tail_k_block_loop: "
<<
has_double_tail_k_block_loop
<<
std
::
endl
;
for
(
index_t
j
=
0
;
j
<
nrepeat
;
++
j
)
{
if
constexpr
(
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
run_gridwise_operation
<
gridwise_gemm
,
const
FloatAB
*
,
const
FloatAB
*
,
FloatC
*
,
integral_constant
<
bool
,
true
>
,
integral_constant
<
bool
,
true
>>
;
launch_kernel
(
kernel
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
0
,
p_wei_global
,
p_in_global
,
p_out_global
,
integral_constant
<
bool
,
true
>
{},
integral_constant
<
bool
,
true
>
{});
}
else
if
constexpr
(
has_main_k_block_loop
&&
!
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
run_gridwise_operation
<
gridwise_gemm
,
const
FloatAB
*
,
const
FloatAB
*
,
FloatC
*
,
integral_constant
<
bool
,
true
>
,
integral_constant
<
bool
,
false
>>
;
launch_kernel
(
kernel
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
0
,
p_wei_global
,
p_in_global
,
p_out_global
,
integral_constant
<
bool
,
true
>
{},
integral_constant
<
bool
,
false
>
{});
}
else
if
constexpr
(
!
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
run_gridwise_operation
<
gridwise_gemm
,
const
FloatAB
*
,
const
FloatAB
*
,
FloatC
*
,
integral_constant
<
bool
,
false
>
,
integral_constant
<
bool
,
true
>>
;
launch_kernel
(
kernel
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
0
,
p_wei_global
,
p_in_global
,
p_out_global
,
integral_constant
<
bool
,
false
>
{},
integral_constant
<
bool
,
true
>
{});
}
else
{
const
auto
kernel
=
run_gridwise_operation
<
gridwise_gemm
,
const
FloatAB
*
,
const
FloatAB
*
,
FloatC
*
,
integral_constant
<
bool
,
false
>
,
integral_constant
<
bool
,
false
>>
;
launch_kernel
(
kernel
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
0
,
p_wei_global
,
p_in_global
,
p_out_global
,
integral_constant
<
bool
,
false
>
{},
integral_constant
<
bool
,
false
>
{});
}
}
timer
.
End
();
float
ave_time
=
timer
.
GetElapsedTime
()
/
nrepeat
;
float
perf
=
(
float
)
calculate_convolution_flops
(
in_n_c_hi_wi_global_desc
,
wei_k_c_y_x_global_desc
,
out_n_k0_ho_wo_k1_global_desc
)
/
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
ave_time
;
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
}
}
};
}
// namespace ck
#endif
composable_kernel/include/tensor_description/dynamic_multi_index_transform_helper.hpp
View file @
fe427fd1
...
...
@@ -32,7 +32,7 @@ __host__ __device__ constexpr auto make_left_pad_transform(
return
DynamicLeftPad
<
LowLength
,
LeftPad
,
SkipIsValidCheck
>
{
low_length
,
left_pad
};
}
template
<
typename
LowLength
,
typename
RightPad
,
bool
SkipIsValidCheck
>
template
<
typename
LowLength
,
typename
RightPad
,
bool
SkipIsValidCheck
=
false
>
__host__
__device__
constexpr
auto
make_right_pad_transform
(
const
LowLength
&
low_length
,
const
RightPad
&
right_pad
,
...
...
composable_kernel/include/tensor_operation/gridwise_static_gemm_v2.hpp
View file @
fe427fd1
...
...
@@ -346,6 +346,19 @@ struct GridwiseStaticGemm_km_kn_mn_v3
blockwise_gemm
.
Run
(
a_block_buf
,
b_thread_even_buf
,
c_thread_buf
);
}
// activ
{
constexpr
index_t
activ_type
=
2
;
static_for
<
0
,
c_k_n_ho_wo_thread_desc
.
GetElementSpaceSize
(),
1
>
{}([
&
](
auto
i
)
{
if
constexpr
(
activ_type
==
1
)
c_thread_buf
(
i
)
=
c_thread_buf
[
i
]
>=
0
?
c_thread_buf
[
i
]
:
0.0
;
else
if
constexpr
(
activ_type
==
2
)
c_thread_buf
(
i
)
=
1.0
/
(
1.0
+
exp
(
-
c_thread_buf
[
i
]));
}
);
}
// output: register to global memory
{
// hack to control index calculation when iterating over c_k_n_ho_wo_global tensor
...
...
host/driver_offline/conv_fwd_driver_offline.cpp
View file @
fe427fd1
...
...
@@ -103,14 +103,14 @@ int main(int argc, char* argv[])
const
bool
do_log
=
atoi
(
argv
[
5
]);
const
int
nrepeat
=
atoi
(
argv
[
6
]);
#if
0
constexpr index_t N = 1;
constexpr index_t C = 16;
constexpr index_t Hi = 1080;
constexpr index_t Wi = 1920;
constexpr index_t K = 16;
constexpr index_t Y = 3;
constexpr index_t X = 3;
#if
1
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
1080
;
constexpr
index_t
Wi
=
1920
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
...
...
@@ -127,7 +127,7 @@ int main(int argc, char* argv[])
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
#elif
1
#elif
0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
240
;
...
...
@@ -135,7 +135,7 @@ int main(int argc, char* argv[])
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
#elif
1
#elif
0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
1080
;
...
...
@@ -143,6 +143,38 @@ int main(int argc, char* argv[])
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
540
;
constexpr
index_t
Wi
=
960
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
480
;
constexpr
index_t
Wi
=
270
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
8
;
constexpr
index_t
Hi
=
1080
;
constexpr
index_t
Wi
=
1920
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
#elif 0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
Hi
=
1080
;
constexpr
index_t
Wi
=
1920
;
constexpr
index_t
K
=
4
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
#endif
const
index_t
conv_stride_h
=
1
;
...
...
@@ -420,17 +452,17 @@ int main(int argc, char* argv[])
#else
device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw
#endif
<
in_data_t
,
8
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I3
],
tmp
[
I4
],
tmp
[
I5
],
tmp
[
I6
],
in
,
wei
,
out_device
,
nrepeat
);
<
in_data_t
,
8
,
8
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I3
],
tmp
[
I4
],
tmp
[
I5
],
tmp
[
I6
],
in
,
wei
,
out_device
,
nrepeat
);
}
#endif
...
...
@@ -490,14 +522,15 @@ int main(int argc, char* argv[])
if
(
do_verification
)
{
host_direct_convolution
(
in
,
wei
,
out_host
,
make_tuple
(
conv_stride_h
,
conv_stride_w
),
make_tuple
(
conv_dilation_h
,
conv_dilation_w
),
make_tuple
(
in_left_pad_h
,
in_left_pad_w
),
make_tuple
(
in_right_pad_h
,
in_right_pad_w
),
layout
);
host_direct_convolution_activ
(
in
,
wei
,
out_host
,
make_tuple
(
conv_stride_h
,
conv_stride_w
),
make_tuple
(
conv_dilation_h
,
conv_dilation_w
),
make_tuple
(
in_left_pad_h
,
in_left_pad_w
),
make_tuple
(
in_right_pad_h
,
in_right_pad_w
),
layout
,
ActivType_t
::
sigmoid
);
check_error
(
out_host
,
out_device
);
...
...
host/driver_offline/include/device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
View file @
fe427fd1
...
...
@@ -6,6 +6,7 @@
template
<
typename
TInWei
,
ck
::
index_t
InWeiVectorSize
,
ck
::
index_t
OutVectorSize
,
typename
TAcc
,
typename
TOut
,
typename
InLengths
,
...
...
@@ -53,8 +54,8 @@ void device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
const
auto
C0
=
C
/
Number
<
InWeiVectorSize
>
{};
const
auto
C1
=
Number
<
InWeiVectorSize
>
{};
const
auto
K0
=
K
/
Number
<
InWei
VectorSize
>
{};
const
auto
K1
=
Number
<
InWei
VectorSize
>
{};
const
auto
K0
=
K
/
Number
<
Out
VectorSize
>
{};
const
auto
K1
=
Number
<
Out
VectorSize
>
{};
Tensor
<
TInWei
>
in_n_c0_hi_wi_c1
(
HostTensorDescriptor
(
std
::
initializer_list
<
index_t
>
{
N
,
C0
,
Hi
,
Wi
,
C1
}));
...
...
@@ -105,7 +106,7 @@ void device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
constexpr
index_t
WoPerThread
=
2
;
constexpr
index_t
EPerThread
=
EPerBlock
;
using
ABlockTransferThreadSliceLengths_E_K
=
Sequence
<
9
,
1
>
;
using
ABlockTransferThreadSliceLengths_E_K
=
Sequence
<
Y
*
X
,
1
>
;
using
ABlockTransferThreadClusterLengths_E_K
=
Sequence
<
EPerBlock
,
KPerBlock
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_E
=
1
;
...
...
@@ -120,8 +121,10 @@ void device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
constexpr
auto
conv_driver
=
#if 0
DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
#el
se
#el
if
1
DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
#elif 1
DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad_1x1
#endif
<
BlockSize
,
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
,
...
...
host/host_tensor/include/host_conv.hpp
View file @
fe427fd1
#pragma once
#include "host_tensor.hpp"
typedef
enum
{
passthrough
=
0
,
relu
,
sigmoid
}
ActivType_t
;
template
<
typename
TIn
,
typename
TWei
,
typename
TOut
,
...
...
@@ -88,6 +95,106 @@ void host_direct_convolution(const Tensor<TIn>& in,
}
}
template
<
typename
T
>
inline
auto
activ
(
T
v
,
const
ActivType_t
activ_type
)
{
switch
(
activ_type
)
{
case
passthrough
:
return
v
;
case
relu
:
return
(
v
>=
0
?
v
:
0
);
case
sigmoid
:
return
(
1
/
(
1
+
exp
(
-
v
)));
default:
throw
std
::
runtime_error
(
"unsupported activ type"
);
break
;
}
}
template
<
typename
TIn
,
typename
TWei
,
typename
TOut
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
void
host_direct_convolution_activ
(
const
Tensor
<
TIn
>&
in
,
const
Tensor
<
TWei
>&
wei
,
Tensor
<
TOut
>&
out
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
in_right_pads
,
const
ConvTensorLayout
layout
=
ConvTensorLayout
::
NCHW
,
const
ActivType_t
activ_type
=
ActivType_t
::
passthrough
)
{
using
namespace
ck
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
auto
f_nchw
=
[
&
](
auto
n
,
auto
k
,
auto
ho
,
auto
wo
)
{
double
v
=
0
;
for
(
int
c
=
0
;
c
<
wei
.
mDesc
.
GetLengths
()[
1
];
++
c
)
{
for
(
int
y
=
0
;
y
<
wei
.
mDesc
.
GetLengths
()[
2
];
++
y
)
{
int
hi
=
ho
*
conv_strides
[
I0
]
+
y
*
conv_dilations
[
I0
]
-
in_left_pads
[
I0
];
for
(
int
x
=
0
;
x
<
wei
.
mDesc
.
GetLengths
()[
3
];
++
x
)
{
int
wi
=
wo
*
conv_strides
[
I1
]
+
x
*
conv_dilations
[
I1
]
-
in_left_pads
[
I1
];
if
(
hi
>=
0
&&
hi
<
in
.
mDesc
.
GetLengths
()[
2
]
&&
wi
>=
0
&&
wi
<
in
.
mDesc
.
GetLengths
()[
3
])
{
v
+=
static_cast
<
const
double
>
(
in
(
n
,
c
,
hi
,
wi
))
*
static_cast
<
const
double
>
(
wei
(
k
,
c
,
y
,
x
));
}
}
}
}
out
(
n
,
k
,
ho
,
wo
)
=
activ
(
v
,
activ_type
);
};
auto
f_nhwc
=
[
&
](
auto
n
,
auto
ho
,
auto
wo
,
auto
k
)
{
double
v
=
0
;
for
(
int
c
=
0
;
c
<
wei
.
mDesc
.
GetLengths
()[
3
];
++
c
)
{
for
(
int
y
=
0
;
y
<
wei
.
mDesc
.
GetLengths
()[
1
];
++
y
)
{
int
hi
=
ho
*
conv_strides
[
I0
]
+
y
*
conv_dilations
[
I0
]
-
in_left_pads
[
I0
];
for
(
int
x
=
0
;
x
<
wei
.
mDesc
.
GetLengths
()[
2
];
++
x
)
{
int
wi
=
wo
*
conv_strides
[
I1
]
+
x
*
conv_dilations
[
I1
]
-
in_left_pads
[
I1
];
if
(
hi
>=
0
&&
hi
<
in
.
mDesc
.
GetLengths
()[
1
]
&&
wi
>=
0
&&
wi
<
in
.
mDesc
.
GetLengths
()[
2
])
{
v
+=
static_cast
<
const
double
>
(
in
(
n
,
hi
,
wi
,
c
))
*
static_cast
<
const
double
>
(
wei
(
k
,
y
,
x
,
c
));
}
}
}
}
out
(
n
,
k
,
ho
,
wo
)
=
activ
(
v
,
activ_type
);
};
switch
(
layout
)
{
case
ConvTensorLayout
::
NCHW
:
make_ParallelTensorFunctor
(
f_nchw
,
out
.
mDesc
.
GetLengths
()[
0
],
out
.
mDesc
.
GetLengths
()[
1
],
out
.
mDesc
.
GetLengths
()[
2
],
out
.
mDesc
.
GetLengths
()[
3
])(
std
::
thread
::
hardware_concurrency
());
break
;
case
ConvTensorLayout
::
NHWC
:
make_ParallelTensorFunctor
(
f_nhwc
,
out
.
mDesc
.
GetLengths
()[
0
],
out
.
mDesc
.
GetLengths
()[
1
],
out
.
mDesc
.
GetLengths
()[
2
],
out
.
mDesc
.
GetLengths
()[
3
])(
std
::
thread
::
hardware_concurrency
());
break
;
default:
throw
std
::
runtime_error
(
"wrong! not supported layout"
);
}
}
template
<
typename
TIn
,
typename
TWei
,
typename
TOut
,
typename
InLeftPads
,
typename
InRightPads
>
void
host_winograd_3x3_convolution
(
const
Tensor
<
TIn
>&
in_nchw
,
const
Tensor
<
TWei
>&
wei_kcyx
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment