Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
07a673c6
Commit
07a673c6
authored
Apr 14, 2022
by
carlushuang
Browse files
Merge remote-tracking branch 'origin/develop' into cpu_avx2
parents
c0f698d5
ac0d8066
Changes
307
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
778 additions
and
220 deletions
+778
-220
library/include/ck/library/obselete_driver_offline/driver_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
...ward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+2
-2
library/include/ck/library/obselete_driver_offline/driver_gemm_dlops_v1r2.hpp
...ibrary/obselete_driver_offline/driver_gemm_dlops_v1r2.hpp
+1
-1
library/include/ck/library/obselete_driver_offline/driver_gemm_dlops_v1r3.hpp
...ibrary/obselete_driver_offline/driver_gemm_dlops_v1r3.hpp
+1
-1
library/include/ck/library/obselete_driver_offline/driver_gemm_xdlops_v2r3.hpp
...brary/obselete_driver_offline/driver_gemm_xdlops_v2r3.hpp
+1
-1
library/include/ck/library/obselete_driver_offline/driver_gemm_xdlops_v2r4.hpp
...brary/obselete_driver_offline/driver_gemm_xdlops_v2r4.hpp
+1
-1
library/include/ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp
...e_tensor_operation/cpu/reference_conv_backward_weight.hpp
+3
-3
library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp
...eference_tensor_operation/cpu/reference_conv_bwd_data.hpp
+6
-6
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp
..._instance/gpu/reduce/device_reduce_instance_blockwise.hpp
+26
-26
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call.hpp
...u/reduce/device_reduce_instance_blockwise_second_call.hpp
+26
-26
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp
...u/reduce/device_reduce_instance_multiblock_atomic_add.hpp
+29
-29
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce.hpp
...duce/device_reduce_instance_multiblock_partial_reduce.hpp
+26
-26
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp
...instance/gpu/reduce/device_reduce_instance_threadwise.hpp
+26
-26
library/include/ck/library/utility/check_err.hpp
library/include/ck/library/utility/check_err.hpp
+40
-35
library/include/ck/library/utility/conv_fwd_util.hpp
library/include/ck/library/utility/conv_fwd_util.hpp
+554
-0
library/src/host_tensor/host_tensor.cpp
library/src/host_tensor/host_tensor.cpp
+1
-12
library/src/obselete_driver_offline/conv_add_fwd_driver_offline_nchwc.cpp
...lete_driver_offline/conv_add_fwd_driver_offline_nchwc.cpp
+6
-4
library/src/obselete_driver_offline/conv_bwd_driver_offline.cpp
...y/src/obselete_driver_offline/conv_bwd_driver_offline.cpp
+3
-1
library/src/obselete_driver_offline/conv_fwd_driver_offline.cpp
...y/src/obselete_driver_offline/conv_fwd_driver_offline.cpp
+3
-1
library/src/obselete_driver_offline/conv_fwd_driver_offline_nchwc.cpp
...obselete_driver_offline/conv_fwd_driver_offline_nchwc.cpp
+7
-5
library/src/obselete_driver_offline/conv_maxpool_fwd_driver_offline_nchwc.cpp
..._driver_offline/conv_maxpool_fwd_driver_offline_nchwc.cpp
+16
-14
No files found.
library/include/ck/library/obselete_driver_offline/driver_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
View file @
07a673c6
...
...
@@ -27,7 +27,7 @@ template <ck::index_t BlockSize,
ck
::
index_t
ABlockTransferDstScalarPerVector_E2
,
ck
::
index_t
BThreadTransferSrcScalarPerVector_E2
,
ck
::
index_t
CThreadTransferDstScalarPerVector_K
,
ck
::
ActivTypeEnum
_t
activ_type
>
ck
::
ActivTypeEnum
activ_type
>
struct
DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_maxpool
{
template
<
typename
...
Wei
,
...
...
@@ -305,7 +305,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
FloatAB
,
FloatAcc
,
FloatC
,
InMemoryDataOperationEnum
_t
::
Set
,
InMemoryDataOperationEnum
::
Set
,
decltype
(
a_e0_e1_k_e2_grid_desc
),
decltype
(
b_e0_e1_n_ho_wo_e2_grid_desc
),
decltype
(
c_k_n_hop_wop_grid_desc
),
...
...
library/include/ck/library/obselete_driver_offline/driver_gemm_dlops_v1r2.hpp
View file @
07a673c6
...
...
@@ -10,7 +10,7 @@ template <ck::index_t BlockSize,
typename
FloatAB
,
typename
FloatAcc
,
typename
FloatC
,
ck
::
InMemoryDataOperationEnum
_t
CGlobalMemoryDataOperation
,
ck
::
InMemoryDataOperationEnum
CGlobalMemoryDataOperation
,
typename
AKMGridDesc
,
typename
BKNGridDesc
,
typename
CMNGridDesc
,
...
...
library/include/ck/library/obselete_driver_offline/driver_gemm_dlops_v1r3.hpp
View file @
07a673c6
...
...
@@ -10,7 +10,7 @@ template <ck::index_t BlockSize,
typename
FloatAB
,
typename
FloatAcc
,
typename
FloatC
,
ck
::
InMemoryDataOperationEnum
_t
CGlobalMemoryDataOperation
,
ck
::
InMemoryDataOperationEnum
CGlobalMemoryDataOperation
,
typename
AK0MK1GridDesc
,
typename
BK0NK1GridDesc
,
typename
CMNGridDesc
,
...
...
library/include/ck/library/obselete_driver_offline/driver_gemm_xdlops_v2r3.hpp
View file @
07a673c6
...
...
@@ -11,7 +11,7 @@ template <ck::index_t BlockSize,
typename
FloatAB
,
typename
FloatAcc
,
typename
FloatC
,
ck
::
InMemoryDataOperationEnum
_t
CGlobalMemoryDataOperation
,
ck
::
InMemoryDataOperationEnum
CGlobalMemoryDataOperation
,
typename
AGridDesc_K0_M_K1
,
typename
BGridDesc_K0_N_K
,
typename
CMNGridDesc
,
...
...
library/include/ck/library/obselete_driver_offline/driver_gemm_xdlops_v2r4.hpp
View file @
07a673c6
...
...
@@ -10,7 +10,7 @@ template <ck::index_t BlockSize,
typename
FloatAB
,
typename
FloatAcc
,
typename
FloatC
,
ck
::
InMemoryDataOperationEnum
_t
CGlobalMemoryDataOperation
,
ck
::
InMemoryDataOperationEnum
CGlobalMemoryDataOperation
,
typename
ABK0MK1GridDesc
,
typename
BBK0NK1GridDesc
,
typename
CMNGridDesc
,
...
...
library/include/ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp
View file @
07a673c6
...
...
@@ -17,7 +17,7 @@ template <typename InDataType,
typename
InElementwiseOperation
,
typename
WeiElementwiseOperation
,
typename
OutElementwiseOperation
>
struct
ReferenceConv
Wrw
:
public
device
::
BaseOperator
struct
ReferenceConv
BwdWeight
:
public
device
::
BaseOperator
{
// Argument
struct
Argument
:
public
device
::
BaseArgument
...
...
@@ -62,7 +62,7 @@ struct ReferenceConvWrw : public device::BaseOperator
// Invoker
struct
Invoker
:
public
device
::
BaseInvoker
{
using
Argument
=
ReferenceConv
Wrw
::
Argument
;
using
Argument
=
ReferenceConv
BwdWeight
::
Argument
;
float
Run
(
const
Argument
&
arg
)
{
...
...
@@ -163,7 +163,7 @@ struct ReferenceConvWrw : public device::BaseOperator
auto
str
=
std
::
stringstream
();
// clang-format off
str
<<
"ReferenceConv
F
wd"
str
<<
"ReferenceConv
B
wd
Weight
"
<<
std
::
endl
;
// clang-format on
...
...
library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp
View file @
07a673c6
...
...
@@ -18,8 +18,8 @@ template <typename InDataType,
typename
InElementwiseOperation
,
typename
WeiElementwiseOperation
,
typename
OutElementwiseOperation
,
ck
::
index_t
NumDimSpatial
=
2
,
typename
std
::
enable_if
<
NumDimSpatial
>
=
1
&&
NumDimSpatial
<=
3
,
bool
>::
type
=
false
>
ck
::
index_t
NumDimSpatial
=
2
,
typename
ck
::
enable_if
<
NumDimSpatial
>
=
1
&&
NumDimSpatial
<=
3
,
bool
>::
type
=
false
>
struct
ReferenceConvBwdData
:
public
device
::
BaseOperator
{
// Argument
...
...
@@ -71,7 +71,7 @@ struct ReferenceConvBwdData : public device::BaseOperator
{
if
constexpr
(
NumDimSpatial
==
1
)
{
auto
f_nc
h
w
=
[
&
](
auto
n
,
auto
c
,
auto
wi
)
{
auto
f_ncw
=
[
&
](
auto
n
,
auto
c
,
auto
wi
)
{
std
::
size_t
K
=
arg
.
weight_
.
mDesc
.
GetLengths
()[
0
];
std
::
size_t
X
=
arg
.
weight_
.
mDesc
.
GetLengths
()[
2
];
std
::
size_t
Wo
=
arg
.
output_
.
mDesc
.
GetLengths
()[
2
];
...
...
@@ -108,7 +108,7 @@ struct ReferenceConvBwdData : public device::BaseOperator
arg
.
input_
(
n
,
c
,
wi
)
=
ck
::
type_convert
<
InDataType
>
(
v_in
);
};
make_ParallelTensorFunctor
(
f_nc
h
w
,
make_ParallelTensorFunctor
(
f_ncw
,
arg
.
input_
.
mDesc
.
GetLengths
()[
0
],
arg
.
input_
.
mDesc
.
GetLengths
()[
1
],
arg
.
input_
.
mDesc
.
GetLengths
()[
2
])(
...
...
@@ -182,7 +182,7 @@ struct ReferenceConvBwdData : public device::BaseOperator
}
else
if
constexpr
(
NumDimSpatial
==
3
)
{
auto
f_nchw
=
[
&
](
auto
n
,
auto
c
,
auto
di
,
auto
hi
,
auto
wi
)
{
auto
f_nc
d
hw
=
[
&
](
auto
n
,
auto
c
,
auto
di
,
auto
hi
,
auto
wi
)
{
std
::
size_t
K
=
arg
.
weight_
.
mDesc
.
GetLengths
()[
0
];
std
::
size_t
Z
=
arg
.
weight_
.
mDesc
.
GetLengths
()[
2
];
std
::
size_t
Y
=
arg
.
weight_
.
mDesc
.
GetLengths
()[
3
];
...
...
@@ -252,7 +252,7 @@ struct ReferenceConvBwdData : public device::BaseOperator
arg
.
input_
(
n
,
c
,
di
,
hi
,
wi
)
=
ck
::
type_convert
<
InDataType
>
(
v_in
);
};
make_ParallelTensorFunctor
(
f_nchw
,
make_ParallelTensorFunctor
(
f_nc
d
hw
,
arg
.
input_
.
mDesc
.
GetLengths
()[
0
],
arg
.
input_
.
mDesc
.
GetLengths
()[
1
],
arg
.
input_
.
mDesc
.
GetLengths
()[
2
],
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp
View file @
07a673c6
...
...
@@ -47,7 +47,7 @@ using reduce_configuration_2_instances_blockwise = std::tuple<
>
;
#endif
template
<
typename
AccDataType
,
ReduceTensorOp
_t
ReduceOpId
>
template
<
typename
AccDataType
,
ReduceTensorOp
ReduceOpId
>
using
deviceReduceBlockWisePtrType
=
DeviceReducePtr
<
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
InElementwiseOperation
,
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
>
;
...
...
@@ -57,9 +57,9 @@ template <typename InDataType,
typename
OutDataType
,
int
Rank
,
int
NumReduceDim
,
ReduceTensorOp
_t
ReduceOpId
,
NanPropagation
_t
NanOpt
,
ReduceTensorIndices
_t
IndicesOpt
>
ReduceTensorOp
ReduceOpId
,
NanPropagation
NanOpt
,
ReduceTensorIndices
IndicesOpt
>
void
add_device_reduce_instance_blockwise
(
std
::
vector
<
deviceReduceBlockWisePtrType
<
AccDataType
,
ReduceOpId
>>&
device_op_instances
)
{
...
...
@@ -71,11 +71,11 @@ void add_device_reduce_instance_blockwise(
AccElementwiseOperation
;
constexpr
bool
Indexable
=
(
ReduceOpId
==
ReduceTensorOp
_t
::
MIN
||
ReduceOpId
==
ReduceTensorOp
_t
::
MAX
||
ReduceOpId
==
ReduceTensorOp
_t
::
AMAX
);
constexpr
bool
NeedIndices
=
Indexable
&&
(
IndicesOpt
!=
ReduceTensorIndices
_t
::
NO_INDICES
);
(
ReduceOpId
==
ReduceTensorOp
::
MIN
||
ReduceOpId
==
ReduceTensorOp
::
MAX
||
ReduceOpId
==
ReduceTensorOp
::
AMAX
);
constexpr
bool
NeedIndices
=
Indexable
&&
(
IndicesOpt
!=
ReduceTensorIndices
::
NO_INDICES
);
constexpr
bool
PropagateNan
=
(
NanOpt
==
NanPropagation
_t
::
NOT_PROPAGATE_NAN
)
?
false
:
true
;
constexpr
bool
PropagateNan
=
(
NanOpt
==
NanPropagation
::
NOT_PROPAGATE_NAN
)
?
false
:
true
;
static_for
<
0
,
std
::
tuple_size
<
reduce_configuration_1_instances
>::
value
,
1
>
{}([
&
](
auto
i
)
{
using
cfg1
=
...
...
@@ -123,15 +123,15 @@ void add_device_reduce_instance_blockwise(
IndicesOpt>( \
std::vector<deviceReduceBlockWisePtrType<compT, ReduceOpId>> & device_op_instances)
#define ADD_BLOCKWISE_INST_BY_ID(
\
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)
\
ADD_BLOCKWISE_INST_BY_TYPE(inT,
\
compT,
\
outT,
\
static_cast<ReduceTensorOp
_t
>(ReduceOpId), \
static_cast<NanPropagation
_t
>(NanOpt), \
static_cast<ReduceTensorIndices
_t
>(IndicesOpt), \
Rank,
\
#define ADD_BLOCKWISE_INST_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
ADD_BLOCKWISE_INST_BY_TYPE(inT, \
compT, \
outT, \
static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \
NumReduceDim)
#define ADD_BLOCKWISE_INST_REF_BY_TYPE( \
...
...
@@ -150,15 +150,15 @@ void add_device_reduce_instance_blockwise(
AccElementwiseOperation>> & \
device_op_instances)
#define ADD_BLOCKWISE_INST_REF_BY_ID(
\
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)
\
ADD_BLOCKWISE_INST_REF_BY_TYPE(inT,
\
compT,
\
outT,
\
static_cast<ReduceTensorOp
_t
>(ReduceOpId), \
static_cast<NanPropagation
_t
>(NanOpt), \
static_cast<ReduceTensorIndices
_t
>(IndicesOpt), \
Rank,
\
#define ADD_BLOCKWISE_INST_REF_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
ADD_BLOCKWISE_INST_REF_BY_TYPE(inT, \
compT, \
outT, \
static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \
NumReduceDim)
}
// namespace device_reduce_instance
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call.hpp
View file @
07a673c6
...
...
@@ -34,7 +34,7 @@ using reduce_configuration_2_instances_blockwise_second_call = std::tuple<
>
;
#endif
template
<
typename
AccDataType
,
ReduceTensorOp
_t
ReduceOpId
>
template
<
typename
AccDataType
,
ReduceTensorOp
ReduceOpId
>
using
deviceReduceBlockWiseSecondCallPtrType
=
DeviceReducePtr
<
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
false
,
true
>::
InElementwiseOperation
,
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
false
,
true
>::
AccElementwiseOperation
>
;
...
...
@@ -44,9 +44,9 @@ template <typename InDataType,
typename
OutDataType
,
int
Rank
,
int
NumReduceDim
,
ReduceTensorOp
_t
ReduceOpId
,
NanPropagation
_t
NanOpt
,
ReduceTensorIndices
_t
IndicesOpt
>
ReduceTensorOp
ReduceOpId
,
NanPropagation
NanOpt
,
ReduceTensorIndices
IndicesOpt
>
void
add_device_reduce_instance_blockwise_second_call
(
std
::
vector
<
deviceReduceBlockWiseSecondCallPtrType
<
AccDataType
,
ReduceOpId
>>&
device_op_instances
)
...
...
@@ -60,11 +60,11 @@ void add_device_reduce_instance_blockwise_second_call(
AccElementwiseOperation
;
constexpr
bool
Indexable
=
(
ReduceOpId
==
ReduceTensorOp
_t
::
MIN
||
ReduceOpId
==
ReduceTensorOp
_t
::
MAX
||
ReduceOpId
==
ReduceTensorOp
_t
::
AMAX
);
constexpr
bool
NeedIndices
=
Indexable
&&
(
IndicesOpt
!=
ReduceTensorIndices
_t
::
NO_INDICES
);
(
ReduceOpId
==
ReduceTensorOp
::
MIN
||
ReduceOpId
==
ReduceTensorOp
::
MAX
||
ReduceOpId
==
ReduceTensorOp
::
AMAX
);
constexpr
bool
NeedIndices
=
Indexable
&&
(
IndicesOpt
!=
ReduceTensorIndices
::
NO_INDICES
);
constexpr
bool
PropagateNan
=
(
NanOpt
==
NanPropagation
_t
::
NOT_PROPAGATE_NAN
)
?
false
:
true
;
constexpr
bool
PropagateNan
=
(
NanOpt
==
NanPropagation
::
NOT_PROPAGATE_NAN
)
?
false
:
true
;
static_assert
(
std
::
is_same
<
InDataType
,
AccDataType
>::
value
,
"InDataType and AccDataType should be the same to use "
...
...
@@ -117,15 +117,15 @@ void add_device_reduce_instance_blockwise_second_call(
std::vector<deviceReduceBlockWiseSecondCallPtrType<compT, ReduceOpId>> & \
device_op_instances)
#define ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(
\
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)
\
ADD_BLOCKWISE_SECOND_CALL_INST_BY_TYPE(inT,
\
compT,
\
outT,
\
static_cast<ReduceTensorOp
_t
>(ReduceOpId), \
static_cast<NanPropagation
_t
>(NanOpt), \
static_cast<ReduceTensorIndices
_t
>(IndicesOpt), \
Rank,
\
#define ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
ADD_BLOCKWISE_SECOND_CALL_INST_BY_TYPE(inT, \
compT, \
outT, \
static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \
NumReduceDim)
#define ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_TYPE( \
...
...
@@ -145,15 +145,15 @@ void add_device_reduce_instance_blockwise_second_call(
AccElementwiseOperation>> & \
device_op_instances)
#define ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(
\
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)
\
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_TYPE(inT,
\
compT,
\
outT,
\
static_cast<ReduceTensorOp
_t
>(ReduceOpId), \
static_cast<NanPropagation
_t
>(NanOpt), \
static_cast<ReduceTensorIndices
_t
>(IndicesOpt), \
Rank,
\
#define ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_TYPE(inT, \
compT, \
outT, \
static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \
NumReduceDim)
}
// namespace device_reduce_instance
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp
View file @
07a673c6
...
...
@@ -47,7 +47,7 @@ using reduce_configuration_2_instances_multiblock_atomic_add = std::tuple<
>
;
#endif
template
<
typename
AccDataType
,
ReduceTensorOp
_t
ReduceOperation
>
template
<
typename
AccDataType
,
ReduceTensorOp
ReduceOperation
>
using
deviceReduceMultiBlockAtomicAddPtrType
=
DeviceReducePtr
<
typename
reduce_unary_operator
<
AccDataType
,
ReduceOperation
,
true
,
true
>::
InElementwiseOperation
,
...
...
@@ -59,9 +59,9 @@ template <typename InDataType,
typename
OutDataType
,
int
Rank
,
int
NumReduceDim
,
ReduceTensorOp
_t
ReduceOpId
,
NanPropagation
_t
NanOpt
,
ReduceTensorIndices
_t
IndicesOpt
>
ReduceTensorOp
ReduceOpId
,
NanPropagation
NanOpt
,
ReduceTensorIndices
IndicesOpt
>
void
add_device_reduce_instance_multiblock_atomic_add
(
std
::
vector
<
deviceReduceMultiBlockAtomicAddPtrType
<
AccDataType
,
ReduceOpId
>>&
device_op_instances
)
...
...
@@ -74,18 +74,18 @@ void add_device_reduce_instance_multiblock_atomic_add(
AccElementwiseOperation
;
constexpr
bool
Indexable
=
(
ReduceOpId
==
ReduceTensorOp
_t
::
MIN
||
ReduceOpId
==
ReduceTensorOp
_t
::
MAX
||
ReduceOpId
==
ReduceTensorOp
_t
::
AMAX
);
constexpr
bool
NeedIndices
=
Indexable
&&
(
IndicesOpt
!=
ReduceTensorIndices
_t
::
NO_INDICES
);
(
ReduceOpId
==
ReduceTensorOp
::
MIN
||
ReduceOpId
==
ReduceTensorOp
::
MAX
||
ReduceOpId
==
ReduceTensorOp
::
AMAX
);
constexpr
bool
NeedIndices
=
Indexable
&&
(
IndicesOpt
!=
ReduceTensorIndices
::
NO_INDICES
);
constexpr
bool
PropagateNan
=
(
NanOpt
==
NanPropagation
_t
::
NOT_PROPAGATE_NAN
)
?
false
:
true
;
constexpr
bool
PropagateNan
=
(
NanOpt
==
NanPropagation
::
NOT_PROPAGATE_NAN
)
?
false
:
true
;
static_assert
(
IndicesOpt
==
ReduceTensorIndices
_t
::
NO_INDICES
,
static_assert
(
IndicesOpt
==
ReduceTensorIndices
::
NO_INDICES
,
"AtomicAdd can only be used with reduction operations without indices!"
);
constexpr
bool
op_acceptable
=
(
ReduceOpId
==
ReduceTensorOp
_t
::
ADD
||
ReduceOpId
==
ReduceTensorOp
_t
::
MUL
||
ReduceOpId
==
ReduceTensorOp
_t
::
AVG
||
ReduceOpId
==
ReduceTensorOp
_t
::
NORM1
);
(
ReduceOpId
==
ReduceTensorOp
::
ADD
||
ReduceOpId
==
ReduceTensorOp
::
MUL
||
ReduceOpId
==
ReduceTensorOp
::
AVG
||
ReduceOpId
==
ReduceTensorOp
::
NORM1
);
constexpr
bool
out_type_acceptable
=
(
std
::
is_same
<
OutDataType
,
float
>::
value
||
std
::
is_same
<
OutDataType
,
double
>::
value
);
...
...
@@ -144,15 +144,15 @@ void add_device_reduce_instance_multiblock_atomic_add(
std::vector<deviceReduceMultiBlockAtomicAddPtrType<compT, ReduceOpId>> & \
device_op_instances)
#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(
\
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)
\
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_TYPE(inT,
\
compT,
\
outT,
\
static_cast<ReduceTensorOp
_t
>(ReduceOpId), \
static_cast<NanPropagation
_t
>(NanOpt), \
static_cast<ReduceTensorIndices
_t
>(IndicesOpt), \
Rank,
\
#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_TYPE(inT, \
compT, \
outT, \
static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \
NumReduceDim)
#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE( \
...
...
@@ -171,15 +171,15 @@ void add_device_reduce_instance_multiblock_atomic_add(
AccElementwiseOperation>> & \
device_op_instances)
#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(
\
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)
\
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE(inT,
\
compT,
\
outT,
\
static_cast<ReduceTensorOp
_t
>(ReduceOpId), \
static_cast<NanPropagation
_t
>(NanOpt), \
static_cast<ReduceTensorIndices
_t
>(IndicesOpt), \
Rank,
\
#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE(inT, \
compT, \
outT, \
static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \
NumReduceDim)
}
// namespace device_reduce_instance
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce.hpp
View file @
07a673c6
...
...
@@ -46,7 +46,7 @@ using reduce_configuration_2_instances_multiblock_partial_reduce = std::tuple<
>
;
#endif
template
<
typename
AccDataType
,
ReduceTensorOp
_t
ReduceOpId
>
template
<
typename
AccDataType
,
ReduceTensorOp
ReduceOpId
>
using
deviceReduceMultiBlockPartialReducePtrType
=
DeviceReducePtr
<
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
false
>::
InElementwiseOperation
,
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
false
>::
AccElementwiseOperation
>
;
...
...
@@ -56,9 +56,9 @@ template <typename InDataType,
typename
OutDataType
,
int
Rank
,
int
NumReduceDim
,
ReduceTensorOp
_t
ReduceOpId
,
NanPropagation
_t
NanOpt
,
ReduceTensorIndices
_t
IndicesOpt
>
ReduceTensorOp
ReduceOpId
,
NanPropagation
NanOpt
,
ReduceTensorIndices
IndicesOpt
>
void
add_device_reduce_instance_multiblock_partial_reduce
(
std
::
vector
<
deviceReduceMultiBlockPartialReducePtrType
<
AccDataType
,
ReduceOpId
>>&
device_op_instances
)
...
...
@@ -72,11 +72,11 @@ void add_device_reduce_instance_multiblock_partial_reduce(
AccElementwiseOperation
;
constexpr
bool
Indexable
=
(
ReduceOpId
==
ReduceTensorOp
_t
::
MIN
||
ReduceOpId
==
ReduceTensorOp
_t
::
MAX
||
ReduceOpId
==
ReduceTensorOp
_t
::
AMAX
);
constexpr
bool
NeedIndices
=
Indexable
&&
(
IndicesOpt
!=
ReduceTensorIndices
_t
::
NO_INDICES
);
(
ReduceOpId
==
ReduceTensorOp
::
MIN
||
ReduceOpId
==
ReduceTensorOp
::
MAX
||
ReduceOpId
==
ReduceTensorOp
::
AMAX
);
constexpr
bool
NeedIndices
=
Indexable
&&
(
IndicesOpt
!=
ReduceTensorIndices
::
NO_INDICES
);
constexpr
bool
PropagateNan
=
(
NanOpt
==
NanPropagation
_t
::
NOT_PROPAGATE_NAN
)
?
false
:
true
;
constexpr
bool
PropagateNan
=
(
NanOpt
==
NanPropagation
::
NOT_PROPAGATE_NAN
)
?
false
:
true
;
static_for
<
0
,
std
::
tuple_size
<
reduce_configuration_1_instances
>::
value
,
1
>
{}([
&
](
auto
i
)
{
using
cfg1
=
...
...
@@ -126,15 +126,15 @@ void add_device_reduce_instance_multiblock_partial_reduce(
std::vector<deviceReduceMultiBlockPartialReducePtrType<compT, ReduceOpId>> & \
device_op_instances)
#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(
\
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)
\
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_TYPE(inT,
\
compT,
\
outT,
\
static_cast<ReduceTensorOp
_t
>(ReduceOpId), \
static_cast<NanPropagation
_t
>(NanOpt), \
static_cast<ReduceTensorIndices
_t
>(IndicesOpt), \
Rank,
\
#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_TYPE(inT, \
compT, \
outT, \
static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \
NumReduceDim)
#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_TYPE( \
...
...
@@ -154,15 +154,15 @@ void add_device_reduce_instance_multiblock_partial_reduce(
AccElementwiseOperation>> & \
device_op_instances)
#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(
\
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)
\
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_TYPE(inT,
\
compT,
\
outT,
\
static_cast<ReduceTensorOp
_t
>(ReduceOpId), \
static_cast<NanPropagation
_t
>(NanOpt), \
static_cast<ReduceTensorIndices
_t
>(IndicesOpt), \
Rank,
\
#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_TYPE(inT, \
compT, \
outT, \
static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \
NumReduceDim)
}
// namespace device_reduce_instance
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp
View file @
07a673c6
...
...
@@ -47,7 +47,7 @@ using reduce_configuration_2_instances_threadwise = std::tuple<
>
;
#endif
template
<
typename
AccDataType
,
ReduceTensorOp
_t
ReduceOpId
>
template
<
typename
AccDataType
,
ReduceTensorOp
ReduceOpId
>
using
deviceReduceThreadWisePtrType
=
DeviceReducePtr
<
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
InElementwiseOperation
,
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
>
;
...
...
@@ -57,9 +57,9 @@ template <typename InDataType,
typename
OutDataType
,
int
Rank
,
int
NumReduceDim
,
ReduceTensorOp
_t
ReduceOpId
,
NanPropagation
_t
NanOpt
,
ReduceTensorIndices
_t
IndicesOpt
>
ReduceTensorOp
ReduceOpId
,
NanPropagation
NanOpt
,
ReduceTensorIndices
IndicesOpt
>
void
add_device_reduce_instance_threadwise
(
std
::
vector
<
deviceReduceThreadWisePtrType
<
AccDataType
,
ReduceOpId
>>&
device_op_instances
)
{
...
...
@@ -71,11 +71,11 @@ void add_device_reduce_instance_threadwise(
AccElementwiseOperation
;
constexpr
bool
Indexable
=
(
ReduceOpId
==
ReduceTensorOp
_t
::
MIN
||
ReduceOpId
==
ReduceTensorOp
_t
::
MAX
||
ReduceOpId
==
ReduceTensorOp
_t
::
AMAX
);
constexpr
bool
NeedIndices
=
Indexable
&&
(
IndicesOpt
!=
ReduceTensorIndices
_t
::
NO_INDICES
);
(
ReduceOpId
==
ReduceTensorOp
::
MIN
||
ReduceOpId
==
ReduceTensorOp
::
MAX
||
ReduceOpId
==
ReduceTensorOp
::
AMAX
);
constexpr
bool
NeedIndices
=
Indexable
&&
(
IndicesOpt
!=
ReduceTensorIndices
::
NO_INDICES
);
constexpr
bool
PropagateNan
=
(
NanOpt
==
NanPropagation
_t
::
NOT_PROPAGATE_NAN
)
?
false
:
true
;
constexpr
bool
PropagateNan
=
(
NanOpt
==
NanPropagation
::
NOT_PROPAGATE_NAN
)
?
false
:
true
;
using
cfg1
=
ReductionConfiguration_1
<
256
,
256
,
1
>
;
...
...
@@ -119,15 +119,15 @@ void add_device_reduce_instance_threadwise(
IndicesOpt>( \
std::vector<deviceReduceThreadWisePtrType<compT, ReduceOpId>> & device_op_instances)
#define ADD_THREADWISE_INST_BY_ID(
\
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)
\
ADD_THREADWISE_INST_BY_TYPE(inT,
\
compT,
\
outT,
\
static_cast<ReduceTensorOp
_t
>(ReduceOpId), \
static_cast<NanPropagation
_t
>(NanOpt), \
static_cast<ReduceTensorIndices
_t
>(IndicesOpt), \
Rank,
\
#define ADD_THREADWISE_INST_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
ADD_THREADWISE_INST_BY_TYPE(inT, \
compT, \
outT, \
static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \
NumReduceDim)
#define ADD_THREADWISE_INST_REF_BY_TYPE( \
...
...
@@ -146,15 +146,15 @@ void add_device_reduce_instance_threadwise(
AccElementwiseOperation>> & \
device_op_instances)
#define ADD_THREADWISE_INST_REF_BY_ID(
\
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)
\
ADD_THREADWISE_INST_REF_BY_TYPE(inT,
\
compT,
\
outT,
\
static_cast<ReduceTensorOp
_t
>(ReduceOpId), \
static_cast<NanPropagation
_t
>(NanOpt), \
static_cast<ReduceTensorIndices
_t
>(IndicesOpt), \
Rank,
\
#define ADD_THREADWISE_INST_REF_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
ADD_THREADWISE_INST_REF_BY_TYPE(inT, \
compT, \
outT, \
static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \
NumReduceDim)
}
// namespace device_reduce_instance
...
...
test
/include/
test_util
.hpp
→
library
/include/
ck/library/utility/check_err
.hpp
View file @
07a673c6
#ifndef
TEST_UTIL
_HPP
#define
TEST_UTIL
_HPP
#ifndef
CHECK_ERR
_HPP
#define
CHECK_ERR
_HPP
#include <algorithm>
#include <cmath>
#include <cstdlib>
#include <half.hpp>
#include <iostream>
#include <iomanip>
#include <iterator>
...
...
@@ -13,16 +14,17 @@
#include "data_type.hpp"
namespace
test
{
namespace
ck
{
namespace
utils
{
template
<
typename
T
>
typename
std
::
enable_if
<
std
::
is_floating_point
<
T
>::
value
&&
!
std
::
is_same
<
T
,
ck
::
half_t
>::
value
,
typename
std
::
enable_if
<
std
::
is_floating_point
<
T
>::
value
&&
!
std
::
is_same
<
T
,
half_t
>::
value
,
bool
>::
type
check_err
(
const
std
::
vector
<
T
>&
out
,
const
std
::
vector
<
T
>&
ref
,
const
std
::
string
&
msg
,
double
rtol
=
1e-5
,
double
atol
=
1e-8
)
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
double
rtol
=
1e-5
,
double
atol
=
1e-8
)
{
if
(
out
.
size
()
!=
ref
.
size
())
{
...
...
@@ -60,13 +62,12 @@ check_err(const std::vector<T>& out,
}
template
<
typename
T
>
typename
std
::
enable_if
<
std
::
is_same
<
T
,
ck
::
bhalf_t
>::
value
||
std
::
is_same
<
T
,
ck
::
half_t
>::
value
,
bool
>::
type
typename
std
::
enable_if
<
std
::
is_same
<
T
,
bhalf_t
>::
value
,
bool
>::
type
check_err
(
const
std
::
vector
<
T
>&
out
,
const
std
::
vector
<
T
>&
ref
,
const
std
::
string
&
msg
,
double
rtol
=
1e-
5
,
double
atol
=
1e-
8
)
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
double
rtol
=
1e-
3
,
double
atol
=
1e-
3
)
{
if
(
out
.
size
()
!=
ref
.
size
())
{
...
...
@@ -77,14 +78,15 @@ check_err(const std::vector<T>& out,
}
bool
res
{
true
};
int
err_count
=
0
;
double
err
=
0
;
double
max_err
=
ck
::
type_convert
<
float
>
(
ck
::
NumericLimits
<
T
>::
Min
());
int
err_count
=
0
;
double
err
=
0
;
// TODO: This is a hack. We should have proper specialization for bhalf_t data type.
double
max_err
=
std
::
numeric_limits
<
float
>::
min
();
for
(
std
::
size_t
i
=
0
;
i
<
ref
.
size
();
++
i
)
{
float
o
=
ck
::
type_convert
<
float
>
(
out
[
i
]);
float
r
=
ck
::
type_convert
<
float
>
(
ref
[
i
]);
err
=
std
::
abs
(
o
-
r
);
double
o
=
type_convert
<
float
>
(
out
[
i
]);
double
r
=
type_convert
<
float
>
(
ref
[
i
]);
err
=
std
::
abs
(
o
-
r
);
if
(
err
>
atol
+
rtol
*
std
::
abs
(
r
)
||
!
std
::
isfinite
(
o
)
||
!
std
::
isfinite
(
r
))
{
max_err
=
err
>
max_err
?
err
:
max_err
;
...
...
@@ -105,11 +107,14 @@ check_err(const std::vector<T>& out,
return
res
;
}
bool
check_err
(
const
std
::
vector
<
ck
::
half_t
>&
out
,
const
std
::
vector
<
ck
::
half_t
>&
ref
,
const
std
::
string
&
msg
,
ck
::
half_t
rtol
=
static_cast
<
ck
::
half_t
>
(
1e-3
f
),
ck
::
half_t
atol
=
static_cast
<
ck
::
half_t
>
(
1e-3
f
))
template
<
typename
T
>
typename
std
::
enable_if
<
std
::
is_same
<
T
,
half_t
>::
value
||
std
::
is_same
<
T
,
half_float
::
half
>::
value
,
bool
>::
type
check_err
(
const
std
::
vector
<
T
>&
out
,
const
std
::
vector
<
T
>&
ref
,
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
double
rtol
=
1e-3
,
double
atol
=
1e-3
)
{
if
(
out
.
size
()
!=
ref
.
size
())
{
...
...
@@ -122,20 +127,20 @@ bool check_err(const std::vector<ck::half_t>& out,
bool
res
{
true
};
int
err_count
=
0
;
double
err
=
0
;
double
max_err
=
std
::
numeric_limits
<
ck
::
half_t
>::
min
();
double
max_err
=
std
::
numeric_limits
<
T
>::
min
();
for
(
std
::
size_t
i
=
0
;
i
<
ref
.
size
();
++
i
)
{
double
o
ut_
=
double
(
out
[
i
]);
double
r
ef_
=
double
(
ref
[
i
]);
err
=
std
::
abs
(
o
ut_
-
r
ef_
);
if
(
err
>
atol
+
rtol
*
std
::
abs
(
r
ef_
)
||
!
std
::
isfinite
(
o
ut_
)
||
!
std
::
isfinite
(
r
ef_
))
double
o
=
type_convert
<
float
>
(
out
[
i
]);
double
r
=
type_convert
<
float
>
(
ref
[
i
]);
err
=
std
::
abs
(
o
-
r
);
if
(
err
>
atol
+
rtol
*
std
::
abs
(
r
)
||
!
std
::
isfinite
(
o
)
||
!
std
::
isfinite
(
r
))
{
max_err
=
err
>
max_err
?
err
:
max_err
;
err_count
++
;
if
(
err_count
<
5
)
{
std
::
cout
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
"out["
<<
i
<<
"] != ref["
<<
i
<<
"]: "
<<
o
ut_
<<
"!="
<<
r
ef_
<<
std
::
endl
<<
i
<<
"]: "
<<
o
<<
"
!=
"
<<
r
<<
std
::
endl
<<
msg
<<
std
::
endl
;
}
res
=
false
;
...
...
@@ -149,13 +154,12 @@ bool check_err(const std::vector<ck::half_t>& out,
}
template
<
typename
T
>
typename
std
::
enable_if
<
std
::
is_integral
<
T
>::
value
&&
!
std
::
is_same
<
T
,
ck
::
bhalf_t
>::
value
,
bool
>::
type
typename
std
::
enable_if
<
std
::
is_integral
<
T
>::
value
&&
!
std
::
is_same
<
T
,
bhalf_t
>::
value
,
bool
>::
type
check_err
(
const
std
::
vector
<
T
>&
out
,
const
std
::
vector
<
T
>&
ref
,
const
std
::
string
&
msg
,
double
=
0
,
double
=
0
)
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
double
=
0
,
double
=
0
)
{
if
(
out
.
size
()
!=
ref
.
size
())
{
...
...
@@ -178,7 +182,8 @@ check_err(const std::vector<T>& out,
return
true
;
}
}
// namespace test
}
// namespace utils
}
// namespace ck
template
<
typename
T
>
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
const
std
::
vector
<
T
>&
v
)
...
...
include/ck/
tensor_operation/gpu/device
/conv_util
s
.hpp
→
library/
include/ck/
library/utility
/conv_
fwd_
util.hpp
View file @
07a673c6
#ifndef CONV_UTILS_HPP
#define CONV_UTILS_HPP
#include <cstdlib>
#include <functional>
#include <iterator>
#include <numeric>
#include <sstream>
#include <type_traits>
#include <vector>
#include "config.hpp"
#include "host_tensor.hpp"
#include "tensor_layout.hpp"
namespace
ck
{
namespace
conv_util
{
/**
* @brief Calculate number of FLOPs for Convolution
*
* @param[in] N Batch size.
* @param[in] C Number of input channels.
* @param[in] K Number of output channels.
* @param[in] filter_spatial_lengths Filter spatial dimensions lengths.
* @param[in] output_spatial_lengths Convolution output spatial dimensions
* lengths.
*
* @return The number of flops.
*/
std
::
size_t
GetFlops
(
ck
::
index_t
N
,
ck
::
index_t
C
,
ck
::
index_t
K
,
const
std
::
vector
<
ck
::
index_t
>&
filter_spatial_lengths
,
const
std
::
vector
<
ck
::
index_t
>&
output_spatial_lengths
)
{
// 2 * N * K * <output spatial lengths product> * C * <filter spatial lengths product>
return
static_cast
<
std
::
size_t
>
(
2
)
*
N
*
K
*
std
::
accumulate
(
std
::
begin
(
output_spatial_lengths
),
std
::
end
(
output_spatial_lengths
),
static_cast
<
std
::
size_t
>
(
1
),
std
::
multiplies
<
std
::
size_t
>
())
*
C
*
std
::
accumulate
(
std
::
begin
(
filter_spatial_lengths
),
std
::
end
(
filter_spatial_lengths
),
static_cast
<
std
::
size_t
>
(
1
),
std
::
multiplies
<
std
::
size_t
>
());
}
/**
* @brief Calculate number of bytes read/write by convolution algorithm.
*
* @param[in] N Batch size.
* @param[in] C Number of input channels.
* @param[in] K Number of output channels.
* @param[in] input_spatial_lengths Input spatial dimensions lengths.
* @param[in] filter_spatial_lengths Filter spatial dimensions lengths.
* @param[in] output_spatial_lengths Output spatial dimensions lengths
*
* @tparam InDataType Input tensor data type.
* @tparam WeiDataType Weights tensor data type.
* @tparam OutDataType Output tensor data type.
*
* @return The number of used bytes.
*/
template
<
typename
InDataType
=
float
,
typename
WeiDataType
=
InDataType
,
typename
OutDataType
=
InDataType
>
std
::
size_t
GetBtype
(
ck
::
index_t
N
,
ck
::
index_t
C
,
ck
::
index_t
K
,
const
std
::
vector
<
ck
::
index_t
>&
input_spatial_lengths
,
const
std
::
vector
<
ck
::
index_t
>&
filter_spatial_lengths
,
const
std
::
vector
<
ck
::
index_t
>&
output_spatial_lengths
)
{
// sizeof(InDataType) * (N * C * <input spatial lengths product>) +
// sizeof(WeiDataType) * (K * C * <filter spatial lengths product>) +
// sizeof(OutDataType) * (N * K * <output spatial lengths product>);
return
sizeof
(
InDataType
)
*
(
N
*
C
*
std
::
accumulate
(
std
::
begin
(
input_spatial_lengths
),
std
::
end
(
input_spatial_lengths
),
static_cast
<
std
::
size_t
>
(
1
),
std
::
multiplies
<
std
::
size_t
>
()))
+
sizeof
(
WeiDataType
)
*
(
K
*
C
*
std
::
accumulate
(
std
::
begin
(
filter_spatial_lengths
),
std
::
end
(
filter_spatial_lengths
),
static_cast
<
std
::
size_t
>
(
1
),
std
::
multiplies
<
std
::
size_t
>
()))
+
sizeof
(
OutDataType
)
*
(
N
*
K
*
std
::
accumulate
(
std
::
begin
(
output_spatial_lengths
),
std
::
end
(
output_spatial_lengths
),
static_cast
<
std
::
size_t
>
(
1
),
std
::
multiplies
<
std
::
size_t
>
()));
}
struct
ConvParams
{
ConvParams
()
:
num_dim_spatial
(
2
),
N
(
128
),
K
(
256
),
C
(
192
),
filter_spatial_lengths
(
2
,
3
),
input_spatial_lengths
(
2
,
71
),
conv_filter_strides
(
2
,
2
),
conv_filter_dilations
(
2
,
1
),
input_left_pads
(
2
,
1
),
input_right_pads
(
2
,
1
)
{
}
ConvParams
(
ck
::
index_t
n_dim_spatial
,
ck
::
index_t
n
,
ck
::
index_t
k
,
ck
::
index_t
c
,
std
::
vector
<
ck
::
index_t
>
filter_lengths
,
std
::
vector
<
ck
::
index_t
>
input_lengths
,
std
::
vector
<
ck
::
index_t
>
conv_strides
,
std
::
vector
<
ck
::
index_t
>
conv_dilations
,
std
::
vector
<
ck
::
index_t
>
left_pads
,
std
::
vector
<
ck
::
index_t
>
right_pads
)
:
num_dim_spatial
(
n_dim_spatial
),
N
(
n
),
K
(
k
),
C
(
c
),
filter_spatial_lengths
(
filter_lengths
),
input_spatial_lengths
(
input_lengths
),
conv_filter_strides
(
conv_strides
),
conv_filter_dilations
(
conv_dilations
),
input_left_pads
(
left_pads
),
input_right_pads
(
right_pads
)
{
}
ck
::
index_t
num_dim_spatial
;
ck
::
index_t
N
;
ck
::
index_t
K
;
ck
::
index_t
C
;
std
::
vector
<
ck
::
index_t
>
filter_spatial_lengths
;
std
::
vector
<
ck
::
index_t
>
input_spatial_lengths
;
std
::
vector
<
ck
::
index_t
>
conv_filter_strides
;
std
::
vector
<
ck
::
index_t
>
conv_filter_dilations
;
std
::
vector
<
ck
::
index_t
>
input_left_pads
;
std
::
vector
<
ck
::
index_t
>
input_right_pads
;
std
::
vector
<
ck
::
index_t
>
GetOutputSpatialLengths
()
const
{
std
::
vector
<
ck
::
index_t
>
out_spatial_len
(
num_dim_spatial
,
0
);
for
(
ck
::
index_t
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
// XEff = (X - 1) * conv_dilation_w + 1;
// Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
const
ck
::
index_t
idx_eff
=
(
filter_spatial_lengths
[
i
]
-
1
)
*
conv_filter_dilations
[
i
]
+
1
;
out_spatial_len
[
i
]
=
(
input_spatial_lengths
[
i
]
+
input_left_pads
[
i
]
+
input_right_pads
[
i
]
-
idx_eff
)
/
conv_filter_strides
[
i
]
+
1
;
}
return
out_spatial_len
;
}
};
/**
* @brief Gets the host tensor descriptor.
*
* @param[in] dims The tensor dimensions lengths. Always in NCHW format.
* @param[in] layout The tensor data layout.
*
* @tparam TensorLayout Layout type.
*
* @return The host tensor descriptor object.
*/
template
<
typename
TensorLayout
>
HostTensorDescriptor
GetHostTensorDescriptor
(
const
std
::
vector
<
std
::
size_t
>&
dims
,
const
TensorLayout
&
layout
)
{
std
::
size_t
C
=
dims
[
1
];
// 1D
if
constexpr
(
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
NCW
>::
value
||
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
KCX
>::
value
||
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
NKW
>::
value
)
{
return
HostTensorDescriptor
(
dims
,
std
::
vector
<
std
::
size_t
>
({
C
*
dims
[
2
],
dims
[
2
],
1
}));
}
else
if
constexpr
(
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
NWC
>::
value
||
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
KXC
>::
value
||
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
NWK
>::
value
)
{
return
HostTensorDescriptor
(
dims
,
std
::
vector
<
std
::
size_t
>
({
C
*
dims
[
2
],
1
,
C
}));
}
// 2D
else
if
constexpr
(
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
NCHW
>::
value
||
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
KCYX
>::
value
||
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
NKHW
>::
value
)
{
return
HostTensorDescriptor
(
dims
,
std
::
vector
<
std
::
size_t
>
{
C
*
dims
[
2
]
*
dims
[
3
],
dims
[
2
]
*
dims
[
3
],
dims
[
3
],
1
});
}
else
if
constexpr
(
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
NHWC
>::
value
||
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
KYXC
>::
value
||
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
NHWK
>::
value
)
{
return
HostTensorDescriptor
(
dims
,
std
::
vector
<
std
::
size_t
>
{
C
*
dims
[
2
]
*
dims
[
3
],
1
,
dims
[
3
]
*
C
,
C
});
}
// 3D
else
if
constexpr
(
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
NCDHW
>::
value
||
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
KCZYX
>::
value
||
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
NKDHW
>::
value
)
{
return
HostTensorDescriptor
(
dims
,
std
::
vector
<
std
::
size_t
>
{
C
*
dims
[
2
]
*
dims
[
3
]
*
dims
[
4
],
dims
[
2
]
*
dims
[
3
]
*
dims
[
4
],
dims
[
3
]
*
dims
[
4
],
dims
[
4
],
1
});
}
else
if
constexpr
(
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
NDHWC
>::
value
||
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
KZYXC
>::
value
||
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
NDHWK
>::
value
)
{
return
HostTensorDescriptor
(
dims
,
std
::
vector
<
std
::
size_t
>
{
C
*
dims
[
2
]
*
dims
[
3
]
*
dims
[
4
],
1
,
dims
[
3
]
*
dims
[
4
]
*
C
,
dims
[
4
]
*
C
,
C
});
}
std
::
stringstream
err_msg
;
err_msg
<<
"Unsupported data layout provided: "
<<
layout
<<
"!"
;
throw
std
::
runtime_error
(
err_msg
.
str
());
}
}
// namespace conv_util
}
// namespace ck
#endif
#ifndef CONV_FWD_UTIL_HPP
#define CONV_FWD_UTIL_HPP
#include <algorithm>
#include <cstdlib>
#include <functional>
#include <iterator>
#include <numeric>
#include <sstream>
#include <random>
#include <tuple>
#include <type_traits>
#include <vector>
#include "check_err.hpp"
#include "config.hpp"
#include "device.hpp"
#include "device_conv_fwd.hpp"
#include "device_tensor.hpp"
#include "element_wise_operation.hpp"
#include "host_tensor.hpp"
#include "reference_conv_fwd.hpp"
#include "tensor_layout.hpp"
namespace
ck
{
namespace
utils
{
namespace
conv
{
using
DeviceConvFwdNoOpPtr
=
ck
::
tensor_operation
::
device
::
DeviceConvFwdPtr
<
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
;
/**
* @brief Calculate number of FLOPs for Convolution
*
* @param[in] N Batch size.
* @param[in] C Number of input channels.
* @param[in] K Number of output channels.
* @param[in] filter_spatial_lengths Filter spatial dimensions lengths.
* @param[in] output_spatial_lengths Convolution output spatial dimensions
* lengths.
*
* @return The number of flops.
*/
std
::
size_t
get_flops
(
ck
::
index_t
N
,
ck
::
index_t
C
,
ck
::
index_t
K
,
const
std
::
vector
<
ck
::
index_t
>&
filter_spatial_lengths
,
const
std
::
vector
<
ck
::
index_t
>&
output_spatial_lengths
)
{
// 2 * N * K * <output spatial lengths product> * C * <filter spatial lengths product>
return
static_cast
<
std
::
size_t
>
(
2
)
*
N
*
K
*
std
::
accumulate
(
std
::
begin
(
output_spatial_lengths
),
std
::
end
(
output_spatial_lengths
),
static_cast
<
std
::
size_t
>
(
1
),
std
::
multiplies
<
std
::
size_t
>
())
*
C
*
std
::
accumulate
(
std
::
begin
(
filter_spatial_lengths
),
std
::
end
(
filter_spatial_lengths
),
static_cast
<
std
::
size_t
>
(
1
),
std
::
multiplies
<
std
::
size_t
>
());
}
/**
* @brief Calculate number of bytes read/write by convolution algorithm.
*
* @param[in] N Batch size.
* @param[in] C Number of input channels.
* @param[in] K Number of output channels.
* @param[in] input_spatial_lengths Input spatial dimensions lengths.
* @param[in] filter_spatial_lengths Filter spatial dimensions lengths.
* @param[in] output_spatial_lengths Output spatial dimensions lengths
*
* @tparam InDataType Input tensor data type.
* @tparam WeiDataType Weights tensor data type.
* @tparam OutDataType Output tensor data type.
*
* @return The number of used bytes.
*/
template
<
typename
InDataType
=
float
,
typename
WeiDataType
=
InDataType
,
typename
OutDataType
=
InDataType
>
std
::
size_t
get_btype
(
ck
::
index_t
N
,
ck
::
index_t
C
,
ck
::
index_t
K
,
const
std
::
vector
<
ck
::
index_t
>&
input_spatial_lengths
,
const
std
::
vector
<
ck
::
index_t
>&
filter_spatial_lengths
,
const
std
::
vector
<
ck
::
index_t
>&
output_spatial_lengths
)
{
// sizeof(InDataType) * (N * C * <input spatial lengths product>) +
// sizeof(WeiDataType) * (K * C * <filter spatial lengths product>) +
// sizeof(OutDataType) * (N * K * <output spatial lengths product>);
return
sizeof
(
InDataType
)
*
(
N
*
C
*
std
::
accumulate
(
std
::
begin
(
input_spatial_lengths
),
std
::
end
(
input_spatial_lengths
),
static_cast
<
std
::
size_t
>
(
1
),
std
::
multiplies
<
std
::
size_t
>
()))
+
sizeof
(
WeiDataType
)
*
(
K
*
C
*
std
::
accumulate
(
std
::
begin
(
filter_spatial_lengths
),
std
::
end
(
filter_spatial_lengths
),
static_cast
<
std
::
size_t
>
(
1
),
std
::
multiplies
<
std
::
size_t
>
()))
+
sizeof
(
OutDataType
)
*
(
N
*
K
*
std
::
accumulate
(
std
::
begin
(
output_spatial_lengths
),
std
::
end
(
output_spatial_lengths
),
static_cast
<
std
::
size_t
>
(
1
),
std
::
multiplies
<
std
::
size_t
>
()));
}
struct
ConvParams
{
ConvParams
()
:
num_dim_spatial
(
2
),
N
(
128
),
K
(
256
),
C
(
192
),
filter_spatial_lengths
(
2
,
3
),
input_spatial_lengths
(
2
,
71
),
conv_filter_strides
(
2
,
2
),
conv_filter_dilations
(
2
,
1
),
input_left_pads
(
2
,
1
),
input_right_pads
(
2
,
1
)
{
}
ConvParams
(
ck
::
index_t
n_dim
,
ck
::
index_t
n_batch
,
ck
::
index_t
n_out_channels
,
ck
::
index_t
n_in_channels
,
const
std
::
vector
<
ck
::
index_t
>&
filters_len
,
const
std
::
vector
<
ck
::
index_t
>&
input_len
,
const
std
::
vector
<
ck
::
index_t
>&
strides
,
const
std
::
vector
<
ck
::
index_t
>&
dilations
,
const
std
::
vector
<
ck
::
index_t
>&
left_pads
,
const
std
::
vector
<
ck
::
index_t
>&
right_pads
)
:
num_dim_spatial
(
n_dim
),
N
(
n_batch
),
K
(
n_out_channels
),
C
(
n_in_channels
),
filter_spatial_lengths
(
filters_len
),
input_spatial_lengths
(
input_len
),
conv_filter_strides
(
strides
),
conv_filter_dilations
(
dilations
),
input_left_pads
(
left_pads
),
input_right_pads
(
right_pads
)
{
if
(
filter_spatial_lengths
.
size
()
!=
num_dim_spatial
||
input_spatial_lengths
.
size
()
!=
num_dim_spatial
||
conv_filter_strides
.
size
()
!=
num_dim_spatial
||
conv_filter_dilations
.
size
()
!=
num_dim_spatial
||
input_left_pads
.
size
()
!=
num_dim_spatial
||
input_right_pads
.
size
()
!=
num_dim_spatial
)
{
throw
(
std
::
runtime_error
(
"ConvParams::GetOutputSpatialLengths: "
"parameter size is different from number of declared dimensions!"
));
}
}
ck
::
index_t
num_dim_spatial
;
ck
::
index_t
N
;
ck
::
index_t
K
;
ck
::
index_t
C
;
std
::
vector
<
ck
::
index_t
>
filter_spatial_lengths
;
std
::
vector
<
ck
::
index_t
>
input_spatial_lengths
;
std
::
vector
<
ck
::
index_t
>
conv_filter_strides
;
std
::
vector
<
ck
::
index_t
>
conv_filter_dilations
;
std
::
vector
<
ck
::
index_t
>
input_left_pads
;
std
::
vector
<
ck
::
index_t
>
input_right_pads
;
std
::
vector
<
ck
::
index_t
>
GetOutputSpatialLengths
()
const
{
if
(
filter_spatial_lengths
.
size
()
!=
num_dim_spatial
||
input_spatial_lengths
.
size
()
!=
num_dim_spatial
||
conv_filter_strides
.
size
()
!=
num_dim_spatial
||
conv_filter_dilations
.
size
()
!=
num_dim_spatial
||
input_left_pads
.
size
()
!=
num_dim_spatial
||
input_right_pads
.
size
()
!=
num_dim_spatial
)
{
throw
(
std
::
runtime_error
(
"ConvParams::GetOutputSpatialLengths: "
"parameter size is different from number of declared dimensions!"
));
}
std
::
vector
<
ck
::
index_t
>
out_spatial_len
(
num_dim_spatial
,
0
);
for
(
ck
::
index_t
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
// XEff = (X - 1) * conv_dilation_w + 1;
// Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
const
ck
::
index_t
idx_eff
=
(
filter_spatial_lengths
[
i
]
-
1
)
*
conv_filter_dilations
[
i
]
+
1
;
out_spatial_len
[
i
]
=
(
input_spatial_lengths
[
i
]
+
input_left_pads
[
i
]
+
input_right_pads
[
i
]
-
idx_eff
)
/
conv_filter_strides
[
i
]
+
1
;
}
return
out_spatial_len
;
}
};
/**
* @brief Gets the host tensor descriptor.
*
* @param[in] dims The tensor dimensions lengths. Always in NCHW format.
* @param[in] layout The tensor data layout.
*
* @tparam TensorLayout Layout type.
*
* @return The host tensor descriptor object.
*/
template
<
typename
TensorLayout
>
HostTensorDescriptor
get_host_tensor_descriptor
(
const
std
::
vector
<
std
::
size_t
>&
dims
,
const
TensorLayout
&
layout
)
{
std
::
size_t
C
=
dims
[
1
];
// 1D
if
constexpr
(
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
NCW
>::
value
||
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
KCX
>::
value
||
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
NKW
>::
value
)
{
return
HostTensorDescriptor
(
dims
,
std
::
vector
<
std
::
size_t
>
({
C
*
dims
[
2
],
dims
[
2
],
1
}));
}
else
if
constexpr
(
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
NWC
>::
value
||
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
KXC
>::
value
||
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
NWK
>::
value
)
{
return
HostTensorDescriptor
(
dims
,
std
::
vector
<
std
::
size_t
>
({
C
*
dims
[
2
],
1
,
C
}));
}
// 2D
else
if
constexpr
(
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
NCHW
>::
value
||
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
KCYX
>::
value
||
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
NKHW
>::
value
)
{
return
HostTensorDescriptor
(
dims
,
std
::
vector
<
std
::
size_t
>
{
C
*
dims
[
2
]
*
dims
[
3
],
dims
[
2
]
*
dims
[
3
],
dims
[
3
],
1
});
}
else
if
constexpr
(
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
NHWC
>::
value
||
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
KYXC
>::
value
||
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
NHWK
>::
value
)
{
return
HostTensorDescriptor
(
dims
,
std
::
vector
<
std
::
size_t
>
{
C
*
dims
[
2
]
*
dims
[
3
],
1
,
dims
[
3
]
*
C
,
C
});
}
// 3D
else
if
constexpr
(
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
NCDHW
>::
value
||
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
KCZYX
>::
value
||
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
NKDHW
>::
value
)
{
return
HostTensorDescriptor
(
dims
,
std
::
vector
<
std
::
size_t
>
{
C
*
dims
[
2
]
*
dims
[
3
]
*
dims
[
4
],
dims
[
2
]
*
dims
[
3
]
*
dims
[
4
],
dims
[
3
]
*
dims
[
4
],
dims
[
4
],
1
});
}
else
if
constexpr
(
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
NDHWC
>::
value
||
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
KZYXC
>::
value
||
std
::
is_same
<
TensorLayout
,
ck
::
tensor_layout
::
convolution
::
NDHWK
>::
value
)
{
return
HostTensorDescriptor
(
dims
,
std
::
vector
<
std
::
size_t
>
{
C
*
dims
[
2
]
*
dims
[
3
]
*
dims
[
4
],
1
,
C
*
dims
[
3
]
*
dims
[
4
],
C
*
dims
[
4
],
C
});
}
std
::
stringstream
err_msg
;
err_msg
<<
"Unsupported data layout provided: "
<<
layout
<<
"!"
;
throw
std
::
runtime_error
(
err_msg
.
str
());
}
template
<
typename
InDataType
=
float
,
typename
WeiDataType
=
float
,
typename
OutDataType
=
float
,
typename
InLayout
=
ck
::
tensor_layout
::
convolution
::
NHWC
,
typename
WeiLayout
=
ck
::
tensor_layout
::
convolution
::
KYXC
,
typename
OutLayout
=
ck
::
tensor_layout
::
convolution
::
NHWK
>
auto
get_host_tensors
(
const
ConvParams
&
params
,
bool
init
=
true
)
{
std
::
vector
<
std
::
size_t
>
input_dims
{
static_cast
<
std
::
size_t
>
(
params
.
N
),
static_cast
<
std
::
size_t
>
(
params
.
C
)};
input_dims
.
insert
(
std
::
end
(
input_dims
),
std
::
begin
(
params
.
input_spatial_lengths
),
std
::
end
(
params
.
input_spatial_lengths
));
std
::
vector
<
std
::
size_t
>
filter_dims
{
static_cast
<
std
::
size_t
>
(
params
.
K
),
static_cast
<
std
::
size_t
>
(
params
.
C
)};
filter_dims
.
insert
(
std
::
end
(
filter_dims
),
std
::
begin
(
params
.
filter_spatial_lengths
),
std
::
end
(
params
.
filter_spatial_lengths
));
const
std
::
vector
<
ck
::
index_t
>&
output_spatial_lengths
=
params
.
GetOutputSpatialLengths
();
std
::
vector
<
std
::
size_t
>
output_dims
{
static_cast
<
std
::
size_t
>
(
params
.
N
),
static_cast
<
std
::
size_t
>
(
params
.
K
)};
output_dims
.
insert
(
std
::
end
(
output_dims
),
std
::
begin
(
output_spatial_lengths
),
std
::
end
(
output_spatial_lengths
));
Tensor
<
InDataType
>
input
(
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
input_dims
,
InLayout
{}));
Tensor
<
WeiDataType
>
weights
(
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
filter_dims
,
WeiLayout
{}));
Tensor
<
OutDataType
>
host_output
(
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
output_dims
,
OutLayout
{}));
Tensor
<
OutDataType
>
device_output
(
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
output_dims
,
OutLayout
{}));
if
(
init
)
{
std
::
mt19937
gen
(
11939
);
if
constexpr
(
std
::
is_same
<
InDataType
,
uint8_t
>::
value
)
{
std
::
uniform_int_distribution
<>
dis
(
-
5
,
5
);
std
::
generate
(
input
.
begin
(),
input
.
end
(),
[
&
dis
,
&
gen
]()
{
return
InDataType
(
dis
(
gen
));
});
std
::
generate
(
weights
.
begin
(),
weights
.
end
(),
[
&
dis
,
&
gen
]()
{
return
WeiDataType
(
dis
(
gen
));
});
}
else
{
std
::
uniform_real_distribution
<>
dis
(
0.
f
,
1.
f
);
std
::
generate
(
input
.
begin
(),
input
.
end
(),
[
&
dis
,
&
gen
]()
{
return
InDataType
(
dis
(
gen
));
});
std
::
generate
(
weights
.
begin
(),
weights
.
end
(),
[
&
dis
,
&
gen
]()
{
return
WeiDataType
(
dis
(
gen
));
});
}
std
::
fill
(
host_output
.
begin
(),
host_output
.
end
(),
OutDataType
(
0.
f
));
std
::
fill
(
device_output
.
begin
(),
device_output
.
end
(),
OutDataType
(
0.
f
));
}
return
std
::
make_tuple
(
input
,
weights
,
host_output
,
device_output
);
}
HostTensorDescriptor
get_output_host_tensor_descriptor
(
const
std
::
vector
<
std
::
size_t
>&
dims
,
int
num_dim_spatial
=
2
)
{
namespace
tl
=
ck
::
tensor_layout
::
convolution
;
switch
(
num_dim_spatial
)
{
case
3
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
NDHWK
{});
}
case
2
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
NHWK
{});
}
case
1
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
NWK
{});
}
default:
{
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
}
}
}
HostTensorDescriptor
get_filters_host_tensor_descriptor
(
const
std
::
vector
<
std
::
size_t
>&
dims
,
int
num_dim_spatial
=
2
)
{
namespace
tl
=
ck
::
tensor_layout
::
convolution
;
switch
(
num_dim_spatial
)
{
case
3
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
KZYXC
{});
}
case
2
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
KYXC
{});
}
case
1
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
KXC
{});
}
default:
{
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
}
}
}
HostTensorDescriptor
get_input_host_tensor_descriptor
(
const
std
::
vector
<
std
::
size_t
>&
dims
,
int
num_dim_spatial
=
2
)
{
namespace
tl
=
ck
::
tensor_layout
::
convolution
;
switch
(
num_dim_spatial
)
{
case
3
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
NDHWC
{});
}
case
2
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
NHWC
{});
}
case
1
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
NWC
{});
}
default:
{
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
}
}
}
template
<
ck
::
index_t
NDim
,
typename
InDataType
=
float
,
typename
WeiDataType
=
float
,
typename
OutDataType
=
float
>
void
run_reference_convolution_forward
(
const
ConvParams
&
params
,
const
Tensor
<
InDataType
>&
input
,
const
Tensor
<
WeiDataType
>&
weights
,
Tensor
<
OutDataType
>&
output
)
{
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
auto
ref_conv
=
ck
::
tensor_operation
::
host
::
ReferenceConvFwd
<
InDataType
,
WeiDataType
,
OutDataType
,
PassThrough
,
PassThrough
,
PassThrough
,
NDim
>
();
auto
ref_invoker
=
ref_conv
.
MakeInvoker
();
auto
ref_argument
=
ref_conv
.
MakeArgument
(
input
,
weights
,
output
,
params
.
conv_filter_strides
,
params
.
conv_filter_dilations
,
params
.
input_left_pads
,
params
.
input_right_pads
,
PassThrough
{},
PassThrough
{},
PassThrough
{});
ref_invoker
.
Run
(
ref_argument
);
}
template
<
ck
::
index_t
NDim
,
typename
InDataType
=
float
,
typename
WeiDataType
=
float
,
typename
OutDataType
=
float
,
template
<
ck
::
index_t
,
typename
,
typename
,
typename
>
class
DeviceConvNDFwdInstance
>
void
run_convolution_forward
(
const
ConvParams
&
params
,
const
Tensor
<
InDataType
>&
input
,
const
Tensor
<
WeiDataType
>&
weights
,
Tensor
<
OutDataType
>&
output
)
{
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
input
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_device_buf
(
sizeof
(
WeiDataType
)
*
weights
.
mDesc
.
GetElementSpace
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
output
.
mDesc
.
GetElementSpace
());
in_device_buf
.
ToDevice
(
input
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
weights
.
mData
.
data
());
const
std
::
vector
<
ck
::
index_t
>&
output_spatial_lengths
=
params
.
GetOutputSpatialLengths
();
auto
conv
=
DeviceConvNDFwdInstance
<
NDim
,
InDataType
,
WeiDataType
,
OutDataType
>
();
auto
invoker
=
conv
.
MakeInvoker
();
auto
argument
=
conv
.
MakeArgument
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
WeiDataType
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
params
.
N
,
params
.
K
,
params
.
C
,
params
.
input_spatial_lengths
,
params
.
filter_spatial_lengths
,
output_spatial_lengths
,
params
.
conv_filter_strides
,
params
.
conv_filter_dilations
,
params
.
input_left_pads
,
params
.
input_right_pads
,
PassThrough
{},
PassThrough
{},
PassThrough
{});
if
(
!
conv
.
IsSupportedArgument
(
argument
))
{
throw
std
::
runtime_error
(
"Error! device_conv with the specified compilation parameters does "
"not support this Conv problem"
);
}
invoker
.
Run
(
argument
);
out_device_buf
.
FromDevice
(
output
.
mData
.
data
());
}
template
<
ck
::
index_t
NDim
,
typename
InDataType
=
float
,
typename
WeiDataType
=
float
,
typename
OutDataType
=
float
>
bool
run_convolution_forward_instances
(
const
ConvParams
&
params
,
const
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
conv_ptrs
,
const
Tensor
<
InDataType
>&
input
,
const
Tensor
<
WeiDataType
>&
weights
,
Tensor
<
OutDataType
>&
output
,
const
Tensor
<
OutDataType
>&
host_output
)
{
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
input
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_device_buf
(
sizeof
(
WeiDataType
)
*
weights
.
mDesc
.
GetElementSpace
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
output
.
mDesc
.
GetElementSpace
());
in_device_buf
.
ToDevice
(
input
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
weights
.
mData
.
data
());
const
std
::
vector
<
ck
::
index_t
>&
output_spatial_lengths
=
params
.
GetOutputSpatialLengths
();
bool
res
{
true
};
for
(
auto
&
conv_ptr
:
conv_ptrs
)
{
auto
invoker
=
conv_ptr
->
MakeInvokerPointer
();
auto
argument
=
conv_ptr
->
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
WeiDataType
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
params
.
N
,
params
.
K
,
params
.
C
,
params
.
input_spatial_lengths
,
params
.
filter_spatial_lengths
,
output_spatial_lengths
,
params
.
conv_filter_strides
,
params
.
conv_filter_dilations
,
params
.
input_left_pads
,
params
.
input_right_pads
,
PassThrough
{},
PassThrough
{},
PassThrough
{});
if
(
conv_ptr
->
IsSupportedArgument
(
argument
.
get
()))
{
float
atol
{
1e-5
f
};
float
rtol
{
1e-4
f
};
if
constexpr
(
std
::
is_same_v
<
InDataType
,
ck
::
half_t
>
)
{
atol
=
1e-4
f
;
rtol
=
2.5e-3
f
;
}
invoker
->
Run
(
argument
.
get
());
out_device_buf
.
FromDevice
(
output
.
mData
.
data
());
res
=
res
&&
ck
::
utils
::
check_err
(
output
.
mData
,
host_output
.
mData
,
"Error: incorrect results!"
,
atol
,
rtol
);
hipGetErrorString
(
hipMemset
(
out_device_buf
.
GetDeviceBuffer
(),
0
,
out_device_buf
.
mMemSize
));
}
}
return
res
;
}
}
// namespace conv
}
// namespace utils
}
// namespace ck
#endif
library/src/host_tensor/host_tensor.cpp
View file @
07a673c6
...
...
@@ -65,21 +65,10 @@ void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream
}
#if 1
// FIXME: remove
float
bf16_to_f32_
(
ck
::
bhalf_t
src_val
)
{
union
{
uint32_t
int32
;
float
fp32
;
}
u
=
{
uint32_t
(
src_val
)
<<
16
};
return
u
.
fp32
;
}
// FIXME: remove
void
bf16_to_f32_
(
const
Tensor
<
ck
::
bhalf_t
>&
src
,
Tensor
<
float
>&
dst
)
{
for
(
int
i
=
0
;
i
<
src
.
mData
.
size
();
++
i
)
dst
.
mData
[
i
]
=
bf16_to_f32_
(
src
.
mData
[
i
]);
dst
.
mData
[
i
]
=
ck
::
type_convert
<
float
>
(
src
.
mData
[
i
]);
}
#endif
library/src/obselete_driver_offline/conv_add_fwd_driver_offline_nchwc.cpp
View file @
07a673c6
...
...
@@ -4,6 +4,8 @@
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "check_err.hpp"
#include "config.hpp"
#include "debug.hpp"
#include "print.hpp"
...
...
@@ -39,7 +41,7 @@ void host_direct_convolution_add_nchwc(const Tensor<TIn>& in,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
,
const
ck
::
ActivTypeEnum
_t
activ_type
)
const
ck
::
ActivTypeEnum
activ_type
)
{
using
namespace
ck
;
...
...
@@ -117,7 +119,7 @@ int main(int argc, char* argv[])
exit
(
1
);
}
constexpr
ck
::
ActivTypeEnum
_t
activ_type
=
ActivTypeEnum
_t
::
LeakyRelu
;
constexpr
ck
::
ActivTypeEnum
activ_type
=
ActivTypeEnum
::
LeakyRelu
;
const
ConvForwardAlgo
algo
=
static_cast
<
ConvForwardAlgo
>
(
std
::
stoi
(
argv
[
1
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
2
]);
...
...
@@ -167,7 +169,7 @@ int main(int argc, char* argv[])
const
bool
do_log
=
std
::
stoi
(
argv
[
4
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
5
]);
constexpr
ck
::
ActivTypeEnum
_t
activ_type
=
ActivTypeEnum
_t
::
LeakyRelu
;
constexpr
ck
::
ActivTypeEnum
activ_type
=
ActivTypeEnum
::
LeakyRelu
;
#if 0
constexpr auto N = Number<1>{};
...
...
@@ -401,7 +403,7 @@ int main(int argc, char* argv[])
make_tuple
(
in_right_pad_h
,
in_right_pad_w
),
activ_type
);
check_err
or
(
add_
host
,
add_device
);
ck
::
utils
::
check_err
(
add_
device
.
mData
,
add_host
.
mData
);
if
(
do_log
)
{
...
...
library/src/obselete_driver_offline/conv_bwd_driver_offline.cpp
View file @
07a673c6
...
...
@@ -4,6 +4,8 @@
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "check_err.hpp"
#include "config.hpp"
#include "debug.hpp"
#include "print.hpp"
...
...
@@ -473,7 +475,7 @@ int main(int argc, char* argv[])
make_tuple
(
in_right_pad_h
,
in_right_pad_w
),
layout
);
check_err
or
(
in_
host
,
in_device
);
ck
::
utils
::
check_err
(
in_
device
.
mData
,
in_host
.
mData
);
if
(
do_log
)
{
...
...
library/src/obselete_driver_offline/conv_fwd_driver_offline.cpp
View file @
07a673c6
...
...
@@ -4,6 +4,8 @@
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "check_err.hpp"
#include "config.hpp"
#include "debug.hpp"
#include "print.hpp"
...
...
@@ -534,7 +536,7 @@ int main(int argc, char* argv[])
make_tuple
(
in_right_pad_h
,
in_right_pad_w
),
layout
);
check_err
or
(
out_
host
,
out_device
);
ck
::
utils
::
check_err
(
out_
device
.
mData
,
out_host
.
mData
);
if
(
do_log
)
{
...
...
library/src/obselete_driver_offline/conv_fwd_driver_offline_nchwc.cpp
View file @
07a673c6
...
...
@@ -4,6 +4,8 @@
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "check_err.hpp"
#include "config.hpp"
#include "debug.hpp"
#include "print.hpp"
...
...
@@ -37,7 +39,7 @@ void host_direct_convolution_nchwc(const Tensor<TIn>& in,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
,
const
ck
::
ActivTypeEnum
_t
activ_type
)
const
ck
::
ActivTypeEnum
activ_type
)
{
using
namespace
ck
;
...
...
@@ -102,7 +104,7 @@ int main(int argc, char* argv[])
exit
(
1
);
}
constexpr
ck
::
ActivTypeEnum
_t
activ_type
=
ActivTypeEnum
_t
::
LeakyRelu
;
constexpr
ck
::
ActivTypeEnum
activ_type
=
ActivTypeEnum
::
LeakyRelu
;
const
ConvForwardAlgo
algo
=
static_cast
<
ConvForwardAlgo
>
(
std
::
stoi
(
argv
[
1
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
2
]);
...
...
@@ -149,8 +151,8 @@ int main(int argc, char* argv[])
const
bool
do_log
=
std
::
stoi
(
argv
[
4
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
5
]);
// constexpr ck::ActivTypeEnum
_t
activ_type = ActivTypeEnum
_t
::Sigmoid;
constexpr
ck
::
ActivTypeEnum
_t
activ_type
=
ActivTypeEnum
_t
::
LeakyRelu
;
// constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::Sigmoid;
constexpr
ck
::
ActivTypeEnum
activ_type
=
ActivTypeEnum
::
LeakyRelu
;
#if 0
constexpr auto N = Number<1>{};
...
...
@@ -377,7 +379,7 @@ int main(int argc, char* argv[])
make_tuple
(
in_right_pad_h
,
in_right_pad_w
),
activ_type
);
check_err
or
(
out_
host
,
out_device
);
ck
::
utils
::
check_err
(
out_
device
.
mData
,
out_host
.
mData
);
if
(
do_log
)
{
...
...
library/src/obselete_driver_offline/conv_maxpool_fwd_driver_offline_nchwc.cpp
View file @
07a673c6
...
...
@@ -4,6 +4,8 @@
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "check_err.hpp"
#include "config.hpp"
#include "debug.hpp"
#include "print.hpp"
...
...
@@ -38,7 +40,7 @@ void host_direct_convolution_maxpool_nchwc(const Tensor<TIn>& in,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
,
const
ck
::
ActivTypeEnum
_t
activ_type
)
const
ck
::
ActivTypeEnum
activ_type
)
{
using
namespace
ck
;
...
...
@@ -126,7 +128,7 @@ int main(int argc, char* argv[])
exit
(
1
);
}
constexpr
ck
::
ActivTypeEnum
_t
activ_type
=
ActivTypeEnum
_t
::
LeakyRelu
;
constexpr
ck
::
ActivTypeEnum
activ_type
=
ActivTypeEnum
::
LeakyRelu
;
const
ConvForwardAlgo
algo
=
static_cast
<
ConvForwardAlgo
>
(
std
::
stoi
(
argv
[
1
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
2
]);
...
...
@@ -176,18 +178,18 @@ int main(int argc, char* argv[])
const
bool
do_log
=
std
::
stoi
(
argv
[
4
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
5
]);
constexpr
ck
::
ActivTypeEnum
_t
activ_type
=
ActivTypeEnum
_t
::
LeakyRelu
;
constexpr
ck
::
ActivTypeEnum
activ_type
=
ActivTypeEnum
::
LeakyRelu
;
#if 1
constexpr
auto
N
=
Number
<
1
>
{};
constexpr
auto
Hi
=
Number
<
1080
>
{};
constexpr
auto
Wi
=
Number
<
1920
>
{};
constexpr
auto
Y
=
Number
<
3
>
{};
constexpr
auto
X
=
Number
<
3
>
{};
constexpr
auto
C0
=
Number
<
2
>
{};
constexpr
auto
C1
=
Number
<
8
>
{};
constexpr
auto
K0
=
Number
<
2
>
{};
constexpr
auto
K1
=
Number
<
8
>
{};
constexpr
auto
N
=
Number
<
1
>
{};
constexpr
auto
Hi
=
Number
<
1080
>
{};
constexpr
auto
Wi
=
Number
<
1920
>
{};
constexpr
auto
Y
=
Number
<
3
>
{};
constexpr
auto
X
=
Number
<
3
>
{};
constexpr
auto
C0
=
Number
<
2
>
{};
constexpr
auto
C1
=
Number
<
8
>
{};
constexpr
auto
K0
=
Number
<
2
>
{};
constexpr
auto
K1
=
Number
<
8
>
{};
#elif 0
constexpr
auto
N
=
Number
<
1
>
{};
constexpr
auto
Hi
=
Number
<
1080
>
{};
...
...
@@ -397,8 +399,8 @@ int main(int argc, char* argv[])
make_tuple
(
in_right_pad_h
,
in_right_pad_w
),
activ_type
);
check_err
or
(
out_
host
,
out_device
);
check_err
or
(
max_
host
,
max_device
);
ck
::
utils
::
check_err
(
out_
device
.
mData
,
out_host
.
mData
);
ck
::
utils
::
check_err
(
max_
device
.
mData
,
max_host
.
mData
);
if
(
do_log
)
{
...
...
Prev
1
…
5
6
7
8
9
10
11
12
13
…
16
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment