Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel
Commits
07a673c6
Commit
07a673c6
authored
Apr 14, 2022
by
carlushuang
Browse files
Merge remote-tracking branch 'origin/develop' into cpu_avx2
parents
c0f698d5
ac0d8066
Changes
307
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
778 additions
and
220 deletions
+778
-220
library/include/ck/library/obselete_driver_offline/driver_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
...ward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+2
-2
library/include/ck/library/obselete_driver_offline/driver_gemm_dlops_v1r2.hpp
...ibrary/obselete_driver_offline/driver_gemm_dlops_v1r2.hpp
+1
-1
library/include/ck/library/obselete_driver_offline/driver_gemm_dlops_v1r3.hpp
...ibrary/obselete_driver_offline/driver_gemm_dlops_v1r3.hpp
+1
-1
library/include/ck/library/obselete_driver_offline/driver_gemm_xdlops_v2r3.hpp
...brary/obselete_driver_offline/driver_gemm_xdlops_v2r3.hpp
+1
-1
library/include/ck/library/obselete_driver_offline/driver_gemm_xdlops_v2r4.hpp
...brary/obselete_driver_offline/driver_gemm_xdlops_v2r4.hpp
+1
-1
library/include/ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp
...e_tensor_operation/cpu/reference_conv_backward_weight.hpp
+3
-3
library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp
...eference_tensor_operation/cpu/reference_conv_bwd_data.hpp
+6
-6
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp
..._instance/gpu/reduce/device_reduce_instance_blockwise.hpp
+26
-26
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call.hpp
...u/reduce/device_reduce_instance_blockwise_second_call.hpp
+26
-26
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp
...u/reduce/device_reduce_instance_multiblock_atomic_add.hpp
+29
-29
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce.hpp
...duce/device_reduce_instance_multiblock_partial_reduce.hpp
+26
-26
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp
...instance/gpu/reduce/device_reduce_instance_threadwise.hpp
+26
-26
library/include/ck/library/utility/check_err.hpp
library/include/ck/library/utility/check_err.hpp
+40
-35
library/include/ck/library/utility/conv_fwd_util.hpp
library/include/ck/library/utility/conv_fwd_util.hpp
+554
-0
library/src/host_tensor/host_tensor.cpp
library/src/host_tensor/host_tensor.cpp
+1
-12
library/src/obselete_driver_offline/conv_add_fwd_driver_offline_nchwc.cpp
...lete_driver_offline/conv_add_fwd_driver_offline_nchwc.cpp
+6
-4
library/src/obselete_driver_offline/conv_bwd_driver_offline.cpp
...y/src/obselete_driver_offline/conv_bwd_driver_offline.cpp
+3
-1
library/src/obselete_driver_offline/conv_fwd_driver_offline.cpp
...y/src/obselete_driver_offline/conv_fwd_driver_offline.cpp
+3
-1
library/src/obselete_driver_offline/conv_fwd_driver_offline_nchwc.cpp
...obselete_driver_offline/conv_fwd_driver_offline_nchwc.cpp
+7
-5
library/src/obselete_driver_offline/conv_maxpool_fwd_driver_offline_nchwc.cpp
..._driver_offline/conv_maxpool_fwd_driver_offline_nchwc.cpp
+16
-14
No files found.
library/include/ck/library/obselete_driver_offline/driver_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
View file @
07a673c6
...
@@ -27,7 +27,7 @@ template <ck::index_t BlockSize,
...
@@ -27,7 +27,7 @@ template <ck::index_t BlockSize,
ck
::
index_t
ABlockTransferDstScalarPerVector_E2
,
ck
::
index_t
ABlockTransferDstScalarPerVector_E2
,
ck
::
index_t
BThreadTransferSrcScalarPerVector_E2
,
ck
::
index_t
BThreadTransferSrcScalarPerVector_E2
,
ck
::
index_t
CThreadTransferDstScalarPerVector_K
,
ck
::
index_t
CThreadTransferDstScalarPerVector_K
,
ck
::
ActivTypeEnum
_t
activ_type
>
ck
::
ActivTypeEnum
activ_type
>
struct
DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_maxpool
struct
DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_maxpool
{
{
template
<
typename
...
Wei
,
template
<
typename
...
Wei
,
...
@@ -305,7 +305,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
...
@@ -305,7 +305,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
FloatAB
,
FloatAB
,
FloatAcc
,
FloatAcc
,
FloatC
,
FloatC
,
InMemoryDataOperationEnum
_t
::
Set
,
InMemoryDataOperationEnum
::
Set
,
decltype
(
a_e0_e1_k_e2_grid_desc
),
decltype
(
a_e0_e1_k_e2_grid_desc
),
decltype
(
b_e0_e1_n_ho_wo_e2_grid_desc
),
decltype
(
b_e0_e1_n_ho_wo_e2_grid_desc
),
decltype
(
c_k_n_hop_wop_grid_desc
),
decltype
(
c_k_n_hop_wop_grid_desc
),
...
...
library/include/ck/library/obselete_driver_offline/driver_gemm_dlops_v1r2.hpp
View file @
07a673c6
...
@@ -10,7 +10,7 @@ template <ck::index_t BlockSize,
...
@@ -10,7 +10,7 @@ template <ck::index_t BlockSize,
typename
FloatAB
,
typename
FloatAB
,
typename
FloatAcc
,
typename
FloatAcc
,
typename
FloatC
,
typename
FloatC
,
ck
::
InMemoryDataOperationEnum
_t
CGlobalMemoryDataOperation
,
ck
::
InMemoryDataOperationEnum
CGlobalMemoryDataOperation
,
typename
AKMGridDesc
,
typename
AKMGridDesc
,
typename
BKNGridDesc
,
typename
BKNGridDesc
,
typename
CMNGridDesc
,
typename
CMNGridDesc
,
...
...
library/include/ck/library/obselete_driver_offline/driver_gemm_dlops_v1r3.hpp
View file @
07a673c6
...
@@ -10,7 +10,7 @@ template <ck::index_t BlockSize,
...
@@ -10,7 +10,7 @@ template <ck::index_t BlockSize,
typename
FloatAB
,
typename
FloatAB
,
typename
FloatAcc
,
typename
FloatAcc
,
typename
FloatC
,
typename
FloatC
,
ck
::
InMemoryDataOperationEnum
_t
CGlobalMemoryDataOperation
,
ck
::
InMemoryDataOperationEnum
CGlobalMemoryDataOperation
,
typename
AK0MK1GridDesc
,
typename
AK0MK1GridDesc
,
typename
BK0NK1GridDesc
,
typename
BK0NK1GridDesc
,
typename
CMNGridDesc
,
typename
CMNGridDesc
,
...
...
library/include/ck/library/obselete_driver_offline/driver_gemm_xdlops_v2r3.hpp
View file @
07a673c6
...
@@ -11,7 +11,7 @@ template <ck::index_t BlockSize,
...
@@ -11,7 +11,7 @@ template <ck::index_t BlockSize,
typename
FloatAB
,
typename
FloatAB
,
typename
FloatAcc
,
typename
FloatAcc
,
typename
FloatC
,
typename
FloatC
,
ck
::
InMemoryDataOperationEnum
_t
CGlobalMemoryDataOperation
,
ck
::
InMemoryDataOperationEnum
CGlobalMemoryDataOperation
,
typename
AGridDesc_K0_M_K1
,
typename
AGridDesc_K0_M_K1
,
typename
BGridDesc_K0_N_K
,
typename
BGridDesc_K0_N_K
,
typename
CMNGridDesc
,
typename
CMNGridDesc
,
...
...
library/include/ck/library/obselete_driver_offline/driver_gemm_xdlops_v2r4.hpp
View file @
07a673c6
...
@@ -10,7 +10,7 @@ template <ck::index_t BlockSize,
...
@@ -10,7 +10,7 @@ template <ck::index_t BlockSize,
typename
FloatAB
,
typename
FloatAB
,
typename
FloatAcc
,
typename
FloatAcc
,
typename
FloatC
,
typename
FloatC
,
ck
::
InMemoryDataOperationEnum
_t
CGlobalMemoryDataOperation
,
ck
::
InMemoryDataOperationEnum
CGlobalMemoryDataOperation
,
typename
ABK0MK1GridDesc
,
typename
ABK0MK1GridDesc
,
typename
BBK0NK1GridDesc
,
typename
BBK0NK1GridDesc
,
typename
CMNGridDesc
,
typename
CMNGridDesc
,
...
...
library/include/ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp
View file @
07a673c6
...
@@ -17,7 +17,7 @@ template <typename InDataType,
...
@@ -17,7 +17,7 @@ template <typename InDataType,
typename
InElementwiseOperation
,
typename
InElementwiseOperation
,
typename
WeiElementwiseOperation
,
typename
WeiElementwiseOperation
,
typename
OutElementwiseOperation
>
typename
OutElementwiseOperation
>
struct
ReferenceConv
Wrw
:
public
device
::
BaseOperator
struct
ReferenceConv
BwdWeight
:
public
device
::
BaseOperator
{
{
// Argument
// Argument
struct
Argument
:
public
device
::
BaseArgument
struct
Argument
:
public
device
::
BaseArgument
...
@@ -62,7 +62,7 @@ struct ReferenceConvWrw : public device::BaseOperator
...
@@ -62,7 +62,7 @@ struct ReferenceConvWrw : public device::BaseOperator
// Invoker
// Invoker
struct
Invoker
:
public
device
::
BaseInvoker
struct
Invoker
:
public
device
::
BaseInvoker
{
{
using
Argument
=
ReferenceConv
Wrw
::
Argument
;
using
Argument
=
ReferenceConv
BwdWeight
::
Argument
;
float
Run
(
const
Argument
&
arg
)
float
Run
(
const
Argument
&
arg
)
{
{
...
@@ -163,7 +163,7 @@ struct ReferenceConvWrw : public device::BaseOperator
...
@@ -163,7 +163,7 @@ struct ReferenceConvWrw : public device::BaseOperator
auto
str
=
std
::
stringstream
();
auto
str
=
std
::
stringstream
();
// clang-format off
// clang-format off
str
<<
"ReferenceConv
F
wd"
str
<<
"ReferenceConv
B
wd
Weight
"
<<
std
::
endl
;
<<
std
::
endl
;
// clang-format on
// clang-format on
...
...
library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp
View file @
07a673c6
...
@@ -19,7 +19,7 @@ template <typename InDataType,
...
@@ -19,7 +19,7 @@ template <typename InDataType,
typename
WeiElementwiseOperation
,
typename
WeiElementwiseOperation
,
typename
OutElementwiseOperation
,
typename
OutElementwiseOperation
,
ck
::
index_t
NumDimSpatial
=
2
,
ck
::
index_t
NumDimSpatial
=
2
,
typename
std
::
enable_if
<
NumDimSpatial
>
=
1
&&
NumDimSpatial
<=
3
,
bool
>::
type
=
false
>
typename
ck
::
enable_if
<
NumDimSpatial
>
=
1
&&
NumDimSpatial
<=
3
,
bool
>::
type
=
false
>
struct
ReferenceConvBwdData
:
public
device
::
BaseOperator
struct
ReferenceConvBwdData
:
public
device
::
BaseOperator
{
{
// Argument
// Argument
...
@@ -71,7 +71,7 @@ struct ReferenceConvBwdData : public device::BaseOperator
...
@@ -71,7 +71,7 @@ struct ReferenceConvBwdData : public device::BaseOperator
{
{
if
constexpr
(
NumDimSpatial
==
1
)
if
constexpr
(
NumDimSpatial
==
1
)
{
{
auto
f_nc
h
w
=
[
&
](
auto
n
,
auto
c
,
auto
wi
)
{
auto
f_ncw
=
[
&
](
auto
n
,
auto
c
,
auto
wi
)
{
std
::
size_t
K
=
arg
.
weight_
.
mDesc
.
GetLengths
()[
0
];
std
::
size_t
K
=
arg
.
weight_
.
mDesc
.
GetLengths
()[
0
];
std
::
size_t
X
=
arg
.
weight_
.
mDesc
.
GetLengths
()[
2
];
std
::
size_t
X
=
arg
.
weight_
.
mDesc
.
GetLengths
()[
2
];
std
::
size_t
Wo
=
arg
.
output_
.
mDesc
.
GetLengths
()[
2
];
std
::
size_t
Wo
=
arg
.
output_
.
mDesc
.
GetLengths
()[
2
];
...
@@ -108,7 +108,7 @@ struct ReferenceConvBwdData : public device::BaseOperator
...
@@ -108,7 +108,7 @@ struct ReferenceConvBwdData : public device::BaseOperator
arg
.
input_
(
n
,
c
,
wi
)
=
ck
::
type_convert
<
InDataType
>
(
v_in
);
arg
.
input_
(
n
,
c
,
wi
)
=
ck
::
type_convert
<
InDataType
>
(
v_in
);
};
};
make_ParallelTensorFunctor
(
f_nc
h
w
,
make_ParallelTensorFunctor
(
f_ncw
,
arg
.
input_
.
mDesc
.
GetLengths
()[
0
],
arg
.
input_
.
mDesc
.
GetLengths
()[
0
],
arg
.
input_
.
mDesc
.
GetLengths
()[
1
],
arg
.
input_
.
mDesc
.
GetLengths
()[
1
],
arg
.
input_
.
mDesc
.
GetLengths
()[
2
])(
arg
.
input_
.
mDesc
.
GetLengths
()[
2
])(
...
@@ -182,7 +182,7 @@ struct ReferenceConvBwdData : public device::BaseOperator
...
@@ -182,7 +182,7 @@ struct ReferenceConvBwdData : public device::BaseOperator
}
}
else
if
constexpr
(
NumDimSpatial
==
3
)
else
if
constexpr
(
NumDimSpatial
==
3
)
{
{
auto
f_nchw
=
[
&
](
auto
n
,
auto
c
,
auto
di
,
auto
hi
,
auto
wi
)
{
auto
f_nc
d
hw
=
[
&
](
auto
n
,
auto
c
,
auto
di
,
auto
hi
,
auto
wi
)
{
std
::
size_t
K
=
arg
.
weight_
.
mDesc
.
GetLengths
()[
0
];
std
::
size_t
K
=
arg
.
weight_
.
mDesc
.
GetLengths
()[
0
];
std
::
size_t
Z
=
arg
.
weight_
.
mDesc
.
GetLengths
()[
2
];
std
::
size_t
Z
=
arg
.
weight_
.
mDesc
.
GetLengths
()[
2
];
std
::
size_t
Y
=
arg
.
weight_
.
mDesc
.
GetLengths
()[
3
];
std
::
size_t
Y
=
arg
.
weight_
.
mDesc
.
GetLengths
()[
3
];
...
@@ -252,7 +252,7 @@ struct ReferenceConvBwdData : public device::BaseOperator
...
@@ -252,7 +252,7 @@ struct ReferenceConvBwdData : public device::BaseOperator
arg
.
input_
(
n
,
c
,
di
,
hi
,
wi
)
=
ck
::
type_convert
<
InDataType
>
(
v_in
);
arg
.
input_
(
n
,
c
,
di
,
hi
,
wi
)
=
ck
::
type_convert
<
InDataType
>
(
v_in
);
};
};
make_ParallelTensorFunctor
(
f_nchw
,
make_ParallelTensorFunctor
(
f_nc
d
hw
,
arg
.
input_
.
mDesc
.
GetLengths
()[
0
],
arg
.
input_
.
mDesc
.
GetLengths
()[
0
],
arg
.
input_
.
mDesc
.
GetLengths
()[
1
],
arg
.
input_
.
mDesc
.
GetLengths
()[
1
],
arg
.
input_
.
mDesc
.
GetLengths
()[
2
],
arg
.
input_
.
mDesc
.
GetLengths
()[
2
],
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp
View file @
07a673c6
...
@@ -47,7 +47,7 @@ using reduce_configuration_2_instances_blockwise = std::tuple<
...
@@ -47,7 +47,7 @@ using reduce_configuration_2_instances_blockwise = std::tuple<
>
;
>
;
#endif
#endif
template
<
typename
AccDataType
,
ReduceTensorOp
_t
ReduceOpId
>
template
<
typename
AccDataType
,
ReduceTensorOp
ReduceOpId
>
using
deviceReduceBlockWisePtrType
=
DeviceReducePtr
<
using
deviceReduceBlockWisePtrType
=
DeviceReducePtr
<
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
InElementwiseOperation
,
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
InElementwiseOperation
,
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
>
;
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
>
;
...
@@ -57,9 +57,9 @@ template <typename InDataType,
...
@@ -57,9 +57,9 @@ template <typename InDataType,
typename
OutDataType
,
typename
OutDataType
,
int
Rank
,
int
Rank
,
int
NumReduceDim
,
int
NumReduceDim
,
ReduceTensorOp
_t
ReduceOpId
,
ReduceTensorOp
ReduceOpId
,
NanPropagation
_t
NanOpt
,
NanPropagation
NanOpt
,
ReduceTensorIndices
_t
IndicesOpt
>
ReduceTensorIndices
IndicesOpt
>
void
add_device_reduce_instance_blockwise
(
void
add_device_reduce_instance_blockwise
(
std
::
vector
<
deviceReduceBlockWisePtrType
<
AccDataType
,
ReduceOpId
>>&
device_op_instances
)
std
::
vector
<
deviceReduceBlockWisePtrType
<
AccDataType
,
ReduceOpId
>>&
device_op_instances
)
{
{
...
@@ -71,11 +71,11 @@ void add_device_reduce_instance_blockwise(
...
@@ -71,11 +71,11 @@ void add_device_reduce_instance_blockwise(
AccElementwiseOperation
;
AccElementwiseOperation
;
constexpr
bool
Indexable
=
constexpr
bool
Indexable
=
(
ReduceOpId
==
ReduceTensorOp
_t
::
MIN
||
ReduceOpId
==
ReduceTensorOp
_t
::
MAX
||
(
ReduceOpId
==
ReduceTensorOp
::
MIN
||
ReduceOpId
==
ReduceTensorOp
::
MAX
||
ReduceOpId
==
ReduceTensorOp
_t
::
AMAX
);
ReduceOpId
==
ReduceTensorOp
::
AMAX
);
constexpr
bool
NeedIndices
=
Indexable
&&
(
IndicesOpt
!=
ReduceTensorIndices
_t
::
NO_INDICES
);
constexpr
bool
NeedIndices
=
Indexable
&&
(
IndicesOpt
!=
ReduceTensorIndices
::
NO_INDICES
);
constexpr
bool
PropagateNan
=
(
NanOpt
==
NanPropagation
_t
::
NOT_PROPAGATE_NAN
)
?
false
:
true
;
constexpr
bool
PropagateNan
=
(
NanOpt
==
NanPropagation
::
NOT_PROPAGATE_NAN
)
?
false
:
true
;
static_for
<
0
,
std
::
tuple_size
<
reduce_configuration_1_instances
>::
value
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
std
::
tuple_size
<
reduce_configuration_1_instances
>::
value
,
1
>
{}([
&
](
auto
i
)
{
using
cfg1
=
using
cfg1
=
...
@@ -128,9 +128,9 @@ void add_device_reduce_instance_blockwise(
...
@@ -128,9 +128,9 @@ void add_device_reduce_instance_blockwise(
ADD_BLOCKWISE_INST_BY_TYPE(inT, \
ADD_BLOCKWISE_INST_BY_TYPE(inT, \
compT, \
compT, \
outT, \
outT, \
static_cast<ReduceTensorOp
_t
>(ReduceOpId), \
static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation
_t
>(NanOpt), \
static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices
_t
>(IndicesOpt), \
static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \
Rank, \
NumReduceDim)
NumReduceDim)
...
@@ -155,9 +155,9 @@ void add_device_reduce_instance_blockwise(
...
@@ -155,9 +155,9 @@ void add_device_reduce_instance_blockwise(
ADD_BLOCKWISE_INST_REF_BY_TYPE(inT, \
ADD_BLOCKWISE_INST_REF_BY_TYPE(inT, \
compT, \
compT, \
outT, \
outT, \
static_cast<ReduceTensorOp
_t
>(ReduceOpId), \
static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation
_t
>(NanOpt), \
static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices
_t
>(IndicesOpt), \
static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \
Rank, \
NumReduceDim)
NumReduceDim)
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call.hpp
View file @
07a673c6
...
@@ -34,7 +34,7 @@ using reduce_configuration_2_instances_blockwise_second_call = std::tuple<
...
@@ -34,7 +34,7 @@ using reduce_configuration_2_instances_blockwise_second_call = std::tuple<
>
;
>
;
#endif
#endif
template
<
typename
AccDataType
,
ReduceTensorOp
_t
ReduceOpId
>
template
<
typename
AccDataType
,
ReduceTensorOp
ReduceOpId
>
using
deviceReduceBlockWiseSecondCallPtrType
=
DeviceReducePtr
<
using
deviceReduceBlockWiseSecondCallPtrType
=
DeviceReducePtr
<
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
false
,
true
>::
InElementwiseOperation
,
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
false
,
true
>::
InElementwiseOperation
,
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
false
,
true
>::
AccElementwiseOperation
>
;
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
false
,
true
>::
AccElementwiseOperation
>
;
...
@@ -44,9 +44,9 @@ template <typename InDataType,
...
@@ -44,9 +44,9 @@ template <typename InDataType,
typename
OutDataType
,
typename
OutDataType
,
int
Rank
,
int
Rank
,
int
NumReduceDim
,
int
NumReduceDim
,
ReduceTensorOp
_t
ReduceOpId
,
ReduceTensorOp
ReduceOpId
,
NanPropagation
_t
NanOpt
,
NanPropagation
NanOpt
,
ReduceTensorIndices
_t
IndicesOpt
>
ReduceTensorIndices
IndicesOpt
>
void
add_device_reduce_instance_blockwise_second_call
(
void
add_device_reduce_instance_blockwise_second_call
(
std
::
vector
<
deviceReduceBlockWiseSecondCallPtrType
<
AccDataType
,
ReduceOpId
>>&
std
::
vector
<
deviceReduceBlockWiseSecondCallPtrType
<
AccDataType
,
ReduceOpId
>>&
device_op_instances
)
device_op_instances
)
...
@@ -60,11 +60,11 @@ void add_device_reduce_instance_blockwise_second_call(
...
@@ -60,11 +60,11 @@ void add_device_reduce_instance_blockwise_second_call(
AccElementwiseOperation
;
AccElementwiseOperation
;
constexpr
bool
Indexable
=
constexpr
bool
Indexable
=
(
ReduceOpId
==
ReduceTensorOp
_t
::
MIN
||
ReduceOpId
==
ReduceTensorOp
_t
::
MAX
||
(
ReduceOpId
==
ReduceTensorOp
::
MIN
||
ReduceOpId
==
ReduceTensorOp
::
MAX
||
ReduceOpId
==
ReduceTensorOp
_t
::
AMAX
);
ReduceOpId
==
ReduceTensorOp
::
AMAX
);
constexpr
bool
NeedIndices
=
Indexable
&&
(
IndicesOpt
!=
ReduceTensorIndices
_t
::
NO_INDICES
);
constexpr
bool
NeedIndices
=
Indexable
&&
(
IndicesOpt
!=
ReduceTensorIndices
::
NO_INDICES
);
constexpr
bool
PropagateNan
=
(
NanOpt
==
NanPropagation
_t
::
NOT_PROPAGATE_NAN
)
?
false
:
true
;
constexpr
bool
PropagateNan
=
(
NanOpt
==
NanPropagation
::
NOT_PROPAGATE_NAN
)
?
false
:
true
;
static_assert
(
std
::
is_same
<
InDataType
,
AccDataType
>::
value
,
static_assert
(
std
::
is_same
<
InDataType
,
AccDataType
>::
value
,
"InDataType and AccDataType should be the same to use "
"InDataType and AccDataType should be the same to use "
...
@@ -122,9 +122,9 @@ void add_device_reduce_instance_blockwise_second_call(
...
@@ -122,9 +122,9 @@ void add_device_reduce_instance_blockwise_second_call(
ADD_BLOCKWISE_SECOND_CALL_INST_BY_TYPE(inT, \
ADD_BLOCKWISE_SECOND_CALL_INST_BY_TYPE(inT, \
compT, \
compT, \
outT, \
outT, \
static_cast<ReduceTensorOp
_t
>(ReduceOpId), \
static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation
_t
>(NanOpt), \
static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices
_t
>(IndicesOpt), \
static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \
Rank, \
NumReduceDim)
NumReduceDim)
...
@@ -150,9 +150,9 @@ void add_device_reduce_instance_blockwise_second_call(
...
@@ -150,9 +150,9 @@ void add_device_reduce_instance_blockwise_second_call(
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_TYPE(inT, \
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_TYPE(inT, \
compT, \
compT, \
outT, \
outT, \
static_cast<ReduceTensorOp
_t
>(ReduceOpId), \
static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation
_t
>(NanOpt), \
static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices
_t
>(IndicesOpt), \
static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \
Rank, \
NumReduceDim)
NumReduceDim)
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp
View file @
07a673c6
...
@@ -47,7 +47,7 @@ using reduce_configuration_2_instances_multiblock_atomic_add = std::tuple<
...
@@ -47,7 +47,7 @@ using reduce_configuration_2_instances_multiblock_atomic_add = std::tuple<
>
;
>
;
#endif
#endif
template
<
typename
AccDataType
,
ReduceTensorOp
_t
ReduceOperation
>
template
<
typename
AccDataType
,
ReduceTensorOp
ReduceOperation
>
using
deviceReduceMultiBlockAtomicAddPtrType
=
using
deviceReduceMultiBlockAtomicAddPtrType
=
DeviceReducePtr
<
typename
reduce_unary_operator
<
AccDataType
,
ReduceOperation
,
true
,
true
>::
DeviceReducePtr
<
typename
reduce_unary_operator
<
AccDataType
,
ReduceOperation
,
true
,
true
>::
InElementwiseOperation
,
InElementwiseOperation
,
...
@@ -59,9 +59,9 @@ template <typename InDataType,
...
@@ -59,9 +59,9 @@ template <typename InDataType,
typename
OutDataType
,
typename
OutDataType
,
int
Rank
,
int
Rank
,
int
NumReduceDim
,
int
NumReduceDim
,
ReduceTensorOp
_t
ReduceOpId
,
ReduceTensorOp
ReduceOpId
,
NanPropagation
_t
NanOpt
,
NanPropagation
NanOpt
,
ReduceTensorIndices
_t
IndicesOpt
>
ReduceTensorIndices
IndicesOpt
>
void
add_device_reduce_instance_multiblock_atomic_add
(
void
add_device_reduce_instance_multiblock_atomic_add
(
std
::
vector
<
deviceReduceMultiBlockAtomicAddPtrType
<
AccDataType
,
ReduceOpId
>>&
std
::
vector
<
deviceReduceMultiBlockAtomicAddPtrType
<
AccDataType
,
ReduceOpId
>>&
device_op_instances
)
device_op_instances
)
...
@@ -74,18 +74,18 @@ void add_device_reduce_instance_multiblock_atomic_add(
...
@@ -74,18 +74,18 @@ void add_device_reduce_instance_multiblock_atomic_add(
AccElementwiseOperation
;
AccElementwiseOperation
;
constexpr
bool
Indexable
=
constexpr
bool
Indexable
=
(
ReduceOpId
==
ReduceTensorOp
_t
::
MIN
||
ReduceOpId
==
ReduceTensorOp
_t
::
MAX
||
(
ReduceOpId
==
ReduceTensorOp
::
MIN
||
ReduceOpId
==
ReduceTensorOp
::
MAX
||
ReduceOpId
==
ReduceTensorOp
_t
::
AMAX
);
ReduceOpId
==
ReduceTensorOp
::
AMAX
);
constexpr
bool
NeedIndices
=
Indexable
&&
(
IndicesOpt
!=
ReduceTensorIndices
_t
::
NO_INDICES
);
constexpr
bool
NeedIndices
=
Indexable
&&
(
IndicesOpt
!=
ReduceTensorIndices
::
NO_INDICES
);
constexpr
bool
PropagateNan
=
(
NanOpt
==
NanPropagation
_t
::
NOT_PROPAGATE_NAN
)
?
false
:
true
;
constexpr
bool
PropagateNan
=
(
NanOpt
==
NanPropagation
::
NOT_PROPAGATE_NAN
)
?
false
:
true
;
static_assert
(
IndicesOpt
==
ReduceTensorIndices
_t
::
NO_INDICES
,
static_assert
(
IndicesOpt
==
ReduceTensorIndices
::
NO_INDICES
,
"AtomicAdd can only be used with reduction operations without indices!"
);
"AtomicAdd can only be used with reduction operations without indices!"
);
constexpr
bool
op_acceptable
=
constexpr
bool
op_acceptable
=
(
ReduceOpId
==
ReduceTensorOp
_t
::
ADD
||
ReduceOpId
==
ReduceTensorOp
_t
::
MUL
||
(
ReduceOpId
==
ReduceTensorOp
::
ADD
||
ReduceOpId
==
ReduceTensorOp
::
MUL
||
ReduceOpId
==
ReduceTensorOp
_t
::
AVG
||
ReduceOpId
==
ReduceTensorOp
_t
::
NORM1
);
ReduceOpId
==
ReduceTensorOp
::
AVG
||
ReduceOpId
==
ReduceTensorOp
::
NORM1
);
constexpr
bool
out_type_acceptable
=
constexpr
bool
out_type_acceptable
=
(
std
::
is_same
<
OutDataType
,
float
>::
value
||
std
::
is_same
<
OutDataType
,
double
>::
value
);
(
std
::
is_same
<
OutDataType
,
float
>::
value
||
std
::
is_same
<
OutDataType
,
double
>::
value
);
...
@@ -149,9 +149,9 @@ void add_device_reduce_instance_multiblock_atomic_add(
...
@@ -149,9 +149,9 @@ void add_device_reduce_instance_multiblock_atomic_add(
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_TYPE(inT, \
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_TYPE(inT, \
compT, \
compT, \
outT, \
outT, \
static_cast<ReduceTensorOp
_t
>(ReduceOpId), \
static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation
_t
>(NanOpt), \
static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices
_t
>(IndicesOpt), \
static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \
Rank, \
NumReduceDim)
NumReduceDim)
...
@@ -176,9 +176,9 @@ void add_device_reduce_instance_multiblock_atomic_add(
...
@@ -176,9 +176,9 @@ void add_device_reduce_instance_multiblock_atomic_add(
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE(inT, \
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE(inT, \
compT, \
compT, \
outT, \
outT, \
static_cast<ReduceTensorOp
_t
>(ReduceOpId), \
static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation
_t
>(NanOpt), \
static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices
_t
>(IndicesOpt), \
static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \
Rank, \
NumReduceDim)
NumReduceDim)
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce.hpp
View file @
07a673c6
...
@@ -46,7 +46,7 @@ using reduce_configuration_2_instances_multiblock_partial_reduce = std::tuple<
...
@@ -46,7 +46,7 @@ using reduce_configuration_2_instances_multiblock_partial_reduce = std::tuple<
>
;
>
;
#endif
#endif
template
<
typename
AccDataType
,
ReduceTensorOp
_t
ReduceOpId
>
template
<
typename
AccDataType
,
ReduceTensorOp
ReduceOpId
>
using
deviceReduceMultiBlockPartialReducePtrType
=
DeviceReducePtr
<
using
deviceReduceMultiBlockPartialReducePtrType
=
DeviceReducePtr
<
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
false
>::
InElementwiseOperation
,
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
false
>::
InElementwiseOperation
,
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
false
>::
AccElementwiseOperation
>
;
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
false
>::
AccElementwiseOperation
>
;
...
@@ -56,9 +56,9 @@ template <typename InDataType,
...
@@ -56,9 +56,9 @@ template <typename InDataType,
typename
OutDataType
,
typename
OutDataType
,
int
Rank
,
int
Rank
,
int
NumReduceDim
,
int
NumReduceDim
,
ReduceTensorOp
_t
ReduceOpId
,
ReduceTensorOp
ReduceOpId
,
NanPropagation
_t
NanOpt
,
NanPropagation
NanOpt
,
ReduceTensorIndices
_t
IndicesOpt
>
ReduceTensorIndices
IndicesOpt
>
void
add_device_reduce_instance_multiblock_partial_reduce
(
void
add_device_reduce_instance_multiblock_partial_reduce
(
std
::
vector
<
deviceReduceMultiBlockPartialReducePtrType
<
AccDataType
,
ReduceOpId
>>&
std
::
vector
<
deviceReduceMultiBlockPartialReducePtrType
<
AccDataType
,
ReduceOpId
>>&
device_op_instances
)
device_op_instances
)
...
@@ -72,11 +72,11 @@ void add_device_reduce_instance_multiblock_partial_reduce(
...
@@ -72,11 +72,11 @@ void add_device_reduce_instance_multiblock_partial_reduce(
AccElementwiseOperation
;
AccElementwiseOperation
;
constexpr
bool
Indexable
=
constexpr
bool
Indexable
=
(
ReduceOpId
==
ReduceTensorOp
_t
::
MIN
||
ReduceOpId
==
ReduceTensorOp
_t
::
MAX
||
(
ReduceOpId
==
ReduceTensorOp
::
MIN
||
ReduceOpId
==
ReduceTensorOp
::
MAX
||
ReduceOpId
==
ReduceTensorOp
_t
::
AMAX
);
ReduceOpId
==
ReduceTensorOp
::
AMAX
);
constexpr
bool
NeedIndices
=
Indexable
&&
(
IndicesOpt
!=
ReduceTensorIndices
_t
::
NO_INDICES
);
constexpr
bool
NeedIndices
=
Indexable
&&
(
IndicesOpt
!=
ReduceTensorIndices
::
NO_INDICES
);
constexpr
bool
PropagateNan
=
(
NanOpt
==
NanPropagation
_t
::
NOT_PROPAGATE_NAN
)
?
false
:
true
;
constexpr
bool
PropagateNan
=
(
NanOpt
==
NanPropagation
::
NOT_PROPAGATE_NAN
)
?
false
:
true
;
static_for
<
0
,
std
::
tuple_size
<
reduce_configuration_1_instances
>::
value
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
std
::
tuple_size
<
reduce_configuration_1_instances
>::
value
,
1
>
{}([
&
](
auto
i
)
{
using
cfg1
=
using
cfg1
=
...
@@ -131,9 +131,9 @@ void add_device_reduce_instance_multiblock_partial_reduce(
...
@@ -131,9 +131,9 @@ void add_device_reduce_instance_multiblock_partial_reduce(
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_TYPE(inT, \
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_TYPE(inT, \
compT, \
compT, \
outT, \
outT, \
static_cast<ReduceTensorOp
_t
>(ReduceOpId), \
static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation
_t
>(NanOpt), \
static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices
_t
>(IndicesOpt), \
static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \
Rank, \
NumReduceDim)
NumReduceDim)
...
@@ -159,9 +159,9 @@ void add_device_reduce_instance_multiblock_partial_reduce(
...
@@ -159,9 +159,9 @@ void add_device_reduce_instance_multiblock_partial_reduce(
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_TYPE(inT, \
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_TYPE(inT, \
compT, \
compT, \
outT, \
outT, \
static_cast<ReduceTensorOp
_t
>(ReduceOpId), \
static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation
_t
>(NanOpt), \
static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices
_t
>(IndicesOpt), \
static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \
Rank, \
NumReduceDim)
NumReduceDim)
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp
View file @
07a673c6
...
@@ -47,7 +47,7 @@ using reduce_configuration_2_instances_threadwise = std::tuple<
...
@@ -47,7 +47,7 @@ using reduce_configuration_2_instances_threadwise = std::tuple<
>
;
>
;
#endif
#endif
template
<
typename
AccDataType
,
ReduceTensorOp
_t
ReduceOpId
>
template
<
typename
AccDataType
,
ReduceTensorOp
ReduceOpId
>
using
deviceReduceThreadWisePtrType
=
DeviceReducePtr
<
using
deviceReduceThreadWisePtrType
=
DeviceReducePtr
<
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
InElementwiseOperation
,
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
InElementwiseOperation
,
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
>
;
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
>
;
...
@@ -57,9 +57,9 @@ template <typename InDataType,
...
@@ -57,9 +57,9 @@ template <typename InDataType,
typename
OutDataType
,
typename
OutDataType
,
int
Rank
,
int
Rank
,
int
NumReduceDim
,
int
NumReduceDim
,
ReduceTensorOp
_t
ReduceOpId
,
ReduceTensorOp
ReduceOpId
,
NanPropagation
_t
NanOpt
,
NanPropagation
NanOpt
,
ReduceTensorIndices
_t
IndicesOpt
>
ReduceTensorIndices
IndicesOpt
>
void
add_device_reduce_instance_threadwise
(
void
add_device_reduce_instance_threadwise
(
std
::
vector
<
deviceReduceThreadWisePtrType
<
AccDataType
,
ReduceOpId
>>&
device_op_instances
)
std
::
vector
<
deviceReduceThreadWisePtrType
<
AccDataType
,
ReduceOpId
>>&
device_op_instances
)
{
{
...
@@ -71,11 +71,11 @@ void add_device_reduce_instance_threadwise(
...
@@ -71,11 +71,11 @@ void add_device_reduce_instance_threadwise(
AccElementwiseOperation
;
AccElementwiseOperation
;
constexpr
bool
Indexable
=
constexpr
bool
Indexable
=
(
ReduceOpId
==
ReduceTensorOp
_t
::
MIN
||
ReduceOpId
==
ReduceTensorOp
_t
::
MAX
||
(
ReduceOpId
==
ReduceTensorOp
::
MIN
||
ReduceOpId
==
ReduceTensorOp
::
MAX
||
ReduceOpId
==
ReduceTensorOp
_t
::
AMAX
);
ReduceOpId
==
ReduceTensorOp
::
AMAX
);
constexpr
bool
NeedIndices
=
Indexable
&&
(
IndicesOpt
!=
ReduceTensorIndices
_t
::
NO_INDICES
);
constexpr
bool
NeedIndices
=
Indexable
&&
(
IndicesOpt
!=
ReduceTensorIndices
::
NO_INDICES
);
constexpr
bool
PropagateNan
=
(
NanOpt
==
NanPropagation
_t
::
NOT_PROPAGATE_NAN
)
?
false
:
true
;
constexpr
bool
PropagateNan
=
(
NanOpt
==
NanPropagation
::
NOT_PROPAGATE_NAN
)
?
false
:
true
;
using
cfg1
=
ReductionConfiguration_1
<
256
,
256
,
1
>
;
using
cfg1
=
ReductionConfiguration_1
<
256
,
256
,
1
>
;
...
@@ -124,9 +124,9 @@ void add_device_reduce_instance_threadwise(
...
@@ -124,9 +124,9 @@ void add_device_reduce_instance_threadwise(
ADD_THREADWISE_INST_BY_TYPE(inT, \
ADD_THREADWISE_INST_BY_TYPE(inT, \
compT, \
compT, \
outT, \
outT, \
static_cast<ReduceTensorOp
_t
>(ReduceOpId), \
static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation
_t
>(NanOpt), \
static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices
_t
>(IndicesOpt), \
static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \
Rank, \
NumReduceDim)
NumReduceDim)
...
@@ -151,9 +151,9 @@ void add_device_reduce_instance_threadwise(
...
@@ -151,9 +151,9 @@ void add_device_reduce_instance_threadwise(
ADD_THREADWISE_INST_REF_BY_TYPE(inT, \
ADD_THREADWISE_INST_REF_BY_TYPE(inT, \
compT, \
compT, \
outT, \
outT, \
static_cast<ReduceTensorOp
_t
>(ReduceOpId), \
static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation
_t
>(NanOpt), \
static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices
_t
>(IndicesOpt), \
static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \
Rank, \
NumReduceDim)
NumReduceDim)
...
...
test
/include/
test_util
.hpp
→
library
/include/
ck/library/utility/check_err
.hpp
View file @
07a673c6
#ifndef
TEST_UTIL
_HPP
#ifndef
CHECK_ERR
_HPP
#define
TEST_UTIL
_HPP
#define
CHECK_ERR
_HPP
#include <algorithm>
#include <algorithm>
#include <cmath>
#include <cmath>
#include <cstdlib>
#include <cstdlib>
#include <half.hpp>
#include <iostream>
#include <iostream>
#include <iomanip>
#include <iomanip>
#include <iterator>
#include <iterator>
...
@@ -13,14 +14,15 @@
...
@@ -13,14 +14,15 @@
#include "data_type.hpp"
#include "data_type.hpp"
namespace
test
{
namespace
ck
{
namespace
utils
{
template
<
typename
T
>
template
<
typename
T
>
typename
std
::
enable_if
<
std
::
is_floating_point
<
T
>::
value
&&
!
std
::
is_same
<
T
,
ck
::
half_t
>::
value
,
typename
std
::
enable_if
<
std
::
is_floating_point
<
T
>::
value
&&
!
std
::
is_same
<
T
,
half_t
>::
value
,
bool
>::
type
bool
>::
type
check_err
(
const
std
::
vector
<
T
>&
out
,
check_err
(
const
std
::
vector
<
T
>&
out
,
const
std
::
vector
<
T
>&
ref
,
const
std
::
vector
<
T
>&
ref
,
const
std
::
string
&
msg
,
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
double
rtol
=
1e-5
,
double
rtol
=
1e-5
,
double
atol
=
1e-8
)
double
atol
=
1e-8
)
{
{
...
@@ -60,13 +62,12 @@ check_err(const std::vector<T>& out,
...
@@ -60,13 +62,12 @@ check_err(const std::vector<T>& out,
}
}
template
<
typename
T
>
template
<
typename
T
>
typename
std
::
enable_if
<
std
::
is_same
<
T
,
ck
::
bhalf_t
>::
value
||
std
::
is_same
<
T
,
ck
::
half_t
>::
value
,
typename
std
::
enable_if
<
std
::
is_same
<
T
,
bhalf_t
>::
value
,
bool
>::
type
bool
>::
type
check_err
(
const
std
::
vector
<
T
>&
out
,
check_err
(
const
std
::
vector
<
T
>&
out
,
const
std
::
vector
<
T
>&
ref
,
const
std
::
vector
<
T
>&
ref
,
const
std
::
string
&
msg
,
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
double
rtol
=
1e-
5
,
double
rtol
=
1e-
3
,
double
atol
=
1e-
8
)
double
atol
=
1e-
3
)
{
{
if
(
out
.
size
()
!=
ref
.
size
())
if
(
out
.
size
()
!=
ref
.
size
())
{
{
...
@@ -79,11 +80,12 @@ check_err(const std::vector<T>& out,
...
@@ -79,11 +80,12 @@ check_err(const std::vector<T>& out,
bool
res
{
true
};
bool
res
{
true
};
int
err_count
=
0
;
int
err_count
=
0
;
double
err
=
0
;
double
err
=
0
;
double
max_err
=
ck
::
type_convert
<
float
>
(
ck
::
NumericLimits
<
T
>::
Min
());
// TODO: This is a hack. We should have proper specialization for bhalf_t data type.
double
max_err
=
std
::
numeric_limits
<
float
>::
min
();
for
(
std
::
size_t
i
=
0
;
i
<
ref
.
size
();
++
i
)
for
(
std
::
size_t
i
=
0
;
i
<
ref
.
size
();
++
i
)
{
{
float
o
=
ck
::
type_convert
<
float
>
(
out
[
i
]);
double
o
=
type_convert
<
float
>
(
out
[
i
]);
float
r
=
ck
::
type_convert
<
float
>
(
ref
[
i
]);
double
r
=
type_convert
<
float
>
(
ref
[
i
]);
err
=
std
::
abs
(
o
-
r
);
err
=
std
::
abs
(
o
-
r
);
if
(
err
>
atol
+
rtol
*
std
::
abs
(
r
)
||
!
std
::
isfinite
(
o
)
||
!
std
::
isfinite
(
r
))
if
(
err
>
atol
+
rtol
*
std
::
abs
(
r
)
||
!
std
::
isfinite
(
o
)
||
!
std
::
isfinite
(
r
))
{
{
...
@@ -105,11 +107,14 @@ check_err(const std::vector<T>& out,
...
@@ -105,11 +107,14 @@ check_err(const std::vector<T>& out,
return
res
;
return
res
;
}
}
bool
check_err
(
const
std
::
vector
<
ck
::
half_t
>&
out
,
template
<
typename
T
>
const
std
::
vector
<
ck
::
half_t
>&
ref
,
typename
std
::
enable_if
<
std
::
is_same
<
T
,
half_t
>::
value
||
std
::
is_same
<
T
,
half_float
::
half
>::
value
,
const
std
::
string
&
msg
,
bool
>::
type
ck
::
half_t
rtol
=
static_cast
<
ck
::
half_t
>
(
1e-3
f
),
check_err
(
const
std
::
vector
<
T
>&
out
,
ck
::
half_t
atol
=
static_cast
<
ck
::
half_t
>
(
1e-3
f
))
const
std
::
vector
<
T
>&
ref
,
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
double
rtol
=
1e-3
,
double
atol
=
1e-3
)
{
{
if
(
out
.
size
()
!=
ref
.
size
())
if
(
out
.
size
()
!=
ref
.
size
())
{
{
...
@@ -122,20 +127,20 @@ bool check_err(const std::vector<ck::half_t>& out,
...
@@ -122,20 +127,20 @@ bool check_err(const std::vector<ck::half_t>& out,
bool
res
{
true
};
bool
res
{
true
};
int
err_count
=
0
;
int
err_count
=
0
;
double
err
=
0
;
double
err
=
0
;
double
max_err
=
std
::
numeric_limits
<
ck
::
half_t
>::
min
();
double
max_err
=
std
::
numeric_limits
<
T
>::
min
();
for
(
std
::
size_t
i
=
0
;
i
<
ref
.
size
();
++
i
)
for
(
std
::
size_t
i
=
0
;
i
<
ref
.
size
();
++
i
)
{
{
double
o
ut_
=
double
(
out
[
i
]);
double
o
=
type_convert
<
float
>
(
out
[
i
]);
double
r
ef_
=
double
(
ref
[
i
]);
double
r
=
type_convert
<
float
>
(
ref
[
i
]);
err
=
std
::
abs
(
o
ut_
-
r
ef_
);
err
=
std
::
abs
(
o
-
r
);
if
(
err
>
atol
+
rtol
*
std
::
abs
(
r
ef_
)
||
!
std
::
isfinite
(
o
ut_
)
||
!
std
::
isfinite
(
r
ef_
))
if
(
err
>
atol
+
rtol
*
std
::
abs
(
r
)
||
!
std
::
isfinite
(
o
)
||
!
std
::
isfinite
(
r
))
{
{
max_err
=
err
>
max_err
?
err
:
max_err
;
max_err
=
err
>
max_err
?
err
:
max_err
;
err_count
++
;
err_count
++
;
if
(
err_count
<
5
)
if
(
err_count
<
5
)
{
{
std
::
cout
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
"out["
<<
i
<<
"] != ref["
std
::
cout
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
"out["
<<
i
<<
"] != ref["
<<
i
<<
"]: "
<<
o
ut_
<<
"!="
<<
r
ef_
<<
std
::
endl
<<
i
<<
"]: "
<<
o
<<
"
!=
"
<<
r
<<
std
::
endl
<<
msg
<<
std
::
endl
;
<<
msg
<<
std
::
endl
;
}
}
res
=
false
;
res
=
false
;
...
@@ -149,11 +154,10 @@ bool check_err(const std::vector<ck::half_t>& out,
...
@@ -149,11 +154,10 @@ bool check_err(const std::vector<ck::half_t>& out,
}
}
template
<
typename
T
>
template
<
typename
T
>
typename
std
::
enable_if
<
std
::
is_integral
<
T
>::
value
&&
!
std
::
is_same
<
T
,
ck
::
bhalf_t
>::
value
,
typename
std
::
enable_if
<
std
::
is_integral
<
T
>::
value
&&
!
std
::
is_same
<
T
,
bhalf_t
>::
value
,
bool
>::
type
bool
>::
type
check_err
(
const
std
::
vector
<
T
>&
out
,
check_err
(
const
std
::
vector
<
T
>&
out
,
const
std
::
vector
<
T
>&
ref
,
const
std
::
vector
<
T
>&
ref
,
const
std
::
string
&
msg
,
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
double
=
0
,
double
=
0
,
double
=
0
)
double
=
0
)
{
{
...
@@ -178,7 +182,8 @@ check_err(const std::vector<T>& out,
...
@@ -178,7 +182,8 @@ check_err(const std::vector<T>& out,
return
true
;
return
true
;
}
}
}
// namespace test
}
// namespace utils
}
// namespace ck
template
<
typename
T
>
template
<
typename
T
>
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
const
std
::
vector
<
T
>&
v
)
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
const
std
::
vector
<
T
>&
v
)
...
...
include/ck/
tensor_operation/gpu/device
/conv_util
s
.hpp
→
library/
include/ck/
library/utility
/conv_
fwd_
util.hpp
View file @
07a673c6
#ifndef CONV_UTIL
S
_HPP
#ifndef CONV_
FWD_
UTIL_HPP
#define CONV_UTIL
S
_HPP
#define CONV_
FWD_
UTIL_HPP
#include <algorithm>
#include <cstdlib>
#include <cstdlib>
#include <functional>
#include <functional>
#include <iterator>
#include <iterator>
#include <numeric>
#include <numeric>
#include <sstream>
#include <sstream>
#include <random>
#include <tuple>
#include <type_traits>
#include <type_traits>
#include <vector>
#include <vector>
#include "check_err.hpp"
#include "config.hpp"
#include "config.hpp"
#include "device.hpp"
#include "device_conv_fwd.hpp"
#include "device_tensor.hpp"
#include "element_wise_operation.hpp"
#include "host_tensor.hpp"
#include "host_tensor.hpp"
#include "reference_conv_fwd.hpp"
#include "tensor_layout.hpp"
#include "tensor_layout.hpp"
namespace
ck
{
namespace
ck
{
namespace
conv_util
{
namespace
utils
{
namespace
conv
{
using
DeviceConvFwdNoOpPtr
=
ck
::
tensor_operation
::
device
::
DeviceConvFwdPtr
<
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
;
/**
/**
* @brief Calculate number of FLOPs for Convolution
* @brief Calculate number of FLOPs for Convolution
...
@@ -28,7 +43,7 @@ namespace conv_util {
...
@@ -28,7 +43,7 @@ namespace conv_util {
*
*
* @return The number of flops.
* @return The number of flops.
*/
*/
std
::
size_t
G
et
F
lops
(
ck
::
index_t
N
,
std
::
size_t
g
et
_f
lops
(
ck
::
index_t
N
,
ck
::
index_t
C
,
ck
::
index_t
C
,
ck
::
index_t
K
,
ck
::
index_t
K
,
const
std
::
vector
<
ck
::
index_t
>&
filter_spatial_lengths
,
const
std
::
vector
<
ck
::
index_t
>&
filter_spatial_lengths
,
...
@@ -66,7 +81,7 @@ std::size_t GetFlops(ck::index_t N,
...
@@ -66,7 +81,7 @@ std::size_t GetFlops(ck::index_t N,
template
<
typename
InDataType
=
float
,
template
<
typename
InDataType
=
float
,
typename
WeiDataType
=
InDataType
,
typename
WeiDataType
=
InDataType
,
typename
OutDataType
=
InDataType
>
typename
OutDataType
=
InDataType
>
std
::
size_t
G
et
B
type
(
ck
::
index_t
N
,
std
::
size_t
g
et
_b
type
(
ck
::
index_t
N
,
ck
::
index_t
C
,
ck
::
index_t
C
,
ck
::
index_t
K
,
ck
::
index_t
K
,
const
std
::
vector
<
ck
::
index_t
>&
input_spatial_lengths
,
const
std
::
vector
<
ck
::
index_t
>&
input_spatial_lengths
,
...
@@ -108,27 +123,38 @@ struct ConvParams
...
@@ -108,27 +123,38 @@ struct ConvParams
input_right_pads
(
2
,
1
)
input_right_pads
(
2
,
1
)
{
{
}
}
ConvParams
(
ck
::
index_t
n_dim_spatial
,
ck
::
index_t
n
,
ConvParams
(
ck
::
index_t
n_dim
,
ck
::
index_t
k
,
ck
::
index_t
n_batch
,
ck
::
index_t
c
,
ck
::
index_t
n_out_channels
,
std
::
vector
<
ck
::
index_t
>
filter_lengths
,
ck
::
index_t
n_in_channels
,
std
::
vector
<
ck
::
index_t
>
input_lengths
,
const
std
::
vector
<
ck
::
index_t
>&
filters_len
,
std
::
vector
<
ck
::
index_t
>
conv_strides
,
const
std
::
vector
<
ck
::
index_t
>&
input_len
,
std
::
vector
<
ck
::
index_t
>
conv_dilations
,
const
std
::
vector
<
ck
::
index_t
>&
strides
,
std
::
vector
<
ck
::
index_t
>
left_pads
,
const
std
::
vector
<
ck
::
index_t
>&
dilations
,
std
::
vector
<
ck
::
index_t
>
right_pads
)
const
std
::
vector
<
ck
::
index_t
>&
left_pads
,
:
num_dim_spatial
(
n_dim_spatial
),
const
std
::
vector
<
ck
::
index_t
>&
right_pads
)
N
(
n
),
:
num_dim_spatial
(
n_dim
),
K
(
k
),
N
(
n_batch
),
C
(
c
),
K
(
n_out_channels
),
filter_spatial_lengths
(
filter_lengths
),
C
(
n_in_channels
),
input_spatial_lengths
(
input_lengths
),
filter_spatial_lengths
(
filters_len
),
conv_filter_strides
(
conv_strides
),
input_spatial_lengths
(
input_len
),
conv_filter_dilations
(
conv_dilations
),
conv_filter_strides
(
strides
),
conv_filter_dilations
(
dilations
),
input_left_pads
(
left_pads
),
input_left_pads
(
left_pads
),
input_right_pads
(
right_pads
)
input_right_pads
(
right_pads
)
{
{
if
(
filter_spatial_lengths
.
size
()
!=
num_dim_spatial
||
input_spatial_lengths
.
size
()
!=
num_dim_spatial
||
conv_filter_strides
.
size
()
!=
num_dim_spatial
||
conv_filter_dilations
.
size
()
!=
num_dim_spatial
||
input_left_pads
.
size
()
!=
num_dim_spatial
||
input_right_pads
.
size
()
!=
num_dim_spatial
)
{
throw
(
std
::
runtime_error
(
"ConvParams::GetOutputSpatialLengths: "
"parameter size is different from number of declared dimensions!"
));
}
}
}
ck
::
index_t
num_dim_spatial
;
ck
::
index_t
num_dim_spatial
;
...
@@ -147,6 +173,17 @@ struct ConvParams
...
@@ -147,6 +173,17 @@ struct ConvParams
std
::
vector
<
ck
::
index_t
>
GetOutputSpatialLengths
()
const
std
::
vector
<
ck
::
index_t
>
GetOutputSpatialLengths
()
const
{
{
if
(
filter_spatial_lengths
.
size
()
!=
num_dim_spatial
||
input_spatial_lengths
.
size
()
!=
num_dim_spatial
||
conv_filter_strides
.
size
()
!=
num_dim_spatial
||
conv_filter_dilations
.
size
()
!=
num_dim_spatial
||
input_left_pads
.
size
()
!=
num_dim_spatial
||
input_right_pads
.
size
()
!=
num_dim_spatial
)
{
throw
(
std
::
runtime_error
(
"ConvParams::GetOutputSpatialLengths: "
"parameter size is different from number of declared dimensions!"
));
}
std
::
vector
<
ck
::
index_t
>
out_spatial_len
(
num_dim_spatial
,
0
);
std
::
vector
<
ck
::
index_t
>
out_spatial_len
(
num_dim_spatial
,
0
);
for
(
ck
::
index_t
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
for
(
ck
::
index_t
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
{
...
@@ -174,7 +211,7 @@ struct ConvParams
...
@@ -174,7 +211,7 @@ struct ConvParams
* @return The host tensor descriptor object.
* @return The host tensor descriptor object.
*/
*/
template
<
typename
TensorLayout
>
template
<
typename
TensorLayout
>
HostTensorDescriptor
G
et
H
ost
T
ensor
D
escriptor
(
const
std
::
vector
<
std
::
size_t
>&
dims
,
HostTensorDescriptor
g
et
_h
ost
_t
ensor
_d
escriptor
(
const
std
::
vector
<
std
::
size_t
>&
dims
,
const
TensorLayout
&
layout
)
const
TensorLayout
&
layout
)
{
{
std
::
size_t
C
=
dims
[
1
];
std
::
size_t
C
=
dims
[
1
];
...
@@ -228,7 +265,7 @@ HostTensorDescriptor GetHostTensorDescriptor(const std::vector<std::size_t>& dim
...
@@ -228,7 +265,7 @@ HostTensorDescriptor GetHostTensorDescriptor(const std::vector<std::size_t>& dim
return
HostTensorDescriptor
(
return
HostTensorDescriptor
(
dims
,
dims
,
std
::
vector
<
std
::
size_t
>
{
std
::
vector
<
std
::
size_t
>
{
C
*
dims
[
2
]
*
dims
[
3
]
*
dims
[
4
],
1
,
dims
[
3
]
*
dims
[
4
]
*
C
,
dims
[
4
]
*
C
,
C
});
C
*
dims
[
2
]
*
dims
[
3
]
*
dims
[
4
],
1
,
C
*
dims
[
3
]
*
dims
[
4
]
,
C
*
dims
[
4
],
C
});
}
}
std
::
stringstream
err_msg
;
std
::
stringstream
err_msg
;
...
@@ -236,7 +273,282 @@ HostTensorDescriptor GetHostTensorDescriptor(const std::vector<std::size_t>& dim
...
@@ -236,7 +273,282 @@ HostTensorDescriptor GetHostTensorDescriptor(const std::vector<std::size_t>& dim
throw
std
::
runtime_error
(
err_msg
.
str
());
throw
std
::
runtime_error
(
err_msg
.
str
());
}
}
}
// namespace conv_util
template
<
typename
InDataType
=
float
,
typename
WeiDataType
=
float
,
typename
OutDataType
=
float
,
typename
InLayout
=
ck
::
tensor_layout
::
convolution
::
NHWC
,
typename
WeiLayout
=
ck
::
tensor_layout
::
convolution
::
KYXC
,
typename
OutLayout
=
ck
::
tensor_layout
::
convolution
::
NHWK
>
auto
get_host_tensors
(
const
ConvParams
&
params
,
bool
init
=
true
)
{
std
::
vector
<
std
::
size_t
>
input_dims
{
static_cast
<
std
::
size_t
>
(
params
.
N
),
static_cast
<
std
::
size_t
>
(
params
.
C
)};
input_dims
.
insert
(
std
::
end
(
input_dims
),
std
::
begin
(
params
.
input_spatial_lengths
),
std
::
end
(
params
.
input_spatial_lengths
));
std
::
vector
<
std
::
size_t
>
filter_dims
{
static_cast
<
std
::
size_t
>
(
params
.
K
),
static_cast
<
std
::
size_t
>
(
params
.
C
)};
filter_dims
.
insert
(
std
::
end
(
filter_dims
),
std
::
begin
(
params
.
filter_spatial_lengths
),
std
::
end
(
params
.
filter_spatial_lengths
));
const
std
::
vector
<
ck
::
index_t
>&
output_spatial_lengths
=
params
.
GetOutputSpatialLengths
();
std
::
vector
<
std
::
size_t
>
output_dims
{
static_cast
<
std
::
size_t
>
(
params
.
N
),
static_cast
<
std
::
size_t
>
(
params
.
K
)};
output_dims
.
insert
(
std
::
end
(
output_dims
),
std
::
begin
(
output_spatial_lengths
),
std
::
end
(
output_spatial_lengths
));
Tensor
<
InDataType
>
input
(
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
input_dims
,
InLayout
{}));
Tensor
<
WeiDataType
>
weights
(
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
filter_dims
,
WeiLayout
{}));
Tensor
<
OutDataType
>
host_output
(
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
output_dims
,
OutLayout
{}));
Tensor
<
OutDataType
>
device_output
(
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
output_dims
,
OutLayout
{}));
if
(
init
)
{
std
::
mt19937
gen
(
11939
);
if
constexpr
(
std
::
is_same
<
InDataType
,
uint8_t
>::
value
)
{
std
::
uniform_int_distribution
<>
dis
(
-
5
,
5
);
std
::
generate
(
input
.
begin
(),
input
.
end
(),
[
&
dis
,
&
gen
]()
{
return
InDataType
(
dis
(
gen
));
});
std
::
generate
(
weights
.
begin
(),
weights
.
end
(),
[
&
dis
,
&
gen
]()
{
return
WeiDataType
(
dis
(
gen
));
});
}
else
{
std
::
uniform_real_distribution
<>
dis
(
0.
f
,
1.
f
);
std
::
generate
(
input
.
begin
(),
input
.
end
(),
[
&
dis
,
&
gen
]()
{
return
InDataType
(
dis
(
gen
));
});
std
::
generate
(
weights
.
begin
(),
weights
.
end
(),
[
&
dis
,
&
gen
]()
{
return
WeiDataType
(
dis
(
gen
));
});
}
std
::
fill
(
host_output
.
begin
(),
host_output
.
end
(),
OutDataType
(
0.
f
));
std
::
fill
(
device_output
.
begin
(),
device_output
.
end
(),
OutDataType
(
0.
f
));
}
return
std
::
make_tuple
(
input
,
weights
,
host_output
,
device_output
);
}
HostTensorDescriptor
get_output_host_tensor_descriptor
(
const
std
::
vector
<
std
::
size_t
>&
dims
,
int
num_dim_spatial
=
2
)
{
namespace
tl
=
ck
::
tensor_layout
::
convolution
;
switch
(
num_dim_spatial
)
{
case
3
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
NDHWK
{});
}
case
2
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
NHWK
{});
}
case
1
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
NWK
{});
}
default:
{
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
}
}
}
HostTensorDescriptor
get_filters_host_tensor_descriptor
(
const
std
::
vector
<
std
::
size_t
>&
dims
,
int
num_dim_spatial
=
2
)
{
namespace
tl
=
ck
::
tensor_layout
::
convolution
;
switch
(
num_dim_spatial
)
{
case
3
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
KZYXC
{});
}
case
2
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
KYXC
{});
}
case
1
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
KXC
{});
}
default:
{
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
}
}
}
HostTensorDescriptor
get_input_host_tensor_descriptor
(
const
std
::
vector
<
std
::
size_t
>&
dims
,
int
num_dim_spatial
=
2
)
{
namespace
tl
=
ck
::
tensor_layout
::
convolution
;
switch
(
num_dim_spatial
)
{
case
3
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
NDHWC
{});
}
case
2
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
NHWC
{});
}
case
1
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
NWC
{});
}
default:
{
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
}
}
}
template
<
ck
::
index_t
NDim
,
typename
InDataType
=
float
,
typename
WeiDataType
=
float
,
typename
OutDataType
=
float
>
void
run_reference_convolution_forward
(
const
ConvParams
&
params
,
const
Tensor
<
InDataType
>&
input
,
const
Tensor
<
WeiDataType
>&
weights
,
Tensor
<
OutDataType
>&
output
)
{
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
auto
ref_conv
=
ck
::
tensor_operation
::
host
::
ReferenceConvFwd
<
InDataType
,
WeiDataType
,
OutDataType
,
PassThrough
,
PassThrough
,
PassThrough
,
NDim
>
();
auto
ref_invoker
=
ref_conv
.
MakeInvoker
();
auto
ref_argument
=
ref_conv
.
MakeArgument
(
input
,
weights
,
output
,
params
.
conv_filter_strides
,
params
.
conv_filter_dilations
,
params
.
input_left_pads
,
params
.
input_right_pads
,
PassThrough
{},
PassThrough
{},
PassThrough
{});
ref_invoker
.
Run
(
ref_argument
);
}
template
<
ck
::
index_t
NDim
,
typename
InDataType
=
float
,
typename
WeiDataType
=
float
,
typename
OutDataType
=
float
,
template
<
ck
::
index_t
,
typename
,
typename
,
typename
>
class
DeviceConvNDFwdInstance
>
void
run_convolution_forward
(
const
ConvParams
&
params
,
const
Tensor
<
InDataType
>&
input
,
const
Tensor
<
WeiDataType
>&
weights
,
Tensor
<
OutDataType
>&
output
)
{
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
input
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_device_buf
(
sizeof
(
WeiDataType
)
*
weights
.
mDesc
.
GetElementSpace
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
output
.
mDesc
.
GetElementSpace
());
in_device_buf
.
ToDevice
(
input
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
weights
.
mData
.
data
());
const
std
::
vector
<
ck
::
index_t
>&
output_spatial_lengths
=
params
.
GetOutputSpatialLengths
();
auto
conv
=
DeviceConvNDFwdInstance
<
NDim
,
InDataType
,
WeiDataType
,
OutDataType
>
();
auto
invoker
=
conv
.
MakeInvoker
();
auto
argument
=
conv
.
MakeArgument
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
WeiDataType
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
params
.
N
,
params
.
K
,
params
.
C
,
params
.
input_spatial_lengths
,
params
.
filter_spatial_lengths
,
output_spatial_lengths
,
params
.
conv_filter_strides
,
params
.
conv_filter_dilations
,
params
.
input_left_pads
,
params
.
input_right_pads
,
PassThrough
{},
PassThrough
{},
PassThrough
{});
if
(
!
conv
.
IsSupportedArgument
(
argument
))
{
throw
std
::
runtime_error
(
"Error! device_conv with the specified compilation parameters does "
"not support this Conv problem"
);
}
invoker
.
Run
(
argument
);
out_device_buf
.
FromDevice
(
output
.
mData
.
data
());
}
template
<
ck
::
index_t
NDim
,
typename
InDataType
=
float
,
typename
WeiDataType
=
float
,
typename
OutDataType
=
float
>
bool
run_convolution_forward_instances
(
const
ConvParams
&
params
,
const
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
conv_ptrs
,
const
Tensor
<
InDataType
>&
input
,
const
Tensor
<
WeiDataType
>&
weights
,
Tensor
<
OutDataType
>&
output
,
const
Tensor
<
OutDataType
>&
host_output
)
{
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
input
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_device_buf
(
sizeof
(
WeiDataType
)
*
weights
.
mDesc
.
GetElementSpace
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
output
.
mDesc
.
GetElementSpace
());
in_device_buf
.
ToDevice
(
input
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
weights
.
mData
.
data
());
const
std
::
vector
<
ck
::
index_t
>&
output_spatial_lengths
=
params
.
GetOutputSpatialLengths
();
bool
res
{
true
};
for
(
auto
&
conv_ptr
:
conv_ptrs
)
{
auto
invoker
=
conv_ptr
->
MakeInvokerPointer
();
auto
argument
=
conv_ptr
->
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
WeiDataType
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
params
.
N
,
params
.
K
,
params
.
C
,
params
.
input_spatial_lengths
,
params
.
filter_spatial_lengths
,
output_spatial_lengths
,
params
.
conv_filter_strides
,
params
.
conv_filter_dilations
,
params
.
input_left_pads
,
params
.
input_right_pads
,
PassThrough
{},
PassThrough
{},
PassThrough
{});
if
(
conv_ptr
->
IsSupportedArgument
(
argument
.
get
()))
{
float
atol
{
1e-5
f
};
float
rtol
{
1e-4
f
};
if
constexpr
(
std
::
is_same_v
<
InDataType
,
ck
::
half_t
>
)
{
atol
=
1e-4
f
;
rtol
=
2.5e-3
f
;
}
invoker
->
Run
(
argument
.
get
());
out_device_buf
.
FromDevice
(
output
.
mData
.
data
());
res
=
res
&&
ck
::
utils
::
check_err
(
output
.
mData
,
host_output
.
mData
,
"Error: incorrect results!"
,
atol
,
rtol
);
hipGetErrorString
(
hipMemset
(
out_device_buf
.
GetDeviceBuffer
(),
0
,
out_device_buf
.
mMemSize
));
}
}
return
res
;
}
}
// namespace conv
}
// namespace utils
}
// namespace ck
}
// namespace ck
#endif
#endif
library/src/host_tensor/host_tensor.cpp
View file @
07a673c6
...
@@ -65,21 +65,10 @@ void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream
...
@@ -65,21 +65,10 @@ void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream
}
}
#if 1
#if 1
// FIXME: remove
float
bf16_to_f32_
(
ck
::
bhalf_t
src_val
)
{
union
{
uint32_t
int32
;
float
fp32
;
}
u
=
{
uint32_t
(
src_val
)
<<
16
};
return
u
.
fp32
;
}
// FIXME: remove
// FIXME: remove
void
bf16_to_f32_
(
const
Tensor
<
ck
::
bhalf_t
>&
src
,
Tensor
<
float
>&
dst
)
void
bf16_to_f32_
(
const
Tensor
<
ck
::
bhalf_t
>&
src
,
Tensor
<
float
>&
dst
)
{
{
for
(
int
i
=
0
;
i
<
src
.
mData
.
size
();
++
i
)
for
(
int
i
=
0
;
i
<
src
.
mData
.
size
();
++
i
)
dst
.
mData
[
i
]
=
bf16_to_f32_
(
src
.
mData
[
i
]);
dst
.
mData
[
i
]
=
ck
::
type_convert
<
float
>
(
src
.
mData
[
i
]);
}
}
#endif
#endif
library/src/obselete_driver_offline/conv_add_fwd_driver_offline_nchwc.cpp
View file @
07a673c6
...
@@ -4,6 +4,8 @@
...
@@ -4,6 +4,8 @@
#include <cstdlib>
#include <cstdlib>
#include <stdlib.h>
#include <stdlib.h>
#include <half.hpp>
#include <half.hpp>
#include "check_err.hpp"
#include "config.hpp"
#include "config.hpp"
#include "debug.hpp"
#include "debug.hpp"
#include "print.hpp"
#include "print.hpp"
...
@@ -39,7 +41,7 @@ void host_direct_convolution_add_nchwc(const Tensor<TIn>& in,
...
@@ -39,7 +41,7 @@ void host_direct_convolution_add_nchwc(const Tensor<TIn>& in,
const
ConvDilations
&
conv_dilations
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
,
const
InRightPads
&
,
const
ck
::
ActivTypeEnum
_t
activ_type
)
const
ck
::
ActivTypeEnum
activ_type
)
{
{
using
namespace
ck
;
using
namespace
ck
;
...
@@ -117,7 +119,7 @@ int main(int argc, char* argv[])
...
@@ -117,7 +119,7 @@ int main(int argc, char* argv[])
exit
(
1
);
exit
(
1
);
}
}
constexpr
ck
::
ActivTypeEnum
_t
activ_type
=
ActivTypeEnum
_t
::
LeakyRelu
;
constexpr
ck
::
ActivTypeEnum
activ_type
=
ActivTypeEnum
::
LeakyRelu
;
const
ConvForwardAlgo
algo
=
static_cast
<
ConvForwardAlgo
>
(
std
::
stoi
(
argv
[
1
]));
const
ConvForwardAlgo
algo
=
static_cast
<
ConvForwardAlgo
>
(
std
::
stoi
(
argv
[
1
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
2
]);
const
bool
do_verification
=
std
::
stoi
(
argv
[
2
]);
...
@@ -167,7 +169,7 @@ int main(int argc, char* argv[])
...
@@ -167,7 +169,7 @@ int main(int argc, char* argv[])
const
bool
do_log
=
std
::
stoi
(
argv
[
4
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
4
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
5
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
5
]);
constexpr
ck
::
ActivTypeEnum
_t
activ_type
=
ActivTypeEnum
_t
::
LeakyRelu
;
constexpr
ck
::
ActivTypeEnum
activ_type
=
ActivTypeEnum
::
LeakyRelu
;
#if 0
#if 0
constexpr auto N = Number<1>{};
constexpr auto N = Number<1>{};
...
@@ -401,7 +403,7 @@ int main(int argc, char* argv[])
...
@@ -401,7 +403,7 @@ int main(int argc, char* argv[])
make_tuple
(
in_right_pad_h
,
in_right_pad_w
),
make_tuple
(
in_right_pad_h
,
in_right_pad_w
),
activ_type
);
activ_type
);
check_err
or
(
add_
host
,
add_device
);
ck
::
utils
::
check_err
(
add_
device
.
mData
,
add_host
.
mData
);
if
(
do_log
)
if
(
do_log
)
{
{
...
...
library/src/obselete_driver_offline/conv_bwd_driver_offline.cpp
View file @
07a673c6
...
@@ -4,6 +4,8 @@
...
@@ -4,6 +4,8 @@
#include <cstdlib>
#include <cstdlib>
#include <stdlib.h>
#include <stdlib.h>
#include <half.hpp>
#include <half.hpp>
#include "check_err.hpp"
#include "config.hpp"
#include "config.hpp"
#include "debug.hpp"
#include "debug.hpp"
#include "print.hpp"
#include "print.hpp"
...
@@ -473,7 +475,7 @@ int main(int argc, char* argv[])
...
@@ -473,7 +475,7 @@ int main(int argc, char* argv[])
make_tuple
(
in_right_pad_h
,
in_right_pad_w
),
make_tuple
(
in_right_pad_h
,
in_right_pad_w
),
layout
);
layout
);
check_err
or
(
in_
host
,
in_device
);
ck
::
utils
::
check_err
(
in_
device
.
mData
,
in_host
.
mData
);
if
(
do_log
)
if
(
do_log
)
{
{
...
...
library/src/obselete_driver_offline/conv_fwd_driver_offline.cpp
View file @
07a673c6
...
@@ -4,6 +4,8 @@
...
@@ -4,6 +4,8 @@
#include <cstdlib>
#include <cstdlib>
#include <stdlib.h>
#include <stdlib.h>
#include <half.hpp>
#include <half.hpp>
#include "check_err.hpp"
#include "config.hpp"
#include "config.hpp"
#include "debug.hpp"
#include "debug.hpp"
#include "print.hpp"
#include "print.hpp"
...
@@ -534,7 +536,7 @@ int main(int argc, char* argv[])
...
@@ -534,7 +536,7 @@ int main(int argc, char* argv[])
make_tuple
(
in_right_pad_h
,
in_right_pad_w
),
make_tuple
(
in_right_pad_h
,
in_right_pad_w
),
layout
);
layout
);
check_err
or
(
out_
host
,
out_device
);
ck
::
utils
::
check_err
(
out_
device
.
mData
,
out_host
.
mData
);
if
(
do_log
)
if
(
do_log
)
{
{
...
...
library/src/obselete_driver_offline/conv_fwd_driver_offline_nchwc.cpp
View file @
07a673c6
...
@@ -4,6 +4,8 @@
...
@@ -4,6 +4,8 @@
#include <cstdlib>
#include <cstdlib>
#include <stdlib.h>
#include <stdlib.h>
#include <half.hpp>
#include <half.hpp>
#include "check_err.hpp"
#include "config.hpp"
#include "config.hpp"
#include "debug.hpp"
#include "debug.hpp"
#include "print.hpp"
#include "print.hpp"
...
@@ -37,7 +39,7 @@ void host_direct_convolution_nchwc(const Tensor<TIn>& in,
...
@@ -37,7 +39,7 @@ void host_direct_convolution_nchwc(const Tensor<TIn>& in,
const
ConvDilations
&
conv_dilations
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
,
const
InRightPads
&
,
const
ck
::
ActivTypeEnum
_t
activ_type
)
const
ck
::
ActivTypeEnum
activ_type
)
{
{
using
namespace
ck
;
using
namespace
ck
;
...
@@ -102,7 +104,7 @@ int main(int argc, char* argv[])
...
@@ -102,7 +104,7 @@ int main(int argc, char* argv[])
exit
(
1
);
exit
(
1
);
}
}
constexpr
ck
::
ActivTypeEnum
_t
activ_type
=
ActivTypeEnum
_t
::
LeakyRelu
;
constexpr
ck
::
ActivTypeEnum
activ_type
=
ActivTypeEnum
::
LeakyRelu
;
const
ConvForwardAlgo
algo
=
static_cast
<
ConvForwardAlgo
>
(
std
::
stoi
(
argv
[
1
]));
const
ConvForwardAlgo
algo
=
static_cast
<
ConvForwardAlgo
>
(
std
::
stoi
(
argv
[
1
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
2
]);
const
bool
do_verification
=
std
::
stoi
(
argv
[
2
]);
...
@@ -149,8 +151,8 @@ int main(int argc, char* argv[])
...
@@ -149,8 +151,8 @@ int main(int argc, char* argv[])
const
bool
do_log
=
std
::
stoi
(
argv
[
4
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
4
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
5
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
5
]);
// constexpr ck::ActivTypeEnum
_t
activ_type = ActivTypeEnum
_t
::Sigmoid;
// constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::Sigmoid;
constexpr
ck
::
ActivTypeEnum
_t
activ_type
=
ActivTypeEnum
_t
::
LeakyRelu
;
constexpr
ck
::
ActivTypeEnum
activ_type
=
ActivTypeEnum
::
LeakyRelu
;
#if 0
#if 0
constexpr auto N = Number<1>{};
constexpr auto N = Number<1>{};
...
@@ -377,7 +379,7 @@ int main(int argc, char* argv[])
...
@@ -377,7 +379,7 @@ int main(int argc, char* argv[])
make_tuple
(
in_right_pad_h
,
in_right_pad_w
),
make_tuple
(
in_right_pad_h
,
in_right_pad_w
),
activ_type
);
activ_type
);
check_err
or
(
out_
host
,
out_device
);
ck
::
utils
::
check_err
(
out_
device
.
mData
,
out_host
.
mData
);
if
(
do_log
)
if
(
do_log
)
{
{
...
...
library/src/obselete_driver_offline/conv_maxpool_fwd_driver_offline_nchwc.cpp
View file @
07a673c6
...
@@ -4,6 +4,8 @@
...
@@ -4,6 +4,8 @@
#include <cstdlib>
#include <cstdlib>
#include <stdlib.h>
#include <stdlib.h>
#include <half.hpp>
#include <half.hpp>
#include "check_err.hpp"
#include "config.hpp"
#include "config.hpp"
#include "debug.hpp"
#include "debug.hpp"
#include "print.hpp"
#include "print.hpp"
...
@@ -38,7 +40,7 @@ void host_direct_convolution_maxpool_nchwc(const Tensor<TIn>& in,
...
@@ -38,7 +40,7 @@ void host_direct_convolution_maxpool_nchwc(const Tensor<TIn>& in,
const
ConvDilations
&
conv_dilations
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
,
const
InRightPads
&
,
const
ck
::
ActivTypeEnum
_t
activ_type
)
const
ck
::
ActivTypeEnum
activ_type
)
{
{
using
namespace
ck
;
using
namespace
ck
;
...
@@ -126,7 +128,7 @@ int main(int argc, char* argv[])
...
@@ -126,7 +128,7 @@ int main(int argc, char* argv[])
exit
(
1
);
exit
(
1
);
}
}
constexpr
ck
::
ActivTypeEnum
_t
activ_type
=
ActivTypeEnum
_t
::
LeakyRelu
;
constexpr
ck
::
ActivTypeEnum
activ_type
=
ActivTypeEnum
::
LeakyRelu
;
const
ConvForwardAlgo
algo
=
static_cast
<
ConvForwardAlgo
>
(
std
::
stoi
(
argv
[
1
]));
const
ConvForwardAlgo
algo
=
static_cast
<
ConvForwardAlgo
>
(
std
::
stoi
(
argv
[
1
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
2
]);
const
bool
do_verification
=
std
::
stoi
(
argv
[
2
]);
...
@@ -176,7 +178,7 @@ int main(int argc, char* argv[])
...
@@ -176,7 +178,7 @@ int main(int argc, char* argv[])
const
bool
do_log
=
std
::
stoi
(
argv
[
4
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
4
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
5
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
5
]);
constexpr
ck
::
ActivTypeEnum
_t
activ_type
=
ActivTypeEnum
_t
::
LeakyRelu
;
constexpr
ck
::
ActivTypeEnum
activ_type
=
ActivTypeEnum
::
LeakyRelu
;
#if 1
#if 1
constexpr
auto
N
=
Number
<
1
>
{};
constexpr
auto
N
=
Number
<
1
>
{};
...
@@ -397,8 +399,8 @@ int main(int argc, char* argv[])
...
@@ -397,8 +399,8 @@ int main(int argc, char* argv[])
make_tuple
(
in_right_pad_h
,
in_right_pad_w
),
make_tuple
(
in_right_pad_h
,
in_right_pad_w
),
activ_type
);
activ_type
);
check_err
or
(
out_
host
,
out_device
);
ck
::
utils
::
check_err
(
out_
device
.
mData
,
out_host
.
mData
);
check_err
or
(
max_
host
,
max_device
);
ck
::
utils
::
check_err
(
max_
device
.
mData
,
max_host
.
mData
);
if
(
do_log
)
if
(
do_log
)
{
{
...
...
Prev
1
…
5
6
7
8
9
10
11
12
13
…
16
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment