Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
a3b4c5cb
Commit
a3b4c5cb
authored
Jun 03, 2022
by
wangshaojie6
Browse files
merge develop branch and add gridwise pipeline v3
parents
48918ab9
1677cf70
Changes
361
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
101 additions
and
896 deletions
+101
-896
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.hpp
.../gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.hpp
+0
-2
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call.hpp
...u/reduce/device_reduce_instance_blockwise_second_call.hpp
+0
-165
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f16_f16_f16.hpp
...ice_reduce_instance_blockwise_second_call_f16_f16_f16.hpp
+0
-47
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_b16.hpp
...ice_reduce_instance_blockwise_second_call_f32_f32_b16.hpp
+0
-60
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f16.hpp
...ice_reduce_instance_blockwise_second_call_f32_f32_f16.hpp
+0
-35
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f32.hpp
...ice_reduce_instance_blockwise_second_call_f32_f32_f32.hpp
+0
-59
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f32.hpp
...ice_reduce_instance_blockwise_second_call_f64_f64_f32.hpp
+0
-35
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f64.hpp
...ice_reduce_instance_blockwise_second_call_f64_f64_f64.hpp
+0
-59
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i32_i32_i8.hpp
...vice_reduce_instance_blockwise_second_call_i32_i32_i8.hpp
+0
-31
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i8_i8_i8.hpp
...device_reduce_instance_blockwise_second_call_i8_i8_i8.hpp
+0
-47
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp
...nstance/gpu/reduce/device_reduce_instance_impl_common.hpp
+0
-14
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp
...u/reduce/device_reduce_instance_multiblock_atomic_add.hpp
+70
-53
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp
...ice_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp
+1
-2
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp
...ice_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp
+1
-2
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp
...ice_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp
+0
-2
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp
...ice_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp
+0
-2
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp
...ice_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp
+29
-0
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce.hpp
...duce/device_reduce_instance_multiblock_partial_reduce.hpp
+0
-174
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.hpp
...reduce_instance_multiblock_partial_reduce_b16_f32_b16.hpp
+0
-60
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.hpp
...reduce_instance_multiblock_partial_reduce_f16_f16_f16.hpp
+0
-47
No files found.
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.hpp
View file @
a3b4c5cb
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I8_I8_HPP
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I8_I8_HPP
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I8_I8_HPP
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I8_I8_HPP
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_blockwise.hpp"
#include "device_reduce_instance_blockwise.hpp"
namespace
ck
{
namespace
ck
{
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call.hpp
deleted
100644 → 0
View file @
48918ab9
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_HPP
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_HPP
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_impl_common.hpp"
#include "device_reduce_blockwise_second_call.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
#ifdef QUICK_REDUCE_TEST
using
reduce_configuration_2_instances_blockwise_second_call
=
std
::
tuple
<
// clang-format off
// InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize
ReductionConfiguration_2
<
1
,
2
,
1
,
1
,
2
>
,
ReductionConfiguration_2
<
1
,
1
,
1
,
1
,
3
>
// clang-format on
>
;
#else
using
reduce_configuration_2_instances_blockwise_second_call
=
std
::
tuple
<
// clang-format off
// InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize
ReductionConfiguration_2
<
1
,
4
,
1
,
1
,
8
>
,
ReductionConfiguration_2
<
1
,
4
,
1
,
1
,
4
>
,
ReductionConfiguration_2
<
1
,
2
,
1
,
1
,
2
>
,
ReductionConfiguration_2
<
1
,
1
,
1
,
1
,
3
>
,
ReductionConfiguration_2
<
1
,
1
,
1
,
1
,
5
>
,
ReductionConfiguration_2
<
1
,
1
,
1
,
1
,
7
>
,
ReductionConfiguration_2
<
1
,
1
,
1
,
1
,
11
>
// clang-format on
>
;
#endif
template
<
typename
AccDataType
,
ReduceTensorOp
ReduceOpId
>
using
deviceReduceBlockWiseSecondCallPtrType
=
DeviceReducePtr
<
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
false
,
true
>::
InElementwiseOperation
,
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
false
,
true
>::
AccElementwiseOperation
>
;
template
<
typename
InDataType
,
typename
AccDataType
,
typename
OutDataType
,
int
Rank
,
int
NumReduceDim
,
ReduceTensorOp
ReduceOpId
,
NanPropagation
NanOpt
,
ReduceTensorIndices
IndicesOpt
>
void
add_device_reduce_instance_blockwise_second_call
(
std
::
vector
<
deviceReduceBlockWiseSecondCallPtrType
<
AccDataType
,
ReduceOpId
>>&
device_op_instances
)
{
using
ReduceOperation
=
typename
reduce_binary_operator
<
AccDataType
,
ReduceOpId
>::
opType
;
using
InElementwiseOperation
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
false
,
true
>::
InElementwiseOperation
;
using
AccElementwiseOperation
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
false
,
true
>::
AccElementwiseOperation
;
constexpr
bool
Indexable
=
(
ReduceOpId
==
ReduceTensorOp
::
MIN
||
ReduceOpId
==
ReduceTensorOp
::
MAX
||
ReduceOpId
==
ReduceTensorOp
::
AMAX
);
constexpr
bool
NeedIndices
=
Indexable
&&
(
IndicesOpt
!=
ReduceTensorIndices
::
NO_INDICES
);
constexpr
bool
PropagateNan
=
(
NanOpt
==
NanPropagation
::
NOT_PROPAGATE_NAN
)
?
false
:
true
;
static_assert
(
std
::
is_same
<
InDataType
,
AccDataType
>::
value
,
"InDataType and AccDataType should be the same to use "
"add_device_reduce_instance_blockwise_second_call!"
);
static_for
<
0
,
std
::
tuple_size
<
reduce_configuration_1_instances
>::
value
,
1
>
{}([
&
](
auto
i
)
{
using
cfg1
=
remove_cvref_t
<
decltype
(
std
::
get
<
i
.
value
>
(
reduce_configuration_1_instances
{}))
>
;
static_for
<
0
,
std
::
tuple_size
<
reduce_configuration_2_instances_blockwise_second_call
>::
value
,
1
>
{}([
&
](
auto
j
)
{
using
cfg2
=
remove_cvref_t
<
decltype
(
std
::
get
<
j
.
value
>
(
reduce_configuration_2_instances_blockwise_second_call
{}))
>
;
using
ReduceOpInstance
=
DeviceReduceBlockWiseSecondCall
<
InDataType
,
AccDataType
,
OutDataType
,
Rank
,
NumReduceDim
,
ReduceOperation
,
InElementwiseOperation
,
AccElementwiseOperation
,
PropagateNan
,
NeedIndices
,
cfg1
::
BlockSize_
,
cfg1
::
MThreadClusterSize_
,
cfg1
::
KThreadClusterSize_
,
cfg2
::
MThreadSliceSize_
,
cfg2
::
KThreadSliceSize_
,
cfg2
::
InSrcVectorDim_
,
cfg2
::
InSrcVectorSize_
,
cfg2
::
OutDstVectorSize_
>
;
device_op_instances
.
push_back
(
std
::
make_unique
<
ReduceOpInstance
>
(
ReduceOpInstance
{}));
});
});
};
#define ADD_BLOCKWISE_SECOND_CALL_INST_BY_TYPE( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
template void add_device_reduce_instance_blockwise_second_call<inT, \
compT, \
outT, \
Rank, \
NumReduceDim, \
ReduceOpId, \
NanOpt, \
IndicesOpt>( \
std::vector<deviceReduceBlockWiseSecondCallPtrType<compT, ReduceOpId>> & \
device_op_instances)
#define ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
ADD_BLOCKWISE_SECOND_CALL_INST_BY_TYPE(inT, \
compT, \
outT, \
static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \
NumReduceDim)
#define ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_TYPE( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
extern template void add_device_reduce_instance_blockwise_second_call<inT, \
compT, \
outT, \
Rank, \
NumReduceDim, \
ReduceOpId, \
NanOpt, \
IndicesOpt>( \
std::vector< \
DeviceReducePtr<typename reduce_unary_operator<compT, ReduceOpId, false, true>:: \
InElementwiseOperation, \
typename reduce_unary_operator<compT, ReduceOpId, false, true>:: \
AccElementwiseOperation>> & \
device_op_instances)
#define ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_TYPE(inT, \
compT, \
outT, \
static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \
NumReduceDim)
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f16_f16_f16.hpp
deleted
100644 → 0
View file @
48918ab9
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F16_F16_F16_HPP
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F16_F16_F16_HPP
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_blockwise_second_call.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_b16.hpp
deleted
100644 → 0
View file @
48918ab9
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F32_F32_B16_HPP
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F32_F32_B16_HPP
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_blockwise_second_call.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
bhalf_t
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
bhalf_t
,
0
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
bhalf_t
,
0
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
bhalf_t
,
0
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
bhalf_t
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
bhalf_t
,
5
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
bhalf_t
,
5
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
bhalf_t
,
5
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
bhalf_t
,
7
,
0
,
0
,
4
,
3
);
// for NORM2
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
bhalf_t
,
7
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
bhalf_t
,
7
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
bhalf_t
,
7
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
bhalf_t
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
bhalf_t
,
2
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
bhalf_t
,
2
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
bhalf_t
,
2
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
bhalf_t
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
bhalf_t
,
3
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
bhalf_t
,
3
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
bhalf_t
,
3
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
bhalf_t
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
bhalf_t
,
4
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
bhalf_t
,
4
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
bhalf_t
,
4
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
bhalf_t
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
bhalf_t
,
2
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
bhalf_t
,
2
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
bhalf_t
,
2
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
bhalf_t
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
bhalf_t
,
3
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
bhalf_t
,
3
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
bhalf_t
,
3
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
bhalf_t
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
bhalf_t
,
4
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
bhalf_t
,
4
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
bhalf_t
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f16.hpp
deleted
100644 → 0
View file @
48918ab9
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F32_F32_F16_HPP
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F32_F32_F16_HPP
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_blockwise_second_call.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
half_t
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
half_t
,
0
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
half_t
,
0
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
half_t
,
0
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
half_t
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
half_t
,
5
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
half_t
,
5
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
half_t
,
5
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
half_t
,
7
,
0
,
0
,
4
,
3
);
// for NORM2
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
half_t
,
7
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
half_t
,
7
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
half_t
,
7
,
0
,
0
,
2
,
1
);
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f32.hpp
deleted
100644 → 0
View file @
48918ab9
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F32_F32_F32_HPP
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F32_F32_F32_HPP
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_blockwise_second_call.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
4
,
3
);
// for NORM2
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f32.hpp
deleted
100644 → 0
View file @
48918ab9
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F64_F64_F32_HPP
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F64_F64_F32_HPP
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_blockwise_second_call.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
float
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
float
,
0
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
float
,
0
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
float
,
0
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
float
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
float
,
5
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
float
,
5
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
float
,
5
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
float
,
7
,
0
,
0
,
4
,
3
);
// for NORM2
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
float
,
7
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
float
,
7
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
float
,
7
,
0
,
0
,
2
,
1
);
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f64.hpp
deleted
100644 → 0
View file @
48918ab9
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F64_F64_F64_HPP
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F64_F64_F64_HPP
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_blockwise_second_call.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
3
);
// for NORM2
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i32_i32_i8.hpp
deleted
100644 → 0
View file @
48918ab9
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_I32_I32_I8_HPP
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_I32_I32_I8_HPP
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_blockwise_second_call.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
int32_t
,
int32_t
,
int8_t
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
int32_t
,
int32_t
,
int8_t
,
0
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
int32_t
,
int32_t
,
int8_t
,
0
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
int32_t
,
int32_t
,
int8_t
,
0
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
int32_t
,
int32_t
,
int8_t
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
int32_t
,
int32_t
,
int8_t
,
5
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
int32_t
,
int32_t
,
int8_t
,
5
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
int32_t
,
int32_t
,
int8_t
,
5
,
0
,
0
,
2
,
1
);
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i8_i8_i8.hpp
deleted
100644 → 0
View file @
48918ab9
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_I8_I8_I8_HPP
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_I8_I8_I8_HPP
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_blockwise_second_call.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp
View file @
a3b4c5cb
...
@@ -30,20 +30,6 @@ struct ReductionConfiguration_2
...
@@ -30,20 +30,6 @@ struct ReductionConfiguration_2
static
constexpr
int
KThreadSliceSize_
=
KThreadSliceSize
;
static
constexpr
int
KThreadSliceSize_
=
KThreadSliceSize
;
};
};
using
reduce_configuration_1_instances
=
std
::
tuple
<
// clang-format off
// BlockSize | MThreadClusterSize | KThreadClusterSize
ReductionConfiguration_1
<
256
,
128
,
2
>
,
ReductionConfiguration_1
<
256
,
64
,
4
>
,
ReductionConfiguration_1
<
256
,
32
,
8
>
,
ReductionConfiguration_1
<
256
,
16
,
16
>
,
ReductionConfiguration_1
<
256
,
8
,
32
>
,
ReductionConfiguration_1
<
256
,
4
,
64
>
,
ReductionConfiguration_1
<
256
,
2
,
128
>
,
ReductionConfiguration_1
<
256
,
1
,
256
>
// clang-format on
>
;
#define QUICK_REDUCE_TEST 1
#define QUICK_REDUCE_TEST 1
}
// namespace device_reduce_instance
}
// namespace device_reduce_instance
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp
View file @
a3b4c5cb
...
@@ -3,13 +3,27 @@
...
@@ -3,13 +3,27 @@
#include "reduction_operator_mapping.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_impl_common.hpp"
#include "device_reduce_instance_impl_common.hpp"
#include "device_reduce_multiblock
_atomic_add
.hpp"
#include "device_reduce_multiblock.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
namespace
device_reduce_instance
{
namespace
device_reduce_instance
{
using
reduce_configuration_1_instances_multiblock_atomic_add
=
std
::
tuple
<
// clang-format off
// BlockSize | MThreadClusterSize | KThreadClusterSize
ReductionConfiguration_1
<
256
,
128
,
2
>
,
ReductionConfiguration_1
<
256
,
64
,
4
>
,
ReductionConfiguration_1
<
256
,
32
,
8
>
,
ReductionConfiguration_1
<
256
,
16
,
16
>
,
ReductionConfiguration_1
<
256
,
8
,
32
>
,
ReductionConfiguration_1
<
256
,
4
,
64
>
,
ReductionConfiguration_1
<
256
,
2
,
128
>
,
ReductionConfiguration_1
<
256
,
1
,
256
>
// clang-format on
>
;
#ifdef QUICK_REDUCE_TEST
#ifdef QUICK_REDUCE_TEST
using
reduce_configuration_2_instances_multiblock_atomic_add
=
std
::
tuple
<
using
reduce_configuration_2_instances_multiblock_atomic_add
=
std
::
tuple
<
// clang-format off
// clang-format off
...
@@ -60,8 +74,8 @@ template <typename InDataType,
...
@@ -60,8 +74,8 @@ template <typename InDataType,
int
Rank
,
int
Rank
,
int
NumReduceDim
,
int
NumReduceDim
,
ReduceTensorOp
ReduceOpId
,
ReduceTensorOp
ReduceOpId
,
Nan
Propagat
ion
NanOpt
,
bool
Propagat
eNan
,
ReduceTensorIndices
IndicesOpt
>
bool
UseIndex
>
void
add_device_reduce_instance_multiblock_atomic_add
(
void
add_device_reduce_instance_multiblock_atomic_add
(
std
::
vector
<
deviceReduceMultiBlockAtomicAddPtrType
<
AccDataType
,
ReduceOpId
>>&
std
::
vector
<
deviceReduceMultiBlockAtomicAddPtrType
<
AccDataType
,
ReduceOpId
>>&
device_op_instances
)
device_op_instances
)
...
@@ -76,12 +90,10 @@ void add_device_reduce_instance_multiblock_atomic_add(
...
@@ -76,12 +90,10 @@ void add_device_reduce_instance_multiblock_atomic_add(
constexpr
bool
Indexable
=
constexpr
bool
Indexable
=
(
ReduceOpId
==
ReduceTensorOp
::
MIN
||
ReduceOpId
==
ReduceTensorOp
::
MAX
||
(
ReduceOpId
==
ReduceTensorOp
::
MIN
||
ReduceOpId
==
ReduceTensorOp
::
MAX
||
ReduceOpId
==
ReduceTensorOp
::
AMAX
);
ReduceOpId
==
ReduceTensorOp
::
AMAX
);
constexpr
bool
NeedIndices
=
Indexable
&&
(
IndicesOpt
!=
ReduceTensorIndices
::
NO_INDICES
);
constexpr
bool
OutputIndex
=
Indexable
&&
UseIndex
;
constexpr
bool
PropagateNan
=
(
NanOpt
==
NanPropagation
::
NOT_PROPAGATE_NAN
)
?
false
:
true
;
static_assert
(
IndicesOpt
==
ReduceTensorIndices
::
NO_INDICES
,
static_assert
(
UseIndex
==
false
,
"AtomicAdd can only be used with reduction operations
without indices
!"
);
"AtomicAdd can only be used with reduction operations
using no index
!"
);
constexpr
bool
op_acceptable
=
constexpr
bool
op_acceptable
=
(
ReduceOpId
==
ReduceTensorOp
::
ADD
||
ReduceOpId
==
ReduceTensorOp
::
MUL
||
(
ReduceOpId
==
ReduceTensorOp
::
ADD
||
ReduceOpId
==
ReduceTensorOp
::
MUL
||
...
@@ -94,9 +106,11 @@ void add_device_reduce_instance_multiblock_atomic_add(
...
@@ -94,9 +106,11 @@ void add_device_reduce_instance_multiblock_atomic_add(
return
;
return
;
else
else
{
{
static_for
<
0
,
std
::
tuple_size
<
reduce_configuration_1_instances
>::
value
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
using
cfg1
=
std
::
tuple_size
<
reduce_configuration_1_instances_multiblock_atomic_add
>::
value
,
remove_cvref_t
<
decltype
(
std
::
get
<
i
.
value
>
(
reduce_configuration_1_instances
{}))
>
;
1
>
{}([
&
](
auto
i
)
{
using
cfg1
=
remove_cvref_t
<
decltype
(
std
::
get
<
i
.
value
>
(
reduce_configuration_1_instances_multiblock_atomic_add
{}))
>
;
static_for
<
static_for
<
0
,
0
,
...
@@ -105,24 +119,27 @@ void add_device_reduce_instance_multiblock_atomic_add(
...
@@ -105,24 +119,27 @@ void add_device_reduce_instance_multiblock_atomic_add(
using
cfg2
=
remove_cvref_t
<
decltype
(
using
cfg2
=
remove_cvref_t
<
decltype
(
std
::
get
<
j
.
value
>
(
reduce_configuration_2_instances_multiblock_atomic_add
{}))
>
;
std
::
get
<
j
.
value
>
(
reduce_configuration_2_instances_multiblock_atomic_add
{}))
>
;
using
ReduceOpInstance
=
DeviceReduceMultiBlockAtomicAdd
<
InDataType
,
using
ReduceOpInstance
=
AccDataType
,
DeviceReduceMultiBlock
<
InDataType
,
OutDataType
,
AccDataType
,
Rank
,
OutDataType
,
NumReduceDim
,
Rank
,
ReduceOperation
,
NumReduceDim
,
InElementwiseOperation
,
ReduceOperation
,
AccElementwiseOperation
,
InElementwiseOperation
,
PropagateNan
,
AccElementwiseOperation
,
NeedIndices
,
InMemoryDataOperationEnum
::
AtomicAdd
,
cfg1
::
BlockSize_
,
PropagateNan
,
cfg1
::
MThreadClusterSize_
,
OutputIndex
,
cfg1
::
KThreadClusterSize_
,
false
,
// HaveIndexInputIfOutputIndex
cfg2
::
MThreadSliceSize_
,
cfg1
::
BlockSize_
,
cfg2
::
KThreadSliceSize_
,
cfg1
::
MThreadClusterSize_
,
cfg2
::
InSrcVectorDim_
,
cfg1
::
KThreadClusterSize_
,
cfg2
::
InSrcVectorSize_
,
cfg2
::
MThreadSliceSize_
,
cfg2
::
OutDstVectorSize_
>
;
cfg2
::
KThreadSliceSize_
,
cfg2
::
InSrcVectorDim_
,
cfg2
::
InSrcVectorSize_
,
cfg2
::
OutDstVectorSize_
>
;
device_op_instances
.
push_back
(
device_op_instances
.
push_back
(
std
::
make_unique
<
ReduceOpInstance
>
(
ReduceOpInstance
{}));
std
::
make_unique
<
ReduceOpInstance
>
(
ReduceOpInstance
{}));
...
@@ -132,54 +149,54 @@ void add_device_reduce_instance_multiblock_atomic_add(
...
@@ -132,54 +149,54 @@ void add_device_reduce_instance_multiblock_atomic_add(
};
};
#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_TYPE( \
#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_TYPE( \
inT, compT, outT, ReduceOpId,
NanOpt, IndicesOpt
, Rank, NumReduceDim)
\
inT, compT, outT, ReduceOpId,
PropagateNan, UseIndex
, Rank, NumReduceDim) \
template void add_device_reduce_instance_multiblock_atomic_add<inT, \
template void add_device_reduce_instance_multiblock_atomic_add<inT, \
compT, \
compT, \
outT, \
outT, \
Rank, \
Rank, \
NumReduceDim, \
NumReduceDim, \
ReduceOpId, \
ReduceOpId, \
NanOpt,
\
PropagateNan,
\
IndicesOpt>(
\
UseIndex>(
\
std::vector<deviceReduceMultiBlockAtomicAddPtrType<compT, ReduceOpId>> & \
std::vector<deviceReduceMultiBlockAtomicAddPtrType<compT, ReduceOpId>> & \
device_op_instances)
device_op_instances)
#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(
\
#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)
\
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_TYPE(inT,
\
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_TYPE(inT, \
compT,
\
compT, \
outT,
\
outT, \
static_cast<ReduceTensorOp>(ReduceOpId),
\
static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<
NanPropagation
>(NanOpt), \
static_cast<
bool
>(NanOpt),
\
static_cast<
ReduceTensorIndices
>(IndicesOpt), \
static_cast<
bool
>(IndicesOpt),
\
Rank,
\
Rank, \
NumReduceDim)
NumReduceDim)
#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE( \
#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE( \
inT, compT, outT, ReduceOpId,
NanOpt, IndicesOpt
, Rank, NumReduceDim)
\
inT, compT, outT, ReduceOpId,
PropagateNan, UseIndex
, Rank, NumReduceDim) \
extern template void add_device_reduce_instance_multiblock_atomic_add<inT, \
extern template void add_device_reduce_instance_multiblock_atomic_add<inT, \
compT, \
compT, \
outT, \
outT, \
Rank, \
Rank, \
NumReduceDim, \
NumReduceDim, \
ReduceOpId, \
ReduceOpId, \
NanOpt,
\
PropagateNan,
\
IndicesOpt>(
\
UseIndex>(
\
std::vector<DeviceReducePtr< \
std::vector<DeviceReducePtr< \
typename reduce_unary_operator<compT, ReduceOpId, true, true>::InElementwiseOperation, \
typename reduce_unary_operator<compT, ReduceOpId, true, true>::InElementwiseOperation, \
typename reduce_unary_operator<compT, ReduceOpId, true, true>:: \
typename reduce_unary_operator<compT, ReduceOpId, true, true>:: \
AccElementwiseOperation>> & \
AccElementwiseOperation>> & \
device_op_instances)
device_op_instances)
#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(
\
#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)
\
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE(inT,
\
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE(inT, \
compT,
\
compT, \
outT,
\
outT, \
static_cast<ReduceTensorOp>(ReduceOpId),
\
static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<
NanPropagation
>(NanOpt), \
static_cast<
bool
>(NanOpt),
\
static_cast<
ReduceTensorIndices
>(IndicesOpt), \
static_cast<
bool
>(IndicesOpt),
\
Rank,
\
Rank, \
NumReduceDim)
NumReduceDim)
}
// namespace device_reduce_instance
}
// namespace device_reduce_instance
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp
View file @
a3b4c5cb
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_B16_F32_F32_HPP
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_B16_F32_F32_HPP
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_B16_F32_F32_HPP
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_B16_F32_F32_HPP
#include "reduction_enums.hpp"
#include "data_type.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_multiblock_atomic_add.hpp"
#include "device_reduce_instance_multiblock_atomic_add.hpp"
namespace
ck
{
namespace
ck
{
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp
View file @
a3b4c5cb
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F16_F32_F32_HPP
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F16_F32_F32_HPP
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F16_F32_F32_HPP
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F16_F32_F32_HPP
#include "reduction_enums.hpp"
#include "data_type.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_multiblock_atomic_add.hpp"
#include "device_reduce_instance_multiblock_atomic_add.hpp"
namespace
ck
{
namespace
ck
{
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp
View file @
a3b4c5cb
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F32_F32_HPP
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F32_F32_HPP
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F32_F32_HPP
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F32_F32_HPP
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_multiblock_atomic_add.hpp"
#include "device_reduce_instance_multiblock_atomic_add.hpp"
namespace
ck
{
namespace
ck
{
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp
View file @
a3b4c5cb
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F64_F32_HPP
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F64_F32_HPP
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F64_F32_HPP
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F64_F32_HPP
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_multiblock_atomic_add.hpp"
#include "device_reduce_instance_multiblock_atomic_add.hpp"
namespace
ck
{
namespace
ck
{
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp
0 → 100644
View file @
a3b4c5cb
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F64_F64_F64_HPP
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F64_F64_F64_HPP
#include "device_reduce_instance_multiblock_atomic_add.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
2
,
1
);
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce.hpp
deleted
100644 → 0
View file @
48918ab9
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_HPP
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_HPP
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_impl_common.hpp"
#include "device_reduce_multiblock_partial_reduce.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
#ifdef QUICK_REDUCE_TEST
using
reduce_configuration_2_instances_multiblock_partial_reduce
=
std
::
tuple
<
// clang-format off
// InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize
ReductionConfiguration_2
<
0
,
1
,
1
,
2
,
1
>
,
ReductionConfiguration_2
<
1
,
2
,
1
,
1
,
2
>
,
ReductionConfiguration_2
<
0
,
1
,
1
,
3
,
1
>
,
ReductionConfiguration_2
<
1
,
1
,
1
,
1
,
3
>
// clang-format on
>
;
#else
using
reduce_configuration_2_instances_multiblock_partial_reduce
=
std
::
tuple
<
// clang-format off
// InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize
ReductionConfiguration_2
<
0
,
4
,
1
,
8
,
1
>
,
ReductionConfiguration_2
<
0
,
4
,
1
,
4
,
1
>
,
ReductionConfiguration_2
<
0
,
2
,
1
,
2
,
1
>
,
ReductionConfiguration_2
<
1
,
4
,
1
,
1
,
8
>
,
ReductionConfiguration_2
<
1
,
4
,
1
,
1
,
4
>
,
ReductionConfiguration_2
<
1
,
2
,
1
,
1
,
2
>
,
// special instances
ReductionConfiguration_2
<
0
,
1
,
1
,
3
,
1
>
,
ReductionConfiguration_2
<
0
,
1
,
1
,
5
,
1
>
,
ReductionConfiguration_2
<
0
,
1
,
1
,
7
,
1
>
,
ReductionConfiguration_2
<
0
,
1
,
1
,
11
,
1
>
,
ReductionConfiguration_2
<
0
,
1
,
1
,
1
,
3
>
,
ReductionConfiguration_2
<
0
,
1
,
1
,
1
,
5
>
,
ReductionConfiguration_2
<
0
,
1
,
1
,
1
,
7
>
,
ReductionConfiguration_2
<
0
,
1
,
1
,
1
,
11
>
// clang-format on
>
;
#endif
template
<
typename
AccDataType
,
ReduceTensorOp
ReduceOpId
>
using
deviceReduceMultiBlockPartialReducePtrType
=
DeviceReducePtr
<
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
false
>::
InElementwiseOperation
,
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
false
>::
AccElementwiseOperation
>
;
template
<
typename
InDataType
,
typename
AccDataType
,
typename
OutDataType
,
int
Rank
,
int
NumReduceDim
,
ReduceTensorOp
ReduceOpId
,
NanPropagation
NanOpt
,
ReduceTensorIndices
IndicesOpt
>
void
add_device_reduce_instance_multiblock_partial_reduce
(
std
::
vector
<
deviceReduceMultiBlockPartialReducePtrType
<
AccDataType
,
ReduceOpId
>>&
device_op_instances
)
{
using
ReduceOperation
=
typename
reduce_binary_operator
<
AccDataType
,
ReduceOpId
>::
opType
;
using
InElementwiseOperation
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
false
>::
InElementwiseOperation
;
using
AccElementwiseOperation
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
false
>::
AccElementwiseOperation
;
constexpr
bool
Indexable
=
(
ReduceOpId
==
ReduceTensorOp
::
MIN
||
ReduceOpId
==
ReduceTensorOp
::
MAX
||
ReduceOpId
==
ReduceTensorOp
::
AMAX
);
constexpr
bool
NeedIndices
=
Indexable
&&
(
IndicesOpt
!=
ReduceTensorIndices
::
NO_INDICES
);
constexpr
bool
PropagateNan
=
(
NanOpt
==
NanPropagation
::
NOT_PROPAGATE_NAN
)
?
false
:
true
;
static_for
<
0
,
std
::
tuple_size
<
reduce_configuration_1_instances
>::
value
,
1
>
{}([
&
](
auto
i
)
{
using
cfg1
=
remove_cvref_t
<
decltype
(
std
::
get
<
i
.
value
>
(
reduce_configuration_1_instances
{}))
>
;
static_for
<
0
,
std
::
tuple_size
<
reduce_configuration_2_instances_multiblock_partial_reduce
>::
value
,
1
>
{}([
&
](
auto
j
)
{
using
cfg2
=
remove_cvref_t
<
decltype
(
std
::
get
<
j
.
value
>
(
reduce_configuration_2_instances_multiblock_partial_reduce
{}))
>
;
using
ReduceOpInstance
=
DeviceReduceMultiBlockPartialReduce
<
InDataType
,
AccDataType
,
OutDataType
,
Rank
,
NumReduceDim
,
ReduceOperation
,
InElementwiseOperation
,
AccElementwiseOperation
,
PropagateNan
,
NeedIndices
,
cfg1
::
BlockSize_
,
cfg1
::
MThreadClusterSize_
,
cfg1
::
KThreadClusterSize_
,
cfg2
::
MThreadSliceSize_
,
cfg2
::
KThreadSliceSize_
,
cfg2
::
InSrcVectorDim_
,
cfg2
::
InSrcVectorSize_
,
cfg2
::
OutDstVectorSize_
>
;
device_op_instances
.
push_back
(
std
::
make_unique
<
ReduceOpInstance
>
(
ReduceOpInstance
{}));
});
});
};
#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_TYPE( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
template void add_device_reduce_instance_multiblock_partial_reduce<inT, \
compT, \
outT, \
Rank, \
NumReduceDim, \
ReduceOpId, \
NanOpt, \
IndicesOpt>( \
std::vector<deviceReduceMultiBlockPartialReducePtrType<compT, ReduceOpId>> & \
device_op_instances)
#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_TYPE(inT, \
compT, \
outT, \
static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \
NumReduceDim)
#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_TYPE( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
extern template void add_device_reduce_instance_multiblock_partial_reduce<inT, \
compT, \
outT, \
Rank, \
NumReduceDim, \
ReduceOpId, \
NanOpt, \
IndicesOpt>( \
std::vector< \
DeviceReducePtr<typename reduce_unary_operator<compT, ReduceOpId, true, false>:: \
InElementwiseOperation, \
typename reduce_unary_operator<compT, ReduceOpId, true, false>:: \
AccElementwiseOperation>> & \
device_op_instances)
#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_TYPE(inT, \
compT, \
outT, \
static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \
NumReduceDim)
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.hpp
deleted
100644 → 0
View file @
48918ab9
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_B16_F32_B16_HPP
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_B16_F32_B16_HPP
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
0
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
0
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
0
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
5
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
5
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
5
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
7
,
0
,
0
,
4
,
3
);
// for NORM2
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
7
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
7
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
7
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
2
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
2
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
2
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
3
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
3
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
3
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
4
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
4
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
4
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
2
,
0
,
1
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
2
,
0
,
1
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
2
,
0
,
1
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
3
,
0
,
1
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
3
,
0
,
1
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
3
,
0
,
1
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
4
,
0
,
1
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
4
,
0
,
1
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.hpp
deleted
100644 → 0
View file @
48918ab9
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F16_F16_F16_HPP
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F16_F16_F16_HPP
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#endif
Prev
1
…
6
7
8
9
10
11
12
13
14
…
19
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment