Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
963e4a71
Unverified
Commit
963e4a71
authored
Oct 27, 2022
by
rocking5566
Committed by
GitHub
Oct 27, 2022
Browse files
Merge branch 'develop' into conv_quant_int8
parents
ad29b25b
0ee3aea1
Changes
193
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
487 additions
and
162 deletions
+487
-162
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp
...u/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp
+0
-46
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.hpp
...uce/device_reduce_instance_blockwise_f16_f16_f16_amax.hpp
+31
-0
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.hpp
...duce/device_reduce_instance_blockwise_f16_f16_f16_max.hpp
+31
-0
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.hpp
...duce/device_reduce_instance_blockwise_f16_f16_f16_min.hpp
+31
-0
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.hpp
...duce/device_reduce_instance_blockwise_f16_f32_f16_add.hpp
+27
-0
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.hpp
...duce/device_reduce_instance_blockwise_f16_f32_f16_avg.hpp
+27
-0
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.hpp
...ce/device_reduce_instance_blockwise_f16_f32_f16_norm2.hpp
+27
-0
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp
...u/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp
+0
-58
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.hpp
...duce/device_reduce_instance_blockwise_f32_f32_f32_add.hpp
+27
-0
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.hpp
...uce/device_reduce_instance_blockwise_f32_f32_f32_amax.hpp
+31
-0
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.hpp
...duce/device_reduce_instance_blockwise_f32_f32_f32_avg.hpp
+27
-0
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.hpp
...duce/device_reduce_instance_blockwise_f32_f32_f32_max.hpp
+31
-0
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.hpp
...duce/device_reduce_instance_blockwise_f32_f32_f32_min.hpp
+31
-0
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.hpp
...ce/device_reduce_instance_blockwise_f32_f32_f32_norm2.hpp
+27
-0
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.hpp
...duce/device_reduce_instance_blockwise_f32_f64_f32_add.hpp
+27
-0
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.hpp
...duce/device_reduce_instance_blockwise_f32_f64_f32_avg.hpp
+27
-0
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.hpp
...ce/device_reduce_instance_blockwise_f32_f64_f32_norm2.hpp
+27
-0
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp
...u/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp
+0
-58
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.hpp
...duce/device_reduce_instance_blockwise_f64_f64_f64_add.hpp
+27
-0
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.hpp
...uce/device_reduce_instance_blockwise_f64_f64_f64_amax.hpp
+31
-0
No files found.
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp
deleted
100644 → 0
View file @
ad29b25b
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.hpp
0 → 100644
View file @
963e4a71
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_blockwise
<
F16
,
F16
,
F16
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F16
,
F16
,
F16
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F16
,
F16
,
F16
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F16
,
F16
,
F16
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F16
,
F16
,
F16
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F16
,
F16
,
F16
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F16
,
F16
,
F16
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F16
,
F16
,
F16
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
UnaryAbs
,
PassThrough
>>&
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.hpp
0 → 100644
View file @
963e4a71
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_blockwise
<
F16
,
F16
,
F16
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F16
,
F16
,
F16
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F16
,
F16
,
F16
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F16
,
F16
,
F16
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F16
,
F16
,
F16
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F16
,
F16
,
F16
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F16
,
F16
,
F16
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F16
,
F16
,
F16
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.hpp
0 → 100644
View file @
963e4a71
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_blockwise
<
F16
,
F16
,
F16
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F16
,
F16
,
F16
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F16
,
F16
,
F16
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F16
,
F16
,
F16
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F16
,
F16
,
F16
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F16
,
F16
,
F16
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F16
,
F16
,
F16
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F16
,
F16
,
F16
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.hpp
0 → 100644
View file @
963e4a71
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_blockwise
<
F16
,
F32
,
F16
,
4
,
3
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F16
,
F32
,
F16
,
4
,
4
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F16
,
F32
,
F16
,
4
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F16
,
F32
,
F16
,
2
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.hpp
0 → 100644
View file @
963e4a71
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_blockwise
<
F16
,
F32
,
F16
,
4
,
3
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
UnaryDivide
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F16
,
F32
,
F16
,
4
,
4
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
UnaryDivide
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F16
,
F32
,
F16
,
4
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
UnaryDivide
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F16
,
F32
,
F16
,
2
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
UnaryDivide
>>&
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_
i8_i32_i8
.hpp
→
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_
f16_f32_f16_norm2
.hpp
View file @
963e4a71
...
@@ -4,6 +4,7 @@
...
@@ -4,6 +4,7 @@
#pragma once
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
...
@@ -13,15 +14,11 @@ namespace device {
...
@@ -13,15 +14,11 @@ namespace device {
namespace
instance
{
namespace
instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
ADD_BLOCKWISE_INST_REF_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
0
,
0
,
0
,
4
,
3
);
// for ADD
extern
template
void
add_device_reduce_instance_blockwise
<
F16
,
F32
,
F16
,
4
,
3
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
UnarySquare
,
UnarySqrt
>>&
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
0
,
0
,
0
,
4
,
4
);
extern
template
void
add_device_reduce_instance_blockwise
<
F16
,
F32
,
F16
,
4
,
4
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
UnarySquare
,
UnarySqrt
>>&
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
0
,
0
,
0
,
4
,
1
);
extern
template
void
add_device_reduce_instance_blockwise
<
F16
,
F32
,
F16
,
4
,
1
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
UnarySquare
,
UnarySqrt
>>&
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
0
,
0
,
0
,
2
,
1
);
extern
template
void
add_device_reduce_instance_blockwise
<
F16
,
F32
,
F16
,
2
,
1
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
UnarySquare
,
UnarySqrt
>>&
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_BLOCKWISE_INST_REF_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
5
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
5
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
5
,
0
,
0
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace instance
}
// namespace instance
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp
deleted
100644 → 0
View file @
ad29b25b
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
4
,
3
);
// for NORM2
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.hpp
0 → 100644
View file @
963e4a71
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F32
,
F32
,
4
,
3
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F32
,
F32
,
4
,
4
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F32
,
F32
,
4
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F32
,
F32
,
2
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.hpp
0 → 100644
View file @
963e4a71
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F32
,
F32
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F32
,
F32
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F32
,
F32
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F32
,
F32
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F32
,
F32
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F32
,
F32
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F32
,
F32
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F32
,
F32
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
UnaryAbs
,
PassThrough
>>&
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.hpp
0 → 100644
View file @
963e4a71
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F32
,
F32
,
4
,
3
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
UnaryDivide
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F32
,
F32
,
4
,
4
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
UnaryDivide
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F32
,
F32
,
4
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
UnaryDivide
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F32
,
F32
,
2
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
UnaryDivide
>>&
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.hpp
0 → 100644
View file @
963e4a71
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F32
,
F32
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F32
,
F32
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F32
,
F32
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F32
,
F32
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F32
,
F32
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F32
,
F32
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F32
,
F32
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F32
,
F32
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.hpp
0 → 100644
View file @
963e4a71
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F32
,
F32
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F32
,
F32
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F32
,
F32
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F32
,
F32
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F32
,
F32
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F32
,
F32
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F32
,
F32
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F32
,
F32
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.hpp
0 → 100644
View file @
963e4a71
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F32
,
F32
,
4
,
3
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
UnarySquare
,
UnarySqrt
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F32
,
F32
,
4
,
4
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
UnarySquare
,
UnarySqrt
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F32
,
F32
,
4
,
1
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
UnarySquare
,
UnarySqrt
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F32
,
F32
,
2
,
1
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
UnarySquare
,
UnarySqrt
>>&
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.hpp
0 → 100644
View file @
963e4a71
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F64
,
F32
,
4
,
3
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F64
,
F32
,
4
,
4
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F64
,
F32
,
4
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F64
,
F32
,
2
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.hpp
0 → 100644
View file @
963e4a71
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F64
,
F32
,
4
,
3
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
UnaryDivide
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F64
,
F32
,
4
,
4
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
UnaryDivide
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F64
,
F32
,
4
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
UnaryDivide
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F64
,
F32
,
2
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
UnaryDivide
>>&
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f
16_f32_f16
.hpp
→
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f
32_f64_f32_norm2
.hpp
View file @
963e4a71
...
@@ -4,6 +4,7 @@
...
@@ -4,6 +4,7 @@
#pragma once
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
...
@@ -13,19 +14,11 @@ namespace device {
...
@@ -13,19 +14,11 @@ namespace device {
namespace
instance
{
namespace
instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
4
,
3
);
// for ADD
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F64
,
F32
,
4
,
3
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
UnarySquare
,
UnarySqrt
>>&
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
4
,
4
);
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F64
,
F32
,
4
,
4
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
UnarySquare
,
UnarySqrt
>>&
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
4
,
1
);
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F64
,
F32
,
4
,
1
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
UnarySquare
,
UnarySqrt
>>&
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
2
,
1
);
extern
template
void
add_device_reduce_instance_blockwise
<
F32
,
F64
,
F32
,
2
,
1
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
UnarySquare
,
UnarySqrt
>>&
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
4
,
3
);
// for NORM2
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace instance
}
// namespace instance
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp
deleted
100644 → 0
View file @
ad29b25b
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
3
);
// for NORM2
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
4
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.hpp
0 → 100644
View file @
963e4a71
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_blockwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.hpp
0 → 100644
View file @
963e4a71
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_blockwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_blockwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
UnaryAbs
,
PassThrough
>>&
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
Prev
1
2
3
4
5
6
…
10
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment