Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
963e4a71
Unverified
Commit
963e4a71
authored
Oct 27, 2022
by
rocking5566
Committed by
GitHub
Oct 27, 2022
Browse files
Merge branch 'develop' into conv_quant_int8
parents
ad29b25b
0ee3aea1
Changes
193
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
377 additions
and
145 deletions
+377
-145
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.cpp
...ce/device_reduce_instance_threadwise_f64_f64_f64_amax.cpp
+27
-0
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.cpp
...uce/device_reduce_instance_threadwise_f64_f64_f64_avg.cpp
+23
-0
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.cpp
...uce/device_reduce_instance_threadwise_f64_f64_f64_max.cpp
+27
-0
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.cpp
...uce/device_reduce_instance_threadwise_f64_f64_f64_min.cpp
+27
-0
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.cpp
...e/device_reduce_instance_threadwise_f64_f64_f64_norm2.cpp
+23
-0
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.cpp
...educe/device_reduce_instance_threadwise_i8_i32_i8_add.cpp
+25
-0
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.cpp
...educe/device_reduce_instance_threadwise_i8_i32_i8_avg.cpp
+24
-0
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.cpp
...gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.cpp
+0
-43
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.cpp
...educe/device_reduce_instance_threadwise_i8_i8_i8_amax.cpp
+28
-0
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.cpp
...reduce/device_reduce_instance_threadwise_i8_i8_i8_max.cpp
+28
-0
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.cpp
...reduce/device_reduce_instance_threadwise_i8_i8_i8_min.cpp
+28
-0
profiler/include/profile_reduce_impl.hpp
profiler/include/profile_reduce_impl.hpp
+110
-95
script/process_perf_data.py
script/process_perf_data.py
+7
-7
No files found.
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.cpp
0 → 100644
View file @
963e4a71
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
UnaryAbs
,
PassThrough
>>&
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.cpp
0 → 100644
View file @
963e4a71
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
UnaryDivide
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
UnaryDivide
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
UnaryDivide
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
UnaryDivide
>>&
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.cpp
0 → 100644
View file @
963e4a71
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.cpp
0 → 100644
View file @
963e4a71
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.cpp
0 → 100644
View file @
963e4a71
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
UnarySquare
,
UnarySqrt
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
UnarySquare
,
UnarySqrt
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
UnarySquare
,
UnarySqrt
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
UnarySquare
,
UnarySqrt
>>&
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.cpp
→
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8
_add
.cpp
View file @
963e4a71
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace
ck
{
...
...
@@ -9,15 +10,11 @@ namespace device {
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
0
,
0
,
0
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
0
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
0
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
5
,
0
,
0
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
5
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
5
,
0
,
0
,
2
,
1
);
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
4
,
3
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
4
,
4
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
4
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
2
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
// clang-format on
// clang-format on
...
...
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.cpp
0 → 100644
View file @
963e4a71
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
4
,
3
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
UnaryDivide
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
4
,
4
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
UnaryDivide
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
4
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
UnaryDivide
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
2
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
UnaryDivide
>>&
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.cpp
deleted
100644 → 0
View file @
ad29b25b
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
0
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
0
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
0
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
1
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
1
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
1
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
1
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
1
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
1
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
1
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
1
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.cpp
0 → 100644
View file @
963e4a71
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
UnaryAbs
,
PassThrough
>>&
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.cpp
0 → 100644
View file @
963e4a71
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.cpp
0 → 100644
View file @
963e4a71
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
profiler/include/profile_reduce_impl.hpp
View file @
963e4a71
...
...
@@ -18,57 +18,61 @@ namespace tensor_operation {
namespace
device
{
namespace
instance
{
template
<
int
Rank
,
int
NumReduceDim
,
int
ReduceOpId
,
bool
PropagateNan
,
bool
UseIndex
>
template
<
index_t
Rank
,
index_t
NumReduceDim
,
ReduceTensorOp
ReduceOpId
,
bool
PropagateNan
,
bool
UseIndex
>
struct
ReduceDescription
{
static
constexpr
int
Rank_
=
Rank
;
static
constexpr
int
NumReduceDim_
=
NumReduceDim
;
static
constexpr
int
ReduceOpId_
=
ReduceOpId
;
static
constexpr
int
PropagateNan_
=
PropagateNan
;
static
constexpr
int
UseIndex_
=
UseIndex
;
static
constexpr
in
dex_
t
Rank_
=
Rank
;
static
constexpr
in
dex_
t
NumReduceDim_
=
NumReduceDim
;
static
constexpr
ReduceTensorOp
ReduceOpId_
=
ReduceOpId
;
static
constexpr
bool
PropagateNan_
=
PropagateNan
;
static
constexpr
bool
UseIndex_
=
UseIndex
;
};
using
reduce_description_instances
=
std
::
tuple
<
ReduceDescription
<
4
,
3
,
0
,
false
,
false
>
,
// for ADD
ReduceDescription
<
4
,
4
,
0
,
false
,
false
>
,
ReduceDescription
<
4
,
1
,
0
,
false
,
false
>
,
ReduceDescription
<
2
,
1
,
0
,
false
,
false
>
,
ReduceDescription
<
4
,
3
,
5
,
false
,
false
>
,
// for AVG
ReduceDescription
<
4
,
4
,
5
,
false
,
false
>
,
ReduceDescription
<
4
,
1
,
5
,
false
,
false
>
,
ReduceDescription
<
2
,
1
,
5
,
false
,
false
>
,
ReduceDescription
<
4
,
3
,
7
,
false
,
false
>
,
// for NORM2
ReduceDescription
<
4
,
4
,
7
,
false
,
false
>
,
ReduceDescription
<
4
,
1
,
7
,
false
,
false
>
,
ReduceDescription
<
2
,
1
,
7
,
false
,
false
>
,
ReduceDescription
<
4
,
3
,
2
,
false
,
false
>
,
// for MIN
ReduceDescription
<
4
,
4
,
2
,
false
,
false
>
,
ReduceDescription
<
4
,
1
,
2
,
false
,
false
>
,
ReduceDescription
<
2
,
1
,
2
,
false
,
false
>
,
ReduceDescription
<
4
,
3
,
3
,
false
,
false
>
,
// for MAX
ReduceDescription
<
4
,
4
,
3
,
false
,
false
>
,
ReduceDescription
<
4
,
1
,
3
,
false
,
false
>
,
ReduceDescription
<
2
,
1
,
3
,
false
,
false
>
,
ReduceDescription
<
4
,
3
,
4
,
false
,
false
>
,
// for AMAX
ReduceDescription
<
4
,
4
,
4
,
false
,
false
>
,
ReduceDescription
<
4
,
1
,
4
,
false
,
false
>
,
ReduceDescription
<
2
,
1
,
4
,
false
,
false
>
,
ReduceDescription
<
4
,
3
,
2
,
false
,
true
>
,
// for MIN
ReduceDescription
<
4
,
4
,
2
,
false
,
true
>
,
ReduceDescription
<
4
,
1
,
2
,
false
,
true
>
,
ReduceDescription
<
2
,
1
,
2
,
false
,
true
>
,
ReduceDescription
<
4
,
3
,
3
,
false
,
true
>
,
// for MAX
ReduceDescription
<
4
,
4
,
3
,
false
,
true
>
,
ReduceDescription
<
4
,
1
,
3
,
false
,
true
>
,
ReduceDescription
<
2
,
1
,
3
,
false
,
true
>
,
ReduceDescription
<
4
,
3
,
4
,
false
,
true
>
,
// for AMAX
ReduceDescription
<
4
,
4
,
4
,
false
,
true
>
,
ReduceDescription
<
4
,
1
,
4
,
false
,
true
>
,
ReduceDescription
<
2
,
1
,
4
,
false
,
true
>>
;
std
::
tuple
<
ReduceDescription
<
4
,
3
,
ReduceTensorOp
::
ADD
,
false
,
false
>
,
// for ADD
ReduceDescription
<
4
,
4
,
ReduceTensorOp
::
ADD
,
false
,
false
>
,
ReduceDescription
<
4
,
1
,
ReduceTensorOp
::
ADD
,
false
,
false
>
,
ReduceDescription
<
2
,
1
,
ReduceTensorOp
::
ADD
,
false
,
false
>
,
ReduceDescription
<
4
,
3
,
ReduceTensorOp
::
AVG
,
false
,
false
>
,
// for AVG
ReduceDescription
<
4
,
4
,
ReduceTensorOp
::
AVG
,
false
,
false
>
,
ReduceDescription
<
4
,
1
,
ReduceTensorOp
::
AVG
,
false
,
false
>
,
ReduceDescription
<
2
,
1
,
ReduceTensorOp
::
AVG
,
false
,
false
>
,
ReduceDescription
<
4
,
3
,
ReduceTensorOp
::
NORM2
,
false
,
false
>
,
// for NORM2
ReduceDescription
<
4
,
4
,
ReduceTensorOp
::
NORM2
,
false
,
false
>
,
ReduceDescription
<
4
,
1
,
ReduceTensorOp
::
NORM2
,
false
,
false
>
,
ReduceDescription
<
2
,
1
,
ReduceTensorOp
::
NORM2
,
false
,
false
>
,
ReduceDescription
<
4
,
3
,
ReduceTensorOp
::
MIN
,
false
,
false
>
,
// for MIN
ReduceDescription
<
4
,
4
,
ReduceTensorOp
::
MIN
,
false
,
false
>
,
ReduceDescription
<
4
,
1
,
ReduceTensorOp
::
MIN
,
false
,
false
>
,
ReduceDescription
<
2
,
1
,
ReduceTensorOp
::
MIN
,
false
,
false
>
,
ReduceDescription
<
4
,
3
,
ReduceTensorOp
::
MAX
,
false
,
false
>
,
// for MAX
ReduceDescription
<
4
,
4
,
ReduceTensorOp
::
MAX
,
false
,
false
>
,
ReduceDescription
<
4
,
1
,
ReduceTensorOp
::
MAX
,
false
,
false
>
,
ReduceDescription
<
2
,
1
,
ReduceTensorOp
::
MAX
,
false
,
false
>
,
ReduceDescription
<
4
,
3
,
ReduceTensorOp
::
AMAX
,
false
,
false
>
,
// for AMAX
ReduceDescription
<
4
,
4
,
ReduceTensorOp
::
AMAX
,
false
,
false
>
,
ReduceDescription
<
4
,
1
,
ReduceTensorOp
::
AMAX
,
false
,
false
>
,
ReduceDescription
<
2
,
1
,
ReduceTensorOp
::
AMAX
,
false
,
false
>
,
ReduceDescription
<
4
,
3
,
ReduceTensorOp
::
MIN
,
false
,
true
>
,
// for MIN
ReduceDescription
<
4
,
4
,
ReduceTensorOp
::
MIN
,
false
,
true
>
,
ReduceDescription
<
4
,
1
,
ReduceTensorOp
::
MIN
,
false
,
true
>
,
ReduceDescription
<
2
,
1
,
ReduceTensorOp
::
MIN
,
false
,
true
>
,
ReduceDescription
<
4
,
3
,
ReduceTensorOp
::
MAX
,
false
,
true
>
,
// for MAX
ReduceDescription
<
4
,
4
,
ReduceTensorOp
::
MAX
,
false
,
true
>
,
ReduceDescription
<
4
,
1
,
ReduceTensorOp
::
MAX
,
false
,
true
>
,
ReduceDescription
<
2
,
1
,
ReduceTensorOp
::
MAX
,
false
,
true
>
,
ReduceDescription
<
4
,
3
,
ReduceTensorOp
::
AMAX
,
false
,
true
>
,
// for AMAX
ReduceDescription
<
4
,
4
,
ReduceTensorOp
::
AMAX
,
false
,
true
>
,
ReduceDescription
<
4
,
1
,
ReduceTensorOp
::
AMAX
,
false
,
true
>
,
ReduceDescription
<
2
,
1
,
ReduceTensorOp
::
AMAX
,
false
,
true
>>
;
template
<
typename
DescriptionType
>
bool
description_match
(
const
DescriptionType
&
description
,
...
...
@@ -78,9 +82,8 @@ bool description_match(const DescriptionType& description,
bool
PropagateNan
,
bool
UseIndex
)
{
if
(
description
.
Rank_
!=
Rank
||
description
.
ReduceOpId_
!=
static_cast
<
int
>
(
ReduceOpId
)
||
description
.
PropagateNan_
!=
static_cast
<
int
>
(
PropagateNan
)
||
description
.
UseIndex_
!=
static_cast
<
int
>
(
UseIndex
))
if
(
description
.
Rank_
!=
Rank
||
description
.
ReduceOpId_
!=
ReduceOpId
||
description
.
PropagateNan_
!=
PropagateNan
||
description
.
UseIndex_
!=
UseIndex
)
return
(
false
);
if
(
DescriptionType
::
NumReduceDim_
!=
reduceDims
.
size
())
...
...
@@ -99,11 +102,10 @@ bool description_match(const DescriptionType& description,
namespace
ck
{
namespace
profiler
{
template
<
index_t
Rank
,
index_t
NumReduceDim
>
static
inline
std
::
vector
<
int
>
get_invariant_dims
(
const
std
::
vector
<
int
>&
reduceDims
)
template
<
int
Rank
,
int
NumReduceDim
>
static
inline
std
::
array
<
int
,
Rank
-
NumReduceDim
>
get_invariant_dims
(
const
std
::
array
<
int
,
NumReduceDim
>&
reduceDims
)
{
assert
(
NumReduceDim
==
reduceDims
.
size
());
int
reduceFlag
=
0
;
// flag the bits for the reduceDims
...
...
@@ -112,13 +114,15 @@ static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduce
reduceFlag
|=
1
<<
reduceDims
[
i
];
};
std
::
vector
<
int
>
invariantDims
;
std
::
array
<
int
,
Rank
-
NumReduceDim
>
invariantDims
;
// collect invariant dimensions
int
dim
=
0
;
for
(
int
i
=
0
;
i
<
Rank
;
i
++
)
if
((
reduceFlag
&
(
1
<<
i
))
==
0
)
{
invariantDims
.
push_back
(
i
);
invariantDims
[
dim
]
=
i
;
dim
++
;
};
return
invariantDims
;
...
...
@@ -137,7 +141,7 @@ bool profile_reduce_impl_impl(bool do_verification,
bool
do_dumpout
,
bool
time_kernel
,
const
std
::
vector
<
size_t
>&
inLengths
,
const
std
::
vector
<
int
>&
reduceDims
,
const
std
::
array
<
int
,
NumReduceDim
>&
reduceDims
,
float
alpha
,
float
beta
)
{
...
...
@@ -145,6 +149,8 @@ bool profile_reduce_impl_impl(bool do_verification,
using
namespace
ck
::
tensor_operation
::
device
::
instance
;
using
ck
::
host_common
::
dumpBufferToFile
;
constexpr
index_t
NumOutDim
=
(
Rank
-
NumReduceDim
==
0
)
?
1
:
Rank
-
NumReduceDim
;
constexpr
bool
op_support_indices
=
(
ReduceOpId
==
ReduceTensorOp
::
MIN
||
ReduceOpId
==
ReduceTensorOp
::
MAX
||
ReduceOpId
==
ReduceTensorOp
::
AMAX
);
...
...
@@ -279,28 +285,32 @@ bool profile_reduce_impl_impl(bool do_verification,
reduce_unary_operator
<
ReduceOpId
,
true
,
true
>::
GetElementwiseOperator
(
static_cast
<
int32_t
>
(
reduce_total_length
));
using
DeviceReduceInstPtr
0
=
DeviceReducePtr
<
InElementwiseOperation
,
AccElementwiseOperation
>
;
using
DeviceReduceInstPtr
=
DeviceReducePtr
<
Rank
,
NumReduceDim
,
InElementwiseOperation
,
AccElementwiseOperation
>
;
std
::
vector
<
DeviceReduceInstPtr
0
>
reduce
0
_ptrs
;
std
::
vector
<
DeviceReduceInstPtr
>
reduce_ptrs
;
add_device_reduce_instance_threadwise
<
InDataType
,
AccDataType
,
OutDataType
,
Rank
,
NumReduceDim
,
ReduceOpId
,
ReduceOperation
,
InElementwiseOperation
,
AccElementwiseOperation
,
PropagateNan
,
UseIndex
>
(
reduce
0
_ptrs
);
UseIndex
>
(
reduce_ptrs
);
add_device_reduce_instance_blockwise
<
InDataType
,
AccDataType
,
OutDataType
,
Rank
,
NumReduceDim
,
ReduceOpId
,
ReduceOperation
,
InElementwiseOperation
,
AccElementwiseOperation
,
PropagateNan
,
UseIndex
>
(
reduce
0
_ptrs
);
UseIndex
>
(
reduce_ptrs
);
if
constexpr
(
use_atomic_add
)
{
...
...
@@ -309,12 +319,14 @@ bool profile_reduce_impl_impl(bool do_verification,
OutDataType
,
Rank
,
NumReduceDim
,
ReduceOpId
,
ReduceOperation
,
InElementwiseOperation
,
AccElementwiseOperation
,
PropagateNan
,
UseIndex
>
(
reduce
0
_ptrs
);
UseIndex
>
(
reduce_ptrs
);
}
if
(
reduce
0
_ptrs
.
empty
())
if
(
reduce_ptrs
.
empty
())
{
throw
std
::
runtime_error
(
"Wrong! No device REDUCE instance found"
);
};
...
...
@@ -342,22 +354,22 @@ bool profile_reduce_impl_impl(bool do_verification,
acc_elementwise_op
);
};
std
::
vector
<
ck
::
index_t
>
i_i
nLengths
;
std
::
vector
<
ck
::
index_t
>
i_i
nStrides
;
std
::
vector
<
ck
::
index_t
>
i_o
utLengths
;
std
::
vector
<
ck
::
index_t
>
i_o
utStrides
;
std
::
array
<
index_t
,
Rank
>
arrI
nLengths
;
std
::
array
<
index_t
,
Rank
>
arrI
nStrides
;
std
::
array
<
index_t
,
NumOutDim
>
arrO
utLengths
;
std
::
array
<
index_t
,
NumOutDim
>
arrO
utStrides
;
i_inLengths
.
assign
(
inLengths
.
begin
(),
inLengths
.
end
());
i_inStrides
.
assign
(
inStrides
.
begin
(),
inStrides
.
end
());
i_outLengths
.
assign
(
outLengths
.
begin
(),
outLengths
.
end
());
i_outStrides
.
assign
(
outStrides
.
begin
(),
outStrides
.
end
());
std
::
copy
(
inLengths
.
begin
(),
inLengths
.
end
()
,
arrInLengths
.
begin
()
);
std
::
copy
(
inStrides
.
begin
(),
inStrides
.
end
()
,
arrInStrides
.
begin
()
);
std
::
copy
(
outLengths
.
begin
(),
outLengths
.
end
()
,
arrOutLengths
.
begin
()
);
std
::
copy
(
outStrides
.
begin
(),
outStrides
.
end
()
,
arrOutStrides
.
begin
()
);
for
(
auto
&
reduce_ptr
:
reduce
0
_ptrs
)
for
(
auto
&
reduce_ptr
:
reduce_ptrs
)
{
auto
argument_ptr
=
reduce_ptr
->
MakeArgumentPointer
(
i_i
nLengths
,
i_i
nStrides
,
i_o
utLengths
,
i_o
utStrides
,
auto
argument_ptr
=
reduce_ptr
->
MakeArgumentPointer
(
arrI
nLengths
,
arrI
nStrides
,
arrO
utLengths
,
arrO
utStrides
,
reduceDims
,
alpha
,
beta
,
...
...
@@ -478,22 +490,25 @@ bool profile_reduce_impl(bool do_verification,
descType
{},
inLengths
.
size
(),
reduceDims
,
ReduceOpId
,
PropagateNan
,
UseIndex
))
return
;
pass
=
pass
&&
profile_reduce_impl_impl
<
InDataType
,
AccDataType
,
OutDataType
,
descType
::
Rank_
,
descType
::
NumReduceDim_
,
static_cast
<
ReduceTensorOp
>
(
descType
::
ReduceOpId_
),
static_cast
<
bool
>
(
descType
::
PropagateNan_
),
static_cast
<
bool
>
(
descType
::
UseIndex_
)
>
(
do_verification
,
init_method
,
do_dumpout
,
time_kernel
,
inLengths
,
reduceDims
,
alpha
,
beta
);
std
::
array
<
ck
::
index_t
,
descType
::
NumReduceDim_
>
arrReduceDims
;
std
::
copy
(
reduceDims
.
begin
(),
reduceDims
.
end
(),
arrReduceDims
.
begin
());
pass
=
pass
&&
profile_reduce_impl_impl
<
InDataType
,
AccDataType
,
OutDataType
,
descType
::
Rank_
,
descType
::
NumReduceDim_
,
static_cast
<
ReduceTensorOp
>
(
descType
::
ReduceOpId_
),
descType
::
PropagateNan_
,
descType
::
UseIndex_
>
(
do_verification
,
init_method
,
do_dumpout
,
time_kernel
,
inLengths
,
arrReduceDims
,
alpha
,
beta
);
matched
=
true
;
});
...
...
script/process_perf_data.py
View file @
963e4a71
...
...
@@ -81,7 +81,7 @@ def parse_logfile(logfile):
StrideA
=
[]
StrideB
=
[]
StrideC
=
[]
if
'perf_gemm'
in
logfile
:
if
'perf_gemm
.log
'
in
logfile
:
for
line
in
open
(
logfile
):
if
'Best Perf'
in
line
:
lst
=
line
.
split
()
...
...
@@ -120,14 +120,14 @@ def parse_logfile(logfile):
res
=
[
x
for
_
,
x
in
sorted
(
zip
(
tests
,
tflops
))]
#sorted_kernels = [x for _,x in sorted(zip(tests,kernels))]
test_list
=
list
(
range
(
1
,
len
(
tests
)
+
1
))
#parse conv_fwd performance tests:
elif
'conv_fwd'
in
logfile
:
#parse conv_fwd
and conv_bwd
performance tests:
elif
'conv_fwd'
in
logfile
or
'conv_bwd_data'
in
logfile
:
for
line
in
open
(
logfile
):
if
'tflops:'
in
line
:
lst
=
line
.
split
()
res
.
append
(
lst
[
1
])
#parse all other performance tests:
elif
'resnet50'
in
logfile
or
'batched_gemm'
in
logfile
or
'grouped_gemm'
in
logfile
or
'conv_bwd_data'
in
logfile
or
'gemm_bilinear'
in
logfile
or
'reduction'
in
logfile
:
elif
'resnet50'
in
logfile
or
'batched_gemm'
in
logfile
or
'grouped_gemm'
in
logfile
or
'gemm_bilinear'
in
logfile
or
'reduction'
in
logfile
:
for
line
in
open
(
logfile
):
if
'Best Perf'
in
line
:
lst
=
line
.
split
()
...
...
@@ -149,7 +149,7 @@ def store_new_test_result(table_name, test_results, testlist, branch_name, node_
df
=
pd
.
DataFrame
(
data
=
[
params
],
columns
=
[
'Branch_ID'
,
'Node_ID'
,
'GPU_arch'
,
'Compute Units'
,
'ROCM_version'
,
'HIP_version'
,
'Environment'
,
'Datetime'
])
df_add
=
pd
.
DataFrame
(
data
=
[
test_results
],
columns
=
testlist
)
df
=
pd
.
concat
([
df
,
df_add
],
axis
=
1
)
print
(
"new test results dataframe:"
,
df
)
#
print("new test results dataframe:",df)
df
.
to_sql
(
table_name
,
connection
,
if_exists
=
'append'
,
index
=
False
)
return
0
...
...
@@ -165,7 +165,7 @@ def compare_test_to_baseline(baseline,test,testlist):
print
(
"test # "
,
i
,
"shows regression by {:.3f}%"
.
format
(
(
float
(
test
[
i
])
-
base_list
[
i
])
/
base_list
[
i
]
*
100
))
regression
=
1
ave_perf
=
ave_perf
+
float
(
test
[
i
])
/
base_list
[
i
]
if
base_list
[
i
]
>
0
:
ave_perf
=
ave_perf
+
float
(
test
[
i
])
/
base_list
[
i
]
if
regression
==
0
:
print
(
"no regressions found"
)
ave_perf
=
ave_perf
/
len
(
base_list
)
...
...
@@ -248,7 +248,7 @@ def main():
conn
=
sqlEngine
.
connect
()
#save gemm performance tests:
if
'perf_gemm'
in
filename
:
if
'perf_gemm
.log
'
in
filename
:
#write the ck_gemm_test_params table only needed once the test set changes
#post_test_params(test_list,conn)
for
i
in
range
(
1
,
len
(
results
)
+
1
):
...
...
Prev
1
…
6
7
8
9
10
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment