Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
5d015452
Commit
5d015452
authored
Jul 06, 2022
by
Chaitanya Inumella
Browse files
Rebased the hipTENSOR development branch with the contraction branch
parents
b7fa6bb1
ed3feb4d
Changes
425
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
498 additions
and
444 deletions
+498
-444
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp
...ice_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp
+9
-9
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp
...ice_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp
+9
-8
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp
...ice_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp
+9
-8
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp
...ice_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp
+9
-8
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp
...instance/gpu/reduce/device_reduce_instance_threadwise.hpp
+28
-34
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp
.../reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp
+9
-9
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp
.../reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp
+9
-9
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp
.../reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp
+9
-9
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp
.../reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp
+9
-8
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp
.../reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp
+9
-8
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp
.../reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp
+9
-8
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp
...pu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp
+9
-8
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp
...gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp
+9
-8
library/include/ck/library/utility/check_err.hpp
library/include/ck/library/utility/check_err.hpp
+210
-195
library/include/ck/library/utility/conv_util.hpp
library/include/ck/library/utility/conv_util.hpp
+39
-36
library/include/ck/library/utility/fill.hpp
library/include/ck/library/utility/fill.hpp
+43
-29
library/include/ck/library/utility/op_instance_engine.hpp
library/include/ck/library/utility/op_instance_engine.hpp
+29
-11
library/src/host_tensor/CMakeLists.txt
library/src/host_tensor/CMakeLists.txt
+8
-16
library/src/host_tensor/device_memory.cpp
library/src/host_tensor/device_memory.cpp
+28
-0
library/src/host_tensor/host_tensor.cpp
library/src/host_tensor/host_tensor.cpp
+5
-23
No files found.
Too many changes to show.
To preserve performance only
425 of 425+
files are displayed.
Plain diff
Email patch
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp
View file @
5d015452
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F16_F32_F32_HPP
// SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F16_F32_F32_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "data_type.hpp"
#pragma once
#include "device_reduce_instance_multiblock_atomic_add.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
namespace
device_reduce_
instance
{
namespace
instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...
@@ -21,10 +24,7 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 5, 0, 0, 4, 1);
...
@@ -21,10 +24,7 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 5, 0, 0, 4, 1);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
half_t
,
float
,
float
,
5
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
half_t
,
float
,
float
,
5
,
0
,
0
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace
device_reduce_
instance
}
// namespace instance
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
}
// namespace ck
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp
View file @
5d015452
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F32_F32_HPP
// SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F32_F32_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "device_reduce_instance_multiblock_atomic_add.hpp"
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
namespace
device_reduce_
instance
{
namespace
instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...
@@ -20,10 +24,7 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 1);
...
@@ -20,10 +24,7 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 1);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace
device_reduce_
instance
}
// namespace instance
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
}
// namespace ck
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp
View file @
5d015452
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F64_F32_HPP
// SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F64_F32_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "device_reduce_instance_multiblock_atomic_add.hpp"
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
namespace
device_reduce_
instance
{
namespace
instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...
@@ -20,10 +24,7 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 1);
...
@@ -20,10 +24,7 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 1);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
float
,
double
,
float
,
5
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
float
,
double
,
float
,
5
,
0
,
0
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace
device_reduce_
instance
}
// namespace instance
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
}
// namespace ck
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp
View file @
5d015452
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F64_F64_F64_HPP
// SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F64_F64_F64_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "device_reduce_instance_multiblock_atomic_add.hpp"
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
namespace
device_reduce_
instance
{
namespace
instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...
@@ -20,10 +24,7 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 1);
...
@@ -20,10 +24,7 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 1);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace
device_reduce_
instance
}
// namespace instance
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
}
// namespace ck
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp
View file @
5d015452
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_HPP
// SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_INSTANCE_THREADWISE_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "reduction_operator_mapping.hpp"
#pragma once
#include "device_reduce_instance_impl_common.hpp"
#include "device_reduce_threadwise.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
namespace
device_reduce_
instance
{
namespace
instance
{
#ifdef QUICK_REDUCE_TEST
#ifdef QUICK_REDUCE_TEST
using
reduce_configuration_2_instances_threadwise
=
std
::
tuple
<
using
reduce_configuration_2_instances_threadwise
=
std
::
tuple
<
...
@@ -47,10 +49,10 @@ using reduce_configuration_2_instances_threadwise = std::tuple<
...
@@ -47,10 +49,10 @@ using reduce_configuration_2_instances_threadwise = std::tuple<
>
;
>
;
#endif
#endif
template
<
typename
AccDataType
,
ReduceTensorOp
ReduceOpId
>
template
<
ReduceTensorOp
ReduceOpId
>
using
deviceReduceThreadWisePtrType
=
DeviceReducePtr
<
using
deviceReduceThreadWisePtrType
=
DeviceReducePtr
<
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
InElementwiseOperation
,
typename
reduce_unary_operator
<
ReduceOpId
,
true
,
true
>::
InElementwiseOperation
,
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
>
;
typename
reduce_unary_operator
<
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
>
;
template
<
typename
InDataType
,
template
<
typename
InDataType
,
typename
AccDataType
,
typename
AccDataType
,
...
@@ -61,14 +63,13 @@ template <typename InDataType,
...
@@ -61,14 +63,13 @@ template <typename InDataType,
bool
PropagateNan
,
bool
PropagateNan
,
bool
UseIndex
>
bool
UseIndex
>
void
add_device_reduce_instance_threadwise
(
void
add_device_reduce_instance_threadwise
(
std
::
vector
<
deviceReduceThreadWisePtrType
<
AccDataType
,
ReduceOpId
>>&
device_op_instances
)
std
::
vector
<
deviceReduceThreadWisePtrType
<
ReduceOpId
>>&
device_op_instances
)
{
{
using
ReduceOperation
=
typename
reduce_binary_operator
<
AccDataType
,
ReduceOpId
>::
opType
;
using
ReduceOperation
=
typename
reduce_binary_operator
<
ReduceOpId
>::
opType
;
using
InElementwiseOperation
=
using
InElementwiseOperation
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
InElementwiseOperation
;
typename
reduce_unary_operator
<
ReduceOpId
,
true
,
true
>::
InElementwiseOperation
;
using
AccElementwiseOperation
=
using
AccElementwiseOperation
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
typename
reduce_unary_operator
<
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
;
AccElementwiseOperation
;
constexpr
bool
Indexable
=
constexpr
bool
Indexable
=
(
ReduceOpId
==
ReduceTensorOp
::
MIN
||
ReduceOpId
==
ReduceTensorOp
::
MAX
||
(
ReduceOpId
==
ReduceTensorOp
::
MIN
||
ReduceOpId
==
ReduceTensorOp
::
MAX
||
...
@@ -114,7 +115,7 @@ void add_device_reduce_instance_threadwise(
...
@@ -114,7 +115,7 @@ void add_device_reduce_instance_threadwise(
ReduceOpId, \
ReduceOpId, \
PropagateNan, \
PropagateNan, \
UseIndex>( \
UseIndex>( \
std::vector<deviceReduceThreadWisePtrType<
compT,
ReduceOpId>> & device_op_instances)
std::vector<deviceReduceThreadWisePtrType<ReduceOpId>> & device_op_instances)
#define ADD_THREADWISE_INST_BY_ID( \
#define ADD_THREADWISE_INST_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
...
@@ -127,21 +128,17 @@ void add_device_reduce_instance_threadwise(
...
@@ -127,21 +128,17 @@ void add_device_reduce_instance_threadwise(
Rank, \
Rank, \
NumReduceDim)
NumReduceDim)
#define ADD_THREADWISE_INST_REF_BY_TYPE( \
#define ADD_THREADWISE_INST_REF_BY_TYPE( \
inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \
inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \
extern template void add_device_reduce_instance_threadwise<inT, \
extern template void add_device_reduce_instance_threadwise<inT, \
compT, \
compT, \
outT, \
outT, \
Rank, \
Rank, \
NumReduceDim, \
NumReduceDim, \
ReduceOpId, \
ReduceOpId, \
PropagateNan, \
PropagateNan, \
UseIndex>( \
UseIndex>( \
std::vector<DeviceReducePtr< \
std::vector<deviceReduceThreadWisePtrType<ReduceOpId>> & device_op_instances)
typename reduce_unary_operator<compT, ReduceOpId, true, true>::InElementwiseOperation, \
typename reduce_unary_operator<compT, ReduceOpId, true, true>:: \
AccElementwiseOperation>> & \
device_op_instances)
#define ADD_THREADWISE_INST_REF_BY_ID( \
#define ADD_THREADWISE_INST_REF_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
...
@@ -154,10 +151,7 @@ void add_device_reduce_instance_threadwise(
...
@@ -154,10 +151,7 @@ void add_device_reduce_instance_threadwise(
Rank, \
Rank, \
NumReduceDim)
NumReduceDim)
}
// namespace
device_reduce_
instance
}
// namespace instance
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
}
// namespace ck
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp
View file @
5d015452
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_B16_F32_B16_HPP
// SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_INSTANCE_THREADWISE_B16_F32_B16_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "data_type.hpp"
#pragma once
#include "device_reduce_instance_threadwise.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
namespace
device_reduce_
instance
{
namespace
instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...
@@ -50,10 +53,7 @@ ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 1);
...
@@ -50,10 +53,7 @@ ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 1);
ADD_THREADWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
4
,
0
,
1
,
2
,
1
);
ADD_THREADWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace
device_reduce_
instance
}
// namespace instance
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
}
// namespace ck
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp
View file @
5d015452
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F16_F16_HPP
// SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F16_F16_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "data_type.hpp"
#pragma once
#include "device_reduce_instance_threadwise.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
namespace
device_reduce_
instance
{
namespace
instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...
@@ -37,10 +40,7 @@ ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1);
...
@@ -37,10 +40,7 @@ ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1);
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
2
,
1
);
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace
device_reduce_
instance
}
// namespace instance
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
}
// namespace ck
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp
View file @
5d015452
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F32_F16_HPP
// SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F32_F16_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "data_type.hpp"
#pragma once
#include "device_reduce_instance_threadwise.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
namespace
device_reduce_
instance
{
namespace
instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...
@@ -25,10 +28,7 @@ ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1);
...
@@ -25,10 +28,7 @@ ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1);
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace
device_reduce_
instance
}
// namespace instance
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
}
// namespace ck
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp
View file @
5d015452
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F32_F32_HPP
// SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F32_F32_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "device_reduce_instance_threadwise.hpp"
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
namespace
device_reduce_
instance
{
namespace
instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...
@@ -48,10 +52,7 @@ ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 1);
...
@@ -48,10 +52,7 @@ ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 1);
ADD_THREADWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
2
,
1
);
ADD_THREADWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace
device_reduce_
instance
}
// namespace instance
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
}
// namespace ck
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp
View file @
5d015452
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F64_F32_HPP
// SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F64_F32_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "device_reduce_instance_threadwise.hpp"
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
namespace
device_reduce_
instance
{
namespace
instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...
@@ -24,10 +28,7 @@ ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 1);
...
@@ -24,10 +28,7 @@ ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 1);
ADD_THREADWISE_INST_REF_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_REF_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace
device_reduce_
instance
}
// namespace instance
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
}
// namespace ck
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp
View file @
5d015452
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F64_F64_F64_HPP
// SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F64_F64_F64_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "device_reduce_instance_threadwise.hpp"
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
namespace
device_reduce_
instance
{
namespace
instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...
@@ -48,10 +52,7 @@ ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 1);
...
@@ -48,10 +52,7 @@ ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 1);
ADD_THREADWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
2
,
1
);
ADD_THREADWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace
device_reduce_
instance
}
// namespace instance
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
}
// namespace ck
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp
View file @
5d015452
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I32_I8_HPP
// SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I32_I8_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "device_reduce_instance_threadwise.hpp"
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
namespace
device_reduce_
instance
{
namespace
instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...
@@ -20,10 +24,7 @@ ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 1);
...
@@ -20,10 +24,7 @@ ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 1);
ADD_THREADWISE_INST_REF_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
5
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_REF_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
5
,
0
,
0
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace
device_reduce_
instance
}
// namespace instance
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
}
// namespace ck
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp
View file @
5d015452
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I8_I8_HPP
// SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I8_I8_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "device_reduce_instance_threadwise.hpp"
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
namespace
device_reduce_
instance
{
namespace
instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...
@@ -36,10 +40,7 @@ ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1);
...
@@ -36,10 +40,7 @@ ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1);
ADD_THREADWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
1
,
2
,
1
);
ADD_THREADWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace
device_reduce_
instance
}
// namespace instance
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
}
// namespace ck
}
// namespace ck
#endif
library/include/ck/library/utility/check_err.hpp
View file @
5d015452
#ifndef CHECK_ERR_HPP
// SPDX-License-Identifier: MIT
#define CHECK_ERR_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <algorithm>
#pragma once
#include <cmath>
#include <cstdlib>
#include <algorithm>
#include <half.hpp>
#include <cmath>
#include <iostream>
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <iterator>
#include <iomanip>
#include <limits>
#include <iterator>
#include <type_traits>
#include <limits>
#include <vector>
#include <type_traits>
#include <vector>
#include "data_type.hpp"
#include "ck/utility/data_type.hpp"
namespace
ck
{
namespace
utils
{
namespace
ck
{
namespace
utils
{
template
<
typename
T
>
typename
std
::
enable_if
<
std
::
is_floating_point
<
T
>::
value
&&
!
std
::
is_same
<
T
,
half_t
>::
value
,
template
<
typename
T
>
bool
>::
type
typename
std
::
enable_if
<
std
::
is_floating_point
<
T
>::
value
&&
!
std
::
is_same
<
T
,
half_t
>::
value
,
check_err
(
const
std
::
vector
<
T
>&
out
,
bool
>::
type
const
std
::
vector
<
T
>&
ref
,
check_err
(
const
std
::
vector
<
T
>&
out
,
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
const
std
::
vector
<
T
>&
ref
,
double
rtol
=
1e-5
,
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
double
atol
=
3e-6
)
double
rtol
=
1e-5
,
{
double
atol
=
3e-6
)
if
(
out
.
size
()
!=
ref
.
size
())
{
{
if
(
out
.
size
()
!=
ref
.
size
())
std
::
cout
<<
"out.size() != ref.size(), :"
<<
out
.
size
()
<<
" != "
<<
ref
.
size
()
{
<<
std
::
endl
std
::
cout
<<
"out.size() != ref.size(), :"
<<
out
.
size
()
<<
" != "
<<
ref
.
size
()
<<
msg
<<
std
::
endl
;
<<
std
::
endl
return
false
;
<<
msg
<<
std
::
endl
;
}
return
false
;
}
bool
res
{
true
};
int
err_count
=
0
;
bool
res
{
true
};
double
err
=
0
;
int
err_count
=
0
;
double
max_err
=
std
::
numeric_limits
<
double
>::
min
();
double
err
=
0
;
for
(
std
::
size_t
i
=
0
;
i
<
ref
.
size
();
++
i
)
double
max_err
=
std
::
numeric_limits
<
double
>::
min
();
{
for
(
std
::
size_t
i
=
0
;
i
<
ref
.
size
();
++
i
)
err
=
std
::
abs
(
out
[
i
]
-
ref
[
i
]);
{
if
(
err
>
atol
+
rtol
*
std
::
abs
(
ref
[
i
])
||
!
std
::
isfinite
(
out
[
i
])
||
!
std
::
isfinite
(
ref
[
i
]))
err
=
std
::
abs
(
out
[
i
]
-
ref
[
i
]);
{
if
(
err
>
atol
+
rtol
*
std
::
abs
(
ref
[
i
])
||
!
std
::
isfinite
(
out
[
i
])
||
!
std
::
isfinite
(
ref
[
i
]))
max_err
=
err
>
max_err
?
err
:
max_err
;
{
err_count
++
;
max_err
=
err
>
max_err
?
err
:
max_err
;
if
(
err_count
<
5
)
err_count
++
;
{
if
(
err_count
<
5
)
std
::
cout
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
"out["
<<
i
<<
"] != ref["
{
<<
i
<<
"]: "
<<
out
[
i
]
<<
" != "
<<
ref
[
i
]
<<
std
::
endl
std
::
cout
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
"out["
<<
i
<<
"] != ref["
<<
msg
<<
std
::
endl
;
<<
i
<<
"]: "
<<
out
[
i
]
<<
" != "
<<
ref
[
i
]
<<
std
::
endl
}
<<
msg
<<
std
::
endl
;
res
=
false
;
}
}
res
=
false
;
}
}
if
(
!
res
)
}
{
if
(
!
res
)
std
::
cout
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
"max err: "
<<
max_err
<<
std
::
endl
;
{
}
std
::
cout
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
"max err: "
<<
max_err
<<
std
::
endl
;
return
res
;
}
}
return
res
;
}
template
<
typename
T
>
typename
std
::
enable_if
<
std
::
is_same
<
T
,
bhalf_t
>::
value
,
bool
>::
type
template
<
typename
T
>
check_err
(
const
std
::
vector
<
T
>&
out
,
typename
std
::
enable_if
<
std
::
is_same
<
T
,
bhalf_t
>::
value
,
bool
>::
type
const
std
::
vector
<
T
>&
ref
,
check_err
(
const
std
::
vector
<
T
>&
out
,
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
const
std
::
vector
<
T
>&
ref
,
double
rtol
=
1e-3
,
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
double
atol
=
1e-3
)
double
rtol
=
1e-3
,
{
double
atol
=
1e-3
)
if
(
out
.
size
()
!=
ref
.
size
())
{
{
if
(
out
.
size
()
!=
ref
.
size
())
std
::
cout
<<
"out.size() != ref.size(), :"
<<
out
.
size
()
<<
" != "
<<
ref
.
size
()
{
<<
std
::
endl
std
::
cout
<<
"out.size() != ref.size(), :"
<<
out
.
size
()
<<
" != "
<<
ref
.
size
()
<<
msg
<<
std
::
endl
;
<<
std
::
endl
return
false
;
<<
msg
<<
std
::
endl
;
}
return
false
;
}
bool
res
{
true
};
int
err_count
=
0
;
bool
res
{
true
};
double
err
=
0
;
int
err_count
=
0
;
// TODO: This is a hack. We should have proper specialization for bhalf_t data type.
double
err
=
0
;
double
max_err
=
std
::
numeric_limits
<
float
>::
min
();
// TODO: This is a hack. We should have proper specialization for bhalf_t data type.
for
(
std
::
size_t
i
=
0
;
i
<
ref
.
size
();
++
i
)
double
max_err
=
std
::
numeric_limits
<
float
>::
min
();
{
for
(
std
::
size_t
i
=
0
;
i
<
ref
.
size
();
++
i
)
double
o
=
type_convert
<
float
>
(
out
[
i
]);
{
double
r
=
type_convert
<
float
>
(
ref
[
i
]);
double
o
=
type_convert
<
float
>
(
out
[
i
]);
err
=
std
::
abs
(
o
-
r
);
double
r
=
type_convert
<
float
>
(
ref
[
i
]);
if
(
err
>
atol
+
rtol
*
std
::
abs
(
r
)
||
!
std
::
isfinite
(
o
)
||
!
std
::
isfinite
(
r
))
err
=
std
::
abs
(
o
-
r
);
{
if
(
err
>
atol
+
rtol
*
std
::
abs
(
r
)
||
!
std
::
isfinite
(
o
)
||
!
std
::
isfinite
(
r
))
max_err
=
err
>
max_err
?
err
:
max_err
;
{
err_count
++
;
max_err
=
err
>
max_err
?
err
:
max_err
;
if
(
err_count
<
5
)
err_count
++
;
{
if
(
err_count
<
5
)
std
::
cout
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
"out["
<<
i
<<
"] != ref["
{
<<
i
<<
"]: "
<<
o
<<
" != "
<<
r
<<
std
::
endl
std
::
cout
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
"out["
<<
i
<<
"] != ref["
<<
msg
<<
std
::
endl
;
<<
i
<<
"]: "
<<
o
<<
" != "
<<
r
<<
std
::
endl
}
<<
msg
<<
std
::
endl
;
res
=
false
;
}
}
res
=
false
;
}
}
if
(
!
res
)
}
{
if
(
!
res
)
std
::
cout
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
"max err: "
<<
max_err
<<
std
::
endl
;
{
}
std
::
cout
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
"max err: "
<<
max_err
<<
std
::
endl
;
return
res
;
}
}
return
res
;
}
template
<
typename
T
>
typename
std
::
enable_if
<
std
::
is_same
<
T
,
half_t
>::
value
||
std
::
is_same
<
T
,
half_float
::
half
>::
value
,
template
<
typename
T
>
bool
>::
type
typename
std
::
enable_if
<
std
::
is_same
<
T
,
half_t
>::
value
,
bool
>::
type
check_err
(
const
std
::
vector
<
T
>&
out
,
check_err
(
const
std
::
vector
<
T
>&
out
,
const
std
::
vector
<
T
>&
ref
,
const
std
::
vector
<
T
>&
ref
,
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
double
rtol
=
1e-3
,
double
rtol
=
1e-3
,
double
atol
=
1e-3
)
double
atol
=
1e-3
)
{
{
if
(
out
.
size
()
!=
ref
.
size
())
if
(
out
.
size
()
!=
ref
.
size
())
{
{
std
::
cout
<<
"out.size() != ref.size(), :"
<<
out
.
size
()
<<
" != "
<<
ref
.
size
()
std
::
cout
<<
"out.size() != ref.size(), :"
<<
out
.
size
()
<<
" != "
<<
ref
.
size
()
<<
std
::
endl
<<
std
::
endl
<<
msg
<<
std
::
endl
;
<<
msg
<<
std
::
endl
;
return
false
;
return
false
;
}
}
bool
res
{
true
};
bool
res
{
true
};
int
err_count
=
0
;
int
err_count
=
0
;
double
err
=
0
;
double
err
=
0
;
double
max_err
=
std
::
numeric_limits
<
T
>::
min
();
double
max_err
=
std
::
numeric_limits
<
T
>::
min
();
for
(
std
::
size_t
i
=
0
;
i
<
ref
.
size
();
++
i
)
for
(
std
::
size_t
i
=
0
;
i
<
ref
.
size
();
++
i
)
{
{
double
o
=
type_convert
<
float
>
(
out
[
i
]);
double
o
=
type_convert
<
float
>
(
out
[
i
]);
double
r
=
type_convert
<
float
>
(
ref
[
i
]);
double
r
=
type_convert
<
float
>
(
ref
[
i
]);
err
=
std
::
abs
(
o
-
r
);
err
=
std
::
abs
(
o
-
r
);
if
(
err
>
atol
+
rtol
*
std
::
abs
(
r
)
||
!
std
::
isfinite
(
o
)
||
!
std
::
isfinite
(
r
))
if
(
err
>
atol
+
rtol
*
std
::
abs
(
r
)
||
!
std
::
isfinite
(
o
)
||
!
std
::
isfinite
(
r
))
{
{
max_err
=
err
>
max_err
?
err
:
max_err
;
max_err
=
err
>
max_err
?
err
:
max_err
;
err_count
++
;
err_count
++
;
if
(
err_count
<
5
)
if
(
err_count
<
5
)
{
{
std
::
cout
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
"out["
<<
i
<<
"] != ref["
std
::
cout
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
"out["
<<
i
<<
"] != ref["
<<
i
<<
"]: "
<<
o
<<
" != "
<<
r
<<
std
::
endl
<<
i
<<
"]: "
<<
o
<<
" != "
<<
r
<<
std
::
endl
<<
msg
<<
std
::
endl
;
<<
msg
<<
std
::
endl
;
}
}
res
=
false
;
res
=
false
;
}
}
}
}
if
(
!
res
)
if
(
!
res
)
{
{
std
::
cout
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
"max err: "
<<
max_err
<<
std
::
endl
;
std
::
cout
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
"max err: "
<<
max_err
<<
std
::
endl
;
}
}
return
res
;
return
res
;
}
}
template
<
typename
T
>
template
<
typename
T
>
typename
std
::
enable_if
<
std
::
is_integral
<
T
>::
value
&&
!
std
::
is_same
<
T
,
bhalf_t
>::
value
,
bool
>::
type
typename
std
::
enable_if
<
std
::
is_integral
<
T
>::
value
&&
!
std
::
is_same
<
T
,
bhalf_t
>::
value
,
bool
>::
type
check_err
(
const
std
::
vector
<
T
>&
out
,
check_err
(
const
std
::
vector
<
T
>&
out
,
const
std
::
vector
<
T
>&
ref
,
const
std
::
vector
<
T
>&
ref
,
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
double
=
0
,
double
=
0
,
double
=
0
)
double
atol
=
0
)
{
{
if
(
out
.
size
()
!=
ref
.
size
())
if
(
out
.
size
()
!=
ref
.
size
())
{
{
std
::
cout
<<
"out.size() != ref.size(), :"
<<
out
.
size
()
<<
" != "
<<
ref
.
size
()
std
::
cout
<<
"out.size() != ref.size(), :"
<<
out
.
size
()
<<
" != "
<<
ref
.
size
()
<<
std
::
endl
<<
std
::
endl
<<
msg
<<
std
::
endl
;
<<
msg
<<
std
::
endl
;
return
false
;
return
false
;
}
}
for
(
std
::
size_t
i
=
0
;
i
<
ref
.
size
();
++
i
)
bool
res
{
true
};
{
int
err_count
=
0
;
if
(
out
[
i
]
!=
ref
[
i
])
int64_t
err
=
0
;
{
int64_t
max_err
=
std
::
numeric_limits
<
int64_t
>::
min
();
std
::
cout
<<
"out["
<<
i
<<
"] != ref["
<<
i
<<
"]: "
<<
static_cast
<
int
>
(
out
[
i
])
for
(
std
::
size_t
i
=
0
;
i
<
ref
.
size
();
++
i
)
<<
" != "
<<
static_cast
<
int
>
(
ref
[
i
])
<<
std
::
endl
{
<<
msg
<<
std
::
endl
;
int64_t
o
=
out
[
i
];
return
false
;
int64_t
r
=
ref
[
i
];
}
err
=
std
::
abs
(
o
-
r
);
}
return
true
;
if
(
err
>
atol
)
}
{
max_err
=
err
>
max_err
?
err
:
max_err
;
}
// namespace utils
err_count
++
;
}
// namespace ck
if
(
err_count
<
5
)
{
template
<
typename
T
>
std
::
cout
<<
"out["
<<
i
<<
"] != ref["
<<
i
<<
"]: "
<<
static_cast
<
int
>
(
out
[
i
])
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
const
std
::
vector
<
T
>&
v
)
<<
" != "
<<
static_cast
<
int
>
(
ref
[
i
])
<<
std
::
endl
{
<<
msg
<<
std
::
endl
;
std
::
copy
(
std
::
begin
(
v
),
std
::
end
(
v
),
std
::
ostream_iterator
<
T
>
(
os
,
" "
));
}
return
os
;
res
=
false
;
}
}
}
#endif
if
(
!
res
)
{
std
::
cout
<<
"max err: "
<<
max_err
<<
std
::
endl
;
}
return
res
;
}
}
// namespace utils
}
// namespace ck
template
<
typename
T
>
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
const
std
::
vector
<
T
>&
v
)
{
std
::
copy
(
std
::
begin
(
v
),
std
::
end
(
v
),
std
::
ostream_iterator
<
T
>
(
os
,
" "
));
return
os
;
}
library/include/ck/library/utility/conv_util.hpp
View file @
5d015452
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
#include <cstdlib>
#include <cstdlib>
...
@@ -9,17 +12,17 @@
...
@@ -9,17 +12,17 @@
#include <type_traits>
#include <type_traits>
#include <vector>
#include <vector>
#include "c
heck_err
.hpp"
#include "c
k/ck
.hpp"
#include "c
onfig
.hpp"
#include "c
k/tensor_operation/gpu/device/tensor_layout
.hpp"
#include "
device
.hpp"
#include "
ck/tensor_operation/gpu/device/device_conv_fwd
.hpp"
#include "
device_conv_fwd
.hpp"
#include "
ck/tensor_operation/gpu/element/element_wise_operation
.hpp"
#include "device_tensor.hpp"
#include "
element_wise_operation
.hpp"
#include "
ck/library/utility/check_err
.hpp"
#include "fill.hpp"
#include "
ck/library/utility/
fill.hpp"
#include "
host_tensor
.hpp"
#include "
ck/library/utility/op_instance_engine
.hpp"
#include "
op_instance_engine
.hpp"
#include "
ck/library/host_tensor/device_memory
.hpp"
#include "
reference_conv_fwd
.hpp"
#include "
ck/library/host_tensor/host_tensor
.hpp"
#include "
tensor_layout
.hpp"
#include "
ck/library/reference_tensor_operation/cpu/reference_conv_fwd
.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
...
@@ -28,15 +31,15 @@ namespace device {
...
@@ -28,15 +31,15 @@ namespace device {
using
DeviceConvFwdNoOpPtr
=
DeviceConvFwdPtr
<
element_wise
::
PassThrough
,
using
DeviceConvFwdNoOpPtr
=
DeviceConvFwdPtr
<
element_wise
::
PassThrough
,
element_wise
::
PassThrough
,
element_wise
::
PassThrough
,
element_wise
::
PassThrough
>
;
element_wise
::
PassThrough
>
;
namespace
device_conv1d_fwd_
instance
{
namespace
instance
{
void
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_bf16_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_bf16_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f16_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f16_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f32_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f32_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
}
// namespace
device_conv1d_fwd_
instance
}
// namespace instance
namespace
device_conv2d_fwd_
instance
{
namespace
instance
{
void
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
...
@@ -45,15 +48,15 @@ void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(
...
@@ -45,15 +48,15 @@ void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(
void
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
}
// namespace
device_conv2d_fwd_
instance
}
// namespace instance
namespace
device_conv3d_fwd_
instance
{
namespace
instance
{
void
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f16_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f16_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f32_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f32_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
}
// namespace
device_conv3d_fwd_
instance
}
// namespace instance
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
...
@@ -292,17 +295,17 @@ struct ConvolutionFwdInstances<float, float, float>
...
@@ -292,17 +295,17 @@ struct ConvolutionFwdInstances<float, float, float>
std
::
vector
<
DeviceConvFwdNoOpPtr
>
conv_ptrs
;
std
::
vector
<
DeviceConvFwdNoOpPtr
>
conv_ptrs
;
if
constexpr
(
NumDimSpatial
==
1
)
if
constexpr
(
NumDimSpatial
==
1
)
{
{
ck
::
tensor_operation
::
device
::
device_conv1d_fwd_
instance
::
ck
::
tensor_operation
::
device
::
instance
::
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f32_instances
(
conv_ptrs
);
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f32_instances
(
conv_ptrs
);
}
}
else
if
constexpr
(
NumDimSpatial
==
2
)
else
if
constexpr
(
NumDimSpatial
==
2
)
{
{
ck
::
tensor_operation
::
device
::
device_conv2d_fwd_
instance
::
ck
::
tensor_operation
::
device
::
instance
::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances
(
conv_ptrs
);
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances
(
conv_ptrs
);
}
}
else
if
constexpr
(
NumDimSpatial
==
3
)
else
if
constexpr
(
NumDimSpatial
==
3
)
{
{
ck
::
tensor_operation
::
device
::
device_conv3d_fwd_
instance
::
ck
::
tensor_operation
::
device
::
instance
::
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f32_instances
(
conv_ptrs
);
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f32_instances
(
conv_ptrs
);
}
}
return
conv_ptrs
;
return
conv_ptrs
;
...
@@ -319,20 +322,20 @@ struct ConvolutionFwdInstances<half_t, half_t, half_t>
...
@@ -319,20 +322,20 @@ struct ConvolutionFwdInstances<half_t, half_t, half_t>
std
::
vector
<
DeviceConvFwdNoOpPtr
>
conv_ptrs
;
std
::
vector
<
DeviceConvFwdNoOpPtr
>
conv_ptrs
;
if
constexpr
(
NumDimSpatial
==
1
)
if
constexpr
(
NumDimSpatial
==
1
)
{
{
ck
::
tensor_operation
::
device
::
device_conv1d_fwd_
instance
::
ck
::
tensor_operation
::
device
::
instance
::
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f16_instances
(
conv_ptrs
);
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f16_instances
(
conv_ptrs
);
return
conv_ptrs
;
return
conv_ptrs
;
}
}
else
if
constexpr
(
NumDimSpatial
==
2
)
else
if
constexpr
(
NumDimSpatial
==
2
)
{
{
ck
::
tensor_operation
::
device
::
device_conv2d_fwd_
instance
::
ck
::
tensor_operation
::
device
::
instance
::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances
(
conv_ptrs
);
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances
(
conv_ptrs
);
ck
::
tensor_operation
::
device
::
device_conv2d_fwd_
instance
::
ck
::
tensor_operation
::
device
::
instance
::
add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances
(
conv_ptrs
);
add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances
(
conv_ptrs
);
}
}
else
if
constexpr
(
NumDimSpatial
==
3
)
else
if
constexpr
(
NumDimSpatial
==
3
)
{
{
ck
::
tensor_operation
::
device
::
device_conv3d_fwd_
instance
::
ck
::
tensor_operation
::
device
::
instance
::
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f16_instances
(
conv_ptrs
);
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f16_instances
(
conv_ptrs
);
}
}
return
conv_ptrs
;
return
conv_ptrs
;
...
@@ -349,17 +352,17 @@ struct ConvolutionFwdInstances<bhalf_t, bhalf_t, bhalf_t>
...
@@ -349,17 +352,17 @@ struct ConvolutionFwdInstances<bhalf_t, bhalf_t, bhalf_t>
std
::
vector
<
DeviceConvFwdNoOpPtr
>
conv_ptrs
;
std
::
vector
<
DeviceConvFwdNoOpPtr
>
conv_ptrs
;
if
constexpr
(
NumDimSpatial
==
1
)
if
constexpr
(
NumDimSpatial
==
1
)
{
{
ck
::
tensor_operation
::
device
::
device_conv1d_fwd_
instance
::
ck
::
tensor_operation
::
device
::
instance
::
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_bf16_instances
(
conv_ptrs
);
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_bf16_instances
(
conv_ptrs
);
}
}
else
if
constexpr
(
NumDimSpatial
==
2
)
else
if
constexpr
(
NumDimSpatial
==
2
)
{
{
ck
::
tensor_operation
::
device
::
device_conv2d_fwd_
instance
::
ck
::
tensor_operation
::
device
::
instance
::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances
(
conv_ptrs
);
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances
(
conv_ptrs
);
}
}
else
if
constexpr
(
NumDimSpatial
==
3
)
else
if
constexpr
(
NumDimSpatial
==
3
)
{
{
ck
::
tensor_operation
::
device
::
device_conv3d_fwd_
instance
::
ck
::
tensor_operation
::
device
::
instance
::
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instances
(
conv_ptrs
);
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instances
(
conv_ptrs
);
}
}
return
conv_ptrs
;
return
conv_ptrs
;
...
@@ -376,17 +379,17 @@ struct ConvolutionFwdInstances<int8_t, int8_t, int8_t>
...
@@ -376,17 +379,17 @@ struct ConvolutionFwdInstances<int8_t, int8_t, int8_t>
std
::
vector
<
DeviceConvFwdNoOpPtr
>
conv_ptrs
;
std
::
vector
<
DeviceConvFwdNoOpPtr
>
conv_ptrs
;
if
constexpr
(
NumDimSpatial
==
1
)
if
constexpr
(
NumDimSpatial
==
1
)
{
{
ck
::
tensor_operation
::
device
::
device_conv1d_fwd_
instance
::
ck
::
tensor_operation
::
device
::
instance
::
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instances
(
conv_ptrs
);
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instances
(
conv_ptrs
);
}
}
else
if
constexpr
(
NumDimSpatial
==
2
)
else
if
constexpr
(
NumDimSpatial
==
2
)
{
{
ck
::
tensor_operation
::
device
::
device_conv2d_fwd_
instance
::
ck
::
tensor_operation
::
device
::
instance
::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances
(
conv_ptrs
);
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances
(
conv_ptrs
);
}
}
else
if
constexpr
(
NumDimSpatial
==
3
)
else
if
constexpr
(
NumDimSpatial
==
3
)
{
{
ck
::
tensor_operation
::
device
::
device_conv3d_fwd_
instance
::
ck
::
tensor_operation
::
device
::
instance
::
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instances
(
conv_ptrs
);
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instances
(
conv_ptrs
);
}
}
return
conv_ptrs
;
return
conv_ptrs
;
...
@@ -402,8 +405,8 @@ template <typename InDataType,
...
@@ -402,8 +405,8 @@ template <typename InDataType,
typename
InElementwiseOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
typename
InElementwiseOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
typename
WeiElementwiseOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
typename
WeiElementwiseOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
typename
OutElementwiseOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
typename
OutElementwiseOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
typename
InputInitFun
=
FillUniform
<
InDataType
>,
typename
InputInitFun
=
FillUniform
Distribution
<
InDataType
>,
typename
WeightsInitFun
=
FillUniform
<
WeiDataType
>>
typename
WeightsInitFun
=
FillUniform
Distribution
<
WeiDataType
>>
class
ConvFwdOpInstance
:
public
ck
::
utils
::
OpInstance
<
OutDataType
,
InDataType
,
WeiDataType
>
class
ConvFwdOpInstance
:
public
ck
::
utils
::
OpInstance
<
OutDataType
,
InDataType
,
WeiDataType
>
{
{
using
DeviceConvFwdOp
=
tensor_operation
::
device
::
using
DeviceConvFwdOp
=
tensor_operation
::
device
::
...
@@ -422,8 +425,8 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType,
...
@@ -422,8 +425,8 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType,
ConvFwdOpInstance
(
const
ConvParams
&
params
,
ConvFwdOpInstance
(
const
ConvParams
&
params
,
bool
do_init
=
true
,
bool
do_init
=
true
,
const
InputInitFun
&
input_init_f
=
InputInitFun
{}
,
const
InputInitFun
&
input_init_f
=
InputInitFun
()
,
const
WeightsInitFun
&
weights_init_f
=
WeightsInitFun
{}
)
const
WeightsInitFun
&
weights_init_f
=
WeightsInitFun
()
)
:
BaseType
(),
:
BaseType
(),
params_
{
params
},
params_
{
params
},
output_spatial_lengths_
{
params
.
GetOutputSpatialLengths
()},
output_spatial_lengths_
{
params
.
GetOutputSpatialLengths
()},
...
@@ -560,8 +563,8 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType,
...
@@ -560,8 +563,8 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType,
const
ConvParams
&
params_
;
const
ConvParams
&
params_
;
const
std
::
vector
<
ck
::
index_t
>
output_spatial_lengths_
;
const
std
::
vector
<
ck
::
index_t
>
output_spatial_lengths_
;
const
bool
do_init_
;
const
bool
do_init_
;
const
InputInitFun
&
input_init_f_
;
InputInitFun
input_init_f_
;
const
WeightsInitFun
&
weights_init_f_
;
WeightsInitFun
weights_init_f_
;
};
};
}
// namespace conv
}
// namespace conv
...
...
library/include/ck/library/utility/fill.hpp
View file @
5d015452
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
#include <algorithm>
#include <algorithm>
#include <cmath>
#include <random>
#include <random>
#include "data_type.hpp"
#include "
ck/utility/
data_type.hpp"
namespace
ck
{
namespace
ck
{
namespace
utils
{
namespace
utils
{
// template <typename T, class Enable = void>
template
<
typename
T
>
// struct FillUniform;
struct
FillUniformDistribution
{
float
a_
{
-
5.
f
};
float
b_
{
5.
f
};
// TODO: what's wrong with this specialization???
template
<
typename
ForwardIter
>
// err: segmentation fault in mt19937 - infinite loop like.
void
operator
()(
ForwardIter
first
,
ForwardIter
last
)
const
// template <typename T>
{
// struct FillUniform<T, typename std::enable_if<std::is_integral<T>::value &&
std
::
mt19937
gen
(
11939
);
// !std::is_same<T, bhalf_t>::value>::type>
std
::
uniform_real_distribution
<
float
>
dis
(
a_
,
b_
);
// {
std
::
generate
(
first
,
last
,
[
&
dis
,
&
gen
]()
{
return
ck
::
type_convert
<
T
>
(
dis
(
gen
));
});
// int a_{0};
}
// int b_{5};
};
// // T a_ = T{0};
// // T b_ = T{5};
// template <typename ForwardIter>
// Normally FillUniformDistributionIntegerValue should use std::uniform_int_distribution as below.
// void operator()(ForwardIter first, ForwardIter last) const
// However this produces segfaults in std::mt19937 which look like inifite loop.
// {
// template <typename T>
// std::mt19937 gen{11939};
// struct FillUniformDistributionIntegerValue
// std::uniform_int_distribution<int> dis(a_, b_);
// {
// std::generate(first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
// int a_{-5};
// }
// int b_{5};
// };
//
// template <typename ForwardIter>
// void operator()(ForwardIter first, ForwardIter last) const
// {
// std::mt19937 gen(11939);
// std::uniform_int_distribution<int> dis(a_, b_);
// std::generate(
// first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
// }
// };
// struct FillUniform<T, typename std::enable_if<std::is_floating_point<T>::value ||
// Workaround for uniform_int_distribution not working as expected. See note above.<
// std::is_same<T, bhalf_t>::value>::type>
template
<
typename
T
>
template
<
typename
T
>
struct
FillUniform
struct
FillUniform
DistributionIntegerValue
{
{
float
a_
{
0
};
float
a_
{
-
5.
f
};
float
b_
{
5
};
float
b_
{
5
.
f
};
template
<
typename
ForwardIter
>
template
<
typename
ForwardIter
>
void
operator
()(
ForwardIter
first
,
ForwardIter
last
)
const
void
operator
()(
ForwardIter
first
,
ForwardIter
last
)
const
{
{
std
::
mt19937
gen
{
11939
};
std
::
mt19937
gen
(
11939
);
std
::
uniform_real_distribution
<>
dis
(
a_
,
b_
);
std
::
uniform_real_distribution
<
float
>
dis
(
a_
,
b_
);
std
::
generate
(
first
,
last
,
[
&
dis
,
&
gen
]()
{
return
ck
::
type_convert
<
T
>
(
dis
(
gen
));
});
std
::
generate
(
first
,
last
,
[
&
dis
,
&
gen
]()
{
return
ck
::
type_convert
<
T
>
(
std
::
round
(
dis
(
gen
)));
});
}
}
};
};
...
...
library/include/ck/library/utility/op_instance_engine.hpp
View file @
5d015452
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
#include <cstdlib>
#include <cstdlib>
#include <iostream>
#include <limits>
#include <limits>
#include <memory>
#include <memory>
#include <stdexcept>
#include <stdexcept>
...
@@ -8,9 +12,12 @@
...
@@ -8,9 +12,12 @@
#include <utility>
#include <utility>
#include <vector>
#include <vector>
#include "check_err.hpp"
#include "ck/utility/functional2.hpp"
#include "device_base.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "functional2.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
namespace
ck
{
namespace
ck
{
namespace
utils
{
namespace
utils
{
...
@@ -78,7 +85,8 @@ class OpInstanceRunEngine
...
@@ -78,7 +85,8 @@ class OpInstanceRunEngine
template
<
typename
ReferenceOp
=
std
::
function
<
void
()>
>
template
<
typename
ReferenceOp
=
std
::
function
<
void
()>
>
OpInstanceRunEngine
(
const
OpInstanceT
&
op_instance
,
OpInstanceRunEngine
(
const
OpInstanceT
&
op_instance
,
const
ReferenceOp
&
reference_op
=
ReferenceOp
{})
const
ReferenceOp
&
reference_op
=
ReferenceOp
{},
bool
do_verification
=
true
)
:
op_instance_
{
op_instance
}
:
op_instance_
{
op_instance
}
{
{
in_tensors_
=
op_instance_
.
GetInputTensors
();
in_tensors_
=
op_instance_
.
GetInputTensors
();
...
@@ -88,8 +96,11 @@ class OpInstanceRunEngine
...
@@ -88,8 +96,11 @@ class OpInstanceRunEngine
const
Tensor
<
InArgTypes
>&
...,
const
Tensor
<
InArgTypes
>&
...,
Tensor
<
OutDataType
>&>
)
Tensor
<
OutDataType
>&>
)
{
{
ref_output_
=
op_instance_
.
GetOutputTensor
();
if
(
do_verification
)
CallRefOpUnpackArgs
(
reference_op
,
std
::
make_index_sequence
<
kNInArgs_
>
{});
{
ref_output_
=
op_instance_
.
GetOutputTensor
();
CallRefOpUnpackArgs
(
reference_op
,
std
::
make_index_sequence
<
kNInArgs_
>
{});
}
}
}
AllocateDeviceInputTensors
(
std
::
make_index_sequence
<
kNInArgs_
>
{});
AllocateDeviceInputTensors
(
std
::
make_index_sequence
<
kNInArgs_
>
{});
out_device_buffer_
=
out_device_buffer_
=
...
@@ -110,6 +121,7 @@ class OpInstanceRunEngine
...
@@ -110,6 +121,7 @@ class OpInstanceRunEngine
op_ptr
.
get
(),
in_device_buffers_
,
out_device_buffer_
);
op_ptr
.
get
(),
in_device_buffers_
,
out_device_buffer_
);
if
(
op_ptr
->
IsSupportedArgument
(
argument
.
get
()))
if
(
op_ptr
->
IsSupportedArgument
(
argument
.
get
()))
{
{
std
::
cout
<<
"Testing instance: "
<<
op_ptr
->
GetTypeString
()
<<
std
::
endl
;
invoker
->
Run
(
argument
.
get
());
invoker
->
Run
(
argument
.
get
());
out_device_buffer_
->
FromDevice
(
out_tensor_
->
mData
.
data
());
out_device_buffer_
->
FromDevice
(
out_tensor_
->
mData
.
data
());
if
(
!
ref_output_
)
if
(
!
ref_output_
)
...
@@ -119,9 +131,16 @@ class OpInstanceRunEngine
...
@@ -119,9 +131,16 @@ class OpInstanceRunEngine
" You have to provide reference function."
);
" You have to provide reference function."
);
}
}
// TODO: enable flexible use of custom check_error functions
// TODO: enable flexible use of custom check_error functions
res
=
res
&&
check_err
(
out_tensor_
->
mData
,
ref_output_
->
mData
);
bool
inst_res
=
CheckErr
(
out_tensor_
->
mData
,
ref_output_
->
mData
);
std
::
cout
<<
(
inst_res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
res
=
res
&&
inst_res
;
out_device_buffer_
->
SetZero
();
out_device_buffer_
->
SetZero
();
}
}
else
{
std
::
cout
<<
"Given conv problem is not supported by instance:
\n\t
>>>>"
<<
op_ptr
->
GetTypeString
()
<<
std
::
endl
;
}
}
}
return
res
;
return
res
;
}
}
...
@@ -132,7 +151,6 @@ class OpInstanceRunEngine
...
@@ -132,7 +151,6 @@ class OpInstanceRunEngine
bool
do_verification
=
false
,
bool
do_verification
=
false
,
bool
do_log
=
false
)
bool
do_log
=
false
)
{
{
bool
res
{
true
};
ProfileBestConfig
best_config
;
ProfileBestConfig
best_config
;
for
(
auto
&
op_ptr
:
op_ptrs
)
for
(
auto
&
op_ptr
:
op_ptrs
)
...
@@ -153,7 +171,7 @@ class OpInstanceRunEngine
...
@@ -153,7 +171,7 @@ class OpInstanceRunEngine
std
::
cout
<<
"Perf: "
<<
avg_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
std
::
cout
<<
"Perf: "
<<
avg_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
if
(
tflops
<
best_config
.
best_
tflops
)
if
(
avg_time
<
best_config
.
best_
avg_time
)
{
{
best_config
.
best_op_name
=
op_name
;
best_config
.
best_op_name
=
op_name
;
best_config
.
best_tflops
=
tflops
;
best_config
.
best_tflops
=
tflops
;
...
@@ -171,7 +189,7 @@ class OpInstanceRunEngine
...
@@ -171,7 +189,7 @@ class OpInstanceRunEngine
" You have to provide reference function."
);
" You have to provide reference function."
);
}
}
// TODO: enable flexible use of custom check_error functions
// TODO: enable flexible use of custom check_error functions
res
=
res
&&
CheckErr
(
out_tensor_
->
mData
,
ref_output_
->
mData
);
CheckErr
(
out_tensor_
->
mData
,
ref_output_
->
mData
);
if
(
do_log
)
{}
if
(
do_log
)
{}
}
}
...
@@ -223,7 +241,7 @@ class OpInstanceRunEngine
...
@@ -223,7 +241,7 @@ class OpInstanceRunEngine
template
<
typename
T
>
template
<
typename
T
>
bool
CheckErr
(
const
std
::
vector
<
T
>&
dev_out
,
const
std
::
vector
<
T
>&
ref_out
)
const
bool
CheckErr
(
const
std
::
vector
<
T
>&
dev_out
,
const
std
::
vector
<
T
>&
ref_out
)
const
{
{
return
ck
::
utils
::
check_err
(
dev_out
,
ref_out
,
"Error: incorrect results!"
,
a
tol_
,
r
tol_
);
return
ck
::
utils
::
check_err
(
dev_out
,
ref_out
,
"Error: incorrect results!"
,
r
tol_
,
a
tol_
);
}
}
};
};
...
...
library/src/host_tensor/CMakeLists.txt
View file @
5d015452
## host_tensor
## host_tensor
include_directories
(
BEFORE
${
PROJECT_SOURCE_DIR
}
/include/ck
${
PROJECT_SOURCE_DIR
}
/include/ck/utility
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/host_tensor
)
set
(
HOST_TENSOR_SOURCE
set
(
HOST_TENSOR_SOURCE
device.cpp
device
_memory
.cpp
host_tensor.cpp
host_tensor.cpp
)
)
...
@@ -17,22 +11,20 @@ target_compile_features(host_tensor PUBLIC)
...
@@ -17,22 +11,20 @@ target_compile_features(host_tensor PUBLIC)
set_target_properties
(
host_tensor PROPERTIES POSITION_INDEPENDENT_CODE ON
)
set_target_properties
(
host_tensor PROPERTIES POSITION_INDEPENDENT_CODE ON
)
target_include_directories
(
host_tensor SYSTEM PUBLIC $<BUILD_INTERFACE:
${
HALF_INCLUDE_DIR
}
>
)
target_include_directories
(
host_tensor SYSTEM PUBLIC $<BUILD_INTERFACE:
${
HALF_INCLUDE_DIR
}
>
)
target_include_directories
(
host_tensor PUBLIC
target_include_directories
(
host_tensor PUBLIC
"$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck>"
"$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck>"
"$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/utility>"
"$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/utility>"
"$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/host_tensor>"
"$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/host_tensor>"
)
)
install
(
TARGETS host_tensor
rocm_install
(
EXPORT host_tensorTargets
TARGETS host_tensor
LIBRARY DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
EXPORT host_tensorTargets
ARCHIVE DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
RUNTIME DESTINATION
${
CMAKE_INSTALL_BINDIR
}
INCLUDES DESTINATION
${
CMAKE_INSTALL_INCLUDEDIR
}
)
)
install
(
EXPORT host_tensorTargets
rocm_install
(
FILE composable_kernelhost_tensorTargets.cmake
EXPORT host_tensorTargets
FILE composable_kernelhost_tensorTargets.cmake
NAMESPACE composable_kernel::
NAMESPACE composable_kernel::
DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
/cmake/composable_kernel
DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
/cmake/composable_kernel
)
)
...
...
library/src/host_tensor/device.cpp
→
library/src/host_tensor/device
_memory
.cpp
View file @
5d015452
#include "device.hpp"
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/device_utility/hip_check_error.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
DeviceMem
::
DeviceMem
(
std
::
size_t
mem_size
)
:
mMemSize
(
mem_size
)
DeviceMem
::
DeviceMem
(
std
::
size_t
mem_size
)
:
mMemSize
(
mem_size
)
{
{
...
@@ -22,49 +26,3 @@ void DeviceMem::FromDevice(void* p)
...
@@ -22,49 +26,3 @@ void DeviceMem::FromDevice(void* p)
void
DeviceMem
::
SetZero
()
{
hip_check_error
(
hipMemset
(
mpDeviceBuf
,
0
,
mMemSize
));
}
void
DeviceMem
::
SetZero
()
{
hip_check_error
(
hipMemset
(
mpDeviceBuf
,
0
,
mMemSize
));
}
DeviceMem
::~
DeviceMem
()
{
hip_check_error
(
hipFree
(
mpDeviceBuf
));
}
DeviceMem
::~
DeviceMem
()
{
hip_check_error
(
hipFree
(
mpDeviceBuf
));
}
struct
KernelTimerImpl
{
KernelTimerImpl
()
{
hip_check_error
(
hipEventCreate
(
&
mStart
));
hip_check_error
(
hipEventCreate
(
&
mEnd
));
}
~
KernelTimerImpl
()
{
hip_check_error
(
hipEventDestroy
(
mStart
));
hip_check_error
(
hipEventDestroy
(
mEnd
));
}
void
Start
()
{
hip_check_error
(
hipDeviceSynchronize
());
hip_check_error
(
hipEventRecord
(
mStart
,
nullptr
));
}
void
End
()
{
hip_check_error
(
hipEventRecord
(
mEnd
,
nullptr
));
hip_check_error
(
hipEventSynchronize
(
mEnd
));
}
float
GetElapsedTime
()
const
{
float
time
;
hip_check_error
(
hipEventElapsedTime
(
&
time
,
mStart
,
mEnd
));
return
time
;
}
hipEvent_t
mStart
,
mEnd
;
};
KernelTimer
::
KernelTimer
()
:
impl
(
new
KernelTimerImpl
())
{}
KernelTimer
::~
KernelTimer
()
{}
void
KernelTimer
::
Start
()
{
impl
->
Start
();
}
void
KernelTimer
::
End
()
{
impl
->
End
();
}
float
KernelTimer
::
GetElapsedTime
()
const
{
return
impl
->
GetElapsedTime
();
}
library/src/host_tensor/host_tensor.cpp
View file @
5d015452
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cassert>
#include <cassert>
#include "host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
void
HostTensorDescriptor
::
CalculateStrides
()
void
HostTensorDescriptor
::
CalculateStrides
()
{
{
...
@@ -50,25 +54,3 @@ std::ostream& operator<<(std::ostream& os, const HostTensorDescriptor& desc)
...
@@ -50,25 +54,3 @@ std::ostream& operator<<(std::ostream& os, const HostTensorDescriptor& desc)
return
os
;
return
os
;
}
}
void
ostream_HostTensorDescriptor
(
const
HostTensorDescriptor
&
desc
,
std
::
ostream
&
os
)
{
os
<<
"dim "
<<
desc
.
GetNumOfDimension
()
<<
", "
;
os
<<
"lengths {"
;
LogRange
(
os
,
desc
.
GetLengths
(),
", "
);
os
<<
"}, "
;
os
<<
"strides {"
;
LogRange
(
os
,
desc
.
GetStrides
(),
", "
);
os
<<
"}"
<<
std
::
endl
;
}
#if 1
// FIXME: remove
void
bf16_to_f32_
(
const
Tensor
<
ck
::
bhalf_t
>&
src
,
Tensor
<
float
>&
dst
)
{
for
(
std
::
size_t
i
=
0
;
i
<
src
.
mData
.
size
();
++
i
)
dst
.
mData
[
i
]
=
ck
::
type_convert
<
float
>
(
src
.
mData
[
i
]);
}
#endif
Prev
1
…
15
16
17
18
19
20
21
22
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment