Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
b79df771
Commit
b79df771
authored
Jul 12, 2022
by
carlushuang
Browse files
Merge remote-tracking branch 'origin/develop' into cpu_avx2
parents
05d38218
63914743
Changes
450
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
489 additions
and
572 deletions
+489
-572
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp
...ice_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp
+9
-8
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp
...ice_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp
+9
-8
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp
...ice_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp
+9
-8
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp
...instance/gpu/reduce/device_reduce_instance_threadwise.hpp
+28
-34
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp
.../reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp
+9
-9
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp
.../reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp
+9
-9
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp
.../reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp
+9
-9
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp
.../reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp
+9
-8
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp
.../reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp
+9
-8
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp
.../reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp
+9
-8
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp
...pu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp
+9
-8
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp
...gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp
+9
-8
library/include/ck/library/utility/check_err.hpp
library/include/ck/library/utility/check_err.hpp
+210
-195
library/include/ck/library/utility/conv_util.hpp
library/include/ck/library/utility/conv_util.hpp
+39
-36
library/include/ck/library/utility/fill.hpp
library/include/ck/library/utility/fill.hpp
+43
-29
library/include/ck/library/utility/op_instance_engine.hpp
library/include/ck/library/utility/op_instance_engine.hpp
+29
-11
library/src/host_tensor/CMakeLists.txt
library/src/host_tensor/CMakeLists.txt
+8
-16
library/src/host_tensor/device.cpp
library/src/host_tensor/device.cpp
+0
-137
library/src/host_tensor/device_memory.cpp
library/src/host_tensor/device_memory.cpp
+28
-0
library/src/host_tensor/host_tensor.cpp
library/src/host_tensor/host_tensor.cpp
+5
-23
No files found.
Too many changes to show.
To preserve performance only
450 of 450+
files are displayed.
Plain diff
Email patch
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp
View file @
b79df771
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F32_F32_HPP
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F32_F32_HPP
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "device_reduce_instance_multiblock_atomic_add.hpp"
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_
instance
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...
...
@@ -20,10 +24,7 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 1);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
2
,
1
);
// clang-format on
}
// namespace
device_reduce_
instance
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp
View file @
b79df771
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F64_F32_HPP
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F64_F32_HPP
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "device_reduce_instance_multiblock_atomic_add.hpp"
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_
instance
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...
...
@@ -20,10 +24,7 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 1);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
float
,
double
,
float
,
5
,
0
,
0
,
2
,
1
);
// clang-format on
}
// namespace
device_reduce_
instance
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp
View file @
b79df771
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F64_F64_F64_HPP
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F64_F64_F64_HPP
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "device_reduce_instance_multiblock_atomic_add.hpp"
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_
instance
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...
...
@@ -20,10 +24,7 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 1);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
2
,
1
);
// clang-format on
}
// namespace
device_reduce_
instance
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp
View file @
b79df771
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_HPP
#define DEVICE_REDUCE_INSTANCE_THREADWISE_HPP
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_impl_common.hpp"
#include "device_reduce_threadwise.hpp"
#pragma once
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_
instance
{
namespace
instance
{
#ifdef QUICK_REDUCE_TEST
using
reduce_configuration_2_instances_threadwise
=
std
::
tuple
<
...
...
@@ -47,10 +49,10 @@ using reduce_configuration_2_instances_threadwise = std::tuple<
>
;
#endif
template
<
typename
AccDataType
,
ReduceTensorOp
ReduceOpId
>
template
<
ReduceTensorOp
ReduceOpId
>
using
deviceReduceThreadWisePtrType
=
DeviceReducePtr
<
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
InElementwiseOperation
,
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
>
;
typename
reduce_unary_operator
<
ReduceOpId
,
true
,
true
>::
InElementwiseOperation
,
typename
reduce_unary_operator
<
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
>
;
template
<
typename
InDataType
,
typename
AccDataType
,
...
...
@@ -61,14 +63,13 @@ template <typename InDataType,
bool
PropagateNan
,
bool
UseIndex
>
void
add_device_reduce_instance_threadwise
(
std
::
vector
<
deviceReduceThreadWisePtrType
<
AccDataType
,
ReduceOpId
>>&
device_op_instances
)
std
::
vector
<
deviceReduceThreadWisePtrType
<
ReduceOpId
>>&
device_op_instances
)
{
using
ReduceOperation
=
typename
reduce_binary_operator
<
AccDataType
,
ReduceOpId
>::
opType
;
using
ReduceOperation
=
typename
reduce_binary_operator
<
ReduceOpId
>::
opType
;
using
InElementwiseOperation
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
InElementwiseOperation
;
typename
reduce_unary_operator
<
ReduceOpId
,
true
,
true
>::
InElementwiseOperation
;
using
AccElementwiseOperation
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
;
typename
reduce_unary_operator
<
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
;
constexpr
bool
Indexable
=
(
ReduceOpId
==
ReduceTensorOp
::
MIN
||
ReduceOpId
==
ReduceTensorOp
::
MAX
||
...
...
@@ -114,7 +115,7 @@ void add_device_reduce_instance_threadwise(
ReduceOpId, \
PropagateNan, \
UseIndex>( \
std::vector<deviceReduceThreadWisePtrType<
compT,
ReduceOpId>> & device_op_instances)
std::vector<deviceReduceThreadWisePtrType<ReduceOpId>> & device_op_instances)
#define ADD_THREADWISE_INST_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
...
...
@@ -127,21 +128,17 @@ void add_device_reduce_instance_threadwise(
Rank, \
NumReduceDim)
#define ADD_THREADWISE_INST_REF_BY_TYPE( \
inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \
extern template void add_device_reduce_instance_threadwise<inT, \
compT, \
outT, \
Rank, \
NumReduceDim, \
ReduceOpId, \
PropagateNan, \
UseIndex>( \
std::vector<DeviceReducePtr< \
typename reduce_unary_operator<compT, ReduceOpId, true, true>::InElementwiseOperation, \
typename reduce_unary_operator<compT, ReduceOpId, true, true>:: \
AccElementwiseOperation>> & \
device_op_instances)
#define ADD_THREADWISE_INST_REF_BY_TYPE( \
inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \
extern template void add_device_reduce_instance_threadwise<inT, \
compT, \
outT, \
Rank, \
NumReduceDim, \
ReduceOpId, \
PropagateNan, \
UseIndex>( \
std::vector<deviceReduceThreadWisePtrType<ReduceOpId>> & device_op_instances)
#define ADD_THREADWISE_INST_REF_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
...
...
@@ -154,10 +151,7 @@ void add_device_reduce_instance_threadwise(
Rank, \
NumReduceDim)
}
// namespace
device_reduce_
instance
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp
View file @
b79df771
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_B16_F32_B16_HPP
#define DEVICE_REDUCE_INSTANCE_THREADWISE_B16_F32_B16_HPP
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "data_type.hpp"
#include "device_reduce_instance_threadwise.hpp"
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_
instance
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...
...
@@ -50,10 +53,7 @@ ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 1);
ADD_THREADWISE_INST_REF_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
}
// namespace
device_reduce_
instance
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp
View file @
b79df771
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F16_F16_HPP
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F16_F16_HPP
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "data_type.hpp"
#include "device_reduce_instance_threadwise.hpp"
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_
instance
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...
...
@@ -37,10 +40,7 @@ ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1);
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
}
// namespace
device_reduce_
instance
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp
View file @
b79df771
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F32_F16_HPP
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F32_F16_HPP
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "data_type.hpp"
#include "device_reduce_instance_threadwise.hpp"
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_
instance
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...
...
@@ -25,10 +28,7 @@ ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1);
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
2
,
1
);
// clang-format on
}
// namespace
device_reduce_
instance
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp
View file @
b79df771
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F32_F32_HPP
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F32_F32_HPP
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "device_reduce_instance_threadwise.hpp"
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_
instance
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...
...
@@ -48,10 +52,7 @@ ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 1);
ADD_THREADWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
}
// namespace
device_reduce_
instance
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp
View file @
b79df771
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F64_F32_HPP
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F64_F32_HPP
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "device_reduce_instance_threadwise.hpp"
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_
instance
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...
...
@@ -24,10 +28,7 @@ ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 1);
ADD_THREADWISE_INST_REF_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
2
,
1
);
// clang-format on
}
// namespace
device_reduce_
instance
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp
View file @
b79df771
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F64_F64_F64_HPP
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F64_F64_F64_HPP
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "device_reduce_instance_threadwise.hpp"
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_
instance
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...
...
@@ -48,10 +52,7 @@ ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 1);
ADD_THREADWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
}
// namespace
device_reduce_
instance
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp
View file @
b79df771
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I32_I8_HPP
#define DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I32_I8_HPP
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "device_reduce_instance_threadwise.hpp"
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_
instance
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...
...
@@ -20,10 +24,7 @@ ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 1);
ADD_THREADWISE_INST_REF_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
5
,
0
,
0
,
2
,
1
);
// clang-format on
}
// namespace
device_reduce_
instance
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp
View file @
b79df771
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I8_I8_HPP
#define DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I8_I8_HPP
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "device_reduce_instance_threadwise.hpp"
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_
instance
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...
...
@@ -36,10 +40,7 @@ ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1);
ADD_THREADWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
}
// namespace
device_reduce_
instance
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#endif
library/include/ck/library/utility/check_err.hpp
View file @
b79df771
#ifndef CHECK_ERR_HPP
#define CHECK_ERR_HPP
#include <algorithm>
#include <cmath>
#include <cstdlib>
#include <half.hpp>
#include <iostream>
#include <iomanip>
#include <iterator>
#include <limits>
#include <type_traits>
#include <vector>
#include "data_type.hpp"
namespace
ck
{
namespace
utils
{
template
<
typename
T
>
typename
std
::
enable_if
<
std
::
is_floating_point
<
T
>::
value
&&
!
std
::
is_same
<
T
,
half_t
>::
value
,
bool
>::
type
check_err
(
const
std
::
vector
<
T
>&
out
,
const
std
::
vector
<
T
>&
ref
,
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
double
rtol
=
1e-5
,
double
atol
=
3e-6
)
{
if
(
out
.
size
()
!=
ref
.
size
())
{
std
::
cout
<<
"out.size() != ref.size(), :"
<<
out
.
size
()
<<
" != "
<<
ref
.
size
()
<<
std
::
endl
<<
msg
<<
std
::
endl
;
return
false
;
}
bool
res
{
true
};
int
err_count
=
0
;
double
err
=
0
;
double
max_err
=
std
::
numeric_limits
<
double
>::
min
();
for
(
std
::
size_t
i
=
0
;
i
<
ref
.
size
();
++
i
)
{
err
=
std
::
abs
(
out
[
i
]
-
ref
[
i
]);
if
(
err
>
atol
+
rtol
*
std
::
abs
(
ref
[
i
])
||
!
std
::
isfinite
(
out
[
i
])
||
!
std
::
isfinite
(
ref
[
i
]))
{
max_err
=
err
>
max_err
?
err
:
max_err
;
err_count
++
;
if
(
err_count
<
5
)
{
std
::
cout
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
"out["
<<
i
<<
"] != ref["
<<
i
<<
"]: "
<<
out
[
i
]
<<
" != "
<<
ref
[
i
]
<<
std
::
endl
<<
msg
<<
std
::
endl
;
}
res
=
false
;
}
}
if
(
!
res
)
{
std
::
cout
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
"max err: "
<<
max_err
<<
std
::
endl
;
}
return
res
;
}
template
<
typename
T
>
typename
std
::
enable_if
<
std
::
is_same
<
T
,
bhalf_t
>::
value
,
bool
>::
type
check_err
(
const
std
::
vector
<
T
>&
out
,
const
std
::
vector
<
T
>&
ref
,
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
double
rtol
=
1e-3
,
double
atol
=
1e-3
)
{
if
(
out
.
size
()
!=
ref
.
size
())
{
std
::
cout
<<
"out.size() != ref.size(), :"
<<
out
.
size
()
<<
" != "
<<
ref
.
size
()
<<
std
::
endl
<<
msg
<<
std
::
endl
;
return
false
;
}
bool
res
{
true
};
int
err_count
=
0
;
double
err
=
0
;
// TODO: This is a hack. We should have proper specialization for bhalf_t data type.
double
max_err
=
std
::
numeric_limits
<
float
>::
min
();
for
(
std
::
size_t
i
=
0
;
i
<
ref
.
size
();
++
i
)
{
double
o
=
type_convert
<
float
>
(
out
[
i
]);
double
r
=
type_convert
<
float
>
(
ref
[
i
]);
err
=
std
::
abs
(
o
-
r
);
if
(
err
>
atol
+
rtol
*
std
::
abs
(
r
)
||
!
std
::
isfinite
(
o
)
||
!
std
::
isfinite
(
r
))
{
max_err
=
err
>
max_err
?
err
:
max_err
;
err_count
++
;
if
(
err_count
<
5
)
{
std
::
cout
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
"out["
<<
i
<<
"] != ref["
<<
i
<<
"]: "
<<
o
<<
" != "
<<
r
<<
std
::
endl
<<
msg
<<
std
::
endl
;
}
res
=
false
;
}
}
if
(
!
res
)
{
std
::
cout
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
"max err: "
<<
max_err
<<
std
::
endl
;
}
return
res
;
}
template
<
typename
T
>
typename
std
::
enable_if
<
std
::
is_same
<
T
,
half_t
>::
value
||
std
::
is_same
<
T
,
half_float
::
half
>::
value
,
bool
>::
type
check_err
(
const
std
::
vector
<
T
>&
out
,
const
std
::
vector
<
T
>&
ref
,
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
double
rtol
=
1e-3
,
double
atol
=
1e-3
)
{
if
(
out
.
size
()
!=
ref
.
size
())
{
std
::
cout
<<
"out.size() != ref.size(), :"
<<
out
.
size
()
<<
" != "
<<
ref
.
size
()
<<
std
::
endl
<<
msg
<<
std
::
endl
;
return
false
;
}
bool
res
{
true
};
int
err_count
=
0
;
double
err
=
0
;
double
max_err
=
std
::
numeric_limits
<
T
>::
min
();
for
(
std
::
size_t
i
=
0
;
i
<
ref
.
size
();
++
i
)
{
double
o
=
type_convert
<
float
>
(
out
[
i
]);
double
r
=
type_convert
<
float
>
(
ref
[
i
]);
err
=
std
::
abs
(
o
-
r
);
if
(
err
>
atol
+
rtol
*
std
::
abs
(
r
)
||
!
std
::
isfinite
(
o
)
||
!
std
::
isfinite
(
r
))
{
max_err
=
err
>
max_err
?
err
:
max_err
;
err_count
++
;
if
(
err_count
<
5
)
{
std
::
cout
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
"out["
<<
i
<<
"] != ref["
<<
i
<<
"]: "
<<
o
<<
" != "
<<
r
<<
std
::
endl
<<
msg
<<
std
::
endl
;
}
res
=
false
;
}
}
if
(
!
res
)
{
std
::
cout
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
"max err: "
<<
max_err
<<
std
::
endl
;
}
return
res
;
}
template
<
typename
T
>
typename
std
::
enable_if
<
std
::
is_integral
<
T
>::
value
&&
!
std
::
is_same
<
T
,
bhalf_t
>::
value
,
bool
>::
type
check_err
(
const
std
::
vector
<
T
>&
out
,
const
std
::
vector
<
T
>&
ref
,
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
double
=
0
,
double
=
0
)
{
if
(
out
.
size
()
!=
ref
.
size
())
{
std
::
cout
<<
"out.size() != ref.size(), :"
<<
out
.
size
()
<<
" != "
<<
ref
.
size
()
<<
std
::
endl
<<
msg
<<
std
::
endl
;
return
false
;
}
for
(
std
::
size_t
i
=
0
;
i
<
ref
.
size
();
++
i
)
{
if
(
out
[
i
]
!=
ref
[
i
])
{
std
::
cout
<<
"out["
<<
i
<<
"] != ref["
<<
i
<<
"]: "
<<
static_cast
<
int
>
(
out
[
i
])
<<
" != "
<<
static_cast
<
int
>
(
ref
[
i
])
<<
std
::
endl
<<
msg
<<
std
::
endl
;
return
false
;
}
}
return
true
;
}
}
// namespace utils
}
// namespace ck
template
<
typename
T
>
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
const
std
::
vector
<
T
>&
v
)
{
std
::
copy
(
std
::
begin
(
v
),
std
::
end
(
v
),
std
::
ostream_iterator
<
T
>
(
os
,
" "
));
return
os
;
}
#endif
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <algorithm>
#include <cmath>
#include <cstdlib>
#include <iostream>
#include <iomanip>
#include <iterator>
#include <limits>
#include <type_traits>
#include <vector>
#include "ck/utility/data_type.hpp"
namespace
ck
{
namespace
utils
{
template
<
typename
T
>
typename
std
::
enable_if
<
std
::
is_floating_point
<
T
>::
value
&&
!
std
::
is_same
<
T
,
half_t
>::
value
,
bool
>::
type
check_err
(
const
std
::
vector
<
T
>&
out
,
const
std
::
vector
<
T
>&
ref
,
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
double
rtol
=
1e-5
,
double
atol
=
3e-6
)
{
if
(
out
.
size
()
!=
ref
.
size
())
{
std
::
cout
<<
"out.size() != ref.size(), :"
<<
out
.
size
()
<<
" != "
<<
ref
.
size
()
<<
std
::
endl
<<
msg
<<
std
::
endl
;
return
false
;
}
bool
res
{
true
};
int
err_count
=
0
;
double
err
=
0
;
double
max_err
=
std
::
numeric_limits
<
double
>::
min
();
for
(
std
::
size_t
i
=
0
;
i
<
ref
.
size
();
++
i
)
{
err
=
std
::
abs
(
out
[
i
]
-
ref
[
i
]);
if
(
err
>
atol
+
rtol
*
std
::
abs
(
ref
[
i
])
||
!
std
::
isfinite
(
out
[
i
])
||
!
std
::
isfinite
(
ref
[
i
]))
{
max_err
=
err
>
max_err
?
err
:
max_err
;
err_count
++
;
if
(
err_count
<
5
)
{
std
::
cout
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
"out["
<<
i
<<
"] != ref["
<<
i
<<
"]: "
<<
out
[
i
]
<<
" != "
<<
ref
[
i
]
<<
std
::
endl
<<
msg
<<
std
::
endl
;
}
res
=
false
;
}
}
if
(
!
res
)
{
std
::
cout
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
"max err: "
<<
max_err
<<
std
::
endl
;
}
return
res
;
}
template
<
typename
T
>
typename
std
::
enable_if
<
std
::
is_same
<
T
,
bhalf_t
>::
value
,
bool
>::
type
check_err
(
const
std
::
vector
<
T
>&
out
,
const
std
::
vector
<
T
>&
ref
,
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
double
rtol
=
1e-3
,
double
atol
=
1e-3
)
{
if
(
out
.
size
()
!=
ref
.
size
())
{
std
::
cout
<<
"out.size() != ref.size(), :"
<<
out
.
size
()
<<
" != "
<<
ref
.
size
()
<<
std
::
endl
<<
msg
<<
std
::
endl
;
return
false
;
}
bool
res
{
true
};
int
err_count
=
0
;
double
err
=
0
;
// TODO: This is a hack. We should have proper specialization for bhalf_t data type.
double
max_err
=
std
::
numeric_limits
<
float
>::
min
();
for
(
std
::
size_t
i
=
0
;
i
<
ref
.
size
();
++
i
)
{
double
o
=
type_convert
<
float
>
(
out
[
i
]);
double
r
=
type_convert
<
float
>
(
ref
[
i
]);
err
=
std
::
abs
(
o
-
r
);
if
(
err
>
atol
+
rtol
*
std
::
abs
(
r
)
||
!
std
::
isfinite
(
o
)
||
!
std
::
isfinite
(
r
))
{
max_err
=
err
>
max_err
?
err
:
max_err
;
err_count
++
;
if
(
err_count
<
5
)
{
std
::
cout
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
"out["
<<
i
<<
"] != ref["
<<
i
<<
"]: "
<<
o
<<
" != "
<<
r
<<
std
::
endl
<<
msg
<<
std
::
endl
;
}
res
=
false
;
}
}
if
(
!
res
)
{
std
::
cout
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
"max err: "
<<
max_err
<<
std
::
endl
;
}
return
res
;
}
template
<
typename
T
>
typename
std
::
enable_if
<
std
::
is_same
<
T
,
half_t
>::
value
,
bool
>::
type
check_err
(
const
std
::
vector
<
T
>&
out
,
const
std
::
vector
<
T
>&
ref
,
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
double
rtol
=
1e-3
,
double
atol
=
1e-3
)
{
if
(
out
.
size
()
!=
ref
.
size
())
{
std
::
cout
<<
"out.size() != ref.size(), :"
<<
out
.
size
()
<<
" != "
<<
ref
.
size
()
<<
std
::
endl
<<
msg
<<
std
::
endl
;
return
false
;
}
bool
res
{
true
};
int
err_count
=
0
;
double
err
=
0
;
double
max_err
=
std
::
numeric_limits
<
T
>::
min
();
for
(
std
::
size_t
i
=
0
;
i
<
ref
.
size
();
++
i
)
{
double
o
=
type_convert
<
float
>
(
out
[
i
]);
double
r
=
type_convert
<
float
>
(
ref
[
i
]);
err
=
std
::
abs
(
o
-
r
);
if
(
err
>
atol
+
rtol
*
std
::
abs
(
r
)
||
!
std
::
isfinite
(
o
)
||
!
std
::
isfinite
(
r
))
{
max_err
=
err
>
max_err
?
err
:
max_err
;
err_count
++
;
if
(
err_count
<
5
)
{
std
::
cout
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
"out["
<<
i
<<
"] != ref["
<<
i
<<
"]: "
<<
o
<<
" != "
<<
r
<<
std
::
endl
<<
msg
<<
std
::
endl
;
}
res
=
false
;
}
}
if
(
!
res
)
{
std
::
cout
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
"max err: "
<<
max_err
<<
std
::
endl
;
}
return
res
;
}
template
<
typename
T
>
typename
std
::
enable_if
<
std
::
is_integral
<
T
>::
value
&&
!
std
::
is_same
<
T
,
bhalf_t
>::
value
,
bool
>::
type
check_err
(
const
std
::
vector
<
T
>&
out
,
const
std
::
vector
<
T
>&
ref
,
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
double
=
0
,
double
atol
=
0
)
{
if
(
out
.
size
()
!=
ref
.
size
())
{
std
::
cout
<<
"out.size() != ref.size(), :"
<<
out
.
size
()
<<
" != "
<<
ref
.
size
()
<<
std
::
endl
<<
msg
<<
std
::
endl
;
return
false
;
}
bool
res
{
true
};
int
err_count
=
0
;
int64_t
err
=
0
;
int64_t
max_err
=
std
::
numeric_limits
<
int64_t
>::
min
();
for
(
std
::
size_t
i
=
0
;
i
<
ref
.
size
();
++
i
)
{
int64_t
o
=
out
[
i
];
int64_t
r
=
ref
[
i
];
err
=
std
::
abs
(
o
-
r
);
if
(
err
>
atol
)
{
max_err
=
err
>
max_err
?
err
:
max_err
;
err_count
++
;
if
(
err_count
<
5
)
{
std
::
cout
<<
"out["
<<
i
<<
"] != ref["
<<
i
<<
"]: "
<<
static_cast
<
int
>
(
out
[
i
])
<<
" != "
<<
static_cast
<
int
>
(
ref
[
i
])
<<
std
::
endl
<<
msg
<<
std
::
endl
;
}
res
=
false
;
}
}
if
(
!
res
)
{
std
::
cout
<<
"max err: "
<<
max_err
<<
std
::
endl
;
}
return
res
;
}
}
// namespace utils
}
// namespace ck
template
<
typename
T
>
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
const
std
::
vector
<
T
>&
v
)
{
std
::
copy
(
std
::
begin
(
v
),
std
::
end
(
v
),
std
::
ostream_iterator
<
T
>
(
os
,
" "
));
return
os
;
}
library/include/ck/library/utility/conv_util.hpp
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <cstdlib>
...
...
@@ -9,17 +12,17 @@
#include <type_traits>
#include <vector>
#include "c
heck_err
.hpp"
#include "c
onfig
.hpp"
#include "
device
.hpp"
#include "
device_conv_fwd
.hpp"
#include "device_tensor.hpp"
#include "
element_wise_operation
.hpp"
#include "fill.hpp"
#include "
host_tensor
.hpp"
#include "
op_instance_engine
.hpp"
#include "
reference_conv_fwd
.hpp"
#include "
tensor_layout
.hpp"
#include "c
k/ck
.hpp"
#include "c
k/tensor_operation/gpu/device/tensor_layout
.hpp"
#include "
ck/tensor_operation/gpu/device/device_conv_fwd
.hpp"
#include "
ck/tensor_operation/gpu/element/element_wise_operation
.hpp"
#include "
ck/library/utility/check_err
.hpp"
#include "
ck/library/utility/
fill.hpp"
#include "
ck/library/utility/op_instance_engine
.hpp"
#include "
ck/library/host_tensor/device_memory
.hpp"
#include "
ck/library/host_tensor/host_tensor
.hpp"
#include "
ck/library/reference_tensor_operation/cpu/reference_conv_fwd
.hpp"
namespace
ck
{
namespace
tensor_operation
{
...
...
@@ -28,15 +31,15 @@ namespace device {
using
DeviceConvFwdNoOpPtr
=
DeviceConvFwdPtr
<
element_wise
::
PassThrough
,
element_wise
::
PassThrough
,
element_wise
::
PassThrough
>
;
namespace
device_conv1d_fwd_
instance
{
namespace
instance
{
void
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_bf16_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f16_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f32_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
}
// namespace
device_conv1d_fwd_
instance
namespace
device_conv2d_fwd_
instance
{
}
// namespace instance
namespace
instance
{
void
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
...
...
@@ -45,15 +48,15 @@ void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(
void
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
}
// namespace
device_conv2d_fwd_
instance
namespace
device_conv3d_fwd_
instance
{
}
// namespace instance
namespace
instance
{
void
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f16_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f32_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
}
// namespace
device_conv3d_fwd_
instance
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
...
...
@@ -292,17 +295,17 @@ struct ConvolutionFwdInstances<float, float, float>
std
::
vector
<
DeviceConvFwdNoOpPtr
>
conv_ptrs
;
if
constexpr
(
NumDimSpatial
==
1
)
{
ck
::
tensor_operation
::
device
::
device_conv1d_fwd_
instance
::
ck
::
tensor_operation
::
device
::
instance
::
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f32_instances
(
conv_ptrs
);
}
else
if
constexpr
(
NumDimSpatial
==
2
)
{
ck
::
tensor_operation
::
device
::
device_conv2d_fwd_
instance
::
ck
::
tensor_operation
::
device
::
instance
::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances
(
conv_ptrs
);
}
else
if
constexpr
(
NumDimSpatial
==
3
)
{
ck
::
tensor_operation
::
device
::
device_conv3d_fwd_
instance
::
ck
::
tensor_operation
::
device
::
instance
::
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f32_instances
(
conv_ptrs
);
}
return
conv_ptrs
;
...
...
@@ -319,20 +322,20 @@ struct ConvolutionFwdInstances<half_t, half_t, half_t>
std
::
vector
<
DeviceConvFwdNoOpPtr
>
conv_ptrs
;
if
constexpr
(
NumDimSpatial
==
1
)
{
ck
::
tensor_operation
::
device
::
device_conv1d_fwd_
instance
::
ck
::
tensor_operation
::
device
::
instance
::
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f16_instances
(
conv_ptrs
);
return
conv_ptrs
;
}
else
if
constexpr
(
NumDimSpatial
==
2
)
{
ck
::
tensor_operation
::
device
::
device_conv2d_fwd_
instance
::
ck
::
tensor_operation
::
device
::
instance
::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances
(
conv_ptrs
);
ck
::
tensor_operation
::
device
::
device_conv2d_fwd_
instance
::
ck
::
tensor_operation
::
device
::
instance
::
add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances
(
conv_ptrs
);
}
else
if
constexpr
(
NumDimSpatial
==
3
)
{
ck
::
tensor_operation
::
device
::
device_conv3d_fwd_
instance
::
ck
::
tensor_operation
::
device
::
instance
::
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f16_instances
(
conv_ptrs
);
}
return
conv_ptrs
;
...
...
@@ -349,17 +352,17 @@ struct ConvolutionFwdInstances<bhalf_t, bhalf_t, bhalf_t>
std
::
vector
<
DeviceConvFwdNoOpPtr
>
conv_ptrs
;
if
constexpr
(
NumDimSpatial
==
1
)
{
ck
::
tensor_operation
::
device
::
device_conv1d_fwd_
instance
::
ck
::
tensor_operation
::
device
::
instance
::
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_bf16_instances
(
conv_ptrs
);
}
else
if
constexpr
(
NumDimSpatial
==
2
)
{
ck
::
tensor_operation
::
device
::
device_conv2d_fwd_
instance
::
ck
::
tensor_operation
::
device
::
instance
::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances
(
conv_ptrs
);
}
else
if
constexpr
(
NumDimSpatial
==
3
)
{
ck
::
tensor_operation
::
device
::
device_conv3d_fwd_
instance
::
ck
::
tensor_operation
::
device
::
instance
::
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instances
(
conv_ptrs
);
}
return
conv_ptrs
;
...
...
@@ -376,17 +379,17 @@ struct ConvolutionFwdInstances<int8_t, int8_t, int8_t>
std
::
vector
<
DeviceConvFwdNoOpPtr
>
conv_ptrs
;
if
constexpr
(
NumDimSpatial
==
1
)
{
ck
::
tensor_operation
::
device
::
device_conv1d_fwd_
instance
::
ck
::
tensor_operation
::
device
::
instance
::
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instances
(
conv_ptrs
);
}
else
if
constexpr
(
NumDimSpatial
==
2
)
{
ck
::
tensor_operation
::
device
::
device_conv2d_fwd_
instance
::
ck
::
tensor_operation
::
device
::
instance
::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances
(
conv_ptrs
);
}
else
if
constexpr
(
NumDimSpatial
==
3
)
{
ck
::
tensor_operation
::
device
::
device_conv3d_fwd_
instance
::
ck
::
tensor_operation
::
device
::
instance
::
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instances
(
conv_ptrs
);
}
return
conv_ptrs
;
...
...
@@ -402,8 +405,8 @@ template <typename InDataType,
typename
InElementwiseOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
typename
WeiElementwiseOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
typename
OutElementwiseOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
typename
InputInitFun
=
FillUniform
<
InDataType
>,
typename
WeightsInitFun
=
FillUniform
<
WeiDataType
>>
typename
InputInitFun
=
FillUniform
Distribution
<
InDataType
>,
typename
WeightsInitFun
=
FillUniform
Distribution
<
WeiDataType
>>
class
ConvFwdOpInstance
:
public
ck
::
utils
::
OpInstance
<
OutDataType
,
InDataType
,
WeiDataType
>
{
using
DeviceConvFwdOp
=
tensor_operation
::
device
::
...
...
@@ -422,8 +425,8 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType,
ConvFwdOpInstance
(
const
ConvParams
&
params
,
bool
do_init
=
true
,
const
InputInitFun
&
input_init_f
=
InputInitFun
{}
,
const
WeightsInitFun
&
weights_init_f
=
WeightsInitFun
{}
)
const
InputInitFun
&
input_init_f
=
InputInitFun
()
,
const
WeightsInitFun
&
weights_init_f
=
WeightsInitFun
()
)
:
BaseType
(),
params_
{
params
},
output_spatial_lengths_
{
params
.
GetOutputSpatialLengths
()},
...
...
@@ -560,8 +563,8 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType,
const
ConvParams
&
params_
;
const
std
::
vector
<
ck
::
index_t
>
output_spatial_lengths_
;
const
bool
do_init_
;
const
InputInitFun
&
input_init_f_
;
const
WeightsInitFun
&
weights_init_f_
;
InputInitFun
input_init_f_
;
WeightsInitFun
weights_init_f_
;
};
}
// namespace conv
...
...
library/include/ck/library/utility/fill.hpp
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <algorithm>
#include <cmath>
#include <random>
#include "data_type.hpp"
#include "
ck/utility/
data_type.hpp"
namespace
ck
{
namespace
utils
{
// template <typename T, class Enable = void>
// struct FillUniform;
template
<
typename
T
>
struct
FillUniformDistribution
{
float
a_
{
-
5.
f
};
float
b_
{
5.
f
};
// TODO: what's wrong with this specialization???
// err: segmentation fault in mt19937 - infinite loop like.
// template <typename T>
// struct FillUniform<T, typename std::enable_if<std::is_integral<T>::value &&
// !std::is_same<T, bhalf_t>::value>::type>
// {
// int a_{0};
// int b_{5};
// // T a_ = T{0};
// // T b_ = T{5};
template
<
typename
ForwardIter
>
void
operator
()(
ForwardIter
first
,
ForwardIter
last
)
const
{
std
::
mt19937
gen
(
11939
);
std
::
uniform_real_distribution
<
float
>
dis
(
a_
,
b_
);
std
::
generate
(
first
,
last
,
[
&
dis
,
&
gen
]()
{
return
ck
::
type_convert
<
T
>
(
dis
(
gen
));
});
}
};
// template <typename ForwardIter>
// void operator()(ForwardIter first, ForwardIter last) const
// {
// std::mt19937 gen{11939};
// std::uniform_int_distribution<int> dis(a_, b_);
// std::generate(first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
// }
// };
// Normally FillUniformDistributionIntegerValue should use std::uniform_int_distribution as below.
// However this produces segfaults in std::mt19937 which look like inifite loop.
// template <typename T>
// struct FillUniformDistributionIntegerValue
// {
// int a_{-5};
// int b_{5};
//
// template <typename ForwardIter>
// void operator()(ForwardIter first, ForwardIter last) const
// {
// std::mt19937 gen(11939);
// std::uniform_int_distribution<int> dis(a_, b_);
// std::generate(
// first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
// }
// };
// struct FillUniform<T, typename std::enable_if<std::is_floating_point<T>::value ||
// std::is_same<T, bhalf_t>::value>::type>
// Workaround for uniform_int_distribution not working as expected. See note above.<
template
<
typename
T
>
struct
FillUniform
struct
FillUniform
DistributionIntegerValue
{
float
a_
{
0
};
float
b_
{
5
};
float
a_
{
-
5.
f
};
float
b_
{
5
.
f
};
template
<
typename
ForwardIter
>
void
operator
()(
ForwardIter
first
,
ForwardIter
last
)
const
{
std
::
mt19937
gen
{
11939
};
std
::
uniform_real_distribution
<>
dis
(
a_
,
b_
);
std
::
generate
(
first
,
last
,
[
&
dis
,
&
gen
]()
{
return
ck
::
type_convert
<
T
>
(
dis
(
gen
));
});
std
::
mt19937
gen
(
11939
);
std
::
uniform_real_distribution
<
float
>
dis
(
a_
,
b_
);
std
::
generate
(
first
,
last
,
[
&
dis
,
&
gen
]()
{
return
ck
::
type_convert
<
T
>
(
std
::
round
(
dis
(
gen
)));
});
}
};
...
...
library/include/ck/library/utility/op_instance_engine.hpp
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <cstdlib>
#include <iostream>
#include <limits>
#include <memory>
#include <stdexcept>
...
...
@@ -8,9 +12,12 @@
#include <utility>
#include <vector>
#include "check_err.hpp"
#include "device_base.hpp"
#include "functional2.hpp"
#include "ck/utility/functional2.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
namespace
ck
{
namespace
utils
{
...
...
@@ -78,7 +85,8 @@ class OpInstanceRunEngine
template
<
typename
ReferenceOp
=
std
::
function
<
void
()>
>
OpInstanceRunEngine
(
const
OpInstanceT
&
op_instance
,
const
ReferenceOp
&
reference_op
=
ReferenceOp
{})
const
ReferenceOp
&
reference_op
=
ReferenceOp
{},
bool
do_verification
=
true
)
:
op_instance_
{
op_instance
}
{
in_tensors_
=
op_instance_
.
GetInputTensors
();
...
...
@@ -88,8 +96,11 @@ class OpInstanceRunEngine
const
Tensor
<
InArgTypes
>&
...,
Tensor
<
OutDataType
>&>
)
{
ref_output_
=
op_instance_
.
GetOutputTensor
();
CallRefOpUnpackArgs
(
reference_op
,
std
::
make_index_sequence
<
kNInArgs_
>
{});
if
(
do_verification
)
{
ref_output_
=
op_instance_
.
GetOutputTensor
();
CallRefOpUnpackArgs
(
reference_op
,
std
::
make_index_sequence
<
kNInArgs_
>
{});
}
}
AllocateDeviceInputTensors
(
std
::
make_index_sequence
<
kNInArgs_
>
{});
out_device_buffer_
=
...
...
@@ -110,6 +121,7 @@ class OpInstanceRunEngine
op_ptr
.
get
(),
in_device_buffers_
,
out_device_buffer_
);
if
(
op_ptr
->
IsSupportedArgument
(
argument
.
get
()))
{
std
::
cout
<<
"Testing instance: "
<<
op_ptr
->
GetTypeString
()
<<
std
::
endl
;
invoker
->
Run
(
argument
.
get
());
out_device_buffer_
->
FromDevice
(
out_tensor_
->
mData
.
data
());
if
(
!
ref_output_
)
...
...
@@ -119,9 +131,16 @@ class OpInstanceRunEngine
" You have to provide reference function."
);
}
// TODO: enable flexible use of custom check_error functions
res
=
res
&&
check_err
(
out_tensor_
->
mData
,
ref_output_
->
mData
);
bool
inst_res
=
CheckErr
(
out_tensor_
->
mData
,
ref_output_
->
mData
);
std
::
cout
<<
(
inst_res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
res
=
res
&&
inst_res
;
out_device_buffer_
->
SetZero
();
}
else
{
std
::
cout
<<
"Given conv problem is not supported by instance:
\n\t
>>>>"
<<
op_ptr
->
GetTypeString
()
<<
std
::
endl
;
}
}
return
res
;
}
...
...
@@ -132,7 +151,6 @@ class OpInstanceRunEngine
bool
do_verification
=
false
,
bool
do_log
=
false
)
{
bool
res
{
true
};
ProfileBestConfig
best_config
;
for
(
auto
&
op_ptr
:
op_ptrs
)
...
...
@@ -153,7 +171,7 @@ class OpInstanceRunEngine
std
::
cout
<<
"Perf: "
<<
avg_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
if
(
tflops
<
best_config
.
best_
tflops
)
if
(
avg_time
<
best_config
.
best_
avg_time
)
{
best_config
.
best_op_name
=
op_name
;
best_config
.
best_tflops
=
tflops
;
...
...
@@ -171,7 +189,7 @@ class OpInstanceRunEngine
" You have to provide reference function."
);
}
// TODO: enable flexible use of custom check_error functions
res
=
res
&&
CheckErr
(
out_tensor_
->
mData
,
ref_output_
->
mData
);
CheckErr
(
out_tensor_
->
mData
,
ref_output_
->
mData
);
if
(
do_log
)
{}
}
...
...
@@ -223,7 +241,7 @@ class OpInstanceRunEngine
template
<
typename
T
>
bool
CheckErr
(
const
std
::
vector
<
T
>&
dev_out
,
const
std
::
vector
<
T
>&
ref_out
)
const
{
return
ck
::
utils
::
check_err
(
dev_out
,
ref_out
,
"Error: incorrect results!"
,
a
tol_
,
r
tol_
);
return
ck
::
utils
::
check_err
(
dev_out
,
ref_out
,
"Error: incorrect results!"
,
r
tol_
,
a
tol_
);
}
};
...
...
library/src/host_tensor/CMakeLists.txt
View file @
b79df771
## host_tensor
include_directories
(
BEFORE
${
PROJECT_SOURCE_DIR
}
/include/ck
${
PROJECT_SOURCE_DIR
}
/include/ck/utility
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/host_tensor
)
set
(
HOST_TENSOR_SOURCE
device.cpp
device
_memory
.cpp
host_tensor.cpp
)
...
...
@@ -17,22 +11,20 @@ target_compile_features(host_tensor PUBLIC)
set_target_properties
(
host_tensor PROPERTIES POSITION_INDEPENDENT_CODE ON
)
target_include_directories
(
host_tensor SYSTEM PUBLIC $<BUILD_INTERFACE:
${
HALF_INCLUDE_DIR
}
>
)
target_include_directories
(
host_tensor PUBLIC
target_include_directories
(
host_tensor PUBLIC
"$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck>"
"$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/utility>"
"$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/host_tensor>"
)
install
(
TARGETS host_tensor
EXPORT host_tensorTargets
LIBRARY DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
ARCHIVE DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
RUNTIME DESTINATION
${
CMAKE_INSTALL_BINDIR
}
INCLUDES DESTINATION
${
CMAKE_INSTALL_INCLUDEDIR
}
rocm_install
(
TARGETS host_tensor
EXPORT host_tensorTargets
)
install
(
EXPORT host_tensorTargets
FILE composable_kernelhost_tensorTargets.cmake
rocm_install
(
EXPORT host_tensorTargets
FILE composable_kernelhost_tensorTargets.cmake
NAMESPACE composable_kernel::
DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
/cmake/composable_kernel
)
...
...
library/src/host_tensor/device.cpp
deleted
100644 → 0
View file @
05d38218
#include <chrono>
#include <assert.h>
#include <string.h>
#include <stdlib.h>
#include "device.hpp"
#ifndef CK_NOGPU
DeviceMem
::
DeviceMem
(
std
::
size_t
mem_size
)
:
mMemSize
(
mem_size
)
{
hip_check_error
(
hipMalloc
(
static_cast
<
void
**>
(
&
mpDeviceBuf
),
mMemSize
));
}
void
*
DeviceMem
::
GetDeviceBuffer
()
{
return
mpDeviceBuf
;
}
std
::
size_t
DeviceMem
::
GetBufferSize
()
{
return
mMemSize
;
}
void
DeviceMem
::
ToDevice
(
const
void
*
p
)
{
hip_check_error
(
hipMemcpy
(
mpDeviceBuf
,
const_cast
<
void
*>
(
p
),
mMemSize
,
hipMemcpyHostToDevice
));
}
void
DeviceMem
::
FromDevice
(
void
*
p
)
{
hip_check_error
(
hipMemcpy
(
p
,
mpDeviceBuf
,
mMemSize
,
hipMemcpyDeviceToHost
));
}
void
DeviceMem
::
SetZero
()
{
hip_check_error
(
hipMemset
(
mpDeviceBuf
,
0
,
mMemSize
));
}
DeviceMem
::~
DeviceMem
()
{
hip_check_error
(
hipFree
(
mpDeviceBuf
));
}
struct
KernelTimerImpl
{
KernelTimerImpl
()
{
hip_check_error
(
hipEventCreate
(
&
mStart
));
hip_check_error
(
hipEventCreate
(
&
mEnd
));
}
~
KernelTimerImpl
()
{
hip_check_error
(
hipEventDestroy
(
mStart
));
hip_check_error
(
hipEventDestroy
(
mEnd
));
}
void
Start
()
{
hip_check_error
(
hipDeviceSynchronize
());
hip_check_error
(
hipEventRecord
(
mStart
,
nullptr
));
}
void
End
()
{
hip_check_error
(
hipEventRecord
(
mEnd
,
nullptr
));
hip_check_error
(
hipEventSynchronize
(
mEnd
));
}
float
GetElapsedTime
()
const
{
float
time
;
hip_check_error
(
hipEventElapsedTime
(
&
time
,
mStart
,
mEnd
));
return
time
;
}
hipEvent_t
mStart
,
mEnd
;
};
KernelTimer
::
KernelTimer
()
:
impl
(
new
KernelTimerImpl
())
{}
KernelTimer
::~
KernelTimer
()
{}
void
KernelTimer
::
Start
()
{
impl
->
Start
();
}
void
KernelTimer
::
End
()
{
impl
->
End
();
}
float
KernelTimer
::
GetElapsedTime
()
const
{
return
impl
->
GetElapsedTime
();
}
#endif
DeviceAlignedMemCPU
::
DeviceAlignedMemCPU
(
std
::
size_t
mem_size
,
std
::
size_t
alignment
)
:
mMemSize
(
mem_size
),
mAlignment
(
alignment
)
{
if
(
mem_size
==
0
)
{
mpDeviceBuf
=
nullptr
;
}
else
{
assert
(
!
(
alignment
==
0
||
(
alignment
&
(
alignment
-
1
))));
// check pow of 2
// TODO: posix only
int
rtn
=
posix_memalign
(
&
mpDeviceBuf
,
alignment
,
mem_size
);
assert
(
rtn
==
0
);
}
}
void
*
DeviceAlignedMemCPU
::
GetDeviceBuffer
()
{
return
mpDeviceBuf
;
}
std
::
size_t
DeviceAlignedMemCPU
::
GetBufferSize
()
{
return
mMemSize
;
}
void
DeviceAlignedMemCPU
::
ToDevice
(
const
void
*
p
)
{
memcpy
(
mpDeviceBuf
,
p
,
mMemSize
);
}
void
DeviceAlignedMemCPU
::
FromDevice
(
void
*
p
)
{
memcpy
(
p
,
mpDeviceBuf
,
mMemSize
);
}
void
DeviceAlignedMemCPU
::
SetZero
()
{
memset
(
mpDeviceBuf
,
0
,
mMemSize
);
}
DeviceAlignedMemCPU
::~
DeviceAlignedMemCPU
()
{
if
(
mpDeviceBuf
!=
nullptr
)
free
(
mpDeviceBuf
);
}
struct
WallTimerImpl
{
void
Start
()
{
mStart
=
std
::
chrono
::
high_resolution_clock
::
now
();
}
void
End
()
{
mStop
=
std
::
chrono
::
high_resolution_clock
::
now
();
}
float
GetElapsedTime
()
const
{
return
static_cast
<
float
>
(
std
::
chrono
::
duration_cast
<
std
::
chrono
::
microseconds
>
(
mStop
-
mStart
).
count
())
*
1e-3
;
}
std
::
chrono
::
time_point
<
std
::
chrono
::
high_resolution_clock
>
mStart
;
std
::
chrono
::
time_point
<
std
::
chrono
::
high_resolution_clock
>
mStop
;
};
WallTimer
::
WallTimer
()
:
impl
(
new
WallTimerImpl
())
{}
WallTimer
::~
WallTimer
()
{}
void
WallTimer
::
Start
()
{
impl
->
Start
();
}
void
WallTimer
::
End
()
{
impl
->
End
();
}
float
WallTimer
::
GetElapsedTime
()
const
{
return
impl
->
GetElapsedTime
();
}
library/src/host_tensor/device_memory.cpp
0 → 100644
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/device_utility/hip_check_error.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
DeviceMem
::
DeviceMem
(
std
::
size_t
mem_size
)
:
mMemSize
(
mem_size
)
{
hip_check_error
(
hipMalloc
(
static_cast
<
void
**>
(
&
mpDeviceBuf
),
mMemSize
));
}
void
*
DeviceMem
::
GetDeviceBuffer
()
{
return
mpDeviceBuf
;
}
std
::
size_t
DeviceMem
::
GetBufferSize
()
{
return
mMemSize
;
}
void
DeviceMem
::
ToDevice
(
const
void
*
p
)
{
hip_check_error
(
hipMemcpy
(
mpDeviceBuf
,
const_cast
<
void
*>
(
p
),
mMemSize
,
hipMemcpyHostToDevice
));
}
void
DeviceMem
::
FromDevice
(
void
*
p
)
{
hip_check_error
(
hipMemcpy
(
p
,
mpDeviceBuf
,
mMemSize
,
hipMemcpyDeviceToHost
));
}
void
DeviceMem
::
SetZero
()
{
hip_check_error
(
hipMemset
(
mpDeviceBuf
,
0
,
mMemSize
));
}
DeviceMem
::~
DeviceMem
()
{
hip_check_error
(
hipFree
(
mpDeviceBuf
));
}
library/src/host_tensor/host_tensor.cpp
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cassert>
#include "host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
void
HostTensorDescriptor
::
CalculateStrides
()
{
...
...
@@ -50,25 +54,3 @@ std::ostream& operator<<(std::ostream& os, const HostTensorDescriptor& desc)
return
os
;
}
void
ostream_HostTensorDescriptor
(
const
HostTensorDescriptor
&
desc
,
std
::
ostream
&
os
)
{
os
<<
"dim "
<<
desc
.
GetNumOfDimension
()
<<
", "
;
os
<<
"lengths {"
;
LogRange
(
os
,
desc
.
GetLengths
(),
", "
);
os
<<
"}, "
;
os
<<
"strides {"
;
LogRange
(
os
,
desc
.
GetStrides
(),
", "
);
os
<<
"}"
<<
std
::
endl
;
}
#if 1
// FIXME: remove
void
bf16_to_f32_
(
const
Tensor
<
ck
::
bhalf_t
>&
src
,
Tensor
<
float
>&
dst
)
{
for
(
std
::
size_t
i
=
0
;
i
<
src
.
mData
.
size
();
++
i
)
dst
.
mData
[
i
]
=
ck
::
type_convert
<
float
>
(
src
.
mData
[
i
]);
}
#endif
Prev
1
…
15
16
17
18
19
20
21
22
23
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment