Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
a3b4c5cb
Commit
a3b4c5cb
authored
Jun 03, 2022
by
wangshaojie6
Browse files
merge develop branch and add gridwise pipeline v3
parents
48918ab9
1677cf70
Changes
361
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
123 additions
and
373 deletions
+123
-373
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.hpp
...reduce_instance_multiblock_partial_reduce_f16_f32_f16.hpp
+0
-35
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.hpp
...reduce_instance_multiblock_partial_reduce_f32_f32_f32.hpp
+0
-52
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.hpp
...reduce_instance_multiblock_partial_reduce_f32_f64_f32.hpp
+0
-27
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.hpp
...reduce_instance_multiblock_partial_reduce_f64_f64_f64.hpp
+0
-62
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.hpp
...e_reduce_instance_multiblock_partial_reduce_i8_i32_i8.hpp
+0
-31
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.hpp
...ce_reduce_instance_multiblock_partial_reduce_i8_i8_i8.hpp
+0
-47
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp
...instance/gpu/reduce/device_reduce_instance_threadwise.hpp
+36
-39
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp
.../reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp
+1
-2
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp
.../reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp
+1
-2
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp
.../reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp
+1
-2
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp
.../reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp
+0
-2
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp
.../reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp
+0
-2
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp
.../reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp
+0
-2
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp
...pu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp
+0
-2
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp
...gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp
+0
-2
library/include/ck/library/utility/check_err.hpp
library/include/ck/library/utility/check_err.hpp
+3
-3
library/include/ck/library/utility/conv_util.hpp
library/include/ck/library/utility/conv_util.hpp
+42
-42
library/include/ck/library/utility/op_instance_engine.hpp
library/include/ck/library/utility/op_instance_engine.hpp
+2
-2
library/src/host_tensor/CMakeLists.txt
library/src/host_tensor/CMakeLists.txt
+23
-2
library/src/host_tensor/device.cpp
library/src/host_tensor/device.cpp
+14
-15
No files found.
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.hpp
deleted
100644 → 0
View file @
48918ab9
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F16_F32_F16_HPP
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F16_F32_F16_HPP
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
4
,
3
);
// for NORM2
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
2
,
1
);
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.hpp
deleted
100644 → 0
View file @
48918ab9
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F32_F32_F32_HPP
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F32_F32_F32_HPP
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
4
,
3
);
// for NORM2
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
2
,
1
);
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.hpp
deleted
100644 → 0
View file @
48918ab9
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F32_F64_F32_HPP
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F32_F64_F32_HPP
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
4
,
3
);
// for NORM2
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
2
,
1
);
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.hpp
deleted
100644 → 0
View file @
48918ab9
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F64_F64_F64_HPP
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F64_F64_F64_HPP
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
3
);
// for NORM2
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
2
,
1
);
// Will be moved to use MultiBlockAtomicAdd
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
2
,
1
);
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.hpp
deleted
100644 → 0
View file @
48918ab9
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_I8_I32_I8_HPP
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_I8_I32_I8_HPP
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
0
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
0
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
0
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
5
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
5
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
5
,
0
,
0
,
2
,
1
);
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.hpp
deleted
100644 → 0
View file @
48918ab9
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_I8_I8_I8_HPP
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_I8_I8_I8_HPP
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
1
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
1
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
1
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
1
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
1
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
1
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
1
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
1
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp
View file @
a3b4c5cb
...
@@ -58,8 +58,8 @@ template <typename InDataType,
...
@@ -58,8 +58,8 @@ template <typename InDataType,
int
Rank
,
int
Rank
,
int
NumReduceDim
,
int
NumReduceDim
,
ReduceTensorOp
ReduceOpId
,
ReduceTensorOp
ReduceOpId
,
Nan
Propagat
ion
NanOpt
,
bool
Propagat
eNan
,
ReduceTensorIndices
IndicesOpt
>
bool
UseIndex
>
void
add_device_reduce_instance_threadwise
(
void
add_device_reduce_instance_threadwise
(
std
::
vector
<
deviceReduceThreadWisePtrType
<
AccDataType
,
ReduceOpId
>>&
device_op_instances
)
std
::
vector
<
deviceReduceThreadWisePtrType
<
AccDataType
,
ReduceOpId
>>&
device_op_instances
)
{
{
...
@@ -73,9 +73,7 @@ void add_device_reduce_instance_threadwise(
...
@@ -73,9 +73,7 @@ void add_device_reduce_instance_threadwise(
constexpr
bool
Indexable
=
constexpr
bool
Indexable
=
(
ReduceOpId
==
ReduceTensorOp
::
MIN
||
ReduceOpId
==
ReduceTensorOp
::
MAX
||
(
ReduceOpId
==
ReduceTensorOp
::
MIN
||
ReduceOpId
==
ReduceTensorOp
::
MAX
||
ReduceOpId
==
ReduceTensorOp
::
AMAX
);
ReduceOpId
==
ReduceTensorOp
::
AMAX
);
constexpr
bool
NeedIndices
=
Indexable
&&
(
IndicesOpt
!=
ReduceTensorIndices
::
NO_INDICES
);
constexpr
bool
OutputIndex
=
Indexable
&&
UseIndex
;
constexpr
bool
PropagateNan
=
(
NanOpt
==
NanPropagation
::
NOT_PROPAGATE_NAN
)
?
false
:
true
;
using
cfg1
=
ReductionConfiguration_1
<
256
,
256
,
1
>
;
using
cfg1
=
ReductionConfiguration_1
<
256
,
256
,
1
>
;
...
@@ -93,10 +91,9 @@ void add_device_reduce_instance_threadwise(
...
@@ -93,10 +91,9 @@ void add_device_reduce_instance_threadwise(
InElementwiseOperation
,
InElementwiseOperation
,
AccElementwiseOperation
,
AccElementwiseOperation
,
PropagateNan
,
PropagateNan
,
NeedIndices
,
OutputIndex
,
false
,
// HaveIndexInputIfOutputIndex
cfg1
::
BlockSize_
,
cfg1
::
BlockSize_
,
cfg1
::
MThreadClusterSize_
,
cfg1
::
KThreadClusterSize_
,
cfg2
::
MThreadSliceSize_
,
cfg2
::
MThreadSliceSize_
,
cfg2
::
KThreadSliceSize_
,
cfg2
::
KThreadSliceSize_
,
cfg2
::
InSrcVectorDim_
,
cfg2
::
InSrcVectorDim_
,
...
@@ -107,54 +104,54 @@ void add_device_reduce_instance_threadwise(
...
@@ -107,54 +104,54 @@ void add_device_reduce_instance_threadwise(
});
});
};
};
#define ADD_THREADWISE_INST_BY_TYPE( \
#define ADD_THREADWISE_INST_BY_TYPE(
\
inT, compT, outT, ReduceOpId,
NanOpt, IndicesOpt
, Rank, NumReduceDim) \
inT, compT, outT, ReduceOpId,
PropagateNan, UseIndex
, Rank, NumReduceDim) \
template void add_device_reduce_instance_threadwise<inT, \
template void add_device_reduce_instance_threadwise<inT,
\
compT, \
compT,
\
outT, \
outT,
\
Rank, \
Rank,
\
NumReduceDim, \
NumReduceDim,
\
ReduceOpId, \
ReduceOpId,
\
NanOpt,
\
PropagateNan,
\
IndicesOpt>(
\
UseIndex>(
\
std::vector<deviceReduceThreadWisePtrType<compT, ReduceOpId>> & device_op_instances)
std::vector<deviceReduceThreadWisePtrType<compT, ReduceOpId>> & device_op_instances)
#define ADD_THREADWISE_INST_BY_ID(
\
#define ADD_THREADWISE_INST_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)
\
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
ADD_THREADWISE_INST_BY_TYPE(inT,
\
ADD_THREADWISE_INST_BY_TYPE(inT, \
compT,
\
compT, \
outT,
\
outT, \
static_cast<ReduceTensorOp>(ReduceOpId),
\
static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<
NanPropagation
>(NanOpt), \
static_cast<
bool
>(NanOpt),
\
static_cast<
ReduceTensorIndices
>(IndicesOpt), \
static_cast<
bool
>(IndicesOpt),
\
Rank,
\
Rank, \
NumReduceDim)
NumReduceDim)
#define ADD_THREADWISE_INST_REF_BY_TYPE( \
#define ADD_THREADWISE_INST_REF_BY_TYPE( \
inT, compT, outT, ReduceOpId,
NanOpt, IndicesOpt
, Rank, NumReduceDim)
\
inT, compT, outT, ReduceOpId,
PropagateNan, UseIndex
, Rank, NumReduceDim) \
extern template void add_device_reduce_instance_threadwise<inT, \
extern template void add_device_reduce_instance_threadwise<inT, \
compT, \
compT, \
outT, \
outT, \
Rank, \
Rank, \
NumReduceDim, \
NumReduceDim, \
ReduceOpId, \
ReduceOpId, \
NanOpt,
\
PropagateNan,
\
IndicesOpt>(
\
UseIndex>(
\
std::vector<DeviceReducePtr< \
std::vector<DeviceReducePtr< \
typename reduce_unary_operator<compT, ReduceOpId, true, true>::InElementwiseOperation, \
typename reduce_unary_operator<compT, ReduceOpId, true, true>::InElementwiseOperation, \
typename reduce_unary_operator<compT, ReduceOpId, true, true>:: \
typename reduce_unary_operator<compT, ReduceOpId, true, true>:: \
AccElementwiseOperation>> & \
AccElementwiseOperation>> & \
device_op_instances)
device_op_instances)
#define ADD_THREADWISE_INST_REF_BY_ID(
\
#define ADD_THREADWISE_INST_REF_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)
\
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
ADD_THREADWISE_INST_REF_BY_TYPE(inT,
\
ADD_THREADWISE_INST_REF_BY_TYPE(inT, \
compT,
\
compT, \
outT,
\
outT, \
static_cast<ReduceTensorOp>(ReduceOpId),
\
static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<
NanPropagation
>(NanOpt), \
static_cast<
bool
>(NanOpt),
\
static_cast<
ReduceTensorIndices
>(IndicesOpt), \
static_cast<
bool
>(IndicesOpt),
\
Rank,
\
Rank, \
NumReduceDim)
NumReduceDim)
}
// namespace device_reduce_instance
}
// namespace device_reduce_instance
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp
View file @
a3b4c5cb
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_B16_F32_B16_HPP
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_B16_F32_B16_HPP
#define DEVICE_REDUCE_INSTANCE_THREADWISE_B16_F32_B16_HPP
#define DEVICE_REDUCE_INSTANCE_THREADWISE_B16_F32_B16_HPP
#include "reduction_enums.hpp"
#include "data_type.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_threadwise.hpp"
#include "device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
ck
{
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp
View file @
a3b4c5cb
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F16_F16_HPP
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F16_F16_HPP
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F16_F16_HPP
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F16_F16_HPP
#include "reduction_enums.hpp"
#include "data_type.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_threadwise.hpp"
#include "device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
ck
{
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp
View file @
a3b4c5cb
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F32_F16_HPP
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F32_F16_HPP
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F32_F16_HPP
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F32_F16_HPP
#include "reduction_enums.hpp"
#include "data_type.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_threadwise.hpp"
#include "device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
ck
{
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp
View file @
a3b4c5cb
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F32_F32_HPP
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F32_F32_HPP
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F32_F32_HPP
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F32_F32_HPP
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_threadwise.hpp"
#include "device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
ck
{
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp
View file @
a3b4c5cb
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F64_F32_HPP
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F64_F32_HPP
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F64_F32_HPP
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F64_F32_HPP
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_threadwise.hpp"
#include "device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
ck
{
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp
View file @
a3b4c5cb
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F64_F64_F64_HPP
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F64_F64_F64_HPP
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F64_F64_F64_HPP
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F64_F64_F64_HPP
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_threadwise.hpp"
#include "device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
ck
{
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp
View file @
a3b4c5cb
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I32_I8_HPP
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I32_I8_HPP
#define DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I32_I8_HPP
#define DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I32_I8_HPP
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_threadwise.hpp"
#include "device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
ck
{
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp
View file @
a3b4c5cb
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I8_I8_HPP
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I8_I8_HPP
#define DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I8_I8_HPP
#define DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I8_I8_HPP
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_threadwise.hpp"
#include "device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
ck
{
...
...
library/include/ck/library/utility/check_err.hpp
View file @
a3b4c5cb
...
@@ -24,7 +24,7 @@ check_err(const std::vector<T>& out,
...
@@ -24,7 +24,7 @@ check_err(const std::vector<T>& out,
const
std
::
vector
<
T
>&
ref
,
const
std
::
vector
<
T
>&
ref
,
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
double
rtol
=
1e-5
,
double
rtol
=
1e-5
,
double
atol
=
1
e-
8
)
double
atol
=
3
e-
6
)
{
{
if
(
out
.
size
()
!=
ref
.
size
())
if
(
out
.
size
()
!=
ref
.
size
())
{
{
...
@@ -173,8 +173,8 @@ check_err(const std::vector<T>& out,
...
@@ -173,8 +173,8 @@ check_err(const std::vector<T>& out,
{
{
if
(
out
[
i
]
!=
ref
[
i
])
if
(
out
[
i
]
!=
ref
[
i
])
{
{
std
::
cout
<<
"out["
<<
i
<<
"] != ref["
<<
i
<<
"]: "
<<
out
[
i
]
<<
" != "
<<
ref
[
i
]
std
::
cout
<<
"out["
<<
i
<<
"] != ref["
<<
i
<<
"]: "
<<
static_cast
<
int
>
(
out
[
i
]
)
<<
std
::
endl
<<
" != "
<<
static_cast
<
int
>
(
ref
[
i
])
<<
std
::
endl
<<
msg
<<
std
::
endl
;
<<
msg
<<
std
::
endl
;
return
false
;
return
false
;
}
}
...
...
library/include/ck/library/utility/conv_
fwd_
util.hpp
→
library/include/ck/library/utility/conv_util.hpp
View file @
a3b4c5cb
...
@@ -146,19 +146,19 @@ struct ConvParams
...
@@ -146,19 +146,19 @@ struct ConvParams
const
std
::
vector
<
ck
::
index_t
>&
left_pads
,
const
std
::
vector
<
ck
::
index_t
>&
left_pads
,
const
std
::
vector
<
ck
::
index_t
>&
right_pads
);
const
std
::
vector
<
ck
::
index_t
>&
right_pads
);
ck
::
index_t
num_dim_spatial
;
ck
::
index_t
num_dim_spatial
_
;
ck
::
index_t
N
;
ck
::
index_t
N
_
;
ck
::
index_t
K
;
ck
::
index_t
K
_
;
ck
::
index_t
C
;
ck
::
index_t
C
_
;
std
::
vector
<
ck
::
index_t
>
filter_spatial_lengths
;
std
::
vector
<
ck
::
index_t
>
filter_spatial_lengths
_
;
std
::
vector
<
ck
::
index_t
>
input_spatial_lengths
;
std
::
vector
<
ck
::
index_t
>
input_spatial_lengths
_
;
std
::
vector
<
ck
::
index_t
>
conv_filter_strides
;
std
::
vector
<
ck
::
index_t
>
conv_filter_strides
_
;
std
::
vector
<
ck
::
index_t
>
conv_filter_dilations
;
std
::
vector
<
ck
::
index_t
>
conv_filter_dilations
_
;
std
::
vector
<
ck
::
index_t
>
input_left_pads
;
std
::
vector
<
ck
::
index_t
>
input_left_pads
_
;
std
::
vector
<
ck
::
index_t
>
input_right_pads
;
std
::
vector
<
ck
::
index_t
>
input_right_pads
_
;
std
::
vector
<
ck
::
index_t
>
GetOutputSpatialLengths
()
const
;
std
::
vector
<
ck
::
index_t
>
GetOutputSpatialLengths
()
const
;
};
};
...
@@ -268,10 +268,10 @@ void run_reference_convolution_forward(const ConvParams& params,
...
@@ -268,10 +268,10 @@ void run_reference_convolution_forward(const ConvParams& params,
auto
ref_argument
=
ref_conv
.
MakeArgument
(
input
,
auto
ref_argument
=
ref_conv
.
MakeArgument
(
input
,
weights
,
weights
,
output
,
output
,
params
.
conv_filter_strides
,
params
.
conv_filter_strides
_
,
params
.
conv_filter_dilations
,
params
.
conv_filter_dilations
_
,
params
.
input_left_pads
,
params
.
input_left_pads
_
,
params
.
input_right_pads
,
params
.
input_right_pads
_
,
PassThrough
{},
PassThrough
{},
PassThrough
{},
PassThrough
{},
PassThrough
{});
PassThrough
{});
...
@@ -437,17 +437,17 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType,
...
@@ -437,17 +437,17 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType,
virtual
InTensorsTuple
GetInputTensors
()
const
override
virtual
InTensorsTuple
GetInputTensors
()
const
override
{
{
std
::
vector
<
std
::
size_t
>
input_dims
{
static_cast
<
std
::
size_t
>
(
params_
.
N
),
std
::
vector
<
std
::
size_t
>
input_dims
{
static_cast
<
std
::
size_t
>
(
params_
.
N
_
),
static_cast
<
std
::
size_t
>
(
params_
.
C
)};
static_cast
<
std
::
size_t
>
(
params_
.
C
_
)};
input_dims
.
insert
(
std
::
end
(
input_dims
),
input_dims
.
insert
(
std
::
end
(
input_dims
),
std
::
begin
(
params_
.
input_spatial_lengths
),
std
::
begin
(
params_
.
input_spatial_lengths
_
),
std
::
end
(
params_
.
input_spatial_lengths
));
std
::
end
(
params_
.
input_spatial_lengths
_
));
std
::
vector
<
std
::
size_t
>
filter_dims
{
static_cast
<
std
::
size_t
>
(
params_
.
K
),
std
::
vector
<
std
::
size_t
>
filter_dims
{
static_cast
<
std
::
size_t
>
(
params_
.
K
_
),
static_cast
<
std
::
size_t
>
(
params_
.
C
)};
static_cast
<
std
::
size_t
>
(
params_
.
C
_
)};
filter_dims
.
insert
(
std
::
end
(
filter_dims
),
filter_dims
.
insert
(
std
::
end
(
filter_dims
),
std
::
begin
(
params_
.
filter_spatial_lengths
),
std
::
begin
(
params_
.
filter_spatial_lengths
_
),
std
::
end
(
params_
.
filter_spatial_lengths
));
std
::
end
(
params_
.
filter_spatial_lengths
_
));
auto
input
=
std
::
make_unique
<
Tensor
<
InDataType
>>
(
auto
input
=
std
::
make_unique
<
Tensor
<
InDataType
>>
(
get_host_tensor_descriptor
(
input_dims
,
InLayout
{}));
get_host_tensor_descriptor
(
input_dims
,
InLayout
{}));
...
@@ -465,8 +465,8 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType,
...
@@ -465,8 +465,8 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType,
virtual
TensorPtr
<
OutDataType
>
GetOutputTensor
()
const
override
virtual
TensorPtr
<
OutDataType
>
GetOutputTensor
()
const
override
{
{
std
::
vector
<
std
::
size_t
>
output_dims
{
static_cast
<
std
::
size_t
>
(
params_
.
N
),
std
::
vector
<
std
::
size_t
>
output_dims
{
static_cast
<
std
::
size_t
>
(
params_
.
N
_
),
static_cast
<
std
::
size_t
>
(
params_
.
K
)};
static_cast
<
std
::
size_t
>
(
params_
.
K
_
)};
output_dims
.
insert
(
std
::
end
(
output_dims
),
output_dims
.
insert
(
std
::
end
(
output_dims
),
std
::
begin
(
output_spatial_lengths_
),
std
::
begin
(
output_spatial_lengths_
),
std
::
end
(
output_spatial_lengths_
));
std
::
end
(
output_spatial_lengths_
));
...
@@ -522,16 +522,16 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType,
...
@@ -522,16 +522,16 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType,
static_cast
<
InDataType
*>
(
in_device_buffers
[
0
]
->
GetDeviceBuffer
()),
static_cast
<
InDataType
*>
(
in_device_buffers
[
0
]
->
GetDeviceBuffer
()),
static_cast
<
WeiDataType
*>
(
in_device_buffers
[
1
]
->
GetDeviceBuffer
()),
static_cast
<
WeiDataType
*>
(
in_device_buffers
[
1
]
->
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buffer
->
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buffer
->
GetDeviceBuffer
()),
params_
.
N
,
params_
.
N
_
,
params_
.
K
,
params_
.
K
_
,
params_
.
C
,
params_
.
C
_
,
params_
.
input_spatial_lengths
,
params_
.
input_spatial_lengths
_
,
params_
.
filter_spatial_lengths
,
params_
.
filter_spatial_lengths
_
,
output_spatial_lengths_
,
output_spatial_lengths_
,
params_
.
conv_filter_strides
,
params_
.
conv_filter_strides
_
,
params_
.
conv_filter_dilations
,
params_
.
conv_filter_dilations
_
,
params_
.
input_left_pads
,
params_
.
input_left_pads
_
,
params_
.
input_right_pads
,
params_
.
input_right_pads
_
,
InElementwiseOp
{},
InElementwiseOp
{},
WeiElementwiseOp
{},
WeiElementwiseOp
{},
OutElementwiseOp
{});
OutElementwiseOp
{});
...
@@ -539,20 +539,20 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType,
...
@@ -539,20 +539,20 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType,
virtual
std
::
size_t
GetFlops
()
const
override
virtual
std
::
size_t
GetFlops
()
const
override
{
{
return
get_flops
(
params_
.
N
,
return
get_flops
(
params_
.
N
_
,
params_
.
C
,
params_
.
C
_
,
params_
.
K
,
params_
.
K
_
,
params_
.
filter_spatial_lengths
,
params_
.
filter_spatial_lengths
_
,
output_spatial_lengths_
);
output_spatial_lengths_
);
}
}
virtual
std
::
size_t
GetBtype
()
const
override
virtual
std
::
size_t
GetBtype
()
const
override
{
{
return
get_btype
<
InDataType
,
WeiDataType
,
OutDataType
>
(
params_
.
N
,
return
get_btype
<
InDataType
,
WeiDataType
,
OutDataType
>
(
params_
.
N
_
,
params_
.
C
,
params_
.
C
_
,
params_
.
K
,
params_
.
K
_
,
params_
.
input_spatial_lengths
,
params_
.
input_spatial_lengths
_
,
params_
.
filter_spatial_lengths
,
params_
.
filter_spatial_lengths
_
,
output_spatial_lengths_
);
output_spatial_lengths_
);
}
}
...
...
library/include/ck/library/utility/op_instance_engine.hpp
View file @
a3b4c5cb
...
@@ -128,7 +128,7 @@ class OpInstanceRunEngine
...
@@ -128,7 +128,7 @@ class OpInstanceRunEngine
template
<
typename
OpInstancePtr
>
template
<
typename
OpInstancePtr
>
ProfileBestConfig
Profile
(
const
std
::
vector
<
OpInstancePtr
>&
op_ptrs
,
ProfileBestConfig
Profile
(
const
std
::
vector
<
OpInstancePtr
>&
op_ptrs
,
int
nrepeat
=
100
,
bool
time_kernel
=
false
,
bool
do_verification
=
false
,
bool
do_verification
=
false
,
bool
do_log
=
false
)
bool
do_log
=
false
)
{
{
...
@@ -143,7 +143,7 @@ class OpInstanceRunEngine
...
@@ -143,7 +143,7 @@ class OpInstanceRunEngine
if
(
op_ptr
->
IsSupportedArgument
(
argument
.
get
()))
if
(
op_ptr
->
IsSupportedArgument
(
argument
.
get
()))
{
{
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
float
avg_time
=
invoker
->
Run
(
argument
.
get
(),
nrepeat
);
float
avg_time
=
invoker
->
Run
(
argument
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
}
);
std
::
size_t
flops
=
op_instance_
.
GetFlops
();
std
::
size_t
flops
=
op_instance_
.
GetFlops
();
std
::
size_t
num_btype
=
op_instance_
.
GetBtype
();
std
::
size_t
num_btype
=
op_instance_
.
GetBtype
();
...
...
library/src/host_tensor/CMakeLists.txt
View file @
a3b4c5cb
...
@@ -10,10 +10,31 @@ set(HOST_TENSOR_SOURCE
...
@@ -10,10 +10,31 @@ set(HOST_TENSOR_SOURCE
host_tensor.cpp
host_tensor.cpp
)
)
add_library
(
host_tensor SHARED
${
HOST_TENSOR_SOURCE
}
)
add_library
(
host_tensor STATIC
${
HOST_TENSOR_SOURCE
}
)
add_library
(
composable_kernel::host_tensor ALIAS host_tensor
)
target_compile_features
(
host_tensor PUBLIC
)
target_compile_features
(
host_tensor PUBLIC
)
set_target_properties
(
host_tensor PROPERTIES POSITION_INDEPENDENT_CODE ON
)
set_target_properties
(
host_tensor PROPERTIES POSITION_INDEPENDENT_CODE ON
)
target_include_directories
(
host_tensor SYSTEM PUBLIC $<BUILD_INTERFACE:
${
HALF_INCLUDE_DIR
}
>
)
target_include_directories
(
host_tensor SYSTEM PUBLIC $<BUILD_INTERFACE:
${
HALF_INCLUDE_DIR
}
>
)
install
(
TARGETS host_tensor LIBRARY DESTINATION lib
)
target_include_directories
(
host_tensor PUBLIC
"$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck>"
"$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/utility>"
"$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/host_tensor>"
)
install
(
TARGETS host_tensor
EXPORT host_tensorTargets
LIBRARY DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
ARCHIVE DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
RUNTIME DESTINATION
${
CMAKE_INSTALL_BINDIR
}
INCLUDES DESTINATION
${
CMAKE_INSTALL_INCLUDEDIR
}
)
install
(
EXPORT host_tensorTargets
FILE composable_kernelhost_tensorTargets.cmake
NAMESPACE composable_kernel::
DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
/cmake/composable_kernel
)
clang_tidy_check
(
host_tensor
)
clang_tidy_check
(
host_tensor
)
library/src/host_tensor/device.cpp
View file @
a3b4c5cb
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
DeviceMem
::
DeviceMem
(
std
::
size_t
mem_size
)
:
mMemSize
(
mem_size
)
DeviceMem
::
DeviceMem
(
std
::
size_t
mem_size
)
:
mMemSize
(
mem_size
)
{
{
hip
GetErrorString
(
hipMalloc
(
static_cast
<
void
**>
(
&
mpDeviceBuf
),
mMemSize
));
hip
_check_error
(
hipMalloc
(
static_cast
<
void
**>
(
&
mpDeviceBuf
),
mMemSize
));
}
}
void
*
DeviceMem
::
GetDeviceBuffer
()
{
return
mpDeviceBuf
;
}
void
*
DeviceMem
::
GetDeviceBuffer
()
{
return
mpDeviceBuf
;
}
...
@@ -11,49 +11,48 @@ std::size_t DeviceMem::GetBufferSize() { return mMemSize; }
...
@@ -11,49 +11,48 @@ std::size_t DeviceMem::GetBufferSize() { return mMemSize; }
void
DeviceMem
::
ToDevice
(
const
void
*
p
)
void
DeviceMem
::
ToDevice
(
const
void
*
p
)
{
{
hipGetErrorString
(
hip_check_error
(
hipMemcpy
(
mpDeviceBuf
,
const_cast
<
void
*>
(
p
),
mMemSize
,
hipMemcpyHostToDevice
));
hipMemcpy
(
mpDeviceBuf
,
const_cast
<
void
*>
(
p
),
mMemSize
,
hipMemcpyHostToDevice
));
}
}
void
DeviceMem
::
FromDevice
(
void
*
p
)
void
DeviceMem
::
FromDevice
(
void
*
p
)
{
{
hip
GetErrorString
(
hipMemcpy
(
p
,
mpDeviceBuf
,
mMemSize
,
hipMemcpyDeviceToHost
));
hip
_check_error
(
hipMemcpy
(
p
,
mpDeviceBuf
,
mMemSize
,
hipMemcpyDeviceToHost
));
}
}
void
DeviceMem
::
SetZero
()
{
hip
GetErrorString
(
hipMemset
(
mpDeviceBuf
,
0
,
mMemSize
));
}
void
DeviceMem
::
SetZero
()
{
hip
_check_error
(
hipMemset
(
mpDeviceBuf
,
0
,
mMemSize
));
}
DeviceMem
::~
DeviceMem
()
{
hip
GetErrorString
(
hipFree
(
mpDeviceBuf
));
}
DeviceMem
::~
DeviceMem
()
{
hip
_check_error
(
hipFree
(
mpDeviceBuf
));
}
struct
KernelTimerImpl
struct
KernelTimerImpl
{
{
KernelTimerImpl
()
KernelTimerImpl
()
{
{
hip
GetErrorString
(
hipEventCreate
(
&
mStart
));
hip
_check_error
(
hipEventCreate
(
&
mStart
));
hip
GetErrorString
(
hipEventCreate
(
&
mEnd
));
hip
_check_error
(
hipEventCreate
(
&
mEnd
));
}
}
~
KernelTimerImpl
()
~
KernelTimerImpl
()
{
{
hip
GetErrorString
(
hipEventDestroy
(
mStart
));
hip
_check_error
(
hipEventDestroy
(
mStart
));
hip
GetErrorString
(
hipEventDestroy
(
mEnd
));
hip
_check_error
(
hipEventDestroy
(
mEnd
));
}
}
void
Start
()
void
Start
()
{
{
hip
GetErrorString
(
hipDeviceSynchronize
());
hip
_check_error
(
hipDeviceSynchronize
());
hip
GetErrorString
(
hipEventRecord
(
mStart
,
nullptr
));
hip
_check_error
(
hipEventRecord
(
mStart
,
nullptr
));
}
}
void
End
()
void
End
()
{
{
hip
GetErrorString
(
hipEventRecord
(
mEnd
,
nullptr
));
hip
_check_error
(
hipEventRecord
(
mEnd
,
nullptr
));
hip
GetErrorString
(
hipEventSynchronize
(
mEnd
));
hip
_check_error
(
hipEventSynchronize
(
mEnd
));
}
}
float
GetElapsedTime
()
const
float
GetElapsedTime
()
const
{
{
float
time
;
float
time
;
hip
GetErrorString
(
hipEventElapsedTime
(
&
time
,
mStart
,
mEnd
));
hip
_check_error
(
hipEventElapsedTime
(
&
time
,
mStart
,
mEnd
));
return
time
;
return
time
;
}
}
...
...
Prev
1
…
7
8
9
10
11
12
13
14
15
…
19
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment