Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
6ef4e211
Commit
6ef4e211
authored
Jul 05, 2022
by
Chao Liu
Browse files
Merge remote-tracking branch 'origin/develop' into contraction
parents
b0a2afb9
9e4429f9
Changes
367
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
146 additions
and
38 deletions
+146
-38
library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt
...tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt
+2
-2
library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp
...device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp
+3
-3
library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp
...device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp
+3
-3
library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp
...device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp
+3
-3
library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp
...device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp
+3
-3
library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt
...ensor_operation_instance/gpu/normalization/CMakeLists.txt
+10
-0
library/src/tensor_operation_instance/gpu/normalization/device_softmax_f16_f16_instance.cpp
...nce/gpu/normalization/device_softmax_f16_f16_instance.cpp
+50
-0
library/src/tensor_operation_instance/gpu/normalization/device_softmax_f32_f32_instance.cpp
...nce/gpu/normalization/device_softmax_f32_f32_instance.cpp
+48
-0
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.cpp
...u/reduce/device_reduce_instance_blockwise_b16_f32_b16.cpp
+2
-2
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.cpp
...u/reduce/device_reduce_instance_blockwise_f16_f16_f16.cpp
+2
-2
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.cpp
...u/reduce/device_reduce_instance_blockwise_f16_f32_f16.cpp
+2
-2
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.cpp
...u/reduce/device_reduce_instance_blockwise_f32_f32_f32.cpp
+2
-2
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.cpp
...u/reduce/device_reduce_instance_blockwise_f32_f64_f32.cpp
+2
-2
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.cpp
...u/reduce/device_reduce_instance_blockwise_f64_f64_f64.cpp
+2
-2
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.cpp
...gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.cpp
+2
-2
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.cpp
.../gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.cpp
+2
-2
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.cpp
...ice_reduce_instance_multiblock_atomic_add_b16_f32_f32.cpp
+2
-2
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.cpp
...ice_reduce_instance_multiblock_atomic_add_f16_f32_f32.cpp
+2
-2
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.cpp
...ice_reduce_instance_multiblock_atomic_add_f32_f32_f32.cpp
+2
-2
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.cpp
...ice_reduce_instance_multiblock_atomic_add_f32_f64_f32.cpp
+2
-2
No files found.
library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt
View file @
6ef4e211
...
@@ -6,10 +6,10 @@ set(DEVICE_GROUPED_GEMM_INSTANCE_SOURCE
...
@@ -6,10 +6,10 @@ set(DEVICE_GROUPED_GEMM_INSTANCE_SOURCE
device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp;
device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp;
)
)
add_library
(
device_grouped_gemm_instance OBJECT
${
DEVICE_GROUPED_GEMM_INSTANCE_SOURCE
}
)
add_library
(
device_grouped_gemm_instance OBJECT
${
DEVICE_GROUPED_GEMM_INSTANCE_SOURCE
}
)
target_compile_features
(
device_grouped_gemm_instance PUBLIC
)
target_compile_features
(
device_grouped_gemm_instance PUBLIC
)
set_target_properties
(
device_grouped_gemm_instance PROPERTIES POSITION_INDEPENDENT_CODE ON
)
set_target_properties
(
device_grouped_gemm_instance PROPERTIES POSITION_INDEPENDENT_CODE ON
)
install
(
TARGETS device_grouped_gemm_instance
LIBRARY DESTINATION lib
)
rocm_
install
(
TARGETS device_grouped_gemm_instance
)
clang_tidy_check
(
device_grouped_gemm_instance
)
clang_tidy_check
(
device_grouped_gemm_instance
)
library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp
View file @
6ef4e211
...
@@ -7,12 +7,12 @@
...
@@ -7,12 +7,12 @@
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/
add_
device_operation_instance.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
namespace
device_grouped_gemm_
instance
{
namespace
instance
{
using
F16
=
ck
::
half_t
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
F32
=
float
;
...
@@ -52,7 +52,7 @@ void add_device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instances(
...
@@ -52,7 +52,7 @@ void add_device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instances(
device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instances
{});
device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instances
{});
}
}
}
// namespace
device_grouped_gemm_
instance
}
// namespace instance
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
}
// namespace ck
}
// namespace ck
library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp
View file @
6ef4e211
...
@@ -7,12 +7,12 @@
...
@@ -7,12 +7,12 @@
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/
add_
device_operation_instance.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
namespace
device_grouped_gemm_
instance
{
namespace
instance
{
using
F16
=
ck
::
half_t
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
F32
=
float
;
...
@@ -52,7 +52,7 @@ void add_device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instances(
...
@@ -52,7 +52,7 @@ void add_device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instances(
device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instances
{});
device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instances
{});
}
}
}
// namespace
device_grouped_gemm_
instance
}
// namespace instance
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
}
// namespace ck
}
// namespace ck
library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp
View file @
6ef4e211
...
@@ -7,12 +7,12 @@
...
@@ -7,12 +7,12 @@
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/
add_
device_operation_instance.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
namespace
device_grouped_gemm_
instance
{
namespace
instance
{
using
F16
=
ck
::
half_t
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
F32
=
float
;
...
@@ -61,7 +61,7 @@ void add_device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instances(
...
@@ -61,7 +61,7 @@ void add_device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instances(
device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instances
{});
device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instances
{});
}
}
}
// namespace
device_grouped_gemm_
instance
}
// namespace instance
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
}
// namespace ck
}
// namespace ck
library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp
View file @
6ef4e211
...
@@ -7,12 +7,12 @@
...
@@ -7,12 +7,12 @@
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/
add_
device_operation_instance.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
namespace
device_grouped_gemm_
instance
{
namespace
instance
{
using
F16
=
ck
::
half_t
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
F32
=
float
;
...
@@ -72,7 +72,7 @@ void add_device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(
...
@@ -72,7 +72,7 @@ void add_device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(
instances
,
device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_irregular_tile_instances
{});
instances
,
device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_irregular_tile_instances
{});
}
}
}
// namespace
device_grouped_gemm_
instance
}
// namespace instance
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
}
// namespace ck
}
// namespace ck
library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt
0 → 100644
View file @
6ef4e211
# device_normalization_instance
set
(
DEVICE_NORMALIZATION_INSTANCE_SOURCE
device_softmax_f32_f32_instance.cpp
device_softmax_f16_f16_instance.cpp
)
add_library
(
device_normalization_instance OBJECT
${
DEVICE_NORMALIZATION_INSTANCE_SOURCE
}
)
set_target_properties
(
device_normalization_instance PROPERTIES POSITION_INDEPENDENT_CODE ON
)
clang_tidy_check
(
device_normalization_instance
)
library/src/tensor_operation_instance/gpu/normalization/device_softmax_f16_f16_instance.cpp
0 → 100644
View file @
6ef4e211
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
template
<
index_t
Rank
,
index_t
Reduce
>
using
device_softmax_f16_f16_instances
=
std
::
tuple
<
// clang-format off
// InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
DeviceSoftmax
<
F16
,
F32
,
F16
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
8
,
1
,
1
,
1
>
,
// fallback kernel
DeviceSoftmax
<
F16
,
F32
,
F16
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
8
,
1
,
8
,
8
>
,
DeviceSoftmax
<
F16
,
F32
,
F16
,
Rank
,
Reduce
,
256
,
4
,
64
,
1
,
8
,
1
,
8
,
8
>
,
DeviceSoftmax
<
F16
,
F32
,
F16
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
8
,
1
,
8
,
8
>
,
DeviceSoftmax
<
F16
,
F32
,
F16
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
16
,
1
,
8
,
8
>
,
DeviceSoftmax
<
F16
,
F32
,
F16
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
32
,
1
,
8
,
8
>
,
DeviceSoftmax
<
F16
,
F32
,
F16
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
8
,
1
,
8
,
8
>
,
DeviceSoftmax
<
F16
,
F32
,
F16
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
16
,
1
,
8
,
8
>
,
DeviceSoftmax
<
F16
,
F32
,
F16
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
32
,
1
,
8
,
8
>
// clang-format on
>
;
void
add_device_softmax_f16_f16_rank3_instances
(
std
::
vector
<
DeviceNormalizationPtr
>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
3
,
1
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
3
,
2
>
{});
}
void
add_device_softmax_f16_f16_rank4_instances
(
std
::
vector
<
DeviceNormalizationPtr
>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
4
,
1
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
4
,
2
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
4
,
3
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/normalization/device_softmax_f32_f32_instance.cpp
0 → 100644
View file @
6ef4e211
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
#include "ck/utility/data_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
F32
=
float
;
template
<
index_t
Rank
,
index_t
Reduce
>
using
device_softmax_f32_f32_instances
=
std
::
tuple
<
// clang-format off
// InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
DeviceSoftmax
<
F32
,
F32
,
F32
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
8
,
1
,
1
,
1
>
,
// fallback kernel
DeviceSoftmax
<
F32
,
F32
,
F32
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
8
,
1
,
4
,
4
>
,
DeviceSoftmax
<
F32
,
F32
,
F32
,
Rank
,
Reduce
,
256
,
4
,
64
,
1
,
8
,
1
,
4
,
4
>
,
DeviceSoftmax
<
F32
,
F32
,
F32
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
8
,
1
,
4
,
4
>
,
DeviceSoftmax
<
F32
,
F32
,
F32
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
16
,
1
,
4
,
4
>
,
DeviceSoftmax
<
F32
,
F32
,
F32
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
32
,
1
,
4
,
4
>
,
DeviceSoftmax
<
F32
,
F32
,
F32
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
8
,
1
,
4
,
4
>
,
DeviceSoftmax
<
F32
,
F32
,
F32
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
16
,
1
,
4
,
4
>
,
DeviceSoftmax
<
F32
,
F32
,
F32
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
32
,
1
,
4
,
4
>
// clang-format on
>
;
void
add_device_softmax_f32_f32_rank3_instances
(
std
::
vector
<
DeviceNormalizationPtr
>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
3
,
1
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
3
,
2
>
{});
}
void
add_device_softmax_f32_f32_rank4_instances
(
std
::
vector
<
DeviceNormalizationPtr
>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
4
,
1
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
4
,
2
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
4
,
3
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.cpp
View file @
6ef4e211
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
namespace
device_reduce_
instance
{
namespace
instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...
@@ -49,7 +49,7 @@ ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 1);
...
@@ -49,7 +49,7 @@ ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 1);
ADD_BLOCKWISE_INST_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
4
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_INST_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace
device_reduce_
instance
}
// namespace instance
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
...
...
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.cpp
View file @
6ef4e211
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
namespace
device_reduce_
instance
{
namespace
instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...
@@ -36,7 +36,7 @@ ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1);
...
@@ -36,7 +36,7 @@ ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1);
ADD_BLOCKWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace
device_reduce_
instance
}
// namespace instance
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
...
...
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.cpp
View file @
6ef4e211
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
namespace
device_reduce_
instance
{
namespace
instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...
@@ -24,7 +24,7 @@ ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1);
...
@@ -24,7 +24,7 @@ ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace
device_reduce_
instance
}
// namespace instance
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
...
...
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.cpp
View file @
6ef4e211
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
namespace
device_reduce_
instance
{
namespace
instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...
@@ -48,7 +48,7 @@ ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 1);
...
@@ -48,7 +48,7 @@ ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 1);
ADD_BLOCKWISE_INST_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_INST_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace
device_reduce_
instance
}
// namespace instance
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
...
...
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.cpp
View file @
6ef4e211
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
namespace
device_reduce_
instance
{
namespace
instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...
@@ -24,7 +24,7 @@ ADD_BLOCKWISE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 1);
...
@@ -24,7 +24,7 @@ ADD_BLOCKWISE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace
device_reduce_
instance
}
// namespace instance
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
}
// namespace ck
}
// namespace ck
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.cpp
View file @
6ef4e211
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
namespace
device_reduce_
instance
{
namespace
instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...
@@ -48,7 +48,7 @@ ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 1);
...
@@ -48,7 +48,7 @@ ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 1);
ADD_BLOCKWISE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace
device_reduce_
instance
}
// namespace instance
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
...
...
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.cpp
View file @
6ef4e211
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
namespace
device_reduce_
instance
{
namespace
instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...
@@ -20,7 +20,7 @@ ADD_BLOCKWISE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 1);
...
@@ -20,7 +20,7 @@ ADD_BLOCKWISE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
5
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
5
,
0
,
0
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace
device_reduce_
instance
}
// namespace instance
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
...
...
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.cpp
View file @
6ef4e211
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
namespace
device_reduce_
instance
{
namespace
instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...
@@ -36,7 +36,7 @@ ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1);
...
@@ -36,7 +36,7 @@ ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1);
ADD_BLOCKWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace
device_reduce_
instance
}
// namespace instance
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
...
...
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.cpp
View file @
6ef4e211
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
namespace
device_reduce_
instance
{
namespace
instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...
@@ -20,7 +20,7 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(bhalf_t, float, float, 5, 0, 0, 4, 1);
...
@@ -20,7 +20,7 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(bhalf_t, float, float, 5, 0, 0, 4, 1);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID
(
bhalf_t
,
float
,
float
,
5
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID
(
bhalf_t
,
float
,
float
,
5
,
0
,
0
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace
device_reduce_
instance
}
// namespace instance
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
}
// namespace ck
}
// namespace ck
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.cpp
View file @
6ef4e211
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
namespace
device_reduce_
instance
{
namespace
instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...
@@ -20,7 +20,7 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 5, 0, 0, 4, 1);
...
@@ -20,7 +20,7 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 5, 0, 0, 4, 1);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID
(
half_t
,
float
,
float
,
5
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID
(
half_t
,
float
,
float
,
5
,
0
,
0
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace
device_reduce_
instance
}
// namespace instance
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
...
...
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.cpp
View file @
6ef4e211
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
namespace
device_reduce_
instance
{
namespace
instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...
@@ -20,7 +20,7 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 5, 0, 0, 4, 1);
...
@@ -20,7 +20,7 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 5, 0, 0, 4, 1);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace
device_reduce_
instance
}
// namespace instance
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
}
// namespace ck
}
// namespace ck
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.cpp
View file @
6ef4e211
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
namespace
device_reduce_
instance
{
namespace
instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...
@@ -20,7 +20,7 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 5, 0, 0, 4, 1);
...
@@ -20,7 +20,7 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 5, 0, 0, 4, 1);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID
(
float
,
double
,
float
,
5
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID
(
float
,
double
,
float
,
5
,
0
,
0
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace
device_reduce_
instance
}
// namespace instance
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
}
// namespace ck
}
// namespace ck
Prev
1
…
11
12
13
14
15
16
17
18
19
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment