Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
24af0144
Unverified
Commit
24af0144
authored
Nov 12, 2022
by
Po Yen Chen
Committed by
GitHub
Nov 12, 2022
Browse files
Merge branch 'develop' into gemm_layernorm_welford
parents
961f5e9e
b79bbbc2
Changes
813
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
483 additions
and
53 deletions
+483
-53
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.hpp
...ce/device_reduce_instance_threadwise_f64_f64_f64_amax.hpp
+31
-0
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.hpp
...uce/device_reduce_instance_threadwise_f64_f64_f64_avg.hpp
+27
-0
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.hpp
...uce/device_reduce_instance_threadwise_f64_f64_f64_max.hpp
+31
-0
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.hpp
...uce/device_reduce_instance_threadwise_f64_f64_f64_min.hpp
+31
-0
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.hpp
...e/device_reduce_instance_threadwise_f64_f64_f64_norm2.hpp
+27
-0
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.hpp
...educe/device_reduce_instance_threadwise_i8_i32_i8_add.hpp
+27
-0
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.hpp
...educe/device_reduce_instance_threadwise_i8_i32_i8_avg.hpp
+27
-0
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp
...gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp
+0
-46
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.hpp
...educe/device_reduce_instance_threadwise_i8_i8_i8_amax.hpp
+31
-0
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.hpp
...reduce/device_reduce_instance_threadwise_i8_i8_i8_max.hpp
+31
-0
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.hpp
...reduce/device_reduce_instance_threadwise_i8_i8_i8_min.hpp
+31
-0
library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp
...lude/ck/library/tensor_operation_instance/gpu/softmax.hpp
+13
-7
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp
..._instance/gpu/softmax/device_softmax_f16_f16_instance.hpp
+22
-0
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp
...softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp
+22
-0
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp
...softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp
+22
-0
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp
...softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp
+22
-0
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp
...softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp
+22
-0
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp
...softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp
+22
-0
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp
...softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp
+22
-0
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp
...softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp
+22
-0
No files found.
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.hpp
0 → 100644
View file @
24af0144
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
UnaryAbs
,
PassThrough
>>&
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_
multiblock_atomic_add
_f64_f64_f64.hpp
→
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_
threadwise
_f64_f64_f64
_avg
.hpp
View file @
24af0144
...
...
@@ -4,8 +4,9 @@
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_
multiblock_atomic_add
.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_
threadwise
.hpp"
namespace
ck
{
namespace
tensor_operation
{
...
...
@@ -13,15 +14,11 @@ namespace device {
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
2
,
1
);
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
UnaryDivide
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
UnaryDivide
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
UnaryDivide
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
UnaryDivide
>>&
);
// clang-format on
}
// namespace instance
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.hpp
0 → 100644
View file @
24af0144
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.hpp
0 → 100644
View file @
24af0144
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.hpp
0 → 100644
View file @
24af0144
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
UnarySquare
,
UnarySqrt
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
UnarySquare
,
UnarySqrt
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
UnarySquare
,
UnarySqrt
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
UnarySquare
,
UnarySqrt
>>&
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.hpp
0 → 100644
View file @
24af0144
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
4
,
3
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
4
,
4
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
4
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
2
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.hpp
0 → 100644
View file @
24af0144
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
4
,
3
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
UnaryDivide
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
4
,
4
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
UnaryDivide
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
4
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
UnaryDivide
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
2
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
UnaryDivide
>>&
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp
deleted
100644 → 0
View file @
961f5e9e
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_THREADWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_THREADWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
0
,
4
,
4
);
ADD_THREADWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_THREADWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
0
,
4
,
4
);
ADD_THREADWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_THREADWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
0
,
4
,
4
);
ADD_THREADWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_THREADWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
1
,
4
,
4
);
ADD_THREADWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
1
,
4
,
1
);
ADD_THREADWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
1
,
2
,
1
);
ADD_THREADWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_THREADWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
1
,
4
,
4
);
ADD_THREADWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
1
,
4
,
1
);
ADD_THREADWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
1
,
2
,
1
);
ADD_THREADWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_THREADWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
1
,
4
,
4
);
ADD_THREADWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
1
,
4
,
1
);
ADD_THREADWISE_INST_REF_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.hpp
0 → 100644
View file @
24af0144
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
UnaryAbs
,
PassThrough
>>&
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.hpp
0 → 100644
View file @
24af0144
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.hpp
0 → 100644
View file @
24af0144
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp
View file @
24af0144
...
...
@@ -8,20 +8,13 @@
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/utility/data_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
void
add_device_softmax_f16_f16_rank3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
3
>>&
);
void
add_device_softmax_f16_f16_rank4_instances
(
...
...
@@ -32,6 +25,11 @@ void add_device_softmax_f32_f32_rank3_instances(
void
add_device_softmax_f32_f32_rank4_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
4
>>&
);
void
add_device_softmax_i8_i8_rank3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
3
>>&
);
void
add_device_softmax_i8_i8_rank4_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
4
>>&
);
template
<
typename
InDataType
,
typename
AccDataType
,
typename
OutDataType
,
index_t
Rank
>
struct
DeviceOperationInstanceFactory
<
ck
::
tensor_operation
::
device
::
...
...
@@ -60,6 +58,14 @@ struct DeviceOperationInstanceFactory<
else
if
constexpr
(
Rank
==
4
)
add_device_softmax_f32_f32_rank4_instances
(
op_ptrs
);
}
else
if
constexpr
(
std
::
is_same_v
<
InDataType
,
I8
>
&&
std
::
is_same_v
<
AccDataType
,
F32
>
&&
std
::
is_same_v
<
OutDataType
,
I8
>
)
{
if
constexpr
(
Rank
==
3
)
add_device_softmax_i8_i8_rank3_instances
(
op_ptrs
);
else
if
constexpr
(
Rank
==
4
)
add_device_softmax_i8_i8_rank4_instances
(
op_ptrs
);
}
return
op_ptrs
;
}
...
...
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp
0 → 100644
View file @
24af0144
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_softmax_f16_f16_rank3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
3
>>&
instances
);
void
add_device_softmax_f16_f16_rank4_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
4
>>&
instances
);
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp
0 → 100644
View file @
24af0144
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_softmax_f16_f16_rank3_reduce1_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
3
>>&
instances
);
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp
0 → 100644
View file @
24af0144
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_softmax_f16_f16_rank3_reduce2_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
3
>>&
instances
);
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp
0 → 100644
View file @
24af0144
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_softmax_f16_f16_rank3_reduce3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
3
>>&
instances
);
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp
0 → 100644
View file @
24af0144
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_softmax_f16_f16_rank4_reduce1_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
4
>>&
instances
);
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp
0 → 100644
View file @
24af0144
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_softmax_f16_f16_rank4_reduce2_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
4
>>&
instances
);
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp
0 → 100644
View file @
24af0144
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_softmax_f16_f16_rank4_reduce3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
4
>>&
instances
);
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp
0 → 100644
View file @
24af0144
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_softmax_f16_f16_rank4_reduce4_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
4
>>&
instances
);
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
Prev
1
…
16
17
18
19
20
21
22
23
24
…
41
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment