Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
95a83c6e
Commit
95a83c6e
authored
Nov 18, 2022
by
Adam Osewski
Browse files
Merge remote-tracking branch 'origin/develop' into wavelet_model
parents
5b7c2432
892a8d76
Changes
618
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
530 additions
and
68 deletions
+530
-68
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.cpp
...reduce/device_reduce_instance_threadwise_i8_i8_i8_max.cpp
+28
-0
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.cpp
...reduce/device_reduce_instance_threadwise_i8_i8_i8_min.cpp
+28
-0
library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt
.../src/tensor_operation_instance/gpu/softmax/CMakeLists.txt
+22
-0
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp
..._instance/gpu/softmax/device_softmax_f16_f16_instance.cpp
+17
-35
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp
...softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp
+27
-0
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp
...softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp
+27
-0
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp
...softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp
+27
-0
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp
...softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp
+27
-0
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp
...softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp
+27
-0
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp
...softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp
+27
-0
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp
...softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp
+27
-0
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp
..._instance/gpu/softmax/device_softmax_f32_f32_instance.cpp
+17
-33
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp
...softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp
+27
-0
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp
...softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp
+27
-0
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp
...softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp
+27
-0
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp
...softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp
+27
-0
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp
...softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp
+27
-0
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp
...softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp
+27
-0
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp
...softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp
+27
-0
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp
...on_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp
+40
-0
No files found.
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.cpp
0 → 100644
View file @
95a83c6e
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.cpp
0 → 100644
View file @
95a83c6e
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
// clang-format on
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt
View file @
95a83c6e
add_instance_library
(
device_softmax_instance
add_instance_library
(
device_softmax_instance
device_softmax_i8_i8_instance.cpp
device_softmax_i8_i8_instance_rank3_reduce1.cpp
device_softmax_i8_i8_instance_rank3_reduce2.cpp
device_softmax_i8_i8_instance_rank3_reduce3.cpp
device_softmax_i8_i8_instance_rank4_reduce1.cpp
device_softmax_i8_i8_instance_rank4_reduce2.cpp
device_softmax_i8_i8_instance_rank4_reduce3.cpp
device_softmax_i8_i8_instance_rank4_reduce4.cpp
device_softmax_f16_f16_instance.cpp
device_softmax_f16_f16_instance.cpp
device_softmax_f16_f16_instance_rank3_reduce1.cpp
device_softmax_f16_f16_instance_rank3_reduce2.cpp
device_softmax_f16_f16_instance_rank3_reduce3.cpp
device_softmax_f16_f16_instance_rank4_reduce1.cpp
device_softmax_f16_f16_instance_rank4_reduce2.cpp
device_softmax_f16_f16_instance_rank4_reduce3.cpp
device_softmax_f16_f16_instance_rank4_reduce4.cpp
device_softmax_f32_f32_instance.cpp
device_softmax_f32_f32_instance.cpp
device_softmax_f32_f32_instance_rank3_reduce1.cpp
device_softmax_f32_f32_instance_rank3_reduce2.cpp
device_softmax_f32_f32_instance_rank3_reduce3.cpp
device_softmax_f32_f32_instance_rank4_reduce1.cpp
device_softmax_f32_f32_instance_rank4_reduce2.cpp
device_softmax_f32_f32_instance_rank4_reduce3.cpp
device_softmax_f32_f32_instance_rank4_reduce4.cpp
)
)
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp
View file @
95a83c6e
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <tuple>
#include <vector>
#include <vector>
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp"
#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
namespace
instance
{
namespace
instance
{
namespace
{
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
Pass
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
}
// namespace
template
<
index_t
Rank
,
index_t
Reduce
>
using
device_softmax_f16_f16_instances
=
std
::
tuple
<
// clang-format off
// InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
DeviceSoftmaxImpl
<
F16
,
F32
,
F16
,
Pass
,
Pass
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
8
,
1
,
1
,
1
>
,
// fallback kernel
DeviceSoftmaxImpl
<
F16
,
F32
,
F16
,
Pass
,
Pass
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
8
,
1
,
8
,
8
>
,
DeviceSoftmaxImpl
<
F16
,
F32
,
F16
,
Pass
,
Pass
,
Rank
,
Reduce
,
256
,
4
,
64
,
1
,
8
,
1
,
8
,
8
>
,
DeviceSoftmaxImpl
<
F16
,
F32
,
F16
,
Pass
,
Pass
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
8
,
1
,
8
,
8
>
,
DeviceSoftmaxImpl
<
F16
,
F32
,
F16
,
Pass
,
Pass
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
16
,
1
,
8
,
8
>
,
DeviceSoftmaxImpl
<
F16
,
F32
,
F16
,
Pass
,
Pass
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
32
,
1
,
8
,
8
>
,
DeviceSoftmaxImpl
<
F16
,
F32
,
F16
,
Pass
,
Pass
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
8
,
1
,
8
,
8
>
,
DeviceSoftmaxImpl
<
F16
,
F32
,
F16
,
Pass
,
Pass
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
16
,
1
,
8
,
8
>
,
DeviceSoftmaxImpl
<
F16
,
F32
,
F16
,
Pass
,
Pass
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
32
,
1
,
8
,
8
>
// clang-format on
>
;
void
add_device_softmax_f16_f16_rank3_instances
(
void
add_device_softmax_f16_f16_rank3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
Pass
,
Pass
,
3
>>&
instances
)
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
Pass
Through
,
PassThrough
,
3
>>&
instances
)
{
{
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
3
,
1
>
{});
add_device_softmax_f16_f16_rank3_reduce1_instances
(
instances
);
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
3
,
2
>
{});
add_device_softmax_f16_f16_rank3_reduce2_instances
(
instances
);
add_device_softmax_f16_f16_rank3_reduce3_instances
(
instances
);
}
}
void
add_device_softmax_f16_f16_rank4_instances
(
void
add_device_softmax_f16_f16_rank4_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
Pass
,
Pass
,
4
>>&
instances
)
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
Pass
Through
,
PassThrough
,
4
>>&
instances
)
{
{
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
4
,
1
>
{});
add_device_softmax_f16_f16_rank4_reduce1_instances
(
instances
);
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
4
,
2
>
{});
add_device_softmax_f16_f16_rank4_reduce2_instances
(
instances
);
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
4
,
3
>
{});
add_device_softmax_f16_f16_rank4_reduce3_instances
(
instances
);
add_device_softmax_f16_f16_rank4_reduce4_instances
(
instances
);
}
}
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp
0 → 100644
View file @
95a83c6e
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
3
;
void
add_device_softmax_f16_f16_rank3_reduce1_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
RANK
,
1
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp
0 → 100644
View file @
95a83c6e
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
3
;
void
add_device_softmax_f16_f16_rank3_reduce2_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
RANK
,
2
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp
0 → 100644
View file @
95a83c6e
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
3
;
void
add_device_softmax_f16_f16_rank3_reduce3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
RANK
,
3
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp
0 → 100644
View file @
95a83c6e
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_f16_f16_rank4_reduce1_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
RANK
,
1
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp
0 → 100644
View file @
95a83c6e
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_f16_f16_rank4_reduce2_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
RANK
,
2
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp
0 → 100644
View file @
95a83c6e
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_f16_f16_rank4_reduce3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
RANK
,
3
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp
0 → 100644
View file @
95a83c6e
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_f16_f16_rank4_reduce4_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
RANK
,
4
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp
View file @
95a83c6e
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <tuple>
#include <vector>
#include <vector>
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp"
#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
namespace
instance
{
namespace
instance
{
namespace
{
using
F32
=
float
;
using
Pass
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
}
// namespace
template
<
index_t
Rank
,
index_t
Reduce
>
using
device_softmax_f32_f32_instances
=
std
::
tuple
<
// clang-format off
// InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
DeviceSoftmaxImpl
<
F32
,
F32
,
F32
,
Pass
,
Pass
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
8
,
1
,
1
,
1
>
,
// fallback kernel
DeviceSoftmaxImpl
<
F32
,
F32
,
F32
,
Pass
,
Pass
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
8
,
1
,
4
,
4
>
,
DeviceSoftmaxImpl
<
F32
,
F32
,
F32
,
Pass
,
Pass
,
Rank
,
Reduce
,
256
,
4
,
64
,
1
,
8
,
1
,
4
,
4
>
,
DeviceSoftmaxImpl
<
F32
,
F32
,
F32
,
Pass
,
Pass
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
8
,
1
,
4
,
4
>
,
DeviceSoftmaxImpl
<
F32
,
F32
,
F32
,
Pass
,
Pass
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
16
,
1
,
4
,
4
>
,
DeviceSoftmaxImpl
<
F32
,
F32
,
F32
,
Pass
,
Pass
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
32
,
1
,
4
,
4
>
,
DeviceSoftmaxImpl
<
F32
,
F32
,
F32
,
Pass
,
Pass
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
8
,
1
,
4
,
4
>
,
DeviceSoftmaxImpl
<
F32
,
F32
,
F32
,
Pass
,
Pass
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
16
,
1
,
4
,
4
>
,
DeviceSoftmaxImpl
<
F32
,
F32
,
F32
,
Pass
,
Pass
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
32
,
1
,
4
,
4
>
// clang-format on
>
;
void
add_device_softmax_f32_f32_rank3_instances
(
void
add_device_softmax_f32_f32_rank3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
Pass
,
Pass
,
3
>>&
instances
)
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
Pass
Through
,
PassThrough
,
3
>>&
instances
)
{
{
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
3
,
1
>
{});
add_device_softmax_f32_f32_rank3_reduce1_instances
(
instances
);
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
3
,
2
>
{});
add_device_softmax_f32_f32_rank3_reduce2_instances
(
instances
);
add_device_softmax_f32_f32_rank3_reduce3_instances
(
instances
);
}
}
void
add_device_softmax_f32_f32_rank4_instances
(
void
add_device_softmax_f32_f32_rank4_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
Pass
,
Pass
,
4
>>&
instances
)
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
Pass
Through
,
PassThrough
,
4
>>&
instances
)
{
{
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
4
,
1
>
{});
add_device_softmax_f32_f32_rank4_reduce1_instances
(
instances
);
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
4
,
2
>
{});
add_device_softmax_f32_f32_rank4_reduce2_instances
(
instances
);
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
4
,
3
>
{});
add_device_softmax_f32_f32_rank4_reduce3_instances
(
instances
);
add_device_softmax_f32_f32_rank4_reduce4_instances
(
instances
);
}
}
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp
0 → 100644
View file @
95a83c6e
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
3
;
void
add_device_softmax_f32_f32_rank3_reduce1_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
RANK
,
1
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp
0 → 100644
View file @
95a83c6e
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
3
;
void
add_device_softmax_f32_f32_rank3_reduce2_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
RANK
,
2
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp
0 → 100644
View file @
95a83c6e
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
3
;
void
add_device_softmax_f32_f32_rank3_reduce3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
RANK
,
3
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp
0 → 100644
View file @
95a83c6e
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_f32_f32_rank4_reduce1_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
RANK
,
1
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp
0 → 100644
View file @
95a83c6e
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_f32_f32_rank4_reduce2_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
RANK
,
2
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp
0 → 100644
View file @
95a83c6e
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_f32_f32_rank4_reduce3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
RANK
,
3
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp
0 → 100644
View file @
95a83c6e
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_f32_f32_rank4_reduce4_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_f32_f32_instances
<
RANK
,
4
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp
0 → 100644
View file @
95a83c6e
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_softmax_i8_i8_rank3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
3
>>&
instances
)
{
add_device_softmax_i8_i8_rank3_reduce1_instances
(
instances
);
add_device_softmax_i8_i8_rank3_reduce2_instances
(
instances
);
add_device_softmax_i8_i8_rank3_reduce3_instances
(
instances
);
}
void
add_device_softmax_i8_i8_rank4_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
4
>>&
instances
)
{
add_device_softmax_i8_i8_rank4_reduce1_instances
(
instances
);
add_device_softmax_i8_i8_rank4_reduce2_instances
(
instances
);
add_device_softmax_i8_i8_rank4_reduce3_instances
(
instances
);
add_device_softmax_i8_i8_rank4_reduce4_instances
(
instances
);
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
Prev
1
…
23
24
25
26
27
28
29
30
31
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment