Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
72e0c1c5
Commit
72e0c1c5
authored
Jun 19, 2023
by
Rostyslav Geyyer
Browse files
Merge branch 'develop' into lwpck-739
parents
898866e0
f0c620c4
Changes
103
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
69 additions
and
79 deletions
+69
-79
library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp
...on_instance/gpu/elementwise/device_normalize_instance.cpp
+9
-1
library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp
...tance/gpu/normalization/device_groupnorm_f16_instance.cpp
+2
-0
library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp
...tance/gpu/normalization/device_groupnorm_f32_instance.cpp
+2
-0
library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp
...ation/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp
+2
-0
library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp
...gpu/normalization/device_groupnorm_swish_f16_instance.cpp
+2
-0
library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp
...gpu/normalization/device_groupnorm_swish_f32_instance.cpp
+2
-0
library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp
...nce/gpu/normalization/device_layernorm2d_f16_instance.cpp
+2
-0
library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp
...nce/gpu/normalization/device_layernorm2d_f32_instance.cpp
+2
-0
library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp
...nce/gpu/normalization/device_layernorm4d_f16_instance.cpp
+2
-0
library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp
...nce/gpu/normalization/device_layernorm4d_f32_instance.cpp
+2
-0
library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp
...tance/gpu/normalization/normalization_instance_common.hpp
+21
-0
library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt
.../src/tensor_operation_instance/gpu/softmax/CMakeLists.txt
+0
-10
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp
..._instance/gpu/softmax/device_softmax_f16_f16_instance.cpp
+0
-40
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp
...softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp
+3
-4
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp
...softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp
+3
-4
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp
...softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp
+3
-4
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp
...softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp
+3
-4
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp
...softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp
+3
-4
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp
...softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp
+3
-4
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp
...softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp
+3
-4
No files found.
library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp
View file @
72e0c1c5
...
@@ -30,7 +30,12 @@ using device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances = std:
...
@@ -30,7 +30,12 @@ using device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances = std:
//###################|<in, mean, square_mean, gamma, beta>| <out>| functor| NDim| MPerThread| <in, mean, square_mean, gamma, beta ScalarPerVector>| <out ScalarPerVector>|
//###################|<in, mean, square_mean, gamma, beta>| <out>| functor| NDim| MPerThread| <in, mean, square_mean, gamma, beta ScalarPerVector>| <out ScalarPerVector>|
DeviceElementwiseImpl
<
Tuple
<
F16
,
F32
,
F32
,
F16
,
F16
>
,
Tuple
<
F16
>
,
Normalize
,
2
,
8
,
Sequence
<
8
,
1
,
1
,
8
,
8
>
,
Sequence
<
8
>
>
,
DeviceElementwiseImpl
<
Tuple
<
F16
,
F32
,
F32
,
F16
,
F16
>
,
Tuple
<
F16
>
,
Normalize
,
2
,
8
,
Sequence
<
8
,
1
,
1
,
8
,
8
>
,
Sequence
<
8
>
>
,
DeviceElementwiseImpl
<
Tuple
<
F16
,
F32
,
F32
,
F16
,
F16
>
,
Tuple
<
F16
>
,
Normalize
,
2
,
4
,
Sequence
<
4
,
1
,
1
,
4
,
4
>
,
Sequence
<
4
>
>
,
DeviceElementwiseImpl
<
Tuple
<
F16
,
F32
,
F32
,
F16
,
F16
>
,
Tuple
<
F16
>
,
Normalize
,
2
,
4
,
Sequence
<
4
,
1
,
1
,
4
,
4
>
,
Sequence
<
4
>
>
,
DeviceElementwiseImpl
<
Tuple
<
F16
,
F32
,
F32
,
F16
,
F16
>
,
Tuple
<
F16
>
,
Normalize
,
2
,
2
,
Sequence
<
2
,
1
,
1
,
2
,
2
>
,
Sequence
<
2
>
>
,
DeviceElementwiseImpl
<
Tuple
<
F16
,
F32
,
F32
,
F16
,
F16
>
,
Tuple
<
F16
>
,
Normalize
,
2
,
2
,
Sequence
<
2
,
1
,
1
,
2
,
2
>
,
Sequence
<
2
>
>
// clang-format on
>
;
using
device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_generic_instance
=
std
::
tuple
<
// clang-format off
DeviceElementwiseImpl
<
Tuple
<
F16
,
F32
,
F32
,
F16
,
F16
>
,
Tuple
<
F16
>
,
Normalize
,
2
,
1
,
Sequence
<
1
,
1
,
1
,
1
,
1
>
,
Sequence
<
1
>
>
DeviceElementwiseImpl
<
Tuple
<
F16
,
F32
,
F32
,
F16
,
F16
>
,
Tuple
<
F16
>
,
Normalize
,
2
,
1
,
Sequence
<
1
,
1
,
1
,
1
,
1
>
,
Sequence
<
1
>
>
// clang-format on
// clang-format on
>
;
>
;
...
@@ -39,6 +44,9 @@ void add_device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances(
...
@@ -39,6 +44,9 @@ void add_device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances(
std
::
vector
<
DeviceElementwisePtr
<
Tuple
<
F16
,
F32
,
F32
,
F16
,
F16
>
,
Tuple
<
F16
>
,
Normalize
,
2
>>&
std
::
vector
<
DeviceElementwisePtr
<
Tuple
<
F16
,
F32
,
F32
,
F16
,
F16
>
,
Tuple
<
F16
>
,
Normalize
,
2
>>&
instances
)
instances
)
{
{
add_device_operation_instances
(
instances
,
device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_generic_instance
{});
add_device_operation_instances
(
add_device_operation_instances
(
instances
,
device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances
{});
instances
,
device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances
{});
}
}
...
...
library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp
View file @
72e0c1c5
...
@@ -14,6 +14,8 @@ void add_device_normalization_rank_5_3_f16_instances(
...
@@ -14,6 +14,8 @@ void add_device_normalization_rank_5_3_f16_instances(
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F16
,
F16
,
F16
,
F32
,
F16
,
Pass
,
5
,
3
>>>&
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F16
,
F16
,
F16
,
F32
,
F16
,
Pass
,
5
,
3
>>>&
instances
)
instances
)
{
{
add_device_operation_instances
(
instances
,
device_normalization_f16_generic_instance
<
Pass
,
5
,
3
>
{});
add_device_operation_instances
(
instances
,
device_normalization_f16_instances
<
Pass
,
5
,
3
>
{});
add_device_operation_instances
(
instances
,
device_normalization_f16_instances
<
Pass
,
5
,
3
>
{});
}
}
...
...
library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp
View file @
72e0c1c5
...
@@ -14,6 +14,8 @@ void add_device_normalization_rank_5_3_f32_instances(
...
@@ -14,6 +14,8 @@ void add_device_normalization_rank_5_3_f32_instances(
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F32
,
F32
,
F32
,
F32
,
F32
,
Pass
,
5
,
3
>>>&
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F32
,
F32
,
F32
,
F32
,
F32
,
Pass
,
5
,
3
>>>&
instances
)
instances
)
{
{
add_device_operation_instances
(
instances
,
device_normalization_f32_generic_instance
<
Pass
,
5
,
3
>
{});
add_device_operation_instances
(
instances
,
device_normalization_f32_instances
<
Pass
,
5
,
3
>
{});
add_device_operation_instances
(
instances
,
device_normalization_f32_instances
<
Pass
,
5
,
3
>
{});
}
}
...
...
library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp
View file @
72e0c1c5
...
@@ -14,6 +14,8 @@ void add_device_normalization_rank_5_3_swish_f16_f32_f32_f16_instances(
...
@@ -14,6 +14,8 @@ void add_device_normalization_rank_5_3_swish_f16_f32_f32_f16_instances(
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F16
,
F32
,
F32
,
F32
,
F16
,
Swish
,
5
,
3
>>>&
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F16
,
F32
,
F32
,
F32
,
F16
,
Swish
,
5
,
3
>>>&
instances
)
instances
)
{
{
add_device_operation_instances
(
instances
,
device_normalization_f16_f32_f32_f16_generic_instance
<
Swish
,
5
,
3
>
{});
add_device_operation_instances
(
instances
,
add_device_operation_instances
(
instances
,
device_normalization_f16_f32_f32_f16_instances
<
Swish
,
5
,
3
>
{});
device_normalization_f16_f32_f32_f16_instances
<
Swish
,
5
,
3
>
{});
}
}
...
...
library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp
View file @
72e0c1c5
...
@@ -14,6 +14,8 @@ void add_device_normalization_rank_5_3_swish_f16_instances(
...
@@ -14,6 +14,8 @@ void add_device_normalization_rank_5_3_swish_f16_instances(
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F16
,
F16
,
F16
,
F32
,
F16
,
Swish
,
5
,
3
>>>&
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F16
,
F16
,
F16
,
F32
,
F16
,
Swish
,
5
,
3
>>>&
instances
)
instances
)
{
{
add_device_operation_instances
(
instances
,
device_normalization_f16_generic_instance
<
Swish
,
5
,
3
>
{});
add_device_operation_instances
(
instances
,
device_normalization_f16_instances
<
Swish
,
5
,
3
>
{});
add_device_operation_instances
(
instances
,
device_normalization_f16_instances
<
Swish
,
5
,
3
>
{});
}
}
...
...
library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp
View file @
72e0c1c5
...
@@ -14,6 +14,8 @@ void add_device_normalization_rank_5_3_swish_f32_instances(
...
@@ -14,6 +14,8 @@ void add_device_normalization_rank_5_3_swish_f32_instances(
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F32
,
F32
,
F32
,
F32
,
F32
,
Swish
,
5
,
3
>>>&
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F32
,
F32
,
F32
,
F32
,
F32
,
Swish
,
5
,
3
>>>&
instances
)
instances
)
{
{
add_device_operation_instances
(
instances
,
device_normalization_f32_generic_instance
<
Swish
,
5
,
3
>
{});
add_device_operation_instances
(
instances
,
device_normalization_f32_instances
<
Swish
,
5
,
3
>
{});
add_device_operation_instances
(
instances
,
device_normalization_f32_instances
<
Swish
,
5
,
3
>
{});
}
}
...
...
library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp
View file @
72e0c1c5
...
@@ -14,6 +14,8 @@ void add_device_normalization_rank_2_1_f16_instances(
...
@@ -14,6 +14,8 @@ void add_device_normalization_rank_2_1_f16_instances(
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F16
,
F16
,
F16
,
F32
,
F16
,
Pass
,
2
,
1
>>>&
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F16
,
F16
,
F16
,
F32
,
F16
,
Pass
,
2
,
1
>>>&
instances
)
instances
)
{
{
add_device_operation_instances
(
instances
,
device_normalization_f16_generic_instance
<
Pass
,
2
,
1
>
{});
add_device_operation_instances
(
instances
,
device_normalization_f16_instances
<
Pass
,
2
,
1
>
{});
add_device_operation_instances
(
instances
,
device_normalization_f16_instances
<
Pass
,
2
,
1
>
{});
}
}
...
...
library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp
View file @
72e0c1c5
...
@@ -14,6 +14,8 @@ void add_device_normalization_rank_2_1_f32_instances(
...
@@ -14,6 +14,8 @@ void add_device_normalization_rank_2_1_f32_instances(
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F32
,
F32
,
F32
,
F32
,
F32
,
Pass
,
2
,
1
>>>&
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F32
,
F32
,
F32
,
F32
,
F32
,
Pass
,
2
,
1
>>>&
instances
)
instances
)
{
{
add_device_operation_instances
(
instances
,
device_normalization_f32_generic_instance
<
Pass
,
2
,
1
>
{});
add_device_operation_instances
(
instances
,
device_normalization_f32_instances
<
Pass
,
2
,
1
>
{});
add_device_operation_instances
(
instances
,
device_normalization_f32_instances
<
Pass
,
2
,
1
>
{});
}
}
...
...
library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp
View file @
72e0c1c5
...
@@ -14,6 +14,8 @@ void add_device_normalization_rank_4_3_f16_instances(
...
@@ -14,6 +14,8 @@ void add_device_normalization_rank_4_3_f16_instances(
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F16
,
F16
,
F16
,
F32
,
F16
,
Pass
,
4
,
3
>>>&
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F16
,
F16
,
F16
,
F32
,
F16
,
Pass
,
4
,
3
>>>&
instances
)
instances
)
{
{
add_device_operation_instances
(
instances
,
device_normalization_f16_generic_instance
<
Pass
,
4
,
3
>
{});
add_device_operation_instances
(
instances
,
device_normalization_f16_instances
<
Pass
,
4
,
3
>
{});
add_device_operation_instances
(
instances
,
device_normalization_f16_instances
<
Pass
,
4
,
3
>
{});
}
}
...
...
library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp
View file @
72e0c1c5
...
@@ -14,6 +14,8 @@ void add_device_normalization_rank_4_3_f32_instances(
...
@@ -14,6 +14,8 @@ void add_device_normalization_rank_4_3_f32_instances(
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F32
,
F32
,
F32
,
F32
,
F32
,
Pass
,
4
,
3
>>>&
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F32
,
F32
,
F32
,
F32
,
F32
,
Pass
,
4
,
3
>>>&
instances
)
instances
)
{
{
add_device_operation_instances
(
instances
,
device_normalization_f32_generic_instance
<
Pass
,
4
,
3
>
{});
add_device_operation_instances
(
instances
,
device_normalization_f32_instances
<
Pass
,
4
,
3
>
{});
add_device_operation_instances
(
instances
,
device_normalization_f32_instances
<
Pass
,
4
,
3
>
{});
}
}
...
...
library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp
View file @
72e0c1c5
...
@@ -43,6 +43,13 @@ using device_normalization_f16_instances =
...
@@ -43,6 +43,13 @@ using device_normalization_f16_instances =
// clang-format on
// clang-format on
>
;
>
;
template
<
typename
OutElementwise
,
index_t
Rank
,
index_t
Reduce
>
using
device_normalization_f16_generic_instance
=
std
::
tuple
<
// clang-format off
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
64
,
1
,
64
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
// clang-format on
>
;
template
<
typename
OutElementwise
,
index_t
Rank
,
index_t
Reduce
>
template
<
typename
OutElementwise
,
index_t
Rank
,
index_t
Reduce
>
using
device_normalization_f32_instances
=
std
::
tuple
<
using
device_normalization_f32_instances
=
std
::
tuple
<
// clang-format off
// clang-format off
...
@@ -69,6 +76,13 @@ using device_normalization_f32_instances = std::tuple<
...
@@ -69,6 +76,13 @@ using device_normalization_f32_instances = std::tuple<
// clang-format on
// clang-format on
>
;
>
;
template
<
typename
OutElementwise
,
index_t
Rank
,
index_t
Reduce
>
using
device_normalization_f32_generic_instance
=
std
::
tuple
<
// clang-format off
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
64
,
1
,
64
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
// clang-format on
>
;
template
<
typename
OutElementwise
,
index_t
Rank
,
index_t
Reduce
>
template
<
typename
OutElementwise
,
index_t
Rank
,
index_t
Reduce
>
using
device_normalization_f16_f32_f32_f16_instances
=
std
::
tuple
<
using
device_normalization_f16_f32_f32_f16_instances
=
std
::
tuple
<
// clang-format off
// clang-format off
...
@@ -95,6 +109,13 @@ using device_normalization_f16_f32_f32_f16_instances = std::tuple<
...
@@ -95,6 +109,13 @@ using device_normalization_f16_f32_f32_f16_instances = std::tuple<
// clang-format on
// clang-format on
>
;
>
;
template
<
typename
OutElementwise
,
index_t
Rank
,
index_t
Reduce
>
using
device_normalization_f16_f32_f32_f16_generic_instance
=
std
::
tuple
<
// clang-format off
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
64
,
1
,
64
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
// clang-format on
>
;
}
// namespace instance
}
// namespace instance
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
...
...
library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt
View file @
72e0c1c5
add_instance_library
(
device_softmax_instance
add_instance_library
(
device_softmax_instance
device_softmax_i8_i8_instance.cpp
device_softmax_i8_i8_instance_rank3_reduce1.cpp
device_softmax_i8_i8_instance_rank3_reduce2.cpp
device_softmax_i8_i8_instance_rank3_reduce3.cpp
device_softmax_i8_i8_instance_rank4_reduce1.cpp
device_softmax_i8_i8_instance_rank4_reduce2.cpp
device_softmax_i8_i8_instance_rank4_reduce3.cpp
device_softmax_i8_i8_instance_rank4_reduce4.cpp
device_softmax_f16_f16_instance.cpp
device_softmax_f16_f16_instance_rank3_reduce1.cpp
device_softmax_f16_f16_instance_rank3_reduce1.cpp
device_softmax_f16_f16_instance_rank3_reduce2.cpp
device_softmax_f16_f16_instance_rank3_reduce2.cpp
device_softmax_f16_f16_instance_rank3_reduce3.cpp
device_softmax_f16_f16_instance_rank3_reduce3.cpp
...
@@ -15,7 +6,6 @@ add_instance_library(device_softmax_instance
...
@@ -15,7 +6,6 @@ add_instance_library(device_softmax_instance
device_softmax_f16_f16_instance_rank4_reduce2.cpp
device_softmax_f16_f16_instance_rank4_reduce2.cpp
device_softmax_f16_f16_instance_rank4_reduce3.cpp
device_softmax_f16_f16_instance_rank4_reduce3.cpp
device_softmax_f16_f16_instance_rank4_reduce4.cpp
device_softmax_f16_f16_instance_rank4_reduce4.cpp
device_softmax_f32_f32_instance.cpp
device_softmax_f32_f32_instance_rank3_reduce1.cpp
device_softmax_f32_f32_instance_rank3_reduce1.cpp
device_softmax_f32_f32_instance_rank3_reduce2.cpp
device_softmax_f32_f32_instance_rank3_reduce2.cpp
device_softmax_f32_f32_instance_rank3_reduce3.cpp
device_softmax_f32_f32_instance_rank3_reduce3.cpp
...
...
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp
deleted
100644 → 0
View file @
898866e0
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_softmax_f16_f16_rank3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
3
>>&
instances
)
{
add_device_softmax_f16_f16_rank3_reduce1_instances
(
instances
);
add_device_softmax_f16_f16_rank3_reduce2_instances
(
instances
);
add_device_softmax_f16_f16_rank3_reduce3_instances
(
instances
);
}
void
add_device_softmax_f16_f16_rank4_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
4
>>&
instances
)
{
add_device_softmax_f16_f16_rank4_reduce1_instances
(
instances
);
add_device_softmax_f16_f16_rank4_reduce2_instances
(
instances
);
add_device_softmax_f16_f16_rank4_reduce3_instances
(
instances
);
add_device_softmax_f16_f16_rank4_reduce4_instances
(
instances
);
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp
View file @
72e0c1c5
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
namespace
device
{
namespace
device
{
namespace
instance
{
namespace
instance
{
static
constexpr
index_t
RANK
=
3
;
void
add_device_softmax_f16_f16_rank3_reduce1_instances
(
void
add_device_softmax_f16_f16_rank3_reduce1_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
3
,
1
>>&
instances
)
{
{
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
RANK
,
1
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_generic_instance
<
3
,
1
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
3
,
1
>
{});
}
}
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp
View file @
72e0c1c5
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
namespace
device
{
namespace
device
{
namespace
instance
{
namespace
instance
{
static
constexpr
index_t
RANK
=
3
;
void
add_device_softmax_f16_f16_rank3_reduce2_instances
(
void
add_device_softmax_f16_f16_rank3_reduce2_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
3
,
2
>>&
instances
)
{
{
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
RANK
,
2
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_generic_instance
<
3
,
2
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
3
,
2
>
{});
}
}
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp
View file @
72e0c1c5
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
namespace
device
{
namespace
device
{
namespace
instance
{
namespace
instance
{
static
constexpr
index_t
RANK
=
3
;
void
add_device_softmax_f16_f16_rank3_reduce3_instances
(
void
add_device_softmax_f16_f16_rank3_reduce3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
3
,
3
>>&
instances
)
{
{
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
RANK
,
3
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_generic_instance
<
3
,
3
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
3
,
3
>
{});
}
}
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp
View file @
72e0c1c5
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
namespace
device
{
namespace
device
{
namespace
instance
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_f16_f16_rank4_reduce1_instances
(
void
add_device_softmax_f16_f16_rank4_reduce1_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
4
,
1
>>&
instances
)
{
{
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
RANK
,
1
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_generic_instance
<
4
,
1
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
4
,
1
>
{});
}
}
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp
View file @
72e0c1c5
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
namespace
device
{
namespace
device
{
namespace
instance
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_f16_f16_rank4_reduce2_instances
(
void
add_device_softmax_f16_f16_rank4_reduce2_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
4
,
2
>>&
instances
)
{
{
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
RANK
,
2
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_generic_instance
<
4
,
2
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
4
,
2
>
{});
}
}
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp
View file @
72e0c1c5
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
namespace
device
{
namespace
device
{
namespace
instance
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_f16_f16_rank4_reduce3_instances
(
void
add_device_softmax_f16_f16_rank4_reduce3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
4
,
3
>>&
instances
)
{
{
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
RANK
,
3
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_generic_instance
<
4
,
3
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
4
,
3
>
{});
}
}
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp
View file @
72e0c1c5
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
namespace
device
{
namespace
device
{
namespace
instance
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_f16_f16_rank4_reduce4_instances
(
void
add_device_softmax_f16_f16_rank4_reduce4_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
4
,
4
>>&
instances
)
{
{
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
RANK
,
4
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_generic_instance
<
4
,
4
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
4
,
4
>
{});
}
}
}
// namespace instance
}
// namespace instance
...
...
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment