Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
6b9a4bd5
Commit
6b9a4bd5
authored
Apr 23, 2024
by
Jun Liu
Browse files
Merge branch 'amd-develop-staging-0423' into amd-master
parents
56de337f
c5f1cdf7
Changes
364
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
268 additions
and
36 deletions
+268
-36
library/include/ck/library/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.hpp
...ance/gpu/permute_scale/device_permute_scale_instances.hpp
+164
-15
library/include/ck/library/utility/host_tensor.hpp
library/include/ck/library/utility/host_tensor.hpp
+32
-1
library/src/tensor_operation_instance/gpu/CMakeLists.txt
library/src/tensor_operation_instance/gpu/CMakeLists.txt
+35
-0
library/src/tensor_operation_instance/gpu/batched_gemm/CMakeLists.txt
...tensor_operation_instance/gpu/batched_gemm/CMakeLists.txt
+1
-0
library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/CMakeLists.txt
...nstance/gpu/batched_gemm_add_relu_gemm_add/CMakeLists.txt
+1
-0
library/src/tensor_operation_instance/gpu/batched_gemm_bias_permute/CMakeLists.txt
...ion_instance/gpu/batched_gemm_bias_permute/CMakeLists.txt
+1
-0
library/src/tensor_operation_instance/gpu/batched_gemm_gemm/CMakeLists.txt
...r_operation_instance/gpu/batched_gemm_gemm/CMakeLists.txt
+1
-0
library/src/tensor_operation_instance/gpu/batched_gemm_reduce/CMakeLists.txt
...operation_instance/gpu/batched_gemm_reduce/CMakeLists.txt
+1
-0
library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm/CMakeLists.txt
...ion_instance/gpu/batched_gemm_softmax_gemm/CMakeLists.txt
+1
-0
library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/CMakeLists.txt
...ance/gpu/batched_gemm_softmax_gemm_permute/CMakeLists.txt
+1
-0
library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp
...shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp
+3
-2
library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp
...shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp
+3
-2
library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp
...shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp
+3
-2
library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp
...shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp
+3
-2
library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp
...l_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp
+3
-2
library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp
...l_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp
+3
-2
library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp
...l_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp
+3
-2
library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp
...l_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp
+3
-2
library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp
..._c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp
+3
-2
library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp
..._c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp
+3
-2
No files found.
library/include/ck/library/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.hpp
View file @
6b9a4bd5
...
...
@@ -2,7 +2,7 @@
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_
scale
_impl.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_
dynamic_vector_dims
_impl.hpp"
#include "ck/utility/data_type.hpp"
namespace
ck
{
...
...
@@ -13,26 +13,175 @@ namespace instance {
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
Pass
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
UnaryOp
=
ck
::
tensor_operation
::
element_wise
::
UnarySquare
;
using
Scale
=
ck
::
tensor_operation
::
element_wise
::
Scale
;
// clang-format off
template
<
index_t
NDims
>
template
<
index_t
NDims
,
typename
ElementwiseOp
>
using
device_permute_scale_f16_instances
=
std
::
tuple
<
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
Pass
,
UnaryOp
,
Scale
,
NDims
,
1
,
ck
::
Sequence
<
1
>
,
ck
::
Sequence
<
1
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
Pass
,
UnaryOp
,
Scale
,
NDims
,
8
,
ck
::
Sequence
<
8
>
,
ck
::
Sequence
<
1
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
Pass
,
UnaryOp
,
Scale
,
NDims
,
4
,
ck
::
Sequence
<
4
>
,
ck
::
Sequence
<
1
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
Pass
,
UnaryOp
,
Scale
,
NDims
,
2
,
ck
::
Sequence
<
2
>
,
ck
::
Sequence
<
1
>>
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
ElementwiseOp
,
NDims
,
256
,
64
,
64
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
4
>
,
ck
::
Sequence
<
4
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
ElementwiseOp
,
NDims
,
256
,
128
,
32
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
4
>
,
ck
::
Sequence
<
4
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
ElementwiseOp
,
NDims
,
256
,
32
,
128
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
4
>
,
ck
::
Sequence
<
4
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
ElementwiseOp
,
NDims
,
128
,
64
,
32
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
4
>
,
ck
::
Sequence
<
4
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
ElementwiseOp
,
NDims
,
128
,
32
,
64
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
4
>
,
ck
::
Sequence
<
4
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
ElementwiseOp
,
NDims
,
128
,
16
,
128
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
4
>
,
ck
::
Sequence
<
4
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
ElementwiseOp
,
NDims
,
128
,
128
,
16
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
4
>
,
ck
::
Sequence
<
4
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
ElementwiseOp
,
NDims
,
64
,
32
,
32
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
4
>
,
ck
::
Sequence
<
4
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
ElementwiseOp
,
NDims
,
64
,
16
,
64
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
4
>
,
ck
::
Sequence
<
4
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
ElementwiseOp
,
NDims
,
64
,
64
,
16
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
4
>
,
ck
::
Sequence
<
4
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
ElementwiseOp
,
NDims
,
32
,
32
,
16
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
4
>
,
ck
::
Sequence
<
4
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
ElementwiseOp
,
NDims
,
32
,
16
,
32
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
4
>
,
ck
::
Sequence
<
4
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
ElementwiseOp
,
NDims
,
256
,
128
,
128
,
8
,
8
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
8
>
,
ck
::
Sequence
<
8
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
ElementwiseOp
,
NDims
,
256
,
256
,
64
,
8
,
8
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
8
>
,
ck
::
Sequence
<
8
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
ElementwiseOp
,
NDims
,
256
,
64
,
256
,
8
,
8
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
8
>
,
ck
::
Sequence
<
8
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
ElementwiseOp
,
NDims
,
128
,
128
,
64
,
8
,
8
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
8
>
,
ck
::
Sequence
<
8
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
ElementwiseOp
,
NDims
,
128
,
64
,
128
,
8
,
8
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
8
>
,
ck
::
Sequence
<
8
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
ElementwiseOp
,
NDims
,
128
,
32
,
256
,
8
,
8
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
8
>
,
ck
::
Sequence
<
8
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
ElementwiseOp
,
NDims
,
128
,
256
,
32
,
8
,
8
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
8
>
,
ck
::
Sequence
<
8
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
ElementwiseOp
,
NDims
,
64
,
64
,
64
,
8
,
8
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
8
>
,
ck
::
Sequence
<
8
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
ElementwiseOp
,
NDims
,
64
,
32
,
128
,
8
,
8
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
8
>
,
ck
::
Sequence
<
8
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
ElementwiseOp
,
NDims
,
64
,
128
,
32
,
8
,
8
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
8
>
,
ck
::
Sequence
<
8
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
ElementwiseOp
,
NDims
,
32
,
64
,
32
,
8
,
8
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
8
>
,
ck
::
Sequence
<
8
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
ElementwiseOp
,
NDims
,
32
,
32
,
64
,
8
,
8
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
8
>
,
ck
::
Sequence
<
8
>>
,
#if 0
// Disabled instances to improve compilation time
// They listed here to show other possible combinations of parameters
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 256, 256, 256, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>,
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 128, 256, 128, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>,
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 128, 128, 256, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>,
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 128, 32, 512, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>,
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 128, 512, 64, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>,
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 64, 64, 256, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>,
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 64, 256, 64, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>,
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 64, 128, 128, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>,
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 32, 128, 64, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>,
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 32, 64, 128, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>,
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 256, 64, 128, 4, 8, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 256, 128, 64, 4, 8, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 128, 64, 64, 4, 8, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 128, 32, 128, 4, 8, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 256, 32, 256, 4, 8, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 128, 16, 256, 4, 8, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 128, 128, 32, 4, 8, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 64, 32, 64, 4, 8, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 64, 16, 128, 4, 8, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 64, 64, 32, 4, 8, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 32, 32, 32, 4, 8, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 32, 16, 64, 4, 8, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 256, 128, 64, 8, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 256, 256, 32, 8, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 256, 64, 128, 8, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 128, 128, 32, 8, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 128, 64, 64, 8, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 128, 32, 128, 8, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 128, 256, 16, 8, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 64, 64, 32, 8, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 64, 32, 64, 8, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 64, 128, 16, 8, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 32, 64, 16, 8, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 32, 32, 32, 8, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
#endif
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
ElementwiseOp
,
NDims
,
256
,
64
,
64
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
1
>
,
ck
::
Sequence
<
1
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
ElementwiseOp
,
NDims
,
256
,
128
,
32
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
1
>
,
ck
::
Sequence
<
1
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
ElementwiseOp
,
NDims
,
256
,
32
,
128
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
1
>
,
ck
::
Sequence
<
1
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
ElementwiseOp
,
NDims
,
128
,
64
,
32
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
1
>
,
ck
::
Sequence
<
1
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
ElementwiseOp
,
NDims
,
128
,
32
,
64
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
1
>
,
ck
::
Sequence
<
1
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
ElementwiseOp
,
NDims
,
128
,
16
,
128
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
1
>
,
ck
::
Sequence
<
1
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
ElementwiseOp
,
NDims
,
128
,
128
,
16
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
1
>
,
ck
::
Sequence
<
1
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
ElementwiseOp
,
NDims
,
64
,
32
,
32
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
1
>
,
ck
::
Sequence
<
1
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
ElementwiseOp
,
NDims
,
64
,
16
,
64
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
1
>
,
ck
::
Sequence
<
1
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
ElementwiseOp
,
NDims
,
64
,
64
,
16
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
1
>
,
ck
::
Sequence
<
1
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
ElementwiseOp
,
NDims
,
32
,
32
,
16
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
1
>
,
ck
::
Sequence
<
1
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
ElementwiseOp
,
NDims
,
32
,
16
,
32
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
1
>
,
ck
::
Sequence
<
1
>>
>
;
template
<
index_t
NDims
>
template
<
index_t
NDims
,
typename
ElementwiseOp
>
using
device_permute_scale_f32_instances
=
std
::
tuple
<
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
Pass
,
UnaryOp
,
Scale
,
NDims
,
1
,
ck
::
Sequence
<
1
>
,
ck
::
Sequence
<
1
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
Pass
,
UnaryOp
,
Scale
,
NDims
,
8
,
ck
::
Sequence
<
8
>
,
ck
::
Sequence
<
1
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
Pass
,
UnaryOp
,
Scale
,
NDims
,
4
,
ck
::
Sequence
<
4
>
,
ck
::
Sequence
<
1
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
Pass
,
UnaryOp
,
Scale
,
NDims
,
2
,
ck
::
Sequence
<
2
>
,
ck
::
Sequence
<
1
>>
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
ElementwiseOp
,
NDims
,
256
,
64
,
64
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
4
>
,
ck
::
Sequence
<
4
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
ElementwiseOp
,
NDims
,
256
,
128
,
32
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
4
>
,
ck
::
Sequence
<
4
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
ElementwiseOp
,
NDims
,
256
,
32
,
128
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
4
>
,
ck
::
Sequence
<
4
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
ElementwiseOp
,
NDims
,
128
,
64
,
32
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
4
>
,
ck
::
Sequence
<
4
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
ElementwiseOp
,
NDims
,
128
,
32
,
64
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
4
>
,
ck
::
Sequence
<
4
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
ElementwiseOp
,
NDims
,
128
,
16
,
128
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
4
>
,
ck
::
Sequence
<
4
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
ElementwiseOp
,
NDims
,
128
,
128
,
16
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
4
>
,
ck
::
Sequence
<
4
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
ElementwiseOp
,
NDims
,
64
,
32
,
32
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
4
>
,
ck
::
Sequence
<
4
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
ElementwiseOp
,
NDims
,
64
,
16
,
64
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
4
>
,
ck
::
Sequence
<
4
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
ElementwiseOp
,
NDims
,
64
,
64
,
16
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
4
>
,
ck
::
Sequence
<
4
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
ElementwiseOp
,
NDims
,
32
,
32
,
16
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
4
>
,
ck
::
Sequence
<
4
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
ElementwiseOp
,
NDims
,
32
,
16
,
32
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
4
>
,
ck
::
Sequence
<
4
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
ElementwiseOp
,
NDims
,
256
,
128
,
128
,
8
,
8
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
8
>
,
ck
::
Sequence
<
8
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
ElementwiseOp
,
NDims
,
256
,
256
,
64
,
8
,
8
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
8
>
,
ck
::
Sequence
<
8
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
ElementwiseOp
,
NDims
,
256
,
64
,
256
,
8
,
8
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
8
>
,
ck
::
Sequence
<
8
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
ElementwiseOp
,
NDims
,
128
,
128
,
64
,
8
,
8
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
8
>
,
ck
::
Sequence
<
8
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
ElementwiseOp
,
NDims
,
128
,
64
,
128
,
8
,
8
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
8
>
,
ck
::
Sequence
<
8
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
ElementwiseOp
,
NDims
,
128
,
32
,
256
,
8
,
8
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
8
>
,
ck
::
Sequence
<
8
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
ElementwiseOp
,
NDims
,
128
,
256
,
32
,
8
,
8
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
8
>
,
ck
::
Sequence
<
8
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
ElementwiseOp
,
NDims
,
64
,
64
,
64
,
8
,
8
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
8
>
,
ck
::
Sequence
<
8
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
ElementwiseOp
,
NDims
,
64
,
32
,
128
,
8
,
8
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
8
>
,
ck
::
Sequence
<
8
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
ElementwiseOp
,
NDims
,
64
,
128
,
32
,
8
,
8
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
8
>
,
ck
::
Sequence
<
8
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
ElementwiseOp
,
NDims
,
32
,
64
,
32
,
8
,
8
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
8
>
,
ck
::
Sequence
<
8
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
ElementwiseOp
,
NDims
,
32
,
32
,
64
,
8
,
8
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
8
>
,
ck
::
Sequence
<
8
>>
,
#if 0
// Disabled instances to improve compilation time
// They listed here to show other possible combinations of parameters
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 256, 256, 256, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>,
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 128, 256, 128, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>,
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 128, 128, 256, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>,
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 128, 32, 512, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>,
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 128, 512, 64, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>,
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 64, 256, 64, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>,
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 64, 64, 256, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>,
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 64, 128, 128, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>,
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 32, 128, 64, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>,
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 32, 64, 128, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>,
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 256, 64, 128, 4, 8, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 256, 128, 64, 4, 8, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 128, 64, 64, 4, 8, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 128, 32, 128, 4, 8, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 256, 32, 256, 4, 8, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 128, 16, 256, 4, 8, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 128, 128, 32, 4, 8, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 64, 32, 64, 4, 8, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 64, 16, 128, 4, 8, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 64, 64, 32, 4, 8, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 32, 32, 32, 4, 8, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 32, 16, 64, 4, 8, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 256, 128, 64, 8, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 256, 256, 32, 8, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 128, 128, 32, 8, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 128, 64, 64, 8, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 256, 64, 128, 8, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 128, 32, 128, 8, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 128, 256, 16, 8, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 64, 64, 32, 8, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 64, 32, 64, 8, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 64, 128, 16, 8, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 32, 64, 16, 8, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 32, 32, 32, 8, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
#endif
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
ElementwiseOp
,
NDims
,
256
,
64
,
64
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
1
>
,
ck
::
Sequence
<
1
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
ElementwiseOp
,
NDims
,
256
,
128
,
32
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
1
>
,
ck
::
Sequence
<
1
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
ElementwiseOp
,
NDims
,
256
,
32
,
128
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
1
>
,
ck
::
Sequence
<
1
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
ElementwiseOp
,
NDims
,
128
,
64
,
32
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
1
>
,
ck
::
Sequence
<
1
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
ElementwiseOp
,
NDims
,
128
,
32
,
64
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
1
>
,
ck
::
Sequence
<
1
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
ElementwiseOp
,
NDims
,
128
,
16
,
128
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
1
>
,
ck
::
Sequence
<
1
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
ElementwiseOp
,
NDims
,
128
,
128
,
16
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
1
>
,
ck
::
Sequence
<
1
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
ElementwiseOp
,
NDims
,
64
,
32
,
32
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
1
>
,
ck
::
Sequence
<
1
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
ElementwiseOp
,
NDims
,
64
,
16
,
64
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
1
>
,
ck
::
Sequence
<
1
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
ElementwiseOp
,
NDims
,
64
,
64
,
16
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
1
>
,
ck
::
Sequence
<
1
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
ElementwiseOp
,
NDims
,
32
,
32
,
16
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
1
>
,
ck
::
Sequence
<
1
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
ElementwiseOp
,
NDims
,
32
,
16
,
32
,
4
,
4
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
1
>
,
ck
::
Sequence
<
1
>>
>
;
// clang-format on
...
...
library/include/ck/library/utility/host_tensor.hpp
View file @
6b9a4bd5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
...
...
@@ -408,6 +408,37 @@ struct Tensor
mDesc
.
GetLengths
()[
5
])(
num_thread
);
break
;
}
case
12
:
{
auto
f
=
[
&
](
auto
i0
,
auto
i1
,
auto
i2
,
auto
i3
,
auto
i4
,
auto
i5
,
auto
i6
,
auto
i7
,
auto
i8
,
auto
i9
,
auto
i10
,
auto
i11
)
{
(
*
this
)(
i0
,
i1
,
i2
,
i3
,
i4
,
i5
,
i6
,
i7
,
i8
,
i9
,
i10
,
i11
)
=
g
(
i0
,
i1
,
i2
,
i3
,
i4
,
i5
,
i6
,
i7
,
i8
,
i9
,
i10
,
i11
);
};
make_ParallelTensorFunctor
(
f
,
mDesc
.
GetLengths
()[
0
],
mDesc
.
GetLengths
()[
1
],
mDesc
.
GetLengths
()[
2
],
mDesc
.
GetLengths
()[
3
],
mDesc
.
GetLengths
()[
4
],
mDesc
.
GetLengths
()[
5
],
mDesc
.
GetLengths
()[
6
],
mDesc
.
GetLengths
()[
7
],
mDesc
.
GetLengths
()[
8
],
mDesc
.
GetLengths
()[
9
],
mDesc
.
GetLengths
()[
10
],
mDesc
.
GetLengths
()[
11
])(
num_thread
);
break
;
}
default:
throw
std
::
runtime_error
(
"unspported dimension"
);
}
}
...
...
library/src/tensor_operation_instance/gpu/CMakeLists.txt
View file @
6b9a4bd5
...
...
@@ -36,12 +36,27 @@ function(add_instance_library INSTANCE_NAME)
endif
()
endforeach
()
endif
()
# Do not build DL instances if DL_KERNELS macro is not set
foreach
(
source IN LISTS ARGN
)
if
(
NOT DEFINED DL_KERNELS AND source MATCHES
"_dl"
)
message
(
"removing dl instance
${
source
}
"
)
list
(
REMOVE_ITEM ARGN
"
${
source
}
"
)
endif
()
endforeach
()
# Do not build XDL instances if gfx9 targets are not on the target list
foreach
(
source IN LISTS ARGN
)
if
(
NOT GPU_TARGETS MATCHES
"gfx9"
AND source MATCHES
"_xdl"
)
message
(
"removing xdl instance
${
source
}
"
)
list
(
REMOVE_ITEM ARGN
"
${
source
}
"
)
endif
()
endforeach
()
# Do not build WMMA instances if gfx11 targets are not on the target list
foreach
(
source IN LISTS ARGN
)
if
(
NOT GPU_TARGETS MATCHES
"gfx11"
AND source MATCHES
"_wmma"
)
message
(
"removing wmma instance
${
source
}
"
)
list
(
REMOVE_ITEM ARGN
"
${
source
}
"
)
endif
()
endforeach
()
#only continue if there are some source files left on the list
if
(
ARGN
)
add_library
(
${
INSTANCE_NAME
}
OBJECT
${
ARGN
}
)
...
...
@@ -124,6 +139,26 @@ FOREACH(subdir_path ${dir_list})
message
(
"Found only dl instances, but DL_KERNELS is not set. Skipping."
)
set
(
add_inst 0
)
endif
()
if
((
"
${
cmake_instance
}
"
MATCHES
"ONLY XDL_KERNELS"
)
AND
(
NOT GPU_TARGETS MATCHES
"gfx9"
))
message
(
"Found only xdl instances, but gfx9 is not on the targets list. Skipping."
)
set
(
add_inst 0
)
endif
()
if
((
"
${
cmake_instance
}
"
MATCHES
"ONLY WMMA_KERNELS"
)
AND
(
NOT GPU_TARGETS MATCHES
"gfx11"
))
message
(
"Found only wmma instances, but gfx11 is not on the targets list. Skipping."
)
set
(
add_inst 0
)
endif
()
if
((
"
${
cmake_instance
}
"
MATCHES
"ONLY XDL_AND_DL_KERNELS"
)
AND
(
NOT DEFINED DL_KERNELS
)
AND
(
NOT GPU_TARGETS MATCHES
"gfx9"
))
message
(
"Found only xdl and dl instances, but gfx9 is not on the targets listand DL_KERNELS is not set. Skipping."
)
set
(
add_inst 0
)
endif
()
if
((
"
${
cmake_instance
}
"
MATCHES
"ONLY XDL_AND_WMMA_KERNELS"
)
AND
(
NOT GPU_TARGETS MATCHES
"gfx11"
)
AND
(
NOT GPU_TARGETS MATCHES
"gfx9"
))
message
(
"Found only xdl and wmma instances, but gfx11 and gfx9 are not on the targets list. Skipping."
)
set
(
add_inst 0
)
endif
()
if
((
"
${
cmake_instance
}
"
MATCHES
"XDL_DL_WMMA_KERNELS"
)
AND
(
NOT GPU_TARGETS MATCHES
"gfx11"
)
AND
(
NOT GPU_TARGETS MATCHES
"gfx9"
)
AND
(
NOT DEFINED DL_KERNELS
))
message
(
"Found xdl, dl, and wmma instances, but none of those meet the target list. Skipping."
)
set
(
add_inst 0
)
endif
()
if
((
add_inst EQUAL 1
))
get_filename_component
(
target_dir
${
subdir_path
}
NAME
)
add_subdirectory
(
${
target_dir
}
)
...
...
library/src/tensor_operation_instance/gpu/batched_gemm/CMakeLists.txt
View file @
6b9a4bd5
# ONLY XDL_KERNELS
set
(
BATCHED_GEMM_INSTANCES
)
list
(
APPEND BATCHED_GEMM_INSTANCES device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp
device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp
...
...
library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/CMakeLists.txt
View file @
6b9a4bd5
# ONLY XDL_KERNELS
add_instance_library
(
device_batched_gemm_add_relu_gemm_add_instance
device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp
...
...
library/src/tensor_operation_instance/gpu/batched_gemm_bias_permute/CMakeLists.txt
View file @
6b9a4bd5
# ONLY XDL_KERNELS
add_instance_library
(
device_batched_gemm_bias_permute_instance
device_batched_gemm_bias_permute_m2_n3_k1_xdl_c_shuffle_f16_f16_f16_f16_instance.cpp
)
...
...
library/src/tensor_operation_instance/gpu/batched_gemm_gemm/CMakeLists.txt
View file @
6b9a4bd5
# ONLY XDL_KERNELS
add_instance_library
(
device_batched_gemm_gemm_instance
device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp
...
...
library/src/tensor_operation_instance/gpu/batched_gemm_reduce/CMakeLists.txt
View file @
6b9a4bd5
# ONLY XDL_KERNELS
add_instance_library
(
device_batched_gemm_reduce_instance
device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp
device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp
...
...
library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm/CMakeLists.txt
View file @
6b9a4bd5
# ONLY XDL_KERNELS
add_instance_library
(
device_batched_gemm_softmax_gemm_instance
device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
)
library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/CMakeLists.txt
View file @
6b9a4bd5
# ONLY XDL_KERNELS
set
(
DEVICE_BATCHED_GEMM_SOFTMAX_GEMM_PERMUTE_INSTANCES
)
list
(
APPEND DEVICE_BATCHED_GEMM_SOFTMAX_GEMM_PERMUTE_INSTANCES
device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
...
...
library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp
→
library/src/tensor_operation_instance/gpu/contraction_bilinear/
2D/
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp
View file @
6b9a4bd5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
...
...
@@ -31,7 +31,8 @@ using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_com
F32
,
PassThrough
,
PassThrough
,
Bilinear
>
;
Bilinear
,
2
>
;
void
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
...
...
library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp
→
library/src/tensor_operation_instance/gpu/contraction_bilinear/
2D/
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp
View file @
6b9a4bd5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
...
...
@@ -31,7 +31,8 @@ using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_com
F32
,
PassThrough
,
PassThrough
,
Bilinear
>
;
Bilinear
,
2
>
;
void
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
...
...
library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp
→
library/src/tensor_operation_instance/gpu/contraction_bilinear/
2D/
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp
View file @
6b9a4bd5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
...
...
@@ -31,7 +31,8 @@ using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_com
F32
,
PassThrough
,
PassThrough
,
Bilinear
>
;
Bilinear
,
2
>
;
void
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
...
...
library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp
→
library/src/tensor_operation_instance/gpu/contraction_bilinear/
2D/
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp
View file @
6b9a4bd5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
...
...
@@ -31,7 +31,8 @@ using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_com
F32
,
PassThrough
,
PassThrough
,
Bilinear
>
;
Bilinear
,
2
>
;
void
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
...
...
library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp
→
library/src/tensor_operation_instance/gpu/contraction_bilinear/
2D/
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp
View file @
6b9a4bd5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
...
...
@@ -31,7 +31,8 @@ using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute
F32
,
PassThrough
,
PassThrough
,
Bilinear
>
;
Bilinear
,
2
>
;
void
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
...
...
library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp
→
library/src/tensor_operation_instance/gpu/contraction_bilinear/
2D/
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp
View file @
6b9a4bd5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
...
...
@@ -31,7 +31,8 @@ using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute
F32
,
PassThrough
,
PassThrough
,
Bilinear
>
;
Bilinear
,
2
>
;
void
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
...
...
library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp
→
library/src/tensor_operation_instance/gpu/contraction_bilinear/
2D/
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp
View file @
6b9a4bd5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
...
...
@@ -31,7 +31,8 @@ using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute
F32
,
PassThrough
,
PassThrough
,
Bilinear
>
;
Bilinear
,
2
>
;
void
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
...
...
library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp
→
library/src/tensor_operation_instance/gpu/contraction_bilinear/
2D/
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp
View file @
6b9a4bd5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
...
...
@@ -31,7 +31,8 @@ using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute
F32
,
PassThrough
,
PassThrough
,
Bilinear
>
;
Bilinear
,
2
>
;
void
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
...
...
library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp
→
library/src/tensor_operation_instance/gpu/contraction_bilinear/
2D/
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp
View file @
6b9a4bd5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
...
...
@@ -31,7 +31,8 @@ using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute
BF16
,
PassThrough
,
PassThrough
,
Bilinear
>
;
Bilinear
,
2
>
;
void
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
...
...
library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp
→
library/src/tensor_operation_instance/gpu/contraction_bilinear/
2D/
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp
View file @
6b9a4bd5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
...
...
@@ -31,7 +31,8 @@ using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute
BF16
,
PassThrough
,
PassThrough
,
Bilinear
>
;
Bilinear
,
2
>
;
void
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
...
...
Prev
1
…
3
4
5
6
7
8
9
10
11
…
19
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment