Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
2797227f
Unverified
Commit
2797227f
authored
Apr 19, 2023
by
Rostyslav Geyyer
Committed by
GitHub
Apr 19, 2023
Browse files
Merge branch 'develop' into lwpck-674
parents
da8679b3
bb0b772d
Changes
56
Show whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
546 additions
and
75 deletions
+546
-75
library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp
...scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp
+75
-0
library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp
...scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp
+75
-0
library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp
...scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp
+75
-0
library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt
...ensor_operation_instance/gpu/normalization/CMakeLists.txt
+9
-2
library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp
...tance/gpu/normalization/device_groupnorm_f16_instance.cpp
+23
-0
library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp
...tance/gpu/normalization/device_groupnorm_f32_instance.cpp
+23
-0
library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp
...ation/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp
+24
-0
library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp
...gpu/normalization/device_groupnorm_swish_f16_instance.cpp
+23
-0
library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp
...gpu/normalization/device_groupnorm_swish_f32_instance.cpp
+23
-0
library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp
...nce/gpu/normalization/device_layernorm2d_f16_instance.cpp
+23
-0
library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp
...nce/gpu/normalization/device_layernorm2d_f32_instance.cpp
+23
-0
library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp
...nce/gpu/normalization/device_layernorm4d_f16_instance.cpp
+23
-0
library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp
...nce/gpu/normalization/device_layernorm4d_f32_instance.cpp
+23
-0
library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp
...e/gpu/normalization/device_normalization_f16_instance.cpp
+0
-70
library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp
...tance/gpu/normalization/normalization_instance_common.hpp
+101
-0
profiler/include/profiler/profile_groupnorm_impl.hpp
profiler/include/profiler/profile_groupnorm_impl.hpp
+3
-3
No files found.
library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp
0 → 100644
View file @
2797227f
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
F64
=
double
;
using
Empty_Tuple
=
ck
::
Tuple
<>
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Scale
=
ck
::
tensor_operation
::
element_wise
::
Scale
;
static
constexpr
auto
GemmMNKPadding
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
;
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// k/n/n/n are the fast changing dimension for A/B/D/E
using
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance
=
std
::
tuple
<
// clang-format off
//#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
//#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
F64
,
F64
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
GemmMNKPadding
,
1
,
256
,
128
,
128
,
16
,
2
,
1
,
16
,
16
,
4
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
1
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
F64
,
F64
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
GemmMNKPadding
,
1
,
256
,
128
,
128
,
16
,
2
,
2
,
16
,
16
,
4
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
1
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
F64
,
F64
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
GemmMNKPadding
,
1
,
128
,
128
,
64
,
16
,
2
,
1
,
16
,
16
,
4
,
4
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
1
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
F64
,
F64
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
GemmMNKPadding
,
1
,
128
,
128
,
64
,
16
,
2
,
2
,
16
,
16
,
4
,
4
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
1
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
F64
,
F64
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
GemmMNKPadding
,
1
,
128
,
64
,
128
,
16
,
2
,
1
,
16
,
16
,
4
,
4
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
0
,
1
,
1
,
S
<
1
,
8
,
1
,
16
>
,
1
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
F64
,
F64
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
GemmMNKPadding
,
1
,
128
,
64
,
128
,
16
,
2
,
2
,
16
,
16
,
4
,
4
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
16
>
,
1
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
F64
,
F64
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
GemmMNKPadding
,
1
,
256
,
128
,
64
,
16
,
2
,
1
,
16
,
16
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
S
<
16
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
1
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
F64
,
F64
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
GemmMNKPadding
,
1
,
256
,
128
,
64
,
16
,
2
,
2
,
16
,
16
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
1
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
F64
,
F64
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
GemmMNKPadding
,
1
,
256
,
64
,
128
,
16
,
2
,
1
,
16
,
16
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
1
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
F64
,
F64
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
GemmMNKPadding
,
1
,
256
,
64
,
128
,
16
,
2
,
2
,
16
,
16
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
1
>
// clang-format on
>
;
void
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp
0 → 100644
View file @
2797227f
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
F64
=
double
;
using
Empty_Tuple
=
ck
::
Tuple
<>
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Scale
=
ck
::
tensor_operation
::
element_wise
::
Scale
;
static
constexpr
auto
GemmMNKPadding
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
;
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// m/k/n/n are the fast changing dimension for A/B/D/E
using
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance
=
std
::
tuple
<
// clang-format off
//#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
//#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
F64
,
F64
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
GemmMNKPadding
,
1
,
256
,
128
,
128
,
16
,
1
,
2
,
16
,
16
,
4
,
4
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
0
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
1
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
F64
,
F64
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
GemmMNKPadding
,
1
,
256
,
128
,
128
,
16
,
2
,
2
,
16
,
16
,
4
,
4
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
1
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
F64
,
F64
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
GemmMNKPadding
,
1
,
128
,
128
,
64
,
16
,
1
,
2
,
16
,
16
,
4
,
4
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
0
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
1
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
F64
,
F64
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
GemmMNKPadding
,
1
,
128
,
128
,
64
,
16
,
2
,
2
,
16
,
16
,
4
,
4
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
1
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
F64
,
F64
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
GemmMNKPadding
,
1
,
128
,
64
,
128
,
16
,
1
,
2
,
16
,
16
,
4
,
4
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
0
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
16
>
,
1
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
F64
,
F64
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
GemmMNKPadding
,
1
,
128
,
64
,
128
,
16
,
2
,
2
,
16
,
16
,
4
,
4
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
16
>
,
1
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
F64
,
F64
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
GemmMNKPadding
,
1
,
256
,
128
,
64
,
16
,
1
,
2
,
16
,
16
,
4
,
2
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
0
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
1
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
F64
,
F64
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
GemmMNKPadding
,
1
,
256
,
128
,
64
,
16
,
2
,
2
,
16
,
16
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
1
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
F64
,
F64
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
GemmMNKPadding
,
1
,
256
,
64
,
128
,
16
,
1
,
2
,
16
,
16
,
2
,
4
,
S
<
16
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
0
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
1
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
F64
,
F64
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
GemmMNKPadding
,
1
,
256
,
64
,
128
,
16
,
2
,
2
,
16
,
16
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
1
>
// clang-format on
>
;
void
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp
0 → 100644
View file @
2797227f
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
F64
=
double
;
using
Empty_Tuple
=
ck
::
Tuple
<>
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Scale
=
ck
::
tensor_operation
::
element_wise
::
Scale
;
static
constexpr
auto
GemmMNKPadding
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
;
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// m/n/n/n are the fast changing dimension for A/B/D/E
using
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance
=
std
::
tuple
<
// clang-format off
//#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
//#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
F64
,
F64
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
GemmMNKPadding
,
1
,
256
,
128
,
128
,
16
,
1
,
1
,
16
,
16
,
4
,
4
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
0
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
1
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
F64
,
F64
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
GemmMNKPadding
,
1
,
256
,
128
,
128
,
16
,
2
,
2
,
16
,
16
,
4
,
4
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
1
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
F64
,
F64
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
GemmMNKPadding
,
1
,
128
,
128
,
64
,
16
,
1
,
1
,
16
,
16
,
4
,
4
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
0
,
S
<
4
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
1
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
F64
,
F64
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
GemmMNKPadding
,
1
,
128
,
128
,
64
,
16
,
2
,
2
,
16
,
16
,
4
,
4
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
1
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
F64
,
F64
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
GemmMNKPadding
,
1
,
128
,
64
,
128
,
16
,
1
,
1
,
16
,
16
,
4
,
4
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
0
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
0
,
1
,
1
,
S
<
1
,
8
,
1
,
16
>
,
1
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
F64
,
F64
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
GemmMNKPadding
,
1
,
128
,
64
,
128
,
16
,
2
,
2
,
16
,
16
,
4
,
4
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
16
>
,
1
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
F64
,
F64
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
GemmMNKPadding
,
1
,
256
,
128
,
64
,
16
,
1
,
1
,
16
,
16
,
4
,
2
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
0
,
S
<
16
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
1
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
F64
,
F64
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
GemmMNKPadding
,
1
,
256
,
128
,
64
,
16
,
2
,
2
,
16
,
16
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
1
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
F64
,
F64
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
GemmMNKPadding
,
1
,
256
,
64
,
128
,
16
,
1
,
1
,
16
,
16
,
2
,
4
,
S
<
16
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
0
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
1
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
F64
,
F64
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
GemmMNKPadding
,
1
,
256
,
64
,
128
,
16
,
2
,
2
,
16
,
16
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
1
>
// clang-format on
>
;
void
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt
View file @
2797227f
add_instance_library
(
device_normalization_instance
add_instance_library
(
device_normalization_instance
device_normalization_f16_instance.cpp
device_layernorm2d_f16_instance.cpp
device_normalization_f32_instance.cpp
device_layernorm2d_f32_instance.cpp
device_layernorm4d_f16_instance.cpp
device_layernorm4d_f32_instance.cpp
device_groupnorm_f16_instance.cpp
device_groupnorm_f32_instance.cpp
device_groupnorm_swish_f16_instance.cpp
device_groupnorm_swish_f32_instance.cpp
device_groupnorm_swish_f16_f32_f32_f16_instance.cpp
)
)
library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp
0 → 100644
View file @
2797227f
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "normalization_instance_common.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
Pass
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
void
add_device_normalization_rank_5_3_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F16
,
F16
,
F16
,
F32
,
F16
,
Pass
,
5
,
3
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_normalization_f16_instances
<
Pass
,
5
,
3
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp
0 → 100644
View file @
2797227f
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "normalization_instance_common.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
Pass
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
void
add_device_normalization_rank_5_3_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F32
,
F32
,
F32
,
F32
,
F32
,
Pass
,
5
,
3
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_normalization_f32_instances
<
Pass
,
5
,
3
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp
0 → 100644
View file @
2797227f
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "normalization_instance_common.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
Swish
=
ck
::
tensor_operation
::
element_wise
::
Swish
;
void
add_device_normalization_rank_5_3_swish_f16_f32_f32_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F16
,
F32
,
F32
,
F32
,
F16
,
Swish
,
5
,
3
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_normalization_f16_f32_f32_f16_instances
<
Swish
,
5
,
3
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp
0 → 100644
View file @
2797227f
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "normalization_instance_common.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
Swish
=
ck
::
tensor_operation
::
element_wise
::
Swish
;
void
add_device_normalization_rank_5_3_swish_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F16
,
F16
,
F16
,
F32
,
F16
,
Swish
,
5
,
3
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_normalization_f16_instances
<
Swish
,
5
,
3
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp
0 → 100644
View file @
2797227f
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "normalization_instance_common.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
Swish
=
ck
::
tensor_operation
::
element_wise
::
Swish
;
void
add_device_normalization_rank_5_3_swish_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F32
,
F32
,
F32
,
F32
,
F32
,
Swish
,
5
,
3
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_normalization_f32_instances
<
Swish
,
5
,
3
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp
0 → 100644
View file @
2797227f
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "normalization_instance_common.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
Pass
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
void
add_device_normalization_rank_2_1_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F16
,
F16
,
F16
,
F32
,
F16
,
Pass
,
2
,
1
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_normalization_f16_instances
<
Pass
,
2
,
1
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp
0 → 100644
View file @
2797227f
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "normalization_instance_common.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
Pass
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
void
add_device_normalization_rank_2_1_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F32
,
F32
,
F32
,
F32
,
F32
,
Pass
,
2
,
1
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_normalization_f32_instances
<
Pass
,
2
,
1
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp
0 → 100644
View file @
2797227f
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "normalization_instance_common.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
Pass
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
void
add_device_normalization_rank_4_3_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F16
,
F16
,
F16
,
F32
,
F16
,
Pass
,
4
,
3
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_normalization_f16_instances
<
Pass
,
4
,
3
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp
0 → 100644
View file @
2797227f
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "normalization_instance_common.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
Pass
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
void
add_device_normalization_rank_4_3_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F32
,
F32
,
F32
,
F32
,
F32
,
Pass
,
4
,
3
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_normalization_f32_instances
<
Pass
,
4
,
3
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp
deleted
100644 → 0
View file @
da8679b3
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
Pass
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
template
<
typename
OutElementwise
,
index_t
Rank
,
index_t
Reduce
>
// clang-format off
using
device_normalization_f16_instances
=
std
::
tuple
<
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize>
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
2
,
1
,
2
,
1
,
2
,
1
,
2
,
2
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
64
,
1
,
64
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
32
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
2
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
32
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
>
;
// clang-format on
void
add_device_normalization_rank_2_1_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F16
,
F16
,
F16
,
F32
,
F16
,
Pass
,
2
,
1
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_normalization_f16_instances
<
Pass
,
2
,
1
>
{});
}
void
add_device_normalization_rank_4_3_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F16
,
F16
,
F16
,
F32
,
F16
,
Pass
,
4
,
3
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_normalization_f16_instances
<
Pass
,
4
,
3
>
{});
}
void
add_device_normalization_rank_5_3_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F16
,
F16
,
F16
,
F32
,
F16
,
Pass
,
5
,
3
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_normalization_f16_instances
<
Pass
,
5
,
3
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/normalization/
device_
normalization_
f32_
instance
.c
pp
→
library/src/tensor_operation_instance/gpu/normalization/normalization_instance
_common.h
pp
View file @
2797227f
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/ck.hpp"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/utility/data_type.hpp"
...
@@ -12,12 +14,37 @@ namespace tensor_operation {
...
@@ -12,12 +14,37 @@ namespace tensor_operation {
namespace
device
{
namespace
device
{
namespace
instance
{
namespace
instance
{
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
F32
=
float
;
using
Pass
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
template
<
typename
OutElementwise
,
index_t
Rank
,
index_t
Reduce
>
using
device_normalization_f16_instances
=
// clang-format off
std
::
tuple
<
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize>
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
2
,
1
,
2
,
1
,
2
,
1
,
2
,
2
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
64
,
1
,
64
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
32
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
2
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
32
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
// clang-format on
>
;
template
<
typename
OutElementwise
,
index_t
Rank
,
index_t
Reduce
>
template
<
typename
OutElementwise
,
index_t
Rank
,
index_t
Reduce
>
using
device_
layernorm
_f32_instances
=
std
::
tuple
<
using
device_
normalization
_f32_instances
=
std
::
tuple
<
// clang-format off
// clang-format off
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize>
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize>
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
...
@@ -42,26 +69,31 @@ using device_layernorm_f32_instances = std::tuple<
...
@@ -42,26 +69,31 @@ using device_layernorm_f32_instances = std::tuple<
// clang-format on
// clang-format on
>
;
>
;
void
add_device_normalization_rank_2_1_f32_instances
(
template
<
typename
OutElementwise
,
index_t
Rank
,
index_t
Reduce
>
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F32
,
F32
,
F32
,
F32
,
F32
,
Pass
,
2
,
1
>>>&
using
device_normalization_f16_f32_f32_f16_instances
=
std
::
tuple
<
instances
)
// clang-format off
{
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize>
add_device_operation_instances
(
instances
,
device_layernorm_f32_instances
<
Pass
,
2
,
1
>
{});
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
}
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
void
add_device_normalization_rank_4_3_f32_instances
(
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// irregular size
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F32
,
F32
,
F32
,
F32
,
F32
,
Pass
,
4
,
3
>>>&
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
2
,
1
,
2
,
1
,
2
,
1
,
2
,
2
>
,
// irregular size
instances
)
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
{
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
add_device_operation_instances
(
instances
,
device_layernorm_f32_instances
<
Pass
,
4
,
3
>
{});
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
}
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
128
,
1
,
128
,
1
,
32
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
void
add_device_normalization_rank_5_3_f32_instances
(
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F32
,
F32
,
F32
,
F32
,
F32
,
Pass
,
5
,
3
>>>&
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
instances
)
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
2
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
{
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
32
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
add_device_operation_instances
(
instances
,
device_layernorm_f32_instances
<
Pass
,
5
,
3
>
{});
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
}
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
512
,
1
,
512
,
2
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
4
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceNormalizationImpl
<
F16
,
F32
,
F32
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
// clang-format on
>
;
}
// namespace instance
}
// namespace instance
}
// namespace device
}
// namespace device
...
...
profiler/include/profiler/profile_groupnorm_impl.hpp
View file @
2797227f
...
@@ -190,9 +190,9 @@ bool profile_groupnorm_impl(int do_verification,
...
@@ -190,9 +190,9 @@ bool profile_groupnorm_impl(int do_verification,
if
(
time_kernel
)
if
(
time_kernel
)
{
{
LogRange
(
std
::
cout
<<
"length = "
,
length
,
","
)
<<
", "
;
LogRange
(
std
::
cout
<<
"length = "
,
length
,
","
)
<<
std
::
endl
;
std
::
cout
<<
"
num_kernel = "
<<
num_kernel
<<
",
best perf = "
<<
best_avg_time
<<
" ms, "
std
::
cout
<<
"best perf = "
<<
best_avg_time
<<
" ms, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_instance_name
<<
std
::
endl
;
<<
best_instance_name
<<
std
::
endl
;
}
}
if
(
num_kernel
==
0
)
if
(
num_kernel
==
0
)
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment