Merge branch 'amd-develop' into amd-master

e70a4d19 · Jun Liu · ce72f286 · 0dacd895 · e70a4d19 · e70a4d19
Commit e70a4d19 authored Dec 13, 2023 by Jun Liu
20 changed files
--- a/example/26_contraction/contraction_bilinear_xdl_fp16_compute_fp32.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp16_compute_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "common_instances.hpp"
+using ADataType        = F16;
+using BDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F16;
+using DDataType        = F16;
+using DsDataType       = ck::Tuple<DDataType>;
+using EDataType        = F16;
+using ComputeDataType  = F32;
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
+using DeviceOpInstanceKKNN = DeviceOpInstanceKK_Generic<NumDimM,
+                                                        NumDimN,
+                                                        NumDimK,
+                                                        ADataType,
+                                                        BDataType,
+                                                        AccDataType,
+                                                        CShuffleDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        ComputeDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CDEElementOp>;
+using DeviceOpInstanceKNNN = DeviceOpInstanceKN_Generic<NumDimM,
+                                                        NumDimN,
+                                                        NumDimK,
+                                                        ADataType,
+                                                        BDataType,
+                                                        AccDataType,
+                                                        CShuffleDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        ComputeDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CDEElementOp>;
+using DeviceOpInstanceMKNN = DeviceOpInstanceMK_Generic<NumDimM,
+                                                        NumDimN,
+                                                        NumDimK,
+                                                        ADataType,
+                                                        BDataType,
+                                                        AccDataType,
+                                                        CShuffleDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        ComputeDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CDEElementOp>;
+using DeviceOpInstanceMNNN = DeviceOpInstanceMN_Generic<NumDimM,
+                                                        NumDimN,
+                                                        NumDimK,
+                                                        ADataType,
+                                                        BDataType,
+                                                        AccDataType,
+                                                        CShuffleDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        ComputeDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CDEElementOp>;
+using DeviceOpInstance = DeviceOpInstanceKKNN;
+#include "run_contraction_bilinear_example.inc"
+int main(int argc, char* argv[]) { return run_contraction_bilinear_example(argc, argv); }
--- a/example/26_contraction/contraction_bilinear_xdl_fp32.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp32.cpp
--- a/example/26_contraction/contraction_bilinear_xdl_fp32_compute_bf16.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp32_compute_bf16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "common_instances.hpp"
+using ADataType        = F32;
+using BDataType        = F32;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DDataType        = F32;
+using DsDataType       = ck::Tuple<DDataType>;
+using EDataType        = F32;
+using ComputeDataType  = BF16;
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
+using DeviceOpInstanceKKNN = DeviceOpInstanceKK_Generic<NumDimM,
+                                                        NumDimN,
+                                                        NumDimK,
+                                                        ADataType,
+                                                        BDataType,
+                                                        AccDataType,
+                                                        CShuffleDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        ComputeDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CDEElementOp>;
+using DeviceOpInstanceKNNN = DeviceOpInstanceKN_Generic<NumDimM,
+                                                        NumDimN,
+                                                        NumDimK,
+                                                        ADataType,
+                                                        BDataType,
+                                                        AccDataType,
+                                                        CShuffleDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        ComputeDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CDEElementOp>;
+using DeviceOpInstanceMKNN = DeviceOpInstanceMK_Generic<NumDimM,
+                                                        NumDimN,
+                                                        NumDimK,
+                                                        ADataType,
+                                                        BDataType,
+                                                        AccDataType,
+                                                        CShuffleDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        ComputeDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CDEElementOp>;
+using DeviceOpInstanceMNNN = DeviceOpInstanceMN_Generic<NumDimM,
+                                                        NumDimN,
+                                                        NumDimK,
+                                                        ADataType,
+                                                        BDataType,
+                                                        AccDataType,
+                                                        CShuffleDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        ComputeDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CDEElementOp>;
+using DeviceOpInstance = DeviceOpInstanceKKNN;
+#include "run_contraction_bilinear_example.inc"
+int main(int argc, char* argv[]) { return run_contraction_bilinear_example(argc, argv); }
--- a/example/26_contraction/contraction_bilinear_xdl_fp32_compute_fp16.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp32_compute_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "common_instances.hpp"
+using ADataType        = F32;
+using BDataType        = F32;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DDataType        = F32;
+using DsDataType       = ck::Tuple<DDataType>;
+using EDataType        = F32;
+using ComputeDataType  = F16;
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
+using DeviceOpInstanceKKNN = DeviceOpInstanceKK_Generic<NumDimM,
+                                                        NumDimN,
+                                                        NumDimK,
+                                                        ADataType,
+                                                        BDataType,
+                                                        AccDataType,
+                                                        CShuffleDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        ComputeDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CDEElementOp>;
+using DeviceOpInstanceKNNN = DeviceOpInstanceKN_Generic<NumDimM,
+                                                        NumDimN,
+                                                        NumDimK,
+                                                        ADataType,
+                                                        BDataType,
+                                                        AccDataType,
+                                                        CShuffleDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        ComputeDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CDEElementOp>;
+using DeviceOpInstanceMKNN = DeviceOpInstanceMK_Generic<NumDimM,
+                                                        NumDimN,
+                                                        NumDimK,
+                                                        ADataType,
+                                                        BDataType,
+                                                        AccDataType,
+                                                        CShuffleDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        ComputeDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CDEElementOp>;
+using DeviceOpInstanceMNNN = DeviceOpInstanceMN_Generic<NumDimM,
+                                                        NumDimN,
+                                                        NumDimK,
+                                                        ADataType,
+                                                        BDataType,
+                                                        AccDataType,
+                                                        CShuffleDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        ComputeDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CDEElementOp>;
+using DeviceOpInstance = DeviceOpInstanceKKNN;
+#include "run_contraction_bilinear_example.inc"
+int main(int argc, char* argv[]) { return run_contraction_bilinear_example(argc, argv); }
--- a/example/26_contraction/contraction_bilinear_xdl_fp64.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp64.cpp
--- a/example/26_contraction/contraction_bilinear_xdl_fp64_compute_fp32.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp64_compute_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "common_instances.hpp"
+using ADataType        = F64;
+using BDataType        = F64;
+using AccDataType      = F32;
+using CShuffleDataType = F64;
+using DDataType        = F64;
+using DsDataType       = ck::Tuple<DDataType>;
+using EDataType        = F64;
+using ComputeDataType  = F32;
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
+using DeviceOpInstanceKKNN = DeviceOpInstanceKK_FP64<NumDimM,
+                                                     NumDimN,
+                                                     NumDimK,
+                                                     ADataType,
+                                                     BDataType,
+                                                     AccDataType,
+                                                     CShuffleDataType,
+                                                     DsDataType,
+                                                     EDataType,
+                                                     ComputeDataType,
+                                                     AElementOp,
+                                                     BElementOp,
+                                                     CDEElementOp>;
+using DeviceOpInstanceKNNN = DeviceOpInstanceKN_FP64<NumDimM,
+                                                     NumDimN,
+                                                     NumDimK,
+                                                     ADataType,
+                                                     BDataType,
+                                                     AccDataType,
+                                                     CShuffleDataType,
+                                                     DsDataType,
+                                                     EDataType,
+                                                     ComputeDataType,
+                                                     AElementOp,
+                                                     BElementOp,
+                                                     CDEElementOp>;
+using DeviceOpInstanceMKNN = DeviceOpInstanceMK_FP64<NumDimM,
+                                                     NumDimN,
+                                                     NumDimK,
+                                                     ADataType,
+                                                     BDataType,
+                                                     AccDataType,
+                                                     CShuffleDataType,
+                                                     DsDataType,
+                                                     EDataType,
+                                                     ComputeDataType,
+                                                     AElementOp,
+                                                     BElementOp,
+                                                     CDEElementOp>;
+using DeviceOpInstanceMNNN = DeviceOpInstanceMN_FP64<NumDimM,
+                                                     NumDimN,
+                                                     NumDimK,
+                                                     ADataType,
+                                                     BDataType,
+                                                     AccDataType,
+                                                     CShuffleDataType,
+                                                     DsDataType,
+                                                     EDataType,
+                                                     ComputeDataType,
+                                                     AElementOp,
+                                                     BElementOp,
+                                                     CDEElementOp>;
+using DeviceOpInstance = DeviceOpInstanceKKNN;
+#include "run_contraction_bilinear_example.inc"
+int main(int argc, char* argv[]) { return run_contraction_bilinear_example(argc, argv); }
--- a/example/26_contraction/contraction_scale_xdl_bf16_compute_fp32.cpp
+++ b/example/26_contraction/contraction_scale_xdl_bf16_compute_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "common_instances.hpp"
+using ADataType        = BF16;
+using BDataType        = BF16;
+using AccDataType      = F32;
+using CShuffleDataType = BF16;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = BF16;
+using ComputeDataType  = F32;
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Scale;
+using DeviceOpInstanceKKN = DeviceOpInstanceKK_Generic<NumDimM,
+                                                       NumDimN,
+                                                       NumDimK,
+                                                       ADataType,
+                                                       BDataType,
+                                                       AccDataType,
+                                                       CShuffleDataType,
+                                                       DsDataType,
+                                                       EDataType,
+                                                       ComputeDataType,
+                                                       AElementOp,
+                                                       BElementOp,
+                                                       CDEElementOp>;
+using DeviceOpInstanceKNN = DeviceOpInstanceKN_Generic<NumDimM,
+                                                       NumDimN,
+                                                       NumDimK,
+                                                       ADataType,
+                                                       BDataType,
+                                                       AccDataType,
+                                                       CShuffleDataType,
+                                                       DsDataType,
+                                                       EDataType,
+                                                       ComputeDataType,
+                                                       AElementOp,
+                                                       BElementOp,
+                                                       CDEElementOp>;
+using DeviceOpInstanceMKN = DeviceOpInstanceMK_Generic<NumDimM,
+                                                       NumDimN,
+                                                       NumDimK,
+                                                       ADataType,
+                                                       BDataType,
+                                                       AccDataType,
+                                                       CShuffleDataType,
+                                                       DsDataType,
+                                                       EDataType,
+                                                       ComputeDataType,
+                                                       AElementOp,
+                                                       BElementOp,
+                                                       CDEElementOp>;
+using DeviceOpInstanceMNN = DeviceOpInstanceMN_Generic<NumDimM,
+                                                       NumDimN,
+                                                       NumDimK,
+                                                       ADataType,
+                                                       BDataType,
+                                                       AccDataType,
+                                                       CShuffleDataType,
+                                                       DsDataType,
+                                                       EDataType,
+                                                       ComputeDataType,
+                                                       AElementOp,
+                                                       BElementOp,
+                                                       CDEElementOp>;
+using DeviceOpInstance = DeviceOpInstanceKKN;
+#include "run_contraction_scale_example.inc"
+int main(int argc, char* argv[]) { return run_contraction_scale_example(argc, argv); }
--- a/example/26_contraction/contraction_scale_xdl_fp16_compute_fp32.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp16_compute_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "common_instances.hpp"
+using ADataType        = F16;
+using BDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F16;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = F16;
+using ComputeDataType  = F32;
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Scale;
+using DeviceOpInstanceKKN = DeviceOpInstanceKK_Generic<NumDimM,
+                                                       NumDimN,
+                                                       NumDimK,
+                                                       ADataType,
+                                                       BDataType,
+                                                       AccDataType,
+                                                       CShuffleDataType,
+                                                       DsDataType,
+                                                       EDataType,
+                                                       ComputeDataType,
+                                                       AElementOp,
+                                                       BElementOp,
+                                                       CDEElementOp>;
+using DeviceOpInstanceKNN = DeviceOpInstanceKN_Generic<NumDimM,
+                                                       NumDimN,
+                                                       NumDimK,
+                                                       ADataType,
+                                                       BDataType,
+                                                       AccDataType,
+                                                       CShuffleDataType,
+                                                       DsDataType,
+                                                       EDataType,
+                                                       ComputeDataType,
+                                                       AElementOp,
+                                                       BElementOp,
+                                                       CDEElementOp>;
+using DeviceOpInstanceMKN = DeviceOpInstanceMK_Generic<NumDimM,
+                                                       NumDimN,
+                                                       NumDimK,
+                                                       ADataType,
+                                                       BDataType,
+                                                       AccDataType,
+                                                       CShuffleDataType,
+                                                       DsDataType,
+                                                       EDataType,
+                                                       ComputeDataType,
+                                                       AElementOp,
+                                                       BElementOp,
+                                                       CDEElementOp>;
+using DeviceOpInstanceMNN = DeviceOpInstanceMN_Generic<NumDimM,
+                                                       NumDimN,
+                                                       NumDimK,
+                                                       ADataType,
+                                                       BDataType,
+                                                       AccDataType,
+                                                       CShuffleDataType,
+                                                       DsDataType,
+                                                       EDataType,
+                                                       ComputeDataType,
+                                                       AElementOp,
+                                                       BElementOp,
+                                                       CDEElementOp>;
+using DeviceOpInstance = DeviceOpInstanceKKN;
+#include "run_contraction_scale_example.inc"
+int main(int argc, char* argv[]) { return run_contraction_scale_example(argc, argv); }
--- a/example/26_contraction/contraction_scale_xdl_fp32.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp32.cpp
--- a/example/26_contraction/contraction_scale_xdl_fp32_compute_bf16.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp32_compute_bf16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "common_instances.hpp"
+using ADataType        = F32;
+using BDataType        = F32;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = F32;
+using ComputeDataType  = BF16;
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Scale;
+using DeviceOpInstanceKKN = DeviceOpInstanceKK_Generic<NumDimM,
+                                                       NumDimN,
+                                                       NumDimK,
+                                                       ADataType,
+                                                       BDataType,
+                                                       AccDataType,
+                                                       CShuffleDataType,
+                                                       DsDataType,
+                                                       EDataType,
+                                                       ComputeDataType,
+                                                       AElementOp,
+                                                       BElementOp,
+                                                       CDEElementOp>;
+using DeviceOpInstanceKNN = DeviceOpInstanceKN_Generic<NumDimM,
+                                                       NumDimN,
+                                                       NumDimK,
+                                                       ADataType,
+                                                       BDataType,
+                                                       AccDataType,
+                                                       CShuffleDataType,
+                                                       DsDataType,
+                                                       EDataType,
+                                                       ComputeDataType,
+                                                       AElementOp,
+                                                       BElementOp,
+                                                       CDEElementOp>;
+using DeviceOpInstanceMKN = DeviceOpInstanceMK_Generic<NumDimM,
+                                                       NumDimN,
+                                                       NumDimK,
+                                                       ADataType,
+                                                       BDataType,
+                                                       AccDataType,
+                                                       CShuffleDataType,
+                                                       DsDataType,
+                                                       EDataType,
+                                                       ComputeDataType,
+                                                       AElementOp,
+                                                       BElementOp,
+                                                       CDEElementOp>;
+using DeviceOpInstanceMNN = DeviceOpInstanceMN_Generic<NumDimM,
+                                                       NumDimN,
+                                                       NumDimK,
+                                                       ADataType,
+                                                       BDataType,
+                                                       AccDataType,
+                                                       CShuffleDataType,
+                                                       DsDataType,
+                                                       EDataType,
+                                                       ComputeDataType,
+                                                       AElementOp,
+                                                       BElementOp,
+                                                       CDEElementOp>;
+using DeviceOpInstance = DeviceOpInstanceKKN;
+#include "run_contraction_scale_example.inc"
+int main(int argc, char* argv[]) { return run_contraction_scale_example(argc, argv); }
--- a/example/26_contraction/contraction_scale_xdl_fp32_compute_fp16.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp32_compute_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "common_instances.hpp"
+using ADataType        = F32;
+using BDataType        = F32;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = F32;
+using ComputeDataType  = F16;
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Scale;
+using DeviceOpInstanceKKN = DeviceOpInstanceKK_Generic<NumDimM,
+                                                       NumDimN,
+                                                       NumDimK,
+                                                       ADataType,
+                                                       BDataType,
+                                                       AccDataType,
+                                                       CShuffleDataType,
+                                                       DsDataType,
+                                                       EDataType,
+                                                       ComputeDataType,
+                                                       AElementOp,
+                                                       BElementOp,
+                                                       CDEElementOp>;
+using DeviceOpInstanceKNN = DeviceOpInstanceKN_Generic<NumDimM,
+                                                       NumDimN,
+                                                       NumDimK,
+                                                       ADataType,
+                                                       BDataType,
+                                                       AccDataType,
+                                                       CShuffleDataType,
+                                                       DsDataType,
+                                                       EDataType,
+                                                       ComputeDataType,
+                                                       AElementOp,
+                                                       BElementOp,
+                                                       CDEElementOp>;
+using DeviceOpInstanceMKN = DeviceOpInstanceMK_Generic<NumDimM,
+                                                       NumDimN,
+                                                       NumDimK,
+                                                       ADataType,
+                                                       BDataType,
+                                                       AccDataType,
+                                                       CShuffleDataType,
+                                                       DsDataType,
+                                                       EDataType,
+                                                       ComputeDataType,
+                                                       AElementOp,
+                                                       BElementOp,
+                                                       CDEElementOp>;
+using DeviceOpInstanceMNN = DeviceOpInstanceMN_Generic<NumDimM,
+                                                       NumDimN,
+                                                       NumDimK,
+                                                       ADataType,
+                                                       BDataType,
+                                                       AccDataType,
+                                                       CShuffleDataType,
+                                                       DsDataType,
+                                                       EDataType,
+                                                       ComputeDataType,
+                                                       AElementOp,
+                                                       BElementOp,
+                                                       CDEElementOp>;
+using DeviceOpInstance = DeviceOpInstanceKKN;
+#include "run_contraction_scale_example.inc"
+int main(int argc, char* argv[]) { return run_contraction_scale_example(argc, argv); }
--- a/example/26_contraction/contraction_scale_xdl_fp64.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp64.cpp
--- a/example/26_contraction/contraction_scale_xdl_fp64_compute_fp32.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp64_compute_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "common_instances.hpp"
+using ADataType        = F64;
+using BDataType        = F64;
+using AccDataType      = F32;
+using CShuffleDataType = F64;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = F64;
+using ComputeDataType  = F32;
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Scale;
+using DeviceOpInstanceKKN = DeviceOpInstanceKK_FP64<NumDimM,
+                                                    NumDimN,
+                                                    NumDimK,
+                                                    ADataType,
+                                                    BDataType,
+                                                    AccDataType,
+                                                    CShuffleDataType,
+                                                    DsDataType,
+                                                    EDataType,
+                                                    ComputeDataType,
+                                                    AElementOp,
+                                                    BElementOp,
+                                                    CDEElementOp>;
+using DeviceOpInstanceKNN = DeviceOpInstanceKN_FP64<NumDimM,
+                                                    NumDimN,
+                                                    NumDimK,
+                                                    ADataType,
+                                                    BDataType,
+                                                    AccDataType,
+                                                    CShuffleDataType,
+                                                    DsDataType,
+                                                    EDataType,
+                                                    ComputeDataType,
+                                                    AElementOp,
+                                                    BElementOp,
+                                                    CDEElementOp>;
+using DeviceOpInstanceMKN = DeviceOpInstanceMK_FP64<NumDimM,
+                                                    NumDimN,
+                                                    NumDimK,
+                                                    ADataType,
+                                                    BDataType,
+                                                    AccDataType,
+                                                    CShuffleDataType,
+                                                    DsDataType,
+                                                    EDataType,
+                                                    ComputeDataType,
+                                                    AElementOp,
+                                                    BElementOp,
+                                                    CDEElementOp>;
+using DeviceOpInstanceMNN = DeviceOpInstanceMN_FP64<NumDimM,
+                                                    NumDimN,
+                                                    NumDimK,
+                                                    ADataType,
+                                                    BDataType,
+                                                    AccDataType,
+                                                    CShuffleDataType,
+                                                    DsDataType,
+                                                    EDataType,
+                                                    ComputeDataType,
+                                                    AElementOp,
+                                                    BElementOp,
+                                                    CDEElementOp>;
+using DeviceOpInstance = DeviceOpInstanceKKN;
+#include "run_contraction_scale_example.inc"
+int main(int argc, char* argv[]) { return run_contraction_scale_example(argc, argv); }
--- a/example/26_contraction/run_contraction_bilinear_example.inc
+++ b/example/26_contraction/run_contraction_bilinear_example.inc
--- a/example/26_contraction/run_contraction_scale_example.inc
+++ b/example/26_contraction/run_contraction_scale_example.inc
--- a/example/27_layernorm/CMakeLists.txt
+++ b/example/27_layernorm/CMakeLists.txt
-add_example_executable(example_layernorm_fp16 layernorm_fp16.cpp)
-add_example_executable(example_layernorm_splitk_fp16 layernorm_splitk_fp16.cpp)
--- a/example/27_layernorm/layernorm_fp16.cpp
+++ b/example/27_layernorm/layernorm_fp16.cpp
--- a/example/27_layernorm2d_fwd/CMakeLists.txt
+++ b/example/27_layernorm2d_fwd/CMakeLists.txt
+add_example_executable(example_layernorm2d_fwd_fp16 layernorm2d_fwd_fp16.cpp)
+add_example_executable(example_layernorm2d_fwd_splitk_fp16 layernorm2d_fwd_splitk_fp16.cpp)
--- a/example/27_layernorm/common.hpp
+++ b/example/27_layernorm/common.hpp
@@ -10,8 +10,8 @@
 #include <getopt.h>
 #include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_normalization_fwd_impl.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_normalization_splitk_impl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_normalization_fwd_splitk_impl.hpp"
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"

--- a/example/27_layernorm2d_fwd/layernorm2d_fwd_fp16.cpp
+++ b/example/27_layernorm2d_fwd/layernorm2d_fwd_fp16.cpp