Merge remote-tracking branch 'origin/develop' into amax_gemm_op

7e734a03 · Jing Zhang · beff3933 · 6b5f6473 · 7e734a03 · 7e734a03
Commit 7e734a03 authored Oct 02, 2023 by Jing Zhang
20 changed files
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
-# Change Log for Composable Kernel
+# Changelog for Composable Kernel

 Full documentation for Composable Kernel is not yet available.

 ## (Unreleased) CK for ROCm 6.0.0

-### Fixed
+### Fixes
+ - Fixed a hazard associated with inline v_dot (#808)
+ - Fixed two bugs in grouped convolution backward data without K padding (#848 #876)

 ### Optimizations
+None

-### Added
- Added image to column (#867) and column to image kernels (#930).
+### Additions
+- Added an image to a column kernel (#867)
+- Added a column to an image kernel (#930)
+- Support for 3D grouped convolution forward on RDNA 3 GPUs (#935)
+- Grouped convolution support for small K and C (#822 #879 #897)
+- Support for NHWGC (2D and 3D) grouped convolution backward weight (#769 #804)
+- Support for bf16/f32/f16 and NHWGC (2D and 3d) grouped convolution backward data (#757 #799)
+- Support for Batched Gemm DL (#732)

-### Changed
+### Changes
+ - Changed the grouped convolution API to maintain consistency with other convolution kernels (#817)

+## CK 0.2.0 for ROCm 5.7.0

-## CK 0.2.0 for ROCm 5.5.0
-
-### Fixed
- Fixed a bug in 6-dimensional kernels (#555).
- Fixed grouped ConvBwdWeight test case failure (#524).
+### Fixes
+- Fixed a bug in 6-dimensional kernels (#555)
+- Fixed a test case failure with grouped convolution backward weight (#524)

 ### Optimizations
- Improve proformance of normalization kernel
-
-### Added
- Added new cmake flag "DL_KERNELS" must be set to "ON" in order to build the gemm_dl and batched_gemm_multi_d_dl instances.
- Added new cmake flag "DTYPES" which could be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build instance of select data types.
- Added new cmake flag "INSTANCES_ONLY" which will only build CK library and instances without the tests, examples, or profiler.
- Added new feature: if GPU_TARGETS is not set on cmake command line, CK will be built for all targets supported by compiler.
- Added support on MI300A/MI300X.
- Added support on NAVI3x.
- Added user tutorial (#563).
- Added more instances for irregular GEMM sizes (#560).
- Added inter-wave consumer-producer programming model for GEMM kernels (#310).
- Added multi-D GEMM client APIs (#534).
- Added multi-embeddings support (#542).
- Added Navi3x blockwise GEMM and real GEMM support (#541).
- Added Navi grouped ConvBwdWeight support (#505).
- Added MaxPool, AvgPool forward (#815).
- Added MaxPool backward (#750).
-
-### Changed
+- Improved the performance of the normalization kernel
+
+### Additions
+- New CMake flags:
+  - "DL_KERNELS"-- Must be set to "ON" in order to build the gemm_dl and batched_gemm_multi_d_dl instances
+  - "DTYPES" -- Can be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build an instance of the specified data types
+  - "INSTANCES_ONLY" -- Only builds CK library and instances without tests, examples, or profiler
+- New feature: if GPU_TARGETS is not set in the CMake command line, CK will be built for all targets supported by the compiler
+- Support for MI300A/MI300X
+- Support for AMD RDNA 3
+- New user tutorial (#563)
+- Additional instances for irregular GEMM sizes (#560)
+- New inter-wave consumer-producer programming model for GEMM kernels (#310)
+- GEMM with support multiple elementwise fusions (multi-D) (#534)
+- Multi-embeddings support (#542)
+- AMD RDNA 3 blockwise GEMM and real GEMM support (#541)
+- AMD RDNA grouped convolution backward weight support (#505)
+- MaxPool and AvgPool forward (#815); MaxPool backward (#750)
+
+### Changes
+None
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -713,8 +713,8 @@ pipeline {
                    }
                    agent{ label rocmnode("gfx908 || gfx90a") }
                    environment{
-                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941" """
-                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ 
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" """
+                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ 
                    }
                    steps{
                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')

--- a/example/26_contraction/CMakeLists.txt
+++ b/example/26_contraction/CMakeLists.txt
+add_custom_target(example_contraction)
+add_custom_target(example_contraction_scale)
+add_custom_target(example_contraction_bilinear)
+
+# FP32
 add_example_executable(example_contraction_bilinear_xdl_fp32 contraction_bilinear_xdl_fp32.cpp)
+add_dependencies(example_contraction_bilinear example_contraction_bilinear_xdl_fp32)
+
 add_example_executable(example_contraction_scale_xdl_fp32 contraction_scale_xdl_fp32.cpp)
+add_dependencies(example_contraction_scale example_contraction_scale_xdl_fp32)
+
+add_example_executable(example_contraction_bilinear_xdl_fp32_compute_bf16 contraction_bilinear_xdl_fp32_compute_bf16.cpp)
+add_dependencies(example_contraction_bilinear example_contraction_bilinear_xdl_fp32_compute_bf16)
+
+add_example_executable(example_contraction_scale_xdl_fp32_compute_bf16 contraction_scale_xdl_fp32_compute_bf16.cpp)
+add_dependencies(example_contraction_scale example_contraction_scale_xdl_fp32_compute_bf16)
+
+add_example_executable(example_contraction_bilinear_xdl_fp32_compute_fp16 contraction_bilinear_xdl_fp32_compute_fp16.cpp)
+add_dependencies(example_contraction_bilinear example_contraction_bilinear_xdl_fp32_compute_fp16)
+
+add_example_executable(example_contraction_scale_xdl_fp32_compute_fp16 contraction_scale_xdl_fp32_compute_fp16.cpp)
+add_dependencies(example_contraction_scale example_contraction_scale_xdl_fp32_compute_fp16)
+
+# FP64
 add_example_executable(example_contraction_bilinear_xdl_fp64 contraction_bilinear_xdl_fp64.cpp)
+add_dependencies(example_contraction_bilinear example_contraction_bilinear_xdl_fp64)
+
 add_example_executable(example_contraction_scale_xdl_fp64 contraction_scale_xdl_fp64.cpp)
+add_dependencies(example_contraction_scale example_contraction_scale_xdl_fp64)
+
+add_example_executable(example_contraction_bilinear_xdl_fp64_compute_fp32 contraction_bilinear_xdl_fp64_compute_fp32.cpp)
+add_dependencies(example_contraction_bilinear example_contraction_bilinear_xdl_fp64_compute_fp32)
+
+add_example_executable(example_contraction_scale_xdl_fp64_compute_fp32 contraction_scale_xdl_fp64_compute_fp32.cpp)
+add_dependencies(example_contraction_scale example_contraction_scale_xdl_fp64_compute_fp32)
+
+# FP16
+add_example_executable(example_contraction_bilinear_xdl_fp16_compute_fp32 contraction_bilinear_xdl_fp16_compute_fp32.cpp)
+add_dependencies(example_contraction_bilinear example_contraction_bilinear_xdl_fp16_compute_fp32)
+
+add_example_executable(example_contraction_scale_xdl_fp16_compute_fp32 contraction_scale_xdl_fp16_compute_fp32.cpp)
+add_dependencies(example_contraction_scale example_contraction_scale_xdl_fp16_compute_fp32)
+
+# BF16
+add_example_executable(example_contraction_bilinear_xdl_bf16_compute_fp32 contraction_bilinear_xdl_bf16_compute_fp32.cpp)
+add_dependencies(example_contraction_bilinear example_contraction_bilinear_xdl_bf16_compute_fp32)
+
+add_example_executable(example_contraction_scale_xdl_bf16_compute_fp32 contraction_scale_xdl_bf16_compute_fp32.cpp)
+add_dependencies(example_contraction_scale example_contraction_scale_xdl_bf16_compute_fp32)
+
+add_dependencies(example_contraction example_contraction_scale)
+add_dependencies(example_contraction example_contraction_bilinear)
--- a/example/26_contraction/common_instances.hpp
+++ b/example/26_contraction/common_instances.hpp
--- a/example/26_contraction/contraction_bilinear_xdl_bf16_compute_fp32.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_bf16_compute_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "common_instances.hpp"
+
+using ADataType        = BF16;
+using BDataType        = BF16;
+using AccDataType      = F32;
+using CShuffleDataType = BF16;
+using DDataType        = BF16;
+using DsDataType       = ck::Tuple<DDataType>;
+using EDataType        = BF16;
+using ComputeDataType  = F32;
+
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+
+using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
+
+using DeviceOpInstanceKKNN = DeviceOpInstanceKK_Generic<NumDimM,
+                                                        NumDimN,
+                                                        NumDimK,
+                                                        ADataType,
+                                                        BDataType,
+                                                        AccDataType,
+                                                        CShuffleDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        ComputeDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CDEElementOp>;
+
+using DeviceOpInstanceKNNN = DeviceOpInstanceKN_Generic<NumDimM,
+                                                        NumDimN,
+                                                        NumDimK,
+                                                        ADataType,
+                                                        BDataType,
+                                                        AccDataType,
+                                                        CShuffleDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        ComputeDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CDEElementOp>;
+
+using DeviceOpInstanceMKNN = DeviceOpInstanceMK_Generic<NumDimM,
+                                                        NumDimN,
+                                                        NumDimK,
+                                                        ADataType,
+                                                        BDataType,
+                                                        AccDataType,
+                                                        CShuffleDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        ComputeDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CDEElementOp>;
+
+using DeviceOpInstanceMNNN = DeviceOpInstanceMN_Generic<NumDimM,
+                                                        NumDimN,
+                                                        NumDimK,
+                                                        ADataType,
+                                                        BDataType,
+                                                        AccDataType,
+                                                        CShuffleDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        ComputeDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CDEElementOp>;
+
+using DeviceOpInstance = DeviceOpInstanceKKNN;
+
+#include "run_contraction_bilinear_example.inc"
+
+int main(int argc, char* argv[]) { return run_contraction_bilinear_example(argc, argv); }
--- a/example/26_contraction/contraction_bilinear_xdl_fp16_compute_fp32.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp16_compute_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "common_instances.hpp"
+
+using ADataType        = F16;
+using BDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F16;
+using DDataType        = F16;
+using DsDataType       = ck::Tuple<DDataType>;
+using EDataType        = F16;
+using ComputeDataType  = F32;
+
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+
+using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
+
+using DeviceOpInstanceKKNN = DeviceOpInstanceKK_Generic<NumDimM,
+                                                        NumDimN,
+                                                        NumDimK,
+                                                        ADataType,
+                                                        BDataType,
+                                                        AccDataType,
+                                                        CShuffleDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        ComputeDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CDEElementOp>;
+
+using DeviceOpInstanceKNNN = DeviceOpInstanceKN_Generic<NumDimM,
+                                                        NumDimN,
+                                                        NumDimK,
+                                                        ADataType,
+                                                        BDataType,
+                                                        AccDataType,
+                                                        CShuffleDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        ComputeDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CDEElementOp>;
+
+using DeviceOpInstanceMKNN = DeviceOpInstanceMK_Generic<NumDimM,
+                                                        NumDimN,
+                                                        NumDimK,
+                                                        ADataType,
+                                                        BDataType,
+                                                        AccDataType,
+                                                        CShuffleDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        ComputeDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CDEElementOp>;
+
+using DeviceOpInstanceMNNN = DeviceOpInstanceMN_Generic<NumDimM,
+                                                        NumDimN,
+                                                        NumDimK,
+                                                        ADataType,
+                                                        BDataType,
+                                                        AccDataType,
+                                                        CShuffleDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        ComputeDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CDEElementOp>;
+
+using DeviceOpInstance = DeviceOpInstanceKKNN;
+
+#include "run_contraction_bilinear_example.inc"
+
+int main(int argc, char* argv[]) { return run_contraction_bilinear_example(argc, argv); }
--- a/example/26_contraction/contraction_bilinear_xdl_fp32.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp32.cpp
--- a/example/26_contraction/contraction_bilinear_xdl_fp32_compute_bf16.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp32_compute_bf16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "common_instances.hpp"
+
+using ADataType        = F32;
+using BDataType        = F32;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DDataType        = F32;
+using DsDataType       = ck::Tuple<DDataType>;
+using EDataType        = F32;
+using ComputeDataType  = BF16;
+
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+
+using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
+
+using DeviceOpInstanceKKNN = DeviceOpInstanceKK_Generic<NumDimM,
+                                                        NumDimN,
+                                                        NumDimK,
+                                                        ADataType,
+                                                        BDataType,
+                                                        AccDataType,
+                                                        CShuffleDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        ComputeDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CDEElementOp>;
+
+using DeviceOpInstanceKNNN = DeviceOpInstanceKN_Generic<NumDimM,
+                                                        NumDimN,
+                                                        NumDimK,
+                                                        ADataType,
+                                                        BDataType,
+                                                        AccDataType,
+                                                        CShuffleDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        ComputeDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CDEElementOp>;
+
+using DeviceOpInstanceMKNN = DeviceOpInstanceMK_Generic<NumDimM,
+                                                        NumDimN,
+                                                        NumDimK,
+                                                        ADataType,
+                                                        BDataType,
+                                                        AccDataType,
+                                                        CShuffleDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        ComputeDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CDEElementOp>;
+
+using DeviceOpInstanceMNNN = DeviceOpInstanceMN_Generic<NumDimM,
+                                                        NumDimN,
+                                                        NumDimK,
+                                                        ADataType,
+                                                        BDataType,
+                                                        AccDataType,
+                                                        CShuffleDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        ComputeDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CDEElementOp>;
+
+using DeviceOpInstance = DeviceOpInstanceKKNN;
+
+#include "run_contraction_bilinear_example.inc"
+
+int main(int argc, char* argv[]) { return run_contraction_bilinear_example(argc, argv); }
--- a/example/26_contraction/contraction_bilinear_xdl_fp32_compute_fp16.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp32_compute_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "common_instances.hpp"
+
+using ADataType        = F32;
+using BDataType        = F32;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DDataType        = F32;
+using DsDataType       = ck::Tuple<DDataType>;
+using EDataType        = F32;
+using ComputeDataType  = F16;
+
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+
+using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
+
+using DeviceOpInstanceKKNN = DeviceOpInstanceKK_Generic<NumDimM,
+                                                        NumDimN,
+                                                        NumDimK,
+                                                        ADataType,
+                                                        BDataType,
+                                                        AccDataType,
+                                                        CShuffleDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        ComputeDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CDEElementOp>;
+
+using DeviceOpInstanceKNNN = DeviceOpInstanceKN_Generic<NumDimM,
+                                                        NumDimN,
+                                                        NumDimK,
+                                                        ADataType,
+                                                        BDataType,
+                                                        AccDataType,
+                                                        CShuffleDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        ComputeDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CDEElementOp>;
+
+using DeviceOpInstanceMKNN = DeviceOpInstanceMK_Generic<NumDimM,
+                                                        NumDimN,
+                                                        NumDimK,
+                                                        ADataType,
+                                                        BDataType,
+                                                        AccDataType,
+                                                        CShuffleDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        ComputeDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CDEElementOp>;
+
+using DeviceOpInstanceMNNN = DeviceOpInstanceMN_Generic<NumDimM,
+                                                        NumDimN,
+                                                        NumDimK,
+                                                        ADataType,
+                                                        BDataType,
+                                                        AccDataType,
+                                                        CShuffleDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        ComputeDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CDEElementOp>;
+
+using DeviceOpInstance = DeviceOpInstanceKKNN;
+
+#include "run_contraction_bilinear_example.inc"
+
+int main(int argc, char* argv[]) { return run_contraction_bilinear_example(argc, argv); }
--- a/example/26_contraction/contraction_bilinear_xdl_fp64.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp64.cpp
--- a/example/26_contraction/contraction_bilinear_xdl_fp64_compute_fp32.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp64_compute_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "common_instances.hpp"
+
+using ADataType        = F64;
+using BDataType        = F64;
+using AccDataType      = F32;
+using CShuffleDataType = F64;
+using DDataType        = F64;
+using DsDataType       = ck::Tuple<DDataType>;
+using EDataType        = F64;
+using ComputeDataType  = F32;
+
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+
+using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
+
+using DeviceOpInstanceKKNN = DeviceOpInstanceKK_FP64<NumDimM,
+                                                     NumDimN,
+                                                     NumDimK,
+                                                     ADataType,
+                                                     BDataType,
+                                                     AccDataType,
+                                                     CShuffleDataType,
+                                                     DsDataType,
+                                                     EDataType,
+                                                     ComputeDataType,
+                                                     AElementOp,
+                                                     BElementOp,
+                                                     CDEElementOp>;
+
+using DeviceOpInstanceKNNN = DeviceOpInstanceKN_FP64<NumDimM,
+                                                     NumDimN,
+                                                     NumDimK,
+                                                     ADataType,
+                                                     BDataType,
+                                                     AccDataType,
+                                                     CShuffleDataType,
+                                                     DsDataType,
+                                                     EDataType,
+                                                     ComputeDataType,
+                                                     AElementOp,
+                                                     BElementOp,
+                                                     CDEElementOp>;
+
+using DeviceOpInstanceMKNN = DeviceOpInstanceMK_FP64<NumDimM,
+                                                     NumDimN,
+                                                     NumDimK,
+                                                     ADataType,
+                                                     BDataType,
+                                                     AccDataType,
+                                                     CShuffleDataType,
+                                                     DsDataType,
+                                                     EDataType,
+                                                     ComputeDataType,
+                                                     AElementOp,
+                                                     BElementOp,
+                                                     CDEElementOp>;
+
+using DeviceOpInstanceMNNN = DeviceOpInstanceMN_FP64<NumDimM,
+                                                     NumDimN,
+                                                     NumDimK,
+                                                     ADataType,
+                                                     BDataType,
+                                                     AccDataType,
+                                                     CShuffleDataType,
+                                                     DsDataType,
+                                                     EDataType,
+                                                     ComputeDataType,
+                                                     AElementOp,
+                                                     BElementOp,
+                                                     CDEElementOp>;
+
+using DeviceOpInstance = DeviceOpInstanceKKNN;
+
+#include "run_contraction_bilinear_example.inc"
+
+int main(int argc, char* argv[]) { return run_contraction_bilinear_example(argc, argv); }
--- a/example/26_contraction/contraction_scale_xdl_bf16_compute_fp32.cpp
+++ b/example/26_contraction/contraction_scale_xdl_bf16_compute_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "common_instances.hpp"
+
+using ADataType        = BF16;
+using BDataType        = BF16;
+using AccDataType      = F32;
+using CShuffleDataType = BF16;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = BF16;
+using ComputeDataType  = F32;
+
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+
+using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Scale;
+
+using DeviceOpInstanceKKN = DeviceOpInstanceKK_Generic<NumDimM,
+                                                       NumDimN,
+                                                       NumDimK,
+                                                       ADataType,
+                                                       BDataType,
+                                                       AccDataType,
+                                                       CShuffleDataType,
+                                                       DsDataType,
+                                                       EDataType,
+                                                       ComputeDataType,
+                                                       AElementOp,
+                                                       BElementOp,
+                                                       CDEElementOp>;
+
+using DeviceOpInstanceKNN = DeviceOpInstanceKN_Generic<NumDimM,
+                                                       NumDimN,
+                                                       NumDimK,
+                                                       ADataType,
+                                                       BDataType,
+                                                       AccDataType,
+                                                       CShuffleDataType,
+                                                       DsDataType,
+                                                       EDataType,
+                                                       ComputeDataType,
+                                                       AElementOp,
+                                                       BElementOp,
+                                                       CDEElementOp>;
+
+using DeviceOpInstanceMKN = DeviceOpInstanceMK_Generic<NumDimM,
+                                                       NumDimN,
+                                                       NumDimK,
+                                                       ADataType,
+                                                       BDataType,
+                                                       AccDataType,
+                                                       CShuffleDataType,
+                                                       DsDataType,
+                                                       EDataType,
+                                                       ComputeDataType,
+                                                       AElementOp,
+                                                       BElementOp,
+                                                       CDEElementOp>;
+
+using DeviceOpInstanceMNN = DeviceOpInstanceMN_Generic<NumDimM,
+                                                       NumDimN,
+                                                       NumDimK,
+                                                       ADataType,
+                                                       BDataType,
+                                                       AccDataType,
+                                                       CShuffleDataType,
+                                                       DsDataType,
+                                                       EDataType,
+                                                       ComputeDataType,
+                                                       AElementOp,
+                                                       BElementOp,
+                                                       CDEElementOp>;
+
+using DeviceOpInstance = DeviceOpInstanceKKN;
+
+#include "run_contraction_scale_example.inc"
+
+int main(int argc, char* argv[]) { return run_contraction_scale_example(argc, argv); }
--- a/example/26_contraction/contraction_scale_xdl_fp16_compute_fp32.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp16_compute_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "common_instances.hpp"
+
+using ADataType        = F16;
+using BDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F16;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = F16;
+using ComputeDataType  = F32;
+
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+
+using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Scale;
+
+using DeviceOpInstanceKKN = DeviceOpInstanceKK_Generic<NumDimM,
+                                                       NumDimN,
+                                                       NumDimK,
+                                                       ADataType,
+                                                       BDataType,
+                                                       AccDataType,
+                                                       CShuffleDataType,
+                                                       DsDataType,
+                                                       EDataType,
+                                                       ComputeDataType,
+                                                       AElementOp,
+                                                       BElementOp,
+                                                       CDEElementOp>;
+
+using DeviceOpInstanceKNN = DeviceOpInstanceKN_Generic<NumDimM,
+                                                       NumDimN,
+                                                       NumDimK,
+                                                       ADataType,
+                                                       BDataType,
+                                                       AccDataType,
+                                                       CShuffleDataType,
+                                                       DsDataType,
+                                                       EDataType,
+                                                       ComputeDataType,
+                                                       AElementOp,
+                                                       BElementOp,
+                                                       CDEElementOp>;
+
+using DeviceOpInstanceMKN = DeviceOpInstanceMK_Generic<NumDimM,
+                                                       NumDimN,
+                                                       NumDimK,
+                                                       ADataType,
+                                                       BDataType,
+                                                       AccDataType,
+                                                       CShuffleDataType,
+                                                       DsDataType,
+                                                       EDataType,
+                                                       ComputeDataType,
+                                                       AElementOp,
+                                                       BElementOp,
+                                                       CDEElementOp>;
+
+using DeviceOpInstanceMNN = DeviceOpInstanceMN_Generic<NumDimM,
+                                                       NumDimN,
+                                                       NumDimK,
+                                                       ADataType,
+                                                       BDataType,
+                                                       AccDataType,
+                                                       CShuffleDataType,
+                                                       DsDataType,
+                                                       EDataType,
+                                                       ComputeDataType,
+                                                       AElementOp,
+                                                       BElementOp,
+                                                       CDEElementOp>;
+
+using DeviceOpInstance = DeviceOpInstanceKKN;
+
+#include "run_contraction_scale_example.inc"
+
+int main(int argc, char* argv[]) { return run_contraction_scale_example(argc, argv); }
--- a/example/26_contraction/contraction_scale_xdl_fp32.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp32.cpp
--- a/example/26_contraction/contraction_scale_xdl_fp32_compute_bf16.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp32_compute_bf16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "common_instances.hpp"
+
+using ADataType        = F32;
+using BDataType        = F32;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = F32;
+using ComputeDataType  = BF16;
+
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+
+using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Scale;
+
+using DeviceOpInstanceKKN = DeviceOpInstanceKK_Generic<NumDimM,
+                                                       NumDimN,
+                                                       NumDimK,
+                                                       ADataType,
+                                                       BDataType,
+                                                       AccDataType,
+                                                       CShuffleDataType,
+                                                       DsDataType,
+                                                       EDataType,
+                                                       ComputeDataType,
+                                                       AElementOp,
+                                                       BElementOp,
+                                                       CDEElementOp>;
+
+using DeviceOpInstanceKNN = DeviceOpInstanceKN_Generic<NumDimM,
+                                                       NumDimN,
+                                                       NumDimK,
+                                                       ADataType,
+                                                       BDataType,
+                                                       AccDataType,
+                                                       CShuffleDataType,
+                                                       DsDataType,
+                                                       EDataType,
+                                                       ComputeDataType,
+                                                       AElementOp,
+                                                       BElementOp,
+                                                       CDEElementOp>;
+
+using DeviceOpInstanceMKN = DeviceOpInstanceMK_Generic<NumDimM,
+                                                       NumDimN,
+                                                       NumDimK,
+                                                       ADataType,
+                                                       BDataType,
+                                                       AccDataType,
+                                                       CShuffleDataType,
+                                                       DsDataType,
+                                                       EDataType,
+                                                       ComputeDataType,
+                                                       AElementOp,
+                                                       BElementOp,
+                                                       CDEElementOp>;
+
+using DeviceOpInstanceMNN = DeviceOpInstanceMN_Generic<NumDimM,
+                                                       NumDimN,
+                                                       NumDimK,
+                                                       ADataType,
+                                                       BDataType,
+                                                       AccDataType,
+                                                       CShuffleDataType,
+                                                       DsDataType,
+                                                       EDataType,
+                                                       ComputeDataType,
+                                                       AElementOp,
+                                                       BElementOp,
+                                                       CDEElementOp>;
+
+using DeviceOpInstance = DeviceOpInstanceKKN;
+
+#include "run_contraction_scale_example.inc"
+
+int main(int argc, char* argv[]) { return run_contraction_scale_example(argc, argv); }
--- a/example/26_contraction/contraction_scale_xdl_fp32_compute_fp16.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp32_compute_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "common_instances.hpp"
+
+using ADataType        = F32;
+using BDataType        = F32;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = F32;
+using ComputeDataType  = F16;
+
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+
+using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Scale;
+
+using DeviceOpInstanceKKN = DeviceOpInstanceKK_Generic<NumDimM,
+                                                       NumDimN,
+                                                       NumDimK,
+                                                       ADataType,
+                                                       BDataType,
+                                                       AccDataType,
+                                                       CShuffleDataType,
+                                                       DsDataType,
+                                                       EDataType,
+                                                       ComputeDataType,
+                                                       AElementOp,
+                                                       BElementOp,
+                                                       CDEElementOp>;
+
+using DeviceOpInstanceKNN = DeviceOpInstanceKN_Generic<NumDimM,
+                                                       NumDimN,
+                                                       NumDimK,
+                                                       ADataType,
+                                                       BDataType,
+                                                       AccDataType,
+                                                       CShuffleDataType,
+                                                       DsDataType,
+                                                       EDataType,
+                                                       ComputeDataType,
+                                                       AElementOp,
+                                                       BElementOp,
+                                                       CDEElementOp>;
+
+using DeviceOpInstanceMKN = DeviceOpInstanceMK_Generic<NumDimM,
+                                                       NumDimN,
+                                                       NumDimK,
+                                                       ADataType,
+                                                       BDataType,
+                                                       AccDataType,
+                                                       CShuffleDataType,
+                                                       DsDataType,
+                                                       EDataType,
+                                                       ComputeDataType,
+                                                       AElementOp,
+                                                       BElementOp,
+                                                       CDEElementOp>;
+
+using DeviceOpInstanceMNN = DeviceOpInstanceMN_Generic<NumDimM,
+                                                       NumDimN,
+                                                       NumDimK,
+                                                       ADataType,
+                                                       BDataType,
+                                                       AccDataType,
+                                                       CShuffleDataType,
+                                                       DsDataType,
+                                                       EDataType,
+                                                       ComputeDataType,
+                                                       AElementOp,
+                                                       BElementOp,
+                                                       CDEElementOp>;
+
+using DeviceOpInstance = DeviceOpInstanceKKN;
+
+#include "run_contraction_scale_example.inc"
+
+int main(int argc, char* argv[]) { return run_contraction_scale_example(argc, argv); }
--- a/example/26_contraction/contraction_scale_xdl_fp64.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp64.cpp
--- a/example/26_contraction/contraction_scale_xdl_fp64_compute_fp32.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp64_compute_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "common_instances.hpp"
+
+using ADataType        = F64;
+using BDataType        = F64;
+using AccDataType      = F32;
+using CShuffleDataType = F64;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = F64;
+using ComputeDataType  = F32;
+
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+
+using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Scale;
+
+using DeviceOpInstanceKKN = DeviceOpInstanceKK_FP64<NumDimM,
+                                                    NumDimN,
+                                                    NumDimK,
+                                                    ADataType,
+                                                    BDataType,
+                                                    AccDataType,
+                                                    CShuffleDataType,
+                                                    DsDataType,
+                                                    EDataType,
+                                                    ComputeDataType,
+                                                    AElementOp,
+                                                    BElementOp,
+                                                    CDEElementOp>;
+
+using DeviceOpInstanceKNN = DeviceOpInstanceKN_FP64<NumDimM,
+                                                    NumDimN,
+                                                    NumDimK,
+                                                    ADataType,
+                                                    BDataType,
+                                                    AccDataType,
+                                                    CShuffleDataType,
+                                                    DsDataType,
+                                                    EDataType,
+                                                    ComputeDataType,
+                                                    AElementOp,
+                                                    BElementOp,
+                                                    CDEElementOp>;
+
+using DeviceOpInstanceMKN = DeviceOpInstanceMK_FP64<NumDimM,
+                                                    NumDimN,
+                                                    NumDimK,
+                                                    ADataType,
+                                                    BDataType,
+                                                    AccDataType,
+                                                    CShuffleDataType,
+                                                    DsDataType,
+                                                    EDataType,
+                                                    ComputeDataType,
+                                                    AElementOp,
+                                                    BElementOp,
+                                                    CDEElementOp>;
+
+using DeviceOpInstanceMNN = DeviceOpInstanceMN_FP64<NumDimM,
+                                                    NumDimN,
+                                                    NumDimK,
+                                                    ADataType,
+                                                    BDataType,
+                                                    AccDataType,
+                                                    CShuffleDataType,
+                                                    DsDataType,
+                                                    EDataType,
+                                                    ComputeDataType,
+                                                    AElementOp,
+                                                    BElementOp,
+                                                    CDEElementOp>;
+
+using DeviceOpInstance = DeviceOpInstanceKKN;
+
+#include "run_contraction_scale_example.inc"
+
+int main(int argc, char* argv[]) { return run_contraction_scale_example(argc, argv); }
--- a/example/26_contraction/run_contraction_bilinear_example.inc
+++ b/example/26_contraction/run_contraction_bilinear_example.inc
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/numeric.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_contraction.hpp"
+
+int run_contraction_bilinear_example(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    // A[M0, M1, K0, K1]
+    std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> a_ms_ks_strides{524288, 4096, 128, 1};
+    // B[N0, N1, K0, K1]
+    std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
+    std::vector<ck::index_t> b_ns_ks_strides{524288, 4096, 128, 1};
+    // D[M0, M1, N0, N1]
+    std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
+    // E[M0, M1, N0, N1]
+    std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
+
+    float alpha = 1.f;
+    float beta  = 1.f;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 28)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        const ck::index_t M0 = std::stoi(argv[4]);
+        const ck::index_t M1 = std::stoi(argv[5]);
+
+        const ck::index_t N0 = std::stoi(argv[6]);
+        const ck::index_t N1 = std::stoi(argv[7]);
+
+        const ck::index_t K0 = std::stoi(argv[8]);
+        const ck::index_t K1 = std::stoi(argv[9]);
+
+        a_ms_ks_lengths = {M0, M1, K0, K1};
+        a_ms_ks_strides = {
+            std::stoi(argv[10]), std::stoi(argv[11]), std::stoi(argv[12]), std::stoi(argv[13])};
+
+        b_ns_ks_lengths = {N0, N1, K0, K1};
+        b_ns_ks_strides = {
+            std::stoi(argv[14]), std::stoi(argv[15]), std::stoi(argv[16]), std::stoi(argv[17])};
+
+        d_ms_ns_lengths = {M0, M1, N0, N1};
+        d_ms_ns_strides = {
+            std::stoi(argv[18]), std::stoi(argv[19]), std::stoi(argv[20]), std::stoi(argv[21])};
+
+        e_ms_ns_lengths = {M0, M1, N0, N1};
+        e_ms_ns_strides = {
+            std::stoi(argv[22]), std::stoi(argv[23]), std::stoi(argv[24]), std::stoi(argv[25])};
+
+        alpha = std::stof(argv[26]);
+        beta  = std::stof(argv[27]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 9: M0, M1, N0, N1, K0, K1\n");
+        printf("arg10 to 13: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1\n");
+        printf("arg14 to 17: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1\n");
+        printf("arg18 to 21: Stride_D_M0, Stride_D_M1, Stride_D_N0, Stride_D_N1\n");
+        printf("arg22 to 25: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1\n");
+        printf("arg26 to 27: alpha, beta\n");
+        exit(0);
+    }
+
+    Tensor<ADataType> a_ms_ks(a_ms_ks_lengths, a_ms_ks_strides);
+    Tensor<BDataType> b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides);
+    Tensor<EDataType> d_ms_ns(d_ms_ns_lengths, d_ms_ns_strides);
+    Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
+    Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides);
+
+    std::cout << "a_ms_ks: " << a_ms_ks.mDesc << std::endl;
+    std::cout << "b_ns_ks: " << b_ns_ks.mDesc << std::endl;
+    std::cout << "d_ms_ns: " << d_ms_ns.mDesc << std::endl;
+    std::cout << "e_ms_ns: " << e_ms_ns_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_ms_ks.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_ns_ks.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        d_ms_ns.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        break;
+    default:
+        a_ms_ks.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_ns_ks.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d_ms_ns.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        break;
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_ms_ks.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_ns_ks.mDesc.GetElementSpaceSize());
+    DeviceMem d_device_buf(sizeof(DDataType) * d_ms_ns.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_ms_ns_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_ms_ks.mData.data());
+    b_device_buf.ToDevice(b_ns_ks.mData.data());
+    d_device_buf.ToDevice(d_ms_ns.mData.data());
+
+    // set zero
+    e_device_buf.SetZero();
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{alpha, beta};
+
+    // device operation
+    auto op       = DeviceOpInstance{};
+    auto invoker  = op.MakeInvoker();
+    auto argument = op.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                                    b_device_buf.GetDeviceBuffer(),
+                                    std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
+                                    e_device_buf.GetDeviceBuffer(),
+                                    a_ms_ks_lengths,
+                                    a_ms_ks_strides,
+                                    b_ns_ks_lengths,
+                                    b_ns_ks_strides,
+                                    std::array<std::vector<ck::index_t>, 1>{d_ms_ns_lengths},
+                                    std::array<std::vector<ck::index_t>, 1>{d_ms_ns_strides},
+                                    e_ms_ns_lengths,
+                                    e_ms_ns_strides,
+                                    a_element_op,
+                                    b_element_op,
+                                    cde_element_op);
+
+    if(!op.IsSupportedArgument(argument))
+    {
+        std::cout << op.GetTypeString() << " does not support this problem" << std::endl;
+
+        return 0;
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    ck::index_t M =
+        ck::accumulate_n<ck::index_t>(e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
+
+    ck::index_t N = ck::accumulate_n<ck::index_t>(
+        e_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{});
+
+    ck::index_t K = ck::accumulate_n<ck::index_t>(
+        a_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{});
+
+    std::size_t flop      = std::size_t(2) * M * N * K;
+    std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                            sizeof(DDataType) * M * N + sizeof(EDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << op.GetTypeString() << std::endl;
+
+    e_device_buf.FromDevice(e_ms_ns_device_result.mData.data());
+
+    if(do_verification)
+    {
+        Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
+
+        using ReferenceOpInstance =
+            ck::tensor_operation::host::ReferenceContraction_M2_N2_K2<NumDimM,
+                                                                      NumDimN,
+                                                                      NumDimK,
+                                                                      ADataType,
+                                                                      BDataType,
+                                                                      CShuffleDataType,
+                                                                      AccDataType,
+                                                                      ComputeDataType,
+                                                                      AElementOp,
+                                                                      BElementOp>;
+
+        auto ref_op      = ReferenceOpInstance{};
+        auto ref_invoker = ref_op.MakeInvoker();
+
+        auto ref_argument =
+            ref_op.MakeArgument(a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        for(size_t m0 = 0; m0 < e_ms_ns_host_result.mDesc.GetLengths()[0]; ++m0)
+        {
+            for(size_t m1 = 0; m1 < e_ms_ns_host_result.mDesc.GetLengths()[1]; ++m1)
+            {
+                for(size_t n0 = 0; n0 < e_ms_ns_host_result.mDesc.GetLengths()[2]; ++n0)
+                {
+                    for(size_t n1 = 0; n1 < e_ms_ns_host_result.mDesc.GetLengths()[3]; ++n1)
+                    {
+                        cde_element_op(e_ms_ns_host_result(m0, m1, n0, n1),
+                                       c_ms_ns_host_result(m0, m1, n0, n1),
+                                       d_ms_ns(m0, m1, n0, n1));
+                    }
+                }
+            }
+        }
+
+        return ck::utils::check_err(e_ms_ns_device_result, e_ms_ns_host_result) ? 0 : 1;
+    }
+
+    return 0;
+}
--- a/example/26_contraction/run_contraction_scale_example.inc
+++ b/example/26_contraction/run_contraction_scale_example.inc
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/numeric.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_contraction.hpp"
+
+int run_contraction_scale_example(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    // A[M0, M1, K0, K1]
+    std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> a_ms_ks_strides{524288, 4096, 128, 1};
+    // B[N0, N1, K0, K1]
+    std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
+    std::vector<ck::index_t> b_ns_ks_strides{524288, 4096, 128, 1};
+    // E[M0, M1, N0, N1]
+    std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
+
+    float scale = 1.f;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 23)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        const ck::index_t M0 = std::stoi(argv[4]);
+        const ck::index_t M1 = std::stoi(argv[5]);
+
+        const ck::index_t N0 = std::stoi(argv[6]);
+        const ck::index_t N1 = std::stoi(argv[7]);
+
+        const ck::index_t K0 = std::stoi(argv[8]);
+        const ck::index_t K1 = std::stoi(argv[9]);
+
+        a_ms_ks_lengths = {M0, M1, K0, K1};
+        a_ms_ks_strides = {
+            std::stoi(argv[10]), std::stoi(argv[11]), std::stoi(argv[12]), std::stoi(argv[13])};
+
+        b_ns_ks_lengths = {N0, N1, K0, K1};
+        b_ns_ks_strides = {
+            std::stoi(argv[14]), std::stoi(argv[15]), std::stoi(argv[16]), std::stoi(argv[17])};
+
+        e_ms_ns_lengths = {M0, M1, N0, N1};
+        e_ms_ns_strides = {
+            std::stoi(argv[18]), std::stoi(argv[19]), std::stoi(argv[20]), std::stoi(argv[21])};
+
+        scale = std::stof(argv[22]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 9: M0, M1, N0, N1, K0, K1\n");
+        printf("arg10 to 13: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1\n");
+        printf("arg14 to 17: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1\n");
+        printf("arg18 to 21: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1\n");
+        printf("arg22: scale\n");
+        exit(0);
+    }
+
+    Tensor<ADataType> a_ms_ks(a_ms_ks_lengths, a_ms_ks_strides);
+    Tensor<BDataType> b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides);
+    Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
+    Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides);
+
+    std::cout << "a_ms_ks: " << a_ms_ks.mDesc << std::endl;
+    std::cout << "b_ns_ks: " << b_ns_ks.mDesc << std::endl;
+    std::cout << "e_ms_ns: " << e_ms_ns_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_ms_ks.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_ns_ks.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        break;
+    default:
+        a_ms_ks.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_ns_ks.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        break;
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_ms_ks.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_ns_ks.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_ms_ns_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_ms_ks.mData.data());
+    b_device_buf.ToDevice(b_ns_ks.mData.data());
+
+    // set zero
+    e_device_buf.SetZero();
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{scale};
+
+    // device operation
+    auto op       = DeviceOpInstance{};
+    auto invoker  = op.MakeInvoker();
+    auto argument = op.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                                    b_device_buf.GetDeviceBuffer(),
+                                    std::array<const void*, 0>{},
+                                    e_device_buf.GetDeviceBuffer(),
+                                    a_ms_ks_lengths,
+                                    a_ms_ks_strides,
+                                    b_ns_ks_lengths,
+                                    b_ns_ks_strides,
+                                    std::array<std::vector<ck::index_t>, 0>{},
+                                    std::array<std::vector<ck::index_t>, 0>{},
+                                    e_ms_ns_lengths,
+                                    e_ms_ns_strides,
+                                    a_element_op,
+                                    b_element_op,
+                                    cde_element_op);
+
+    if(!op.IsSupportedArgument(argument))
+    {
+        std::cout << op.GetTypeString() << " does not support this problem" << std::endl;
+
+        return 0;
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    ck::index_t M =
+        ck::accumulate_n<ck::index_t>(e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
+
+    ck::index_t N = ck::accumulate_n<ck::index_t>(
+        e_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{});
+
+    ck::index_t K = ck::accumulate_n<ck::index_t>(
+        a_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{});
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_btype =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + +sizeof(EDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << op.GetTypeString() << std::endl;
+
+    e_device_buf.FromDevice(e_ms_ns_device_result.mData.data());
+
+    if(do_verification)
+    {
+        Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
+
+        using ReferenceOpInstance =
+            ck::tensor_operation::host::ReferenceContraction_M2_N2_K2<NumDimM,
+                                                                      NumDimN,
+                                                                      NumDimK,
+                                                                      ADataType,
+                                                                      BDataType,
+                                                                      CShuffleDataType,
+                                                                      AccDataType,
+                                                                      ComputeDataType,
+                                                                      AElementOp,
+                                                                      BElementOp>;
+
+        auto ref_op      = ReferenceOpInstance{};
+        auto ref_invoker = ref_op.MakeInvoker();
+
+        auto ref_argument =
+            ref_op.MakeArgument(a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        for(size_t m0 = 0; m0 < e_ms_ns_host_result.mDesc.GetLengths()[0]; ++m0)
+        {
+            for(size_t m1 = 0; m1 < e_ms_ns_host_result.mDesc.GetLengths()[1]; ++m1)
+            {
+                for(size_t n0 = 0; n0 < e_ms_ns_host_result.mDesc.GetLengths()[2]; ++n0)
+                {
+                    for(size_t n1 = 0; n1 < e_ms_ns_host_result.mDesc.GetLengths()[3]; ++n1)
+                    {
+                        cde_element_op(e_ms_ns_host_result(m0, m1, n0, n1),
+                                       c_ms_ns_host_result(m0, m1, n0, n1));
+                    }
+                }
+            }
+        }
+
+        return ck::utils::check_err(e_ms_ns_device_result, e_ms_ns_host_result) ? 0 : 1;
+    }
+
+    return 0;
+}