Merge branch 'develop' into aosewski/gemm_tile_loop

6b1490c9 · zjing14 · GitHub · 271269a5 · a3c80265 · 6b1490c9
Unverified Commit 6b1490c9 authored Oct 12, 2023 by zjing14 Committed by GitHub Oct 12, 2023
20 changed files
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -42,7 +42,7 @@ fastjsonschema==2.18.0
    # via rocm-docs-core
 gitdb==4.0.10
    # via gitpython
-gitpython==3.1.31
+gitpython==3.1.35
    # via rocm-docs-core
 idna==3.4
    # via requests
@@ -103,7 +103,7 @@ requests==2.28.2
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core>=0.20.0
+rocm-docs-core==0.24.0
    # via -r requirements.in
 six==1.16.0
    # via

--- a/example/01_gemm/CMakeLists.txt
+++ b/example/01_gemm/CMakeLists.txt
@@ -66,18 +66,14 @@ endif()
 add_example_executable(example_gemm_xdl_streamk gemm_xdl_streamk.cpp)
-if(GPU_TARGETS MATCHES "gfx940" OR GPU_TARGETS MATCHES "gfx941" OR GPU_TARGETS MATCHES "gfx942")
+add_example_executable(example_gemm_xdl_fp8 gemm_xdl_fp8.cpp)
-  add_example_executable(example_gemm_xdl_fp8 gemm_xdl_fp8.cpp)
+if(result EQUAL 0)
-  if(result EQUAL 0)
    add_dependencies(example_gemm_xdl example_gemm_xdl_fp8)
-  endif()
 endif()
-if(GPU_TARGETS MATCHES "gfx940" OR GPU_TARGETS MATCHES "gfx941" OR GPU_TARGETS MATCHES "gfx942")
+add_example_executable(example_gemm_xdl_fp8_bf8 gemm_xdl_fp8_bf8.cpp)
-  add_example_executable(example_gemm_xdl_fp8_bf8 gemm_xdl_fp8_bf8.cpp)
+if(result EQUAL 0)
-  if(result EQUAL 0)
    add_dependencies(example_gemm_xdl example_gemm_xdl_fp8_bf8)
-  endif()
 endif()
 add_example_executable(example_gemm_xdl_fp16_fp8 gemm_xdl_fp16_fp8.cpp)

--- a/example/01_gemm/gemm_xdl_fp8.cpp
+++ b/example/01_gemm/gemm_xdl_fp8.cpp
@@ -7,9 +7,9 @@
 using ADataType        = ck::f8_t;
 using BDataType        = ck::f8_t;
-using CDataType        = ck::f8_t;
+using CDataType        = ck::half_t;
 using AccDataType      = float;
-using CShuffleDataType = ck::f8_t;
+using CShuffleDataType = float;
 using ALayout = Row;
 using BLayout = Col;
@@ -27,7 +27,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
 // ######|        |        |        |      Type|      Type|      Type|        Type|         DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 // ######|        |        |        |          |          |          |            |                 |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 // ######|        |        |        |          |          |          |            |                 |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 64, 1, 4>,              16>;
+         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 64, 1, 4>,               8>;
 // clang-format on
 using ReferenceGemmInstance = ck::tensor_operation::host::

--- a/example/01_gemm/gemm_xdl_fp8_bf8.cpp
+++ b/example/01_gemm/gemm_xdl_fp8_bf8.cpp
@@ -7,9 +7,9 @@
 using ADataType        = ck::f8_t;
 using BDataType        = ck::bf8_t;
-using CDataType        = ck::f8_t;
+using CDataType        = ck::half_t;
 using AccDataType      = float;
-using CShuffleDataType = ck::f8_t;
+using CShuffleDataType = float;
 using ALayout = Row;
 using BLayout = Col;
@@ -31,7 +31,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
 // ######|        |        |        |      Type|      Type|      Type|        Type|         DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 // ######|        |        |        |          |          |          |            |                 |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 // ######|        |        |        |          |          |          |            |                 |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 64, 1, 4>,              16,  LoopSched, PipelineVer, ComputeTypeA, ComputeTypeB>;
+         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 64, 1, 4>,               8,  LoopSched, PipelineVer, ComputeTypeA, ComputeTypeB>;
 // clang-format on
 using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,

--- a/example/26_contraction/CMakeLists.txt
+++ b/example/26_contraction/CMakeLists.txt
-add_custom_target(example_contraction)
-add_custom_target(example_contraction_scale)
-add_custom_target(example_contraction_bilinear)
-# FP32
 add_example_executable(example_contraction_bilinear_xdl_fp32 contraction_bilinear_xdl_fp32.cpp)
-add_dependencies(example_contraction_bilinear example_contraction_bilinear_xdl_fp32)
 add_example_executable(example_contraction_scale_xdl_fp32 contraction_scale_xdl_fp32.cpp)
-add_dependencies(example_contraction_scale example_contraction_scale_xdl_fp32)
-add_example_executable(example_contraction_bilinear_xdl_fp32_compute_bf16 contraction_bilinear_xdl_fp32_compute_bf16.cpp)
-add_dependencies(example_contraction_bilinear example_contraction_bilinear_xdl_fp32_compute_bf16)
-add_example_executable(example_contraction_scale_xdl_fp32_compute_bf16 contraction_scale_xdl_fp32_compute_bf16.cpp)
-add_dependencies(example_contraction_scale example_contraction_scale_xdl_fp32_compute_bf16)
-add_example_executable(example_contraction_bilinear_xdl_fp32_compute_fp16 contraction_bilinear_xdl_fp32_compute_fp16.cpp)
-add_dependencies(example_contraction_bilinear example_contraction_bilinear_xdl_fp32_compute_fp16)
-add_example_executable(example_contraction_scale_xdl_fp32_compute_fp16 contraction_scale_xdl_fp32_compute_fp16.cpp)
-add_dependencies(example_contraction_scale example_contraction_scale_xdl_fp32_compute_fp16)
-# FP64
 add_example_executable(example_contraction_bilinear_xdl_fp64 contraction_bilinear_xdl_fp64.cpp)
-add_dependencies(example_contraction_bilinear example_contraction_bilinear_xdl_fp64)
 add_example_executable(example_contraction_scale_xdl_fp64 contraction_scale_xdl_fp64.cpp)
-add_dependencies(example_contraction_scale example_contraction_scale_xdl_fp64)
-add_example_executable(example_contraction_bilinear_xdl_fp64_compute_fp32 contraction_bilinear_xdl_fp64_compute_fp32.cpp)
-add_dependencies(example_contraction_bilinear example_contraction_bilinear_xdl_fp64_compute_fp32)
-add_example_executable(example_contraction_scale_xdl_fp64_compute_fp32 contraction_scale_xdl_fp64_compute_fp32.cpp)
-add_dependencies(example_contraction_scale example_contraction_scale_xdl_fp64_compute_fp32)
-# FP16
-add_example_executable(example_contraction_bilinear_xdl_fp16_compute_fp32 contraction_bilinear_xdl_fp16_compute_fp32.cpp)
-add_dependencies(example_contraction_bilinear example_contraction_bilinear_xdl_fp16_compute_fp32)
-add_example_executable(example_contraction_scale_xdl_fp16_compute_fp32 contraction_scale_xdl_fp16_compute_fp32.cpp)
-add_dependencies(example_contraction_scale example_contraction_scale_xdl_fp16_compute_fp32)
-# BF16
-add_example_executable(example_contraction_bilinear_xdl_bf16_compute_fp32 contraction_bilinear_xdl_bf16_compute_fp32.cpp)
-add_dependencies(example_contraction_bilinear example_contraction_bilinear_xdl_bf16_compute_fp32)
-add_example_executable(example_contraction_scale_xdl_bf16_compute_fp32 contraction_scale_xdl_bf16_compute_fp32.cpp)
-add_dependencies(example_contraction_scale example_contraction_scale_xdl_bf16_compute_fp32)
-add_dependencies(example_contraction example_contraction_scale)
-add_dependencies(example_contraction example_contraction_bilinear)
--- a/example/26_contraction/common_instances.hpp
+++ b/example/26_contraction/common_instances.hpp
--- a/example/26_contraction/contraction_bilinear_xdl_bf16_compute_fp32.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_bf16_compute_fp32.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "common_instances.hpp"
-using ADataType        = BF16;
-using BDataType        = BF16;
-using AccDataType      = F32;
-using CShuffleDataType = BF16;
-using DDataType        = BF16;
-using DsDataType       = ck::Tuple<DDataType>;
-using EDataType        = BF16;
-using ComputeDataType  = F32;
-static constexpr ck::index_t NumDimM = 2;
-static constexpr ck::index_t NumDimN = 2;
-static constexpr ck::index_t NumDimK = 2;
-using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
-using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
-using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
-using DeviceOpInstanceKKNN = DeviceOpInstanceKK_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-using DeviceOpInstanceKNNN = DeviceOpInstanceKN_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-using DeviceOpInstanceMKNN = DeviceOpInstanceMK_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-using DeviceOpInstanceMNNN = DeviceOpInstanceMN_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-using DeviceOpInstance = DeviceOpInstanceKKNN;
-#include "run_contraction_bilinear_example.inc"
-int main(int argc, char* argv[]) { return run_contraction_bilinear_example(argc, argv); }
--- a/example/26_contraction/contraction_bilinear_xdl_fp16_compute_fp32.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp16_compute_fp32.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "common_instances.hpp"
-using ADataType        = F16;
-using BDataType        = F16;
-using AccDataType      = F32;
-using CShuffleDataType = F16;
-using DDataType        = F16;
-using DsDataType       = ck::Tuple<DDataType>;
-using EDataType        = F16;
-using ComputeDataType  = F32;
-static constexpr ck::index_t NumDimM = 2;
-static constexpr ck::index_t NumDimN = 2;
-static constexpr ck::index_t NumDimK = 2;
-using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
-using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
-using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
-using DeviceOpInstanceKKNN = DeviceOpInstanceKK_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-using DeviceOpInstanceKNNN = DeviceOpInstanceKN_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-using DeviceOpInstanceMKNN = DeviceOpInstanceMK_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-using DeviceOpInstanceMNNN = DeviceOpInstanceMN_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-using DeviceOpInstance = DeviceOpInstanceKKNN;
-#include "run_contraction_bilinear_example.inc"
-int main(int argc, char* argv[]) { return run_contraction_bilinear_example(argc, argv); }
--- a/example/26_contraction/contraction_bilinear_xdl_fp32.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp32.cpp
--- a/example/26_contraction/contraction_bilinear_xdl_fp32_compute_bf16.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp32_compute_bf16.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "common_instances.hpp"
-using ADataType        = F32;
-using BDataType        = F32;
-using AccDataType      = F32;
-using CShuffleDataType = F32;
-using DDataType        = F32;
-using DsDataType       = ck::Tuple<DDataType>;
-using EDataType        = F32;
-using ComputeDataType  = BF16;
-static constexpr ck::index_t NumDimM = 2;
-static constexpr ck::index_t NumDimN = 2;
-static constexpr ck::index_t NumDimK = 2;
-using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
-using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
-using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
-using DeviceOpInstanceKKNN = DeviceOpInstanceKK_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-using DeviceOpInstanceKNNN = DeviceOpInstanceKN_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-using DeviceOpInstanceMKNN = DeviceOpInstanceMK_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-using DeviceOpInstanceMNNN = DeviceOpInstanceMN_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-using DeviceOpInstance = DeviceOpInstanceKKNN;
-#include "run_contraction_bilinear_example.inc"
-int main(int argc, char* argv[]) { return run_contraction_bilinear_example(argc, argv); }
--- a/example/26_contraction/contraction_bilinear_xdl_fp32_compute_fp16.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp32_compute_fp16.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "common_instances.hpp"
-using ADataType        = F32;
-using BDataType        = F32;
-using AccDataType      = F32;
-using CShuffleDataType = F32;
-using DDataType        = F32;
-using DsDataType       = ck::Tuple<DDataType>;
-using EDataType        = F32;
-using ComputeDataType  = F16;
-static constexpr ck::index_t NumDimM = 2;
-static constexpr ck::index_t NumDimN = 2;
-static constexpr ck::index_t NumDimK = 2;
-using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
-using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
-using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
-using DeviceOpInstanceKKNN = DeviceOpInstanceKK_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-using DeviceOpInstanceKNNN = DeviceOpInstanceKN_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-using DeviceOpInstanceMKNN = DeviceOpInstanceMK_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-using DeviceOpInstanceMNNN = DeviceOpInstanceMN_Generic<NumDimM,
-                                                        NumDimN,
-                                                        NumDimK,
-                                                        ADataType,
-                                                        BDataType,
-                                                        AccDataType,
-                                                        CShuffleDataType,
-                                                        DsDataType,
-                                                        EDataType,
-                                                        ComputeDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CDEElementOp>;
-using DeviceOpInstance = DeviceOpInstanceKKNN;
-#include "run_contraction_bilinear_example.inc"
-int main(int argc, char* argv[]) { return run_contraction_bilinear_example(argc, argv); }
--- a/example/26_contraction/contraction_bilinear_xdl_fp64.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp64.cpp
--- a/example/26_contraction/contraction_bilinear_xdl_fp64_compute_fp32.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp64_compute_fp32.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "common_instances.hpp"
-using ADataType        = F64;
-using BDataType        = F64;
-using AccDataType      = F32;
-using CShuffleDataType = F64;
-using DDataType        = F64;
-using DsDataType       = ck::Tuple<DDataType>;
-using EDataType        = F64;
-using ComputeDataType  = F32;
-static constexpr ck::index_t NumDimM = 2;
-static constexpr ck::index_t NumDimN = 2;
-static constexpr ck::index_t NumDimK = 2;
-using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
-using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
-using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
-using DeviceOpInstanceKKNN = DeviceOpInstanceKK_FP64<NumDimM,
-                                                     NumDimN,
-                                                     NumDimK,
-                                                     ADataType,
-                                                     BDataType,
-                                                     AccDataType,
-                                                     CShuffleDataType,
-                                                     DsDataType,
-                                                     EDataType,
-                                                     ComputeDataType,
-                                                     AElementOp,
-                                                     BElementOp,
-                                                     CDEElementOp>;
-using DeviceOpInstanceKNNN = DeviceOpInstanceKN_FP64<NumDimM,
-                                                     NumDimN,
-                                                     NumDimK,
-                                                     ADataType,
-                                                     BDataType,
-                                                     AccDataType,
-                                                     CShuffleDataType,
-                                                     DsDataType,
-                                                     EDataType,
-                                                     ComputeDataType,
-                                                     AElementOp,
-                                                     BElementOp,
-                                                     CDEElementOp>;
-using DeviceOpInstanceMKNN = DeviceOpInstanceMK_FP64<NumDimM,
-                                                     NumDimN,
-                                                     NumDimK,
-                                                     ADataType,
-                                                     BDataType,
-                                                     AccDataType,
-                                                     CShuffleDataType,
-                                                     DsDataType,
-                                                     EDataType,
-                                                     ComputeDataType,
-                                                     AElementOp,
-                                                     BElementOp,
-                                                     CDEElementOp>;
-using DeviceOpInstanceMNNN = DeviceOpInstanceMN_FP64<NumDimM,
-                                                     NumDimN,
-                                                     NumDimK,
-                                                     ADataType,
-                                                     BDataType,
-                                                     AccDataType,
-                                                     CShuffleDataType,
-                                                     DsDataType,
-                                                     EDataType,
-                                                     ComputeDataType,
-                                                     AElementOp,
-                                                     BElementOp,
-                                                     CDEElementOp>;
-using DeviceOpInstance = DeviceOpInstanceKKNN;
-#include "run_contraction_bilinear_example.inc"
-int main(int argc, char* argv[]) { return run_contraction_bilinear_example(argc, argv); }
--- a/example/26_contraction/contraction_scale_xdl_bf16_compute_fp32.cpp
+++ b/example/26_contraction/contraction_scale_xdl_bf16_compute_fp32.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "common_instances.hpp"
-using ADataType        = BF16;
-using BDataType        = BF16;
-using AccDataType      = F32;
-using CShuffleDataType = BF16;
-using DsDataType       = ck::Tuple<>;
-using EDataType        = BF16;
-using ComputeDataType  = F32;
-static constexpr ck::index_t NumDimM = 2;
-static constexpr ck::index_t NumDimN = 2;
-static constexpr ck::index_t NumDimK = 2;
-using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
-using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
-using CDEElementOp = ck::tensor_operation::element_wise::Scale;
-using DeviceOpInstanceKKN = DeviceOpInstanceKK_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-using DeviceOpInstanceKNN = DeviceOpInstanceKN_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-using DeviceOpInstanceMKN = DeviceOpInstanceMK_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-using DeviceOpInstanceMNN = DeviceOpInstanceMN_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-using DeviceOpInstance = DeviceOpInstanceKKN;
-#include "run_contraction_scale_example.inc"
-int main(int argc, char* argv[]) { return run_contraction_scale_example(argc, argv); }
--- a/example/26_contraction/contraction_scale_xdl_fp16_compute_fp32.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp16_compute_fp32.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "common_instances.hpp"
-using ADataType        = F16;
-using BDataType        = F16;
-using AccDataType      = F32;
-using CShuffleDataType = F16;
-using DsDataType       = ck::Tuple<>;
-using EDataType        = F16;
-using ComputeDataType  = F32;
-static constexpr ck::index_t NumDimM = 2;
-static constexpr ck::index_t NumDimN = 2;
-static constexpr ck::index_t NumDimK = 2;
-using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
-using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
-using CDEElementOp = ck::tensor_operation::element_wise::Scale;
-using DeviceOpInstanceKKN = DeviceOpInstanceKK_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-using DeviceOpInstanceKNN = DeviceOpInstanceKN_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-using DeviceOpInstanceMKN = DeviceOpInstanceMK_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-using DeviceOpInstanceMNN = DeviceOpInstanceMN_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-using DeviceOpInstance = DeviceOpInstanceKKN;
-#include "run_contraction_scale_example.inc"
-int main(int argc, char* argv[]) { return run_contraction_scale_example(argc, argv); }
--- a/example/26_contraction/contraction_scale_xdl_fp32.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp32.cpp
--- a/example/26_contraction/contraction_scale_xdl_fp32_compute_bf16.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp32_compute_bf16.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "common_instances.hpp"
-using ADataType        = F32;
-using BDataType        = F32;
-using AccDataType      = F32;
-using CShuffleDataType = F32;
-using DsDataType       = ck::Tuple<>;
-using EDataType        = F32;
-using ComputeDataType  = BF16;
-static constexpr ck::index_t NumDimM = 2;
-static constexpr ck::index_t NumDimN = 2;
-static constexpr ck::index_t NumDimK = 2;
-using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
-using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
-using CDEElementOp = ck::tensor_operation::element_wise::Scale;
-using DeviceOpInstanceKKN = DeviceOpInstanceKK_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-using DeviceOpInstanceKNN = DeviceOpInstanceKN_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-using DeviceOpInstanceMKN = DeviceOpInstanceMK_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-using DeviceOpInstanceMNN = DeviceOpInstanceMN_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-using DeviceOpInstance = DeviceOpInstanceKKN;
-#include "run_contraction_scale_example.inc"
-int main(int argc, char* argv[]) { return run_contraction_scale_example(argc, argv); }
--- a/example/26_contraction/contraction_scale_xdl_fp32_compute_fp16.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp32_compute_fp16.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "common_instances.hpp"
-using ADataType        = F32;
-using BDataType        = F32;
-using AccDataType      = F32;
-using CShuffleDataType = F32;
-using DsDataType       = ck::Tuple<>;
-using EDataType        = F32;
-using ComputeDataType  = F16;
-static constexpr ck::index_t NumDimM = 2;
-static constexpr ck::index_t NumDimN = 2;
-static constexpr ck::index_t NumDimK = 2;
-using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
-using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
-using CDEElementOp = ck::tensor_operation::element_wise::Scale;
-using DeviceOpInstanceKKN = DeviceOpInstanceKK_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-using DeviceOpInstanceKNN = DeviceOpInstanceKN_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-using DeviceOpInstanceMKN = DeviceOpInstanceMK_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-using DeviceOpInstanceMNN = DeviceOpInstanceMN_Generic<NumDimM,
-                                                       NumDimN,
-                                                       NumDimK,
-                                                       ADataType,
-                                                       BDataType,
-                                                       AccDataType,
-                                                       CShuffleDataType,
-                                                       DsDataType,
-                                                       EDataType,
-                                                       ComputeDataType,
-                                                       AElementOp,
-                                                       BElementOp,
-                                                       CDEElementOp>;
-using DeviceOpInstance = DeviceOpInstanceKKN;
-#include "run_contraction_scale_example.inc"
-int main(int argc, char* argv[]) { return run_contraction_scale_example(argc, argv); }
--- a/example/26_contraction/contraction_scale_xdl_fp64.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp64.cpp
--- a/example/26_contraction/contraction_scale_xdl_fp64_compute_fp32.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp64_compute_fp32.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "common_instances.hpp"
-using ADataType        = F64;
-using BDataType        = F64;
-using AccDataType      = F32;
-using CShuffleDataType = F64;
-using DsDataType       = ck::Tuple<>;
-using EDataType        = F64;
-using ComputeDataType  = F32;
-static constexpr ck::index_t NumDimM = 2;
-static constexpr ck::index_t NumDimN = 2;
-static constexpr ck::index_t NumDimK = 2;
-using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
-using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
-using CDEElementOp = ck::tensor_operation::element_wise::Scale;
-using DeviceOpInstanceKKN = DeviceOpInstanceKK_FP64<NumDimM,
-                                                    NumDimN,
-                                                    NumDimK,
-                                                    ADataType,
-                                                    BDataType,
-                                                    AccDataType,
-                                                    CShuffleDataType,
-                                                    DsDataType,
-                                                    EDataType,
-                                                    ComputeDataType,
-                                                    AElementOp,
-                                                    BElementOp,
-                                                    CDEElementOp>;
-using DeviceOpInstanceKNN = DeviceOpInstanceKN_FP64<NumDimM,
-                                                    NumDimN,
-                                                    NumDimK,
-                                                    ADataType,
-                                                    BDataType,
-                                                    AccDataType,
-                                                    CShuffleDataType,
-                                                    DsDataType,
-                                                    EDataType,
-                                                    ComputeDataType,
-                                                    AElementOp,
-                                                    BElementOp,
-                                                    CDEElementOp>;
-using DeviceOpInstanceMKN = DeviceOpInstanceMK_FP64<NumDimM,
-                                                    NumDimN,
-                                                    NumDimK,
-                                                    ADataType,
-                                                    BDataType,
-                                                    AccDataType,
-                                                    CShuffleDataType,
-                                                    DsDataType,
-                                                    EDataType,
-                                                    ComputeDataType,
-                                                    AElementOp,
-                                                    BElementOp,
-                                                    CDEElementOp>;
-using DeviceOpInstanceMNN = DeviceOpInstanceMN_FP64<NumDimM,
-                                                    NumDimN,
-                                                    NumDimK,
-                                                    ADataType,
-                                                    BDataType,
-                                                    AccDataType,
-                                                    CShuffleDataType,
-                                                    DsDataType,
-                                                    EDataType,
-                                                    ComputeDataType,
-                                                    AElementOp,
-                                                    BElementOp,
-                                                    CDEElementOp>;
-using DeviceOpInstance = DeviceOpInstanceKKN;
-#include "run_contraction_scale_example.inc"
-int main(int argc, char* argv[]) { return run_contraction_scale_example(argc, argv); }