Merge remote-tracking branch 'upstream/develop' into merge_upstream_1129

also fix regression

Merge remote-tracking branch 'upstream/develop' into merge_upstream_1129
also fix regression
d27e0691 · Chao Liu · 0a7174ad · a2969aa8 · d27e0691 · d27e0691
Commit d27e0691 authored Nov 30, 2023 by Chao Liu
20 changed files
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
@@ -3,12 +3,23 @@

 #pragma once

-#include <iostream>
+#include <cmath>
+#include <cstdlib>
+#include <numeric>
 #include <type_traits>
-#include <sstream>
+#include <vector>

+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/fill.hpp"
 #include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"

 namespace ck {
 namespace tensor_operation {
@@ -22,6 +33,7 @@ namespace host {
 //             Supports both GNCHW/NGCHW as well as GNHWC/NHWGC physical layout
 //             as long as dimensions in tensor descriptor is in GNCHW order
 //
+// @tparam     NDimSpatial  Number of spatial dimensions.
 // @tparam     InDataType               Input tensor data type.
 // @tparam     WeiDataType              Weights tensor data type.
 // @tparam     OutDataType              Output tensor data type.
@@ -29,7 +41,9 @@ namespace host {
 //                                      operation.
 // @tparam     WeiElementwiseOperation  Functor for weights tensor elementwise
 //                                      operation.
-// @tparam     NDimSpatial  Number of spatial dimensions.
+// @tparam     NumAElementwiseTensor  Number of A elementwise tensors.
+// @tparam     NumBElementwiseTensor  Number of B elementwise tensors.
+// @tparam     NumDElementwiseTensor  Number of D elementwise tensors.
 //
 // input descriptor in [G, N, C, Do, Ho, Wo] order
 // weight descriptor in [G, K, C, Z, Y, X] order
@@ -42,25 +56,35 @@ template <ck::index_t NDimSpatial,
          typename InElementwiseOperation,
          typename WeiElementwiseOperation,
          typename OutElementwiseOperation,
+          ck::index_t NumAElementwiseTensor                                         = 0,
+          ck::index_t NumBElementwiseTensor                                         = 0,
+          ck::index_t NumDElementwiseTensor                                         = 0,
          typename std::enable_if<NDimSpatial >= 1 && NDimSpatial <= 3, bool>::type = false>
 struct ReferenceConvFwd : public device::BaseOperator
 {
    // Argument
    struct Argument : public device::BaseArgument
    {
-        Argument(const Tensor<InDataType>& input,
-                 const Tensor<WeiDataType>& weight,
-                 Tensor<OutDataType>& output,
-                 std::vector<ck::index_t> conv_filter_strides,
-                 std::vector<ck::index_t> conv_filter_dilations,
-                 std::vector<ck::index_t> input_left_pads,
-                 std::vector<ck::index_t> input_right_pads,
-                 InElementwiseOperation in_element_op,
-                 WeiElementwiseOperation wei_element_op,
-                 OutElementwiseOperation out_element_op)
+        Argument(
+            const Tensor<InDataType>& input,
+            const Tensor<WeiDataType>& weight,
+            Tensor<OutDataType>& output,
+            std::vector<ck::index_t> conv_filter_strides,
+            std::vector<ck::index_t> conv_filter_dilations,
+            std::vector<ck::index_t> input_left_pads,
+            std::vector<ck::index_t> input_right_pads,
+            InElementwiseOperation in_element_op,
+            WeiElementwiseOperation wei_element_op,
+            OutElementwiseOperation out_element_op,
+            const std::array<Tensor<InDataType>, NumAElementwiseTensor>& elementwise_a_tensors,
+            const std::array<Tensor<WeiDataType>, NumBElementwiseTensor>& elementwise_b_tensors,
+            const std::array<Tensor<OutDataType>, NumDElementwiseTensor>& elementwise_d_tensors)
            : input_{input},
              weight_{weight},
              output_{output},
+              elementwise_a_tensors_{elementwise_a_tensors},
+              elementwise_b_tensors_{elementwise_b_tensors},
+              elementwise_d_tensors_{elementwise_d_tensors},
              conv_strides_{conv_filter_strides},
              conv_dilations_{conv_filter_dilations},
              in_left_pads_{input_left_pads},
@@ -75,6 +99,10 @@ struct ReferenceConvFwd : public device::BaseOperator
        const Tensor<WeiDataType>& weight_;
        Tensor<OutDataType>& output_;

+        const std::array<Tensor<InDataType>, NumAElementwiseTensor>& elementwise_a_tensors_;
+        const std::array<Tensor<WeiDataType>, NumBElementwiseTensor>& elementwise_b_tensors_;
+        const std::array<Tensor<OutDataType>, NumDElementwiseTensor>& elementwise_d_tensors_;
+
        std::vector<index_t> conv_strides_;
        std::vector<index_t> conv_dilations_;
        std::vector<index_t> in_left_pads_;
@@ -114,25 +142,43 @@ struct ReferenceConvFwd : public device::BaseOperator
                            if(wi >= 0 &&
                               ck::type_convert<std::size_t>(wi) < arg.input_.GetLengths()[3])
                            {
-                                float v_in;
-                                float v_wei;
-
-                                arg.in_element_op_(
-                                    v_in, ck::type_convert<float>(arg.input_(g, n, c, wi)));
-
-                                arg.wei_element_op_(
-                                    v_wei, ck::type_convert<float>(arg.weight_(g, k, c, x)));
-
-                                v_acc += v_in * v_wei;
+                                InDataType v_in;
+                                WeiDataType v_wei;
+
+                                ExecuteElementwiseOp(arg.in_element_op_,
+                                                     arg.elementwise_a_tensors_,
+                                                     Number<NumAElementwiseTensor>{},
+                                                     v_in,
+                                                     arg.input_(g, n, c, wi),
+                                                     g,
+                                                     n,
+                                                     c,
+                                                     wi);
+                                ExecuteElementwiseOp(arg.wei_element_op_,
+                                                     arg.elementwise_b_tensors_,
+                                                     Number<NumBElementwiseTensor>{},
+                                                     v_wei,
+                                                     arg.weight_(g, k, c, x),
+                                                     g,
+                                                     k,
+                                                     c,
+                                                     x);
+                                v_acc +=
+                                    ck::type_convert<float>(v_in) * ck::type_convert<float>(v_wei);
                            }
                        }
                    }
-
-                    float v_out;
-
-                    arg.out_element_op_(v_out, v_acc);
-
-                    arg.output_(g, n, k, wo) = ck::type_convert<OutDataType>(v_out);
+                    OutDataType v_acc_converted = ck::type_convert<OutDataType>(v_acc);
+                    OutDataType& v_out          = arg.output_(g, n, k, wo);
+                    ExecuteElementwiseOp(arg.out_element_op_,
+                                         arg.elementwise_d_tensors_,
+                                         Number<NumDElementwiseTensor>{},
+                                         v_out,
+                                         v_acc_converted,
+                                         g,
+                                         n,
+                                         k,
+                                         wo);
                };

                make_ParallelTensorFunctor(func,
@@ -169,26 +215,47 @@ struct ReferenceConvFwd : public device::BaseOperator
                                   wi >= 0 &&
                                   ck::type_convert<std::size_t>(wi) < arg.input_.GetLengths()[4])
                                {
-                                    float v_in;
-                                    float v_wei;
-
-                                    arg.in_element_op_(
-                                        v_in, ck::type_convert<float>(arg.input_(g, n, c, hi, wi)));
-
-                                    arg.wei_element_op_(
-                                        v_wei, ck::type_convert<float>(arg.weight_(g, k, c, y, x)));
-
-                                    v_acc += v_in * v_wei;
+                                    InDataType v_in;
+                                    WeiDataType v_wei;
+
+                                    ExecuteElementwiseOp(arg.in_element_op_,
+                                                         arg.elementwise_a_tensors_,
+                                                         Number<NumAElementwiseTensor>{},
+                                                         v_in,
+                                                         arg.input_(g, n, c, hi, wi),
+                                                         g,
+                                                         n,
+                                                         c,
+                                                         hi,
+                                                         wi);
+                                    ExecuteElementwiseOp(arg.wei_element_op_,
+                                                         arg.elementwise_b_tensors_,
+                                                         Number<NumBElementwiseTensor>{},
+                                                         v_wei,
+                                                         arg.weight_(g, k, c, y, x),
+                                                         g,
+                                                         k,
+                                                         c,
+                                                         y,
+                                                         x);
+                                    v_acc += ck::type_convert<float>(v_in) *
+                                             ck::type_convert<float>(v_wei);
                                }
                            }
                        }
                    }
-
-                    float v_out;
-
-                    arg.out_element_op_(v_out, v_acc);
-
-                    arg.output_(g, n, k, ho, wo) = ck::type_convert<OutDataType>(v_out);
+                    OutDataType v_acc_converted = ck::type_convert<OutDataType>(v_acc);
+                    OutDataType& v_out          = arg.output_(g, n, k, ho, wo);
+                    ExecuteElementwiseOp(arg.out_element_op_,
+                                         arg.elementwise_d_tensors_,
+                                         Number<NumDElementwiseTensor>{},
+                                         v_out,
+                                         v_acc_converted,
+                                         g,
+                                         n,
+                                         k,
+                                         ho,
+                                         wo);
                };

                make_ParallelTensorFunctor(func,
@@ -235,29 +302,51 @@ struct ReferenceConvFwd : public device::BaseOperator
                                       ck::type_convert<std::size_t>(wi) <
                                           arg.input_.GetLengths()[5])
                                    {
-                                        float v_in;
-                                        float v_wei;
-
-                                        arg.in_element_op_(v_in,
-                                                           ck::type_convert<float>(
-                                                               arg.input_(g, n, c, di, hi, wi)));
-
-                                        arg.wei_element_op_(
-                                            v_wei,
-                                            ck::type_convert<float>(arg.weight_(g, k, c, z, y, x)));
-
-                                        v_acc += v_in * v_wei;
+                                        InDataType v_in;
+                                        WeiDataType v_wei;
+
+                                        ExecuteElementwiseOp(arg.in_element_op_,
+                                                             arg.elementwise_a_tensors_,
+                                                             Number<NumAElementwiseTensor>{},
+                                                             v_in,
+                                                             arg.input_(g, n, c, di, hi, wi),
+                                                             g,
+                                                             n,
+                                                             c,
+                                                             di,
+                                                             hi,
+                                                             wi);
+                                        ExecuteElementwiseOp(arg.wei_element_op_,
+                                                             arg.elementwise_b_tensors_,
+                                                             Number<NumBElementwiseTensor>{},
+                                                             v_wei,
+                                                             arg.weight_(g, k, c, z, y, x),
+                                                             g,
+                                                             k,
+                                                             c,
+                                                             z,
+                                                             y,
+                                                             x);
+                                        v_acc += ck::type_convert<float>(v_in) *
+                                                 ck::type_convert<float>(v_wei);
                                    }
                                }
                            }
                        }
                    }
-
-                    float v_out;
-
-                    arg.out_element_op_(v_out, v_acc);
-
-                    arg.output_(g, n, k, d_o, ho, wo) = ck::type_convert<OutDataType>(v_out);
+                    OutDataType v_acc_converted = ck::type_convert<OutDataType>(v_acc);
+                    OutDataType& v_out          = arg.output_(g, n, k, d_o, ho, wo);
+                    ExecuteElementwiseOp(arg.out_element_op_,
+                                         arg.elementwise_d_tensors_,
+                                         Number<NumDElementwiseTensor>{},
+                                         v_out,
+                                         v_acc_converted,
+                                         g,
+                                         n,
+                                         k,
+                                         d_o,
+                                         ho,
+                                         wo);
                };

                make_ParallelTensorFunctor(func,
@@ -280,6 +369,36 @@ struct ReferenceConvFwd : public device::BaseOperator
        }
    };

+    template <typename... Args,
+              typename ElementwiseOp,
+              typename ElementwiseTensor,
+              typename NumTensor,
+              typename T>
+    static void ExecuteElementwiseOp(ElementwiseOp& elementwise_op,
+                                     ElementwiseTensor& elementwise_tensors,
+                                     NumTensor,
+                                     T& y,
+                                     const T& x,
+                                     Args... dims)
+    {
+        if constexpr(NumTensor::value == 0)
+        {
+            elementwise_op(y, x);
+        }
+        else if constexpr(NumTensor::value == 1)
+        {
+            elementwise_op(y, x, elementwise_tensors[0](dims...));
+        }
+        else if constexpr(NumTensor::value == 2)
+        {
+            elementwise_op(y, x, elementwise_tensors[0](dims...), elementwise_tensors[1](dims...));
+        }
+        else
+        {
+            throw std::runtime_error("ElementOp not supported in reference.");
+        }
+    }
+
    static constexpr bool IsValidCompilationParameter()
    {
        // TODO: properly implement this check
@@ -291,16 +410,20 @@ struct ReferenceConvFwd : public device::BaseOperator
        return NDimSpatial >= 1 && NDimSpatial <= 3;
    }

-    static auto MakeArgument(const Tensor<InDataType>& input,
-                             const Tensor<WeiDataType>& weight,
-                             Tensor<OutDataType>& output,
-                             std::vector<ck::index_t> conv_filter_strides,
-                             std::vector<ck::index_t> conv_filter_dilations,
-                             std::vector<ck::index_t> input_left_pads,
-                             std::vector<ck::index_t> input_right_pads,
-                             InElementwiseOperation in_element_op,
-                             WeiElementwiseOperation wei_element_op,
-                             OutElementwiseOperation out_element_op)
+    static auto MakeArgument(
+        const Tensor<InDataType>& input,
+        const Tensor<WeiDataType>& weight,
+        Tensor<OutDataType>& output,
+        std::vector<ck::index_t> conv_filter_strides,
+        std::vector<ck::index_t> conv_filter_dilations,
+        std::vector<ck::index_t> input_left_pads,
+        std::vector<ck::index_t> input_right_pads,
+        InElementwiseOperation in_element_op,
+        WeiElementwiseOperation wei_element_op,
+        OutElementwiseOperation out_element_op,
+        const std::array<Tensor<InDataType>, NumAElementwiseTensor>& elementwise_a_tensors  = {},
+        const std::array<Tensor<WeiDataType>, NumBElementwiseTensor>& elementwise_b_tensors = {},
+        const std::array<Tensor<OutDataType>, NumDElementwiseTensor>& elementwise_d_tensors = {})
    {
        return Argument{input,
                        weight,
@@ -311,7 +434,10 @@ struct ReferenceConvFwd : public device::BaseOperator
                        input_right_pads,
                        in_element_op,
                        wei_element_op,
-                        out_element_op};
+                        out_element_op,
+                        elementwise_a_tensors,
+                        elementwise_b_tensors,
+                        elementwise_d_tensors};
    }

    static auto MakeInvoker() { return Invoker{}; }

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
@@ -20,7 +20,9 @@ template <typename ADataType,
          typename AccDataType,
          typename AElementwiseOperation,
          typename BElementwiseOperation,
-          typename CElementwiseOperation>
+          typename CElementwiseOperation,
+          typename ComputeTypeA = ADataType,
+          typename ComputeTypeB = ComputeTypeA>
 struct ReferenceGemm : public device::BaseOperator
 {
    // Argument
@@ -64,8 +66,8 @@ struct ReferenceGemm : public device::BaseOperator

                for(int k = 0; k < K; ++k)
                {
-                    ADataType v_a;
-                    BDataType v_b;
+                    ComputeTypeA v_a;
+                    ComputeTypeB v_b;

                    // use PassThrough instead of ConvertBF16RTN for reference calculation
                    if constexpr(is_same_v<AElementwiseOperation,

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp
@@ -20,8 +20,9 @@ template <typename XDataType,
          typename GammaDataType,
          typename BetaDataType,
          typename YDataType,
-          typename AccDataType,
-          typename AccElementwiseOperation>
+          typename SaveMeanInvStdDataType,
+          typename ComputeDataType,
+          typename YElementwiseOperation>
 struct ReferenceGroupnorm : public device::BaseOperator
 {
    // x = [N, H, W, G, C]
@@ -35,14 +36,18 @@ struct ReferenceGroupnorm : public device::BaseOperator
                 const Tensor<GammaDataType>& gamma,
                 const Tensor<BetaDataType>& beta,
                 Tensor<YDataType>& y,
-                 AccElementwiseOperation acc_elementwise_op,
+                 Tensor<SaveMeanInvStdDataType>& save_mean,
+                 Tensor<SaveMeanInvStdDataType>& save_inv_std,
+                 YElementwiseOperation y_elementwise_op,
                 const std::vector<index_t> lengths,
-                 AccDataType epsilon)
+                 ComputeDataType epsilon)
            : x_(x),
              gamma_(gamma),
              beta_(beta),
              y_(y),
-              acc_elementwise_op_(acc_elementwise_op),
+              save_mean_(save_mean),
+              save_inv_std_(save_inv_std),
+              y_elementwise_op_(y_elementwise_op),
              lengths_(lengths),
              epsilon_(epsilon)
        {
@@ -52,9 +57,11 @@ struct ReferenceGroupnorm : public device::BaseOperator
        const Tensor<XDataType> gamma_;
        const Tensor<XDataType> beta_;
        Tensor<YDataType>& y_;
-        AccElementwiseOperation acc_elementwise_op_;
+        Tensor<SaveMeanInvStdDataType>& save_mean_;
+        Tensor<SaveMeanInvStdDataType>& save_inv_std_;
+        YElementwiseOperation y_elementwise_op_;
        std::vector<index_t> lengths_;
-        AccDataType epsilon_;
+        ComputeDataType epsilon_;
    };

    // Invoker
@@ -68,8 +75,8 @@ struct ReferenceGroupnorm : public device::BaseOperator
            int G = arg.lengths_[3];
            int C = arg.lengths_[4];

-            Tensor<AccDataType> mean({N, G});
-            Tensor<AccDataType> var({N, G});
+            Tensor<ComputeDataType> mean({N, G});
+            Tensor<ComputeDataType> var({N, G});

            // Compute mean & var in [H, W, C] by Welford Algorithm
            // TODO - parallel for each HWC
@@ -78,9 +85,9 @@ struct ReferenceGroupnorm : public device::BaseOperator
            {
                for(int g = 0; g < G; ++g)
                {
-                    AccDataType mean_val = type_convert<AccDataType>(0.0f);
-                    AccDataType var_val  = type_convert<AccDataType>(0.0f);
-                    int32_t curr_count   = 0;
+                    ComputeDataType mean_val = type_convert<ComputeDataType>(0.0f);
+                    ComputeDataType var_val  = type_convert<ComputeDataType>(0.0f);
+                    int32_t curr_count       = 0;

                    for(int h = 0; h < H; ++h)
                    {
@@ -89,10 +96,11 @@ struct ReferenceGroupnorm : public device::BaseOperator
                            for(int c = 0; c < C; ++c)
                            {
                                curr_count++;
-                                AccDataType x = type_convert<AccDataType>(arg.x_(n, h, w, g, c));
-                                AccDataType delta = x - mean_val;
+                                ComputeDataType x =
+                                    type_convert<ComputeDataType>(arg.x_(n, h, w, g, c));
+                                ComputeDataType delta = x - mean_val;
                                mean_val += delta / curr_count;
-                                AccDataType delta2 = x - mean_val;
+                                ComputeDataType delta2 = x - mean_val;
                                var_val += delta * delta2;
                            }
                        }
@@ -100,6 +108,12 @@ struct ReferenceGroupnorm : public device::BaseOperator

                    mean(n, g) = mean_val;
                    var(n, g)  = var_val / curr_count;
+
+                    arg.save_mean_(n, g) = ck::type_convert<SaveMeanInvStdDataType>(mean(n, g));
+
+                    ComputeDataType divisor =
+                        static_cast<ComputeDataType>(1) / ck::math::sqrt(var(n, g) + arg.epsilon_);
+                    arg.save_inv_std_(n, g) = ck::type_convert<SaveMeanInvStdDataType>(divisor);
                }
            }

@@ -114,15 +128,19 @@ struct ReferenceGroupnorm : public device::BaseOperator
                        {
                            for(int c = 0; c < C; ++c)
                            {
-                                AccDataType x = type_convert<AccDataType>(arg.x_(n, h, w, g, c));
-                                AccDataType gamma    = type_convert<AccDataType>(arg.gamma_(g, c));
-                                AccDataType beta     = type_convert<AccDataType>(arg.beta_(g, c));
-                                AccDataType mean_val = type_convert<AccDataType>(mean(n, g));
-                                AccDataType var_val  = type_convert<AccDataType>(var(n, g));
-                                AccDataType y        = gamma * (x - mean_val) /
-                                                    ck::math::sqrt(arg.epsilon_ + var_val) +
-                                                beta;
-                                arg.acc_elementwise_op_(y, y);
+                                ComputeDataType x =
+                                    type_convert<ComputeDataType>(arg.x_(n, h, w, g, c));
+                                ComputeDataType gamma =
+                                    type_convert<ComputeDataType>(arg.gamma_(g, c));
+                                ComputeDataType beta =
+                                    type_convert<ComputeDataType>(arg.beta_(g, c));
+                                ComputeDataType mean_val =
+                                    type_convert<ComputeDataType>(mean(n, g));
+                                ComputeDataType var_val = type_convert<ComputeDataType>(var(n, g));
+                                ComputeDataType y       = gamma * (x - mean_val) /
+                                                        ck::math::sqrt(arg.epsilon_ + var_val) +
+                                                    beta;
+                                arg.y_elementwise_op_(y, y);
                                arg.y_(n, h, w, g, c) = type_convert<YDataType>(y);
                            }
                        }
@@ -159,11 +177,14 @@ struct ReferenceGroupnorm : public device::BaseOperator
                             const Tensor<GammaDataType>& gamma,
                             const Tensor<BetaDataType>& beta,
                             Tensor<YDataType>& y,
-                             AccElementwiseOperation acc_elementwise_op,
+                             Tensor<SaveMeanInvStdDataType>& save_mean,
+                             Tensor<SaveMeanInvStdDataType>& save_inv_std,
+                             YElementwiseOperation y_elementwise_op,
                             const std::vector<index_t> lengths,
-                             AccDataType epsilon)
+                             ComputeDataType epsilon)
    {
-        return Argument{x, gamma, beta, y, acc_elementwise_op, lengths, epsilon};
+        return Argument{
+            x, gamma, beta, y, save_mean, save_inv_std, y_elementwise_op, lengths, epsilon};
    }

    static auto MakeInvoker() { return Invoker{}; }

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_groupnorm_bwd.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_groupnorm_bwd.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include <algorithm>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+template <typename DYDataType,
+          typename XDataType,
+          typename GammaDataType,
+          typename MeanInvStdDataType,
+          typename DGammaDataType,
+          typename DBetaDataType,
+          typename DXDataType,
+          typename ComputeDataType>
+struct ReferenceGroupnormBwd : public device::BaseOperator
+{
+    // Argument
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const Tensor<DYDataType>& dy_nhwgc,
+                 const Tensor<XDataType>& x_nhwgc,
+                 const Tensor<GammaDataType>& gamma_gc,
+                 const Tensor<MeanInvStdDataType>& mean_ng,
+                 const Tensor<MeanInvStdDataType>& inv_std_ng,
+                 Tensor<DGammaDataType>& dgamma_gc,
+                 Tensor<DBetaDataType>& dbeta_gc,
+                 Tensor<DXDataType>& dx_nhwgc,
+                 const std::vector<index_t> lengths)
+            : dy_nhwgc_(dy_nhwgc),
+              x_nhwgc_(x_nhwgc),
+              gamma_gc_(gamma_gc),
+              mean_ng_(mean_ng),
+              inv_std_ng_(inv_std_ng),
+              dgamma_gc_(dgamma_gc),
+              dbeta_gc_(dbeta_gc),
+              dx_nhwgc_(dx_nhwgc),
+              lengths_(lengths)
+        {
+        }
+
+        const Tensor<DYDataType>& dy_nhwgc_;
+        const Tensor<XDataType>& x_nhwgc_;
+        const Tensor<GammaDataType>& gamma_gc_;
+        const Tensor<MeanInvStdDataType>& mean_ng_;
+        const Tensor<MeanInvStdDataType>& inv_std_ng_;
+        Tensor<DGammaDataType>& dgamma_gc_;
+        Tensor<DBetaDataType>& dbeta_gc_;
+        Tensor<DXDataType>& dx_nhwgc_;
+        std::vector<index_t> lengths_;
+    };
+
+    // Invoker
+    struct Invoker : public device::BaseInvoker
+    {
+        float Run(const Argument& arg)
+        {
+            int N = arg.lengths_[0];
+            int H = arg.lengths_[1];
+            int W = arg.lengths_[2];
+            int G = arg.lengths_[3];
+            int C = arg.lengths_[4];
+
+            // Calculate dgamma and dbeta
+            for(int g = 0; g < G; ++g)
+                for(int c = 0; c < C; ++c)
+                {
+                    ComputeDataType dgamma = 0;
+                    ComputeDataType dbeta  = 0;
+
+                    for(int n = 0; n < N; ++n)
+                        for(int h = 0; h < H; ++h)
+                            for(int w = 0; w < W; ++w)
+                            {
+                                ComputeDataType dy =
+                                    ck::type_convert<ComputeDataType>(arg.dy_nhwgc_(n, h, w, g, c));
+                                ComputeDataType x =
+                                    ck::type_convert<ComputeDataType>(arg.x_nhwgc_(n, h, w, g, c));
+                                ComputeDataType mean =
+                                    ck::type_convert<ComputeDataType>(arg.mean_ng_(n, g));
+                                ComputeDataType rstd =
+                                    ck::type_convert<ComputeDataType>(arg.inv_std_ng_(n, g));
+                                dgamma += dy * rstd * (x - mean);
+                                dbeta += dy;
+                            }
+                    arg.dgamma_gc_(g, c) = ck::type_convert<DGammaDataType>(dgamma);
+                    arg.dbeta_gc_(g, c)  = ck::type_convert<DBetaDataType>(dbeta);
+                }
+
+            // Calculate dx
+            int reduce_size = H * W * C;
+            for(int n = 0; n < N; ++n)
+                for(int g = 0; g < G; ++g)
+                {
+                    ComputeDataType ds = 0;
+                    ComputeDataType db = 0;
+
+                    ComputeDataType mean = ck::type_convert<ComputeDataType>(arg.mean_ng_(n, g));
+                    ComputeDataType rstd = ck::type_convert<ComputeDataType>(arg.inv_std_ng_(n, g));
+
+                    for(int h = 0; h < H; ++h)
+                        for(int w = 0; w < W; ++w)
+                            for(int c = 0; c < C; ++c)
+                            {
+                                ComputeDataType dy =
+                                    ck::type_convert<ComputeDataType>(arg.dy_nhwgc_(n, h, w, g, c));
+                                ComputeDataType x =
+                                    ck::type_convert<ComputeDataType>(arg.x_nhwgc_(n, h, w, g, c));
+                                ComputeDataType gamma =
+                                    ck::type_convert<ComputeDataType>(arg.gamma_gc_(g, c));
+
+                                ds += dy * gamma * x;
+                                db += dy * gamma;
+                            }
+
+                    for(int h = 0; h < H; ++h)
+                        for(int w = 0; w < W; ++w)
+                            for(int c = 0; c < C; ++c)
+                            {
+                                ComputeDataType dy =
+                                    ck::type_convert<ComputeDataType>(arg.dy_nhwgc_(n, h, w, g, c));
+                                ComputeDataType x =
+                                    ck::type_convert<ComputeDataType>(arg.x_nhwgc_(n, h, w, g, c));
+                                ComputeDataType gamma =
+                                    ck::type_convert<ComputeDataType>(arg.gamma_gc_(g, c));
+
+                                ComputeDataType b =
+                                    (db * mean - ds) * rstd * rstd * rstd / reduce_size;
+                                ComputeDataType c1 = -b * mean - db * rstd / reduce_size;
+                                arg.dx_nhwgc_(n, h, w, g, c) =
+                                    ck::type_convert<DXDataType>(dy * gamma * rstd + b * x + c1);
+                            }
+                }
+
+            return 0;
+        }
+
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
+
+    static auto MakeArgument(const Tensor<DYDataType>& dy_nhwgc,
+                             const Tensor<XDataType>& x_nhwgc,
+                             const Tensor<GammaDataType>& gamma_gc,
+                             const Tensor<MeanInvStdDataType>& mean_ng,
+                             const Tensor<MeanInvStdDataType>& inv_std_ng,
+                             Tensor<DGammaDataType>& dgamma_gc,
+                             Tensor<DBetaDataType>& dbeta_gc,
+                             Tensor<DXDataType>& dx_nhwgc,
+                             const std::vector<index_t> lengths)
+    {
+        return Argument{dy_nhwgc,
+                        x_nhwgc,
+                        gamma_gc,
+                        mean_ng,
+                        inv_std_ng,
+                        dgamma_gc,
+                        dbeta_gc,
+                        dx_nhwgc,
+                        lengths};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceGroupnormBwd"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp
@@ -18,16 +18,16 @@ namespace host {
 /**
 * \brief Reference implementation for image to column.
 *
- * Tensor descriptor has [G, N, C, Di, Hi, Wi] data layout.
- * G must be equal to 1. Memory layout is [G, N, Di, Hi, Wi, C].
+ * Input tensor descriptor has [G, N, C, Di, Hi, Wi] data layout.
+ * Output tensor descriptor has [G * N * Do * Ho * Wo, Z * Y * X * C] data layout.
 *
 * \tparam NDimSpatial Number of spatial dimensions.
- * \tparam InputLayout Input Layout.
+ * \tparam ImageLayout Image Layout.
 * \tparam InDataType Input Data Type.
 * \tparam OutDataType Output Data Type.
 */
 template <ck::index_t NDimSpatial,
-          typename InputLayout,
+          typename ImageLayout,
          typename InDataType,
          typename OutDataType,
          typename std::enable_if<NDimSpatial >= 1 && NDimSpatial <= 3, bool>::type = false>
@@ -93,18 +93,19 @@ struct ReferenceImageToColumn : public device::BaseOperator
        float Run(const Argument& arg)
        {
            if(!(arg.input_.GetNumOfDimension() == NDimSpatial + 3 &&
-                 arg.output_.GetNumOfDimension() == 2))
+                 arg.output_.GetNumOfDimension() == 3))
            {
                throw std::runtime_error("wrong! inconsistent dimension");
            }

+            const index_t G = arg.input_.GetLengths()[0];
            const index_t N = arg.input_.GetLengths()[1];
            const index_t C = arg.input_.GetLengths()[2];

            if constexpr(NDimSpatial == 1)
            {
                const index_t Wo = arg.output_spatial_lengths_[0];
-                auto func        = [&](auto n, auto wo) {
+                auto func        = [&](auto g, auto n, auto wo) {
                    index_t row    = n * Wo + wo;
                    index_t column = 0;

@@ -119,15 +120,15 @@ struct ReferenceImageToColumn : public device::BaseOperator
                            if(wi >= 0 &&
                               ck::type_convert<std::size_t>(wi) < arg.input_.GetLengths()[3])
                            {
-                                InDataType v_in          = arg.input_(0, n, c, wi);
-                                arg.output_(row, column) = ck::type_convert<OutDataType>(v_in);
+                                InDataType v_in             = arg.input_(g, n, c, wi);
+                                arg.output_(g, row, column) = ck::type_convert<OutDataType>(v_in);
                            }
                            column++;
                        }
                    }
                };

-                make_ParallelTensorFunctor(func, N, Wo)(std::thread::hardware_concurrency());
+                make_ParallelTensorFunctor(func, G, N, Wo)(std::thread::hardware_concurrency());

                return 0;
            }
@@ -136,7 +137,7 @@ struct ReferenceImageToColumn : public device::BaseOperator
                const index_t Ho = arg.output_spatial_lengths_[0];
                const index_t Wo = arg.output_spatial_lengths_[1];

-                auto func = [&](auto n, auto ho, auto wo) {
+                auto func = [&](auto g, auto n, auto ho, auto wo) {
                    index_t row    = n * Ho * Wo + ho * Wo + wo;
                    index_t column = 0;

@@ -160,8 +161,9 @@ struct ReferenceImageToColumn : public device::BaseOperator
                                   wi >= 0 &&
                                   ck::type_convert<std::size_t>(wi) < arg.input_.GetLengths()[4])
                                {
-                                    InDataType v_in          = arg.input_(0, n, c, hi, wi);
-                                    arg.output_(row, column) = ck::type_convert<OutDataType>(v_in);
+                                    InDataType v_in = arg.input_(g, n, c, hi, wi);
+                                    arg.output_(g, row, column) =
+                                        ck::type_convert<OutDataType>(v_in);
                                }
                                column++;
                            }
@@ -169,7 +171,7 @@ struct ReferenceImageToColumn : public device::BaseOperator
                    }
                };

-                make_ParallelTensorFunctor(func, N, Ho, Wo)(std::thread::hardware_concurrency());
+                make_ParallelTensorFunctor(func, G, N, Ho, Wo)(std::thread::hardware_concurrency());

                return 0;
            }
@@ -179,7 +181,7 @@ struct ReferenceImageToColumn : public device::BaseOperator
                const index_t Ho = arg.output_spatial_lengths_[1];
                const index_t Wo = arg.output_spatial_lengths_[2];

-                auto func = [&](auto n, auto d_o, auto ho, auto wo) {
+                auto func = [&](auto g, auto n, auto d_o, auto ho, auto wo) {
                    index_t row    = n * Do * Ho * Wo + d_o * Ho * Wo + ho * Wo + wo;
                    index_t column = 0;

@@ -211,8 +213,8 @@ struct ReferenceImageToColumn : public device::BaseOperator
                                       ck::type_convert<std::size_t>(wi) <
                                           arg.input_.GetLengths()[5])
                                    {
-                                        InDataType v_in = arg.input_(0, n, c, di, hi, wi);
-                                        arg.output_(row, column) =
+                                        InDataType v_in = arg.input_(g, n, c, di, hi, wi);
+                                        arg.output_(g, row, column) =
                                            ck::type_convert<OutDataType>(v_in);
                                    }
                                    column++;
@@ -222,7 +224,7 @@ struct ReferenceImageToColumn : public device::BaseOperator
                    }
                };

-                make_ParallelTensorFunctor(func, N, Do, Ho, Wo)(
+                make_ParallelTensorFunctor(func, G, N, Do, Ho, Wo)(
                    std::thread::hardware_concurrency());

                return 0;
@@ -240,8 +242,8 @@ struct ReferenceImageToColumn : public device::BaseOperator
    {
        using namespace tensor_layout::convolution;

-        if constexpr(!(std::is_same_v<InputLayout, GNWC> || std::is_same_v<InputLayout, GNHWC> ||
-                       std::is_same_v<InputLayout, GNDHWC>))
+        if constexpr(!(std::is_same_v<ImageLayout, GNWC> || std::is_same_v<ImageLayout, GNHWC> ||
+                       std::is_same_v<ImageLayout, GNDHWC>))
        {
            return false;
        }
@@ -265,8 +267,9 @@ struct ReferenceImageToColumn : public device::BaseOperator
            C * ck::accumulate_n<index_t>(
                    arg.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());

-        if(!(arg.output_.GetLengths()[0] == static_cast<std::size_t>(NDoHoWo) &&
-             arg.output_.GetLengths()[1] == static_cast<std::size_t>(CZYX)))
+        if(!(arg.output_.GetLengths()[0] == static_cast<std::size_t>(G) &&
+             arg.output_.GetLengths()[1] == static_cast<std::size_t>(NDoHoWo) &&
+             arg.output_.GetLengths()[2] == static_cast<std::size_t>(CZYX)))
        {
            return false;
        }

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp
@@ -20,14 +20,16 @@ template <typename XDataType,
          typename GammaDataType,
          typename BetaDataType,
          typename YDataType,
-          typename AccDataType,
-          typename AccElementwiseOperation,
+          typename SaveMeanInvStdDataType,
+          typename ComputeDataType,
+          typename YElementwiseOperation,
          index_t Rank,
          index_t NumReduceDim>
 struct ReferenceLayernorm : public device::BaseOperator
 {
    // TODO - support generic layernorm
-    static_assert((Rank == 2 && NumReduceDim == 1), "Only support 2D version so far");
+    static_assert((Rank == 2 && NumReduceDim == 1) || (Rank == 4 && NumReduceDim == 3),
+                  "Only support 2D & 4D version so far");

    // Argument
    struct Argument : public device::BaseArgument
@@ -36,15 +38,19 @@ struct ReferenceLayernorm : public device::BaseOperator
                 const Tensor<GammaDataType>& gamma_n,
                 const Tensor<BetaDataType>& beta_n,
                 Tensor<YDataType>& y_m_n,
-                 AccElementwiseOperation acc_elementwise_op,
+                 Tensor<SaveMeanInvStdDataType>& save_mean_m,
+                 Tensor<SaveMeanInvStdDataType>& save_inv_std_m,
+                 YElementwiseOperation y_elementwise_op,
                 const std::vector<index_t> lengths,
                 const std::vector<index_t> reduceDims,
-                 AccDataType epsilon)
+                 ComputeDataType epsilon)
            : x_m_n_(x_m_n),
              gamma_n_(gamma_n),
              beta_n_(beta_n),
              y_m_n_(y_m_n),
-              acc_elementwise_op_(acc_elementwise_op),
+              save_mean_m_(save_mean_m),
+              save_inv_std_m_(save_inv_std_m),
+              y_elementwise_op_(y_elementwise_op),
              lengths_(lengths),
              reduceDims_(reduceDims),
              epsilon_(epsilon)
@@ -55,22 +61,24 @@ struct ReferenceLayernorm : public device::BaseOperator
        const Tensor<XDataType> gamma_n_;
        const Tensor<XDataType> beta_n_;
        Tensor<YDataType>& y_m_n_;
-        AccElementwiseOperation acc_elementwise_op_;
+        Tensor<SaveMeanInvStdDataType>& save_mean_m_;
+        Tensor<SaveMeanInvStdDataType>& save_inv_std_m_;
+        YElementwiseOperation y_elementwise_op_;
        std::vector<index_t> lengths_;
        std::vector<index_t> reduceDims_;
-        AccDataType epsilon_;
+        ComputeDataType epsilon_;
    };

    // Invoker
    struct Invoker : public device::BaseInvoker
    {
-        float Run(const Argument& arg)
+        float Run2D(const Argument& arg)
        {
            int M = arg.lengths_[0];
            int N = arg.lengths_[1];

-            Tensor<AccDataType> mean({M});
-            Tensor<AccDataType> var({M});
+            Tensor<ComputeDataType> mean({M});
+            Tensor<ComputeDataType> var({M});

            for(int m = 0; m < M; ++m)
            {
@@ -79,7 +87,7 @@ struct ReferenceLayernorm : public device::BaseOperator

                for(int n = 0; n < N; ++n)
                {
-                    auto x_val = ck::type_convert<AccDataType>(arg.x_m_n_(m, n));
+                    auto x_val = ck::type_convert<ComputeDataType>(arg.x_m_n_(m, n));
                    mean(m) += x_val;
                    var(m) += x_val * x_val;
                }
@@ -90,22 +98,91 @@ struct ReferenceLayernorm : public device::BaseOperator

            for(int m = 0; m < M; ++m)
            {
-                AccDataType divisor =
-                    static_cast<AccDataType>(1) / ck::math::sqrt(var(m) + arg.epsilon_);
+                ComputeDataType divisor =
+                    static_cast<ComputeDataType>(1) / ck::math::sqrt(var(m) + arg.epsilon_);

                for(int n = 0; n < N; ++n)
                {
-                    auto x_val = ck::type_convert<AccDataType>(arg.x_m_n_(m, n));
-                    auto y_val = (x_val - mean(m)) * divisor;
-                    y_val      = (y_val * arg.gamma_n_(n)) + arg.beta_n_(n);
-                    arg.acc_elementwise_op_(y_val, y_val);
+                    auto x_val     = ck::type_convert<ComputeDataType>(arg.x_m_n_(m, n));
+                    auto gamma_val = ck::type_convert<ComputeDataType>(arg.gamma_n_(n));
+                    auto beta_val  = ck::type_convert<ComputeDataType>(arg.beta_n_(n));
+                    auto y_val     = (x_val - mean(m)) * divisor;
+                    y_val          = (y_val * gamma_val) + beta_val;
+                    arg.y_elementwise_op_(y_val, y_val);
                    arg.y_m_n_(m, n) = ck::type_convert<YDataType>(y_val);
                }
+                arg.save_mean_m_(m)    = ck::type_convert<SaveMeanInvStdDataType>(mean(m));
+                arg.save_inv_std_m_(m) = ck::type_convert<SaveMeanInvStdDataType>(divisor);
            }

            return 0;
        }

+        float Run4D(const Argument& arg)
+        {
+            int N = arg.lengths_[0];
+            int H = arg.lengths_[1];
+            int W = arg.lengths_[2];
+            int C = arg.lengths_[3];
+
+            Tensor<ComputeDataType> mean({N});
+            Tensor<ComputeDataType> var({N});
+
+            int reduce_length = H * W * C;
+
+            for(int n = 0; n < N; ++n)
+            {
+                mean(n) = 0;
+                var(n)  = 0;
+
+                for(int h = 0; h < H; ++h)
+                    for(int w = 0; w < W; ++w)
+                        for(int c = 0; c < C; ++c)
+                        {
+                            auto x_val = ck::type_convert<ComputeDataType>(arg.x_m_n_(n, h, w, c));
+                            mean(n) += x_val;
+                            var(n) += x_val * x_val;
+                        }
+
+                mean(n) = mean(n) / reduce_length;
+                var(n)  = (var(n) / reduce_length) - (mean(n) * mean(n));
+            }
+
+            for(int n = 0; n < N; ++n)
+            {
+                ComputeDataType divisor =
+                    static_cast<ComputeDataType>(1) / ck::math::sqrt(var(n) + arg.epsilon_);
+
+                for(int h = 0; h < H; ++h)
+                    for(int w = 0; w < W; ++w)
+                        for(int c = 0; c < C; ++c)
+                        {
+                            auto x_val = ck::type_convert<ComputeDataType>(arg.x_m_n_(n, h, w, c));
+                            auto gamma_val =
+                                ck::type_convert<ComputeDataType>(arg.gamma_n_(h, w, c));
+                            auto beta_val = ck::type_convert<ComputeDataType>(arg.beta_n_(h, w, c));
+                            auto y_val    = (x_val - mean(n)) * divisor;
+                            y_val         = (y_val * gamma_val) + beta_val;
+                            arg.y_elementwise_op_(y_val, y_val);
+                            arg.y_m_n_(n, h, w, c) = ck::type_convert<YDataType>(y_val);
+                        }
+                arg.save_mean_m_(n)    = ck::type_convert<SaveMeanInvStdDataType>(mean(n));
+                arg.save_inv_std_m_(n) = ck::type_convert<SaveMeanInvStdDataType>(divisor);
+            }
+
+            return 0;
+        }
+
+        float Run(const Argument& arg)
+        {
+            if(arg.lengths_.size() == 2)
+                return Run2D(arg);
+            else if(arg.lengths_.size() == 4)
+                return Run4D(arg);
+
+            return 0;
+        }
+
        float Run(const device::BaseArgument* p_arg,
                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
        {
@@ -123,30 +200,39 @@ struct ReferenceLayernorm : public device::BaseOperator
    {
        const Argument* p_arg_ = dynamic_cast<const Argument*>(p_arg);

-        // TODO - support generic layernorm
-        if(p_arg_->lengths_.size() != 2)
-            return false;
-
-        if(p_arg_->reduceDims_.size() != 1)
-            return false;
+        if(p_arg_->lengths_.size() == 2 && p_arg_->reduceDims_.size() == 1 &&
+           p_arg_->reduceDims_[0] == 1)
+            return true;

-        if(p_arg_->reduceDims_[0] != 1)
-            return false;
+        else if(p_arg_->lengths_.size() == 4 && p_arg_->reduceDims_.size() == 3 &&
+                p_arg_->reduceDims_[0] == 1 && p_arg_->reduceDims_[1] == 2 &&
+                p_arg_->reduceDims_[2] == 3)
+            return true;

-        return true;
+        return false;
    }

    static auto MakeArgument(const Tensor<XDataType>& x_m_n,
                             const Tensor<GammaDataType>& gamma_n,
                             const Tensor<BetaDataType>& beta_n,
                             Tensor<YDataType>& y_m_n,
-                             AccElementwiseOperation acc_elementwise_op,
+                             Tensor<SaveMeanInvStdDataType>& save_mean_m,
+                             Tensor<SaveMeanInvStdDataType>& save_inv_std_m,
+                             YElementwiseOperation y_elementwise_op,
                             const std::vector<index_t> lengths,
                             const std::vector<index_t> reduceDims,
-                             AccDataType epsilon)
+                             ComputeDataType epsilon)
    {
-        return Argument{
-            x_m_n, gamma_n, beta_n, y_m_n, acc_elementwise_op, lengths, reduceDims, epsilon};
+        return Argument{x_m_n,
+                        gamma_n,
+                        beta_n,
+                        y_m_n,
+                        save_mean_m,
+                        save_inv_std_m,
+                        y_elementwise_op,
+                        lengths,
+                        reduceDims,
+                        epsilon};
    }

    static auto MakeInvoker() { return Invoker{}; }

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_layernorm_bwd.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_layernorm_bwd.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include <algorithm>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+template <typename DYDataType,
+          typename XDataType,
+          typename GammaDataType,
+          typename MeanInvStdDataType,
+          typename DGammaDataType,
+          typename DBetaDataType,
+          typename DXDataType,
+          typename ComputeDataType>
+struct ReferenceLayernormBwd : public device::BaseOperator
+{
+    // Argument
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const Tensor<DYDataType>& dy_m_n,
+                 const Tensor<XDataType>& x_m_n,
+                 const Tensor<GammaDataType>& gamma_n,
+                 const Tensor<MeanInvStdDataType>& mean_m,
+                 const Tensor<MeanInvStdDataType>& inv_std_m,
+                 Tensor<DGammaDataType>& dgamma_n,
+                 Tensor<DBetaDataType>& dbeta_n,
+                 Tensor<DXDataType>& dx_m_n,
+                 const std::vector<index_t> lengths)
+            : dy_m_n_(dy_m_n),
+              x_m_n_(x_m_n),
+              gamma_n_(gamma_n),
+              mean_m_(mean_m),
+              inv_std_m_(inv_std_m),
+              dgamma_n_(dgamma_n),
+              dbeta_n_(dbeta_n),
+              dx_m_n_(dx_m_n),
+              lengths_(lengths)
+        {
+        }
+
+        const Tensor<DYDataType>& dy_m_n_;
+        const Tensor<XDataType>& x_m_n_;
+        const Tensor<GammaDataType>& gamma_n_;
+        const Tensor<MeanInvStdDataType>& mean_m_;
+        const Tensor<MeanInvStdDataType>& inv_std_m_;
+        Tensor<DGammaDataType>& dgamma_n_;
+        Tensor<DBetaDataType>& dbeta_n_;
+        Tensor<DXDataType>& dx_m_n_;
+        std::vector<index_t> lengths_;
+    };
+
+    // Invoker
+    struct Invoker : public device::BaseInvoker
+    {
+        float Run(const Argument& arg)
+        {
+            int M = arg.lengths_[0];
+            int N = arg.lengths_[1];
+
+            // Calculate dgamma and dbeta
+            for(int n = 0; n < N; ++n)
+            {
+                ComputeDataType dgamma = 0;
+                ComputeDataType dbeta  = 0;
+
+                for(int m = 0; m < M; ++m)
+                {
+                    ComputeDataType dy   = ck::type_convert<ComputeDataType>(arg.dy_m_n_(m, n));
+                    ComputeDataType x    = ck::type_convert<ComputeDataType>(arg.x_m_n_(m, n));
+                    ComputeDataType mean = ck::type_convert<ComputeDataType>(arg.mean_m_(m));
+                    ComputeDataType rstd = ck::type_convert<ComputeDataType>(arg.inv_std_m_(m));
+                    dgamma += dy * rstd * (x - mean);
+                    dbeta += dy;
+                }
+                arg.dgamma_n_(n) = ck::type_convert<DGammaDataType>(dgamma);
+                arg.dbeta_n_(n)  = ck::type_convert<DBetaDataType>(dbeta);
+            }
+
+            // Calculate dx
+            for(int m = 0; m < M; ++m)
+            {
+                ComputeDataType ds = 0;
+                ComputeDataType db = 0;
+
+                ComputeDataType mean = ck::type_convert<ComputeDataType>(arg.mean_m_(m));
+                ComputeDataType rstd = ck::type_convert<ComputeDataType>(arg.inv_std_m_(m));
+
+                for(int n = 0; n < N; ++n)
+                {
+                    ComputeDataType dy    = ck::type_convert<ComputeDataType>(arg.dy_m_n_(m, n));
+                    ComputeDataType x     = ck::type_convert<ComputeDataType>(arg.x_m_n_(m, n));
+                    ComputeDataType gamma = ck::type_convert<ComputeDataType>(arg.gamma_n_(n));
+
+                    ds += dy * gamma * x;
+                    db += dy * gamma;
+                }
+
+                for(int n = 0; n < N; ++n)
+                {
+                    ComputeDataType dy    = ck::type_convert<ComputeDataType>(arg.dy_m_n_(m, n));
+                    ComputeDataType x     = ck::type_convert<ComputeDataType>(arg.x_m_n_(m, n));
+                    ComputeDataType gamma = ck::type_convert<ComputeDataType>(arg.gamma_n_(n));
+
+                    ComputeDataType b = (db * mean - ds) * rstd * rstd * rstd / N;
+                    ComputeDataType c = -b * mean - db * rstd / N;
+
+                    arg.dx_m_n_(m, n) = ck::type_convert<DXDataType>(dy * gamma * rstd + b * x + c);
+                }
+            }
+
+            return 0;
+        }
+
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
+
+    static auto MakeArgument(const Tensor<DYDataType>& dy_m_n,
+                             const Tensor<XDataType>& x_m_n,
+                             const Tensor<GammaDataType>& gamma_n,
+                             const Tensor<MeanInvStdDataType>& mean_m,
+                             const Tensor<MeanInvStdDataType>& inv_std_m,
+                             Tensor<DGammaDataType>& dgamma_n,
+                             Tensor<DBetaDataType>& dbeta_n,
+                             Tensor<DXDataType>& dx_m_n,
+                             const std::vector<index_t> lengths)
+    {
+        return Argument{
+            dy_m_n, x_m_n, gamma_n, mean_m, inv_std_m, dgamma_n, dbeta_n, dx_m_n, lengths};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceLayernormBwd"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
--- a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
+++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
@@ -17,13 +17,16 @@ namespace instance {
 using F64  = double;
 using F32  = float;
 using F16  = ck::half_t;
-using F8   = ck::f8_t;
 using BF16 = ck::bhalf_t;
 using I8   = int8_t;
 using I32  = int32_t;
+using F8   = ck::f8_t;
+using BF8  = ck::bf8_t;

 using Empty_Tuple = ck::Tuple<>;

+using BF16_Tuple = ck::Tuple<BF16>;
+
 using F16_Tuple     = ck::Tuple<F16>;
 using F16_F16_Tuple = ck::Tuple<F16, F16>;

@@ -31,6 +34,7 @@ using F64_Tuple     = ck::Tuple<F64>;
 using F32_Tuple     = ck::Tuple<F32>;
 using I32_Tuple     = ck::Tuple<I32>;
 using I32_F32_Tuple = ck::Tuple<I32, F32>;
+using I8_Tuple      = ck::Tuple<I8>;

 using F32_F32_Tuple = ck::Tuple<F32, F32>;


--- a/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_backward.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_backward.hpp
@@ -16,26 +16,26 @@ namespace tensor_operation {
 namespace device {
 namespace instance {

-// FP16
+#ifdef CK_ENABLE_FP16
 void add_device_batchnorm_backward_rank_4_3_f16_instances(
    std::vector<std::unique_ptr<
        DeviceBatchNormBwd<F16, F32, F32, F32, F16, F32, F32, PassThrough, 4, 3>>>&);
-
-// FP32
+#endif
+#ifdef CK_ENABLE_FP32
 void add_device_batchnorm_backward_rank_4_3_f32_instances(
    std::vector<std::unique_ptr<
        DeviceBatchNormBwd<F32, F32, F32, F32, F32, F32, F32, PassThrough, 4, 3>>>&);
-
-// BF16
+#endif
+#ifdef CK_ENABLE_BF16
 void add_device_batchnorm_backward_rank_4_3_bf16_instances(
    std::vector<std::unique_ptr<
        DeviceBatchNormBwd<BF16, F32, F32, F32, BF16, F32, F32, PassThrough, 4, 3>>>&);
-
-// FP64
+#endif
+#ifdef CK_ENABLE_FP64
 void add_device_batchnorm_backward_rank_4_3_f64_instances(
    std::vector<std::unique_ptr<
        DeviceBatchNormBwd<F64, F64, F64, F64, F64, F64, F64, PassThrough, 4, 3>>>&);
-
+#endif
 template <typename XDataType,
          typename DxDataType,
          typename DyDataType,
@@ -72,7 +72,7 @@ struct DeviceOperationInstanceFactory<
    static auto GetInstances()
    {
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
-
+#ifdef CK_ENABLE_FP16
        if constexpr(is_same_v<XDataType, F16> && is_same_v<DxDataType, F32> &&
                     is_same_v<DyDataType, F32> && is_same_v<AccDataType, F32> &&
                     is_same_v<ScaleDataType, F16> && is_same_v<DscaleDbiasDataType, F32> &&
@@ -83,37 +83,43 @@ struct DeviceOperationInstanceFactory<
                add_device_batchnorm_backward_rank_4_3_f16_instances(op_ptrs);
            }
        }
-        else if constexpr(is_same_v<XDataType, F32> && is_same_v<DxDataType, F32> &&
-                          is_same_v<DyDataType, F32> && is_same_v<AccDataType, F32> &&
-                          is_same_v<ScaleDataType, F32> && is_same_v<DscaleDbiasDataType, F32> &&
-                          is_same_v<MeanVarDataType, F32>)
+#endif
+#ifdef CK_ENABLE_FP32
+        if constexpr(is_same_v<XDataType, F32> && is_same_v<DxDataType, F32> &&
+                     is_same_v<DyDataType, F32> && is_same_v<AccDataType, F32> &&
+                     is_same_v<ScaleDataType, F32> && is_same_v<DscaleDbiasDataType, F32> &&
+                     is_same_v<MeanVarDataType, F32>)
        {
            if constexpr(Rank == 4 && NumReduceDim == 3 && is_same_v<DyElementwiseOp, PassThrough>)
            {
                add_device_batchnorm_backward_rank_4_3_f32_instances(op_ptrs);
            }
        }
-        else if constexpr(is_same_v<XDataType, BF16> && is_same_v<DxDataType, F32> &&
-                          is_same_v<DyDataType, F32> && is_same_v<AccDataType, F32> &&
-                          is_same_v<ScaleDataType, BF16> && is_same_v<DscaleDbiasDataType, F32> &&
-                          is_same_v<MeanVarDataType, F32>)
+#endif
+#ifdef CK_ENABLE_BF16
+        if constexpr(is_same_v<XDataType, BF16> && is_same_v<DxDataType, F32> &&
+                     is_same_v<DyDataType, F32> && is_same_v<AccDataType, F32> &&
+                     is_same_v<ScaleDataType, BF16> && is_same_v<DscaleDbiasDataType, F32> &&
+                     is_same_v<MeanVarDataType, F32>)
        {
            if constexpr(Rank == 4 && NumReduceDim == 3 && is_same_v<DyElementwiseOp, PassThrough>)
            {
                add_device_batchnorm_backward_rank_4_3_bf16_instances(op_ptrs);
            }
        }
-        else if constexpr(is_same_v<XDataType, F64> && is_same_v<DxDataType, F64> &&
-                          is_same_v<DyDataType, F64> && is_same_v<AccDataType, F64> &&
-                          is_same_v<ScaleDataType, F64> && is_same_v<DscaleDbiasDataType, F64> &&
-                          is_same_v<MeanVarDataType, F64>)
+#endif
+#ifdef CK_ENABLE_FP64
+        if constexpr(is_same_v<XDataType, F64> && is_same_v<DxDataType, F64> &&
+                     is_same_v<DyDataType, F64> && is_same_v<AccDataType, F64> &&
+                     is_same_v<ScaleDataType, F64> && is_same_v<DscaleDbiasDataType, F64> &&
+                     is_same_v<MeanVarDataType, F64>)
        {
            if constexpr(Rank == 4 && NumReduceDim == 3 && is_same_v<DyElementwiseOp, PassThrough>)
            {
                add_device_batchnorm_backward_rank_4_3_f64_instances(op_ptrs);
            }
        }
-
+#endif
        return op_ptrs;
    }
 };

--- a/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp
@@ -16,26 +16,26 @@ namespace tensor_operation {
 namespace device {
 namespace instance {

-// FP16
+#ifdef CK_ENABLE_FP16
 void add_device_batchnorm_forward_rank_4_3_f16_instances(
    std::vector<
        std::unique_ptr<DeviceBatchNormFwd<F16, F16, F32, F16, F16, F32, PassThrough, 4, 3>>>&);
-
-// FP32
+#endif
+#ifdef CK_ENABLE_FP32
 void add_device_batchnorm_forward_rank_4_3_f32_instances(
    std::vector<
        std::unique_ptr<DeviceBatchNormFwd<F32, F32, F32, F32, F32, F32, PassThrough, 4, 3>>>&);
-
-// BF16
+#endif
+#ifdef CK_ENABLE_BF16
 void add_device_batchnorm_forward_rank_4_3_bf16_instances(
    std::vector<
        std::unique_ptr<DeviceBatchNormFwd<BF16, BF16, F32, BF16, BF16, F32, PassThrough, 4, 3>>>&);
-
-// FP64
+#endif
+#ifdef CK_ENABLE_FP64
 void add_device_batchnorm_forward_rank_4_3_f64_instances(
    std::vector<
        std::unique_ptr<DeviceBatchNormFwd<F64, F64, F64, F64, F64, F64, PassThrough, 4, 3>>>&);
-
+#endif
 template <typename XDataType,
          typename YDataType,
          typename AccDataType,
@@ -69,7 +69,7 @@ struct DeviceOperationInstanceFactory<
    static auto GetInstances()
    {
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
-
+#ifdef CK_ENABLE_FP16
        if constexpr(is_same_v<XDataType, F16> && is_same_v<YDataType, F16> &&
                     is_same_v<AccDataType, F32> && is_same_v<ScaleDataType, F16> &&
                     is_same_v<BiasDataType, F16> && is_same_v<MeanVarDataType, F32>)
@@ -79,34 +79,40 @@ struct DeviceOperationInstanceFactory<
                add_device_batchnorm_forward_rank_4_3_f16_instances(op_ptrs);
            }
        }
-        else if constexpr(is_same_v<XDataType, F32> && is_same_v<YDataType, F32> &&
-                          is_same_v<AccDataType, F32> && is_same_v<ScaleDataType, F32> &&
-                          is_same_v<BiasDataType, F32> && is_same_v<MeanVarDataType, F32>)
+#endif
+#ifdef CK_ENABLE_FP32
+        if constexpr(is_same_v<XDataType, F32> && is_same_v<YDataType, F32> &&
+                     is_same_v<AccDataType, F32> && is_same_v<ScaleDataType, F32> &&
+                     is_same_v<BiasDataType, F32> && is_same_v<MeanVarDataType, F32>)
        {
            if constexpr(Rank == 4 && NumReduceDim == 3 && is_same_v<YElementwiseOp, PassThrough>)
            {
                add_device_batchnorm_forward_rank_4_3_f32_instances(op_ptrs);
            }
        }
-        else if constexpr(is_same_v<XDataType, BF16> && is_same_v<YDataType, BF16> &&
-                          is_same_v<AccDataType, F32> && is_same_v<ScaleDataType, BF16> &&
-                          is_same_v<BiasDataType, BF16> && is_same_v<MeanVarDataType, F32>)
+#endif
+#ifdef CK_ENABLE_BF16
+        if constexpr(is_same_v<XDataType, BF16> && is_same_v<YDataType, BF16> &&
+                     is_same_v<AccDataType, F32> && is_same_v<ScaleDataType, BF16> &&
+                     is_same_v<BiasDataType, BF16> && is_same_v<MeanVarDataType, F32>)
        {
            if constexpr(Rank == 4 && NumReduceDim == 3 && is_same_v<YElementwiseOp, PassThrough>)
            {
                add_device_batchnorm_forward_rank_4_3_bf16_instances(op_ptrs);
            }
        }
-        else if constexpr(is_same_v<XDataType, F64> && is_same_v<YDataType, F64> &&
-                          is_same_v<AccDataType, F64> && is_same_v<ScaleDataType, F64> &&
-                          is_same_v<BiasDataType, F64> && is_same_v<MeanVarDataType, F64>)
+#endif
+#ifdef CK_ENABLE_FP64
+        if constexpr(is_same_v<XDataType, F64> && is_same_v<YDataType, F64> &&
+                     is_same_v<AccDataType, F64> && is_same_v<ScaleDataType, F64> &&
+                     is_same_v<BiasDataType, F64> && is_same_v<MeanVarDataType, F64>)
        {
            if constexpr(Rank == 4 && NumReduceDim == 3 && is_same_v<YElementwiseOp, PassThrough>)
            {
                add_device_batchnorm_forward_rank_4_3_f64_instances(op_ptrs);
            }
        }
-
+#endif
        return op_ptrs;
    }
 };

--- a/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_infer.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_infer.hpp
@@ -16,38 +16,38 @@ namespace tensor_operation {
 namespace device {
 namespace instance {

-// FP16
+#ifdef CK_ENABLE_FP16
 void add_device_batchnorm_infer_rank_4_f16_instances(
    std::vector<std::unique_ptr<ck::tensor_operation::device::DeviceElementwise<
        ck::Tuple<F16, F32, F32, F16, F16>,
        ck::Tuple<F16>,
        ck::tensor_operation::element_wise::NormalizeInInfer,
        4>>>&);
-
-// FP32
+#endif
+#ifdef CK_ENABLE_FP32
 void add_device_batchnorm_infer_rank_4_f32_instances(
    std::vector<std::unique_ptr<ck::tensor_operation::device::DeviceElementwise<
        ck::Tuple<F32, F32, F32, F32, F32>,
        ck::Tuple<F32>,
        ck::tensor_operation::element_wise::NormalizeInInfer,
        4>>>&);
-
-// BF16
+#endif
+#ifdef CK_ENABLE_BF16
 void add_device_batchnorm_infer_rank_4_bf16_instances(
    std::vector<std::unique_ptr<ck::tensor_operation::device::DeviceElementwise<
        ck::Tuple<BF16, F32, F32, BF16, BF16>,
        ck::Tuple<BF16>,
        ck::tensor_operation::element_wise::NormalizeInInfer,
        4>>>&);
-
-// FP64
+#endif
+#ifdef CK_ENABLE_FP64
 void add_device_batchnorm_infer_rank_4_f64_instances(
    std::vector<std::unique_ptr<ck::tensor_operation::device::DeviceElementwise<
        ck::Tuple<F64, F64, F64, F64, F64>,
        ck::Tuple<F64>,
        ck::tensor_operation::element_wise::NormalizeInInfer,
        4>>>&);
-
+#endif
 template <typename XDataType,
          typename YDataType,
          typename ScaleDataType,
@@ -69,7 +69,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceElemen
    static auto GetInstances()
    {
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
-
+#ifdef CK_ENABLE_FP16
        if constexpr(is_same_v<XDataType, F16> && is_same_v<YDataType, F16> &&
                     is_same_v<ScaleDataType, F16> && is_same_v<BiasDataType, F16> &&
                     is_same_v<MeanVarDataType, F32>)
@@ -79,34 +79,40 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceElemen
                add_device_batchnorm_infer_rank_4_f16_instances(op_ptrs);
            }
        }
-        else if constexpr(is_same_v<XDataType, F32> && is_same_v<YDataType, F32> &&
-                          is_same_v<ScaleDataType, F32> && is_same_v<BiasDataType, F32> &&
-                          is_same_v<MeanVarDataType, F32>)
+#endif
+#ifdef CK_ENABLE_FP32
+        if constexpr(is_same_v<XDataType, F32> && is_same_v<YDataType, F32> &&
+                     is_same_v<ScaleDataType, F32> && is_same_v<BiasDataType, F32> &&
+                     is_same_v<MeanVarDataType, F32>)
        {
            if constexpr(Rank == 4)
            {
                add_device_batchnorm_infer_rank_4_f32_instances(op_ptrs);
            }
        }
-        else if constexpr(is_same_v<XDataType, BF16> && is_same_v<YDataType, BF16> &&
-                          is_same_v<ScaleDataType, BF16> && is_same_v<BiasDataType, BF16> &&
-                          is_same_v<MeanVarDataType, F32>)
+#endif
+#ifdef CK_ENABLE_BF16
+        if constexpr(is_same_v<XDataType, BF16> && is_same_v<YDataType, BF16> &&
+                     is_same_v<ScaleDataType, BF16> && is_same_v<BiasDataType, BF16> &&
+                     is_same_v<MeanVarDataType, F32>)
        {
            if constexpr(Rank == 4)
            {
                add_device_batchnorm_infer_rank_4_bf16_instances(op_ptrs);
            }
        }
-        else if constexpr(is_same_v<XDataType, F64> && is_same_v<YDataType, F64> &&
-                          is_same_v<ScaleDataType, F64> && is_same_v<BiasDataType, F64> &&
-                          is_same_v<MeanVarDataType, F64>)
+#endif
+#ifdef CK_ENABLE_FP64
+        if constexpr(is_same_v<XDataType, F64> && is_same_v<YDataType, F64> &&
+                     is_same_v<ScaleDataType, F64> && is_same_v<BiasDataType, F64> &&
+                     is_same_v<MeanVarDataType, F64>)
        {
            if constexpr(Rank == 4)
            {
                add_device_batchnorm_infer_rank_4_f64_instances(op_ptrs);
            }
        }
-
+#endif
        return op_ptrs;
    }
 };

--- a/library/include/ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16  = ck::half_t;
+using BF16 = ck::bhalf_t;
+using F32  = float;
+using F64  = double;
+
+using F16_Tuple   = ck::Tuple<F16>;
+using BF16_Tuple  = ck::Tuple<BF16>;
+using F32_Tuple   = ck::Tuple<F32>;
+using F64_Tuple   = ck::Tuple<F64>;
+using Empty_Tuple = ck::Tuple<>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
+using Scale       = ck::tensor_operation::element_wise::Scale;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename ComputeDataType,
+          typename AElementwiseOp,
+          typename BElementwiseOp,
+          typename CDEElementwiseOp>
+using device_contraction_kk_instance = std::tuple<
+    // clang-format off
+        //#####################################| NumDimM| NumDimN| NumDimK|      AData|     BData|     AccData|         CShuffle|     DsData|     EData|              A|              B|              CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|         Compute|
+        //#####################################|        |        |        |       Type|      Type|        Type|         DataType|       Type|      Type|    Elementwise|    Elementwise|      Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            Data|
+        //#####################################|        |        |        |           |          |            |                 |           |          |      Operation|      Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            Type|
+        //#####################################|        |        |        |           |          |            |                 |           |          |               |               |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                |
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4, ComputeDataType>
+    // clang-format on
+    >;
+
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename ComputeDataType,
+          typename AElementwiseOp,
+          typename BElementwiseOp,
+          typename CDEElementwiseOp>
+using device_contraction_kn_instance = std::tuple<
+    // clang-format off
+        //#####################################| NumDimM| NumDimN| NumDimK|      AData|     BData|     AccData|         CShuffle|     DsData|     EData|              A|              B|              CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|         Compute|
+        //#####################################|        |        |        |       Type|      Type|        Type|         DataType|       Type|      Type|    Elementwise|    Elementwise|      Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            Data|
+        //#####################################|        |        |        |           |          |            |                 |           |          |      Operation|      Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            Type|
+        //#####################################|        |        |        |           |          |            |                 |           |          |               |               |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                |
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   1,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   1,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   1,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   1,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   1,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1,  8>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   1,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   1,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   1,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>
+    // clang-format on
+    >;
+
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename ComputeDataType,
+          typename AElementwiseOp,
+          typename BElementwiseOp,
+          typename CDEElementwiseOp>
+using device_contraction_mk_instance = std::tuple<
+    // clang-format off
+        //#####################################| NumDimM| NumDimN| NumDimK|      AData|     BData|     AccData|         CShuffle|     DsData|     EData|              A|              B|              CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|         Compute|
+        //#####################################|        |        |        |       Type|      Type|        Type|         DataType|       Type|      Type|    Elementwise|    Elementwise|      Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            Data|
+        //#####################################|        |        |        |           |          |            |                 |           |          |      Operation|      Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            Type|
+        //#####################################|        |        |        |           |          |            |                 |           |          |               |               |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                |
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   256,   128,    16,   1,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   256,    16,   1,   4,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,   128,    16,   1,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   4,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   4,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   4,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   4,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>
+    // clang-format on
+    >;
+
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename ComputeDataType,
+          typename AElementwiseOp,
+          typename BElementwiseOp,
+          typename CDEElementwiseOp>
+using device_contraction_mn_instance = std::tuple<
+    // clang-format off
+        //#####################################| NumDimM| NumDimN| NumDimK|      AData|     BData|     AccData|         CShuffle|     DsData|     EData|              A|              B|              CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|         Compute|
+        //#####################################|        |        |        |       Type|      Type|        Type|         DataType|       Type|      Type|    Elementwise|    Elementwise|      Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            Data|
+        //#####################################|        |        |        |           |          |            |                 |           |          |      Operation|      Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            Type|
+        //#####################################|        |        |        |           |          |            |                 |           |          |               |               |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                |
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   256,   128,    16,   1,   1,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   256,    16,   1,   1,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,   128,    16,   1,   1,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   1,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   1,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1,  8>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   1,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   1,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   1,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>
+    // clang-format on
+    >;
+
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename ComputeDataType,
+          typename AElementwiseOp,
+          typename BElementwiseOp,
+          typename CDEElementwiseOp>
+using device_contraction_f64_kk_instance = std::tuple<
+    // clang-format off
+        //#####################################| NumDimM| NumDimN| NumDimK|      AData|     BData|     AccData|         CShuffle|     DsData|     EData|              A|              B|              CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|         Compute|
+        //#####################################|        |        |        |       Type|      Type|        Type|         DataType|       Type|      Type|    Elementwise|    Elementwise|      Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            Data|
+        //#####################################|        |        |        |           |          |            |                 |           |          |      Operation|      Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            Type|
+        //#####################################|        |        |        |           |          |            |                 |           |          |               |               |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                |
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,    64,    64,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1,  8>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    32,    16,   2,   2,   16,   16,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    32,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,    64,    64,    32,    16,   2,   2,   16,   16,    4,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1,  8>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,    64,    32,    64,    16,   2,   2,   16,   16,    2,    4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1,  8>,               1, ComputeDataType>
+    // clang-format on
+    >;
+
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename ComputeDataType,
+          typename AElementwiseOp,
+          typename BElementwiseOp,
+          typename CDEElementwiseOp>
+using device_contraction_f64_kn_instance = std::tuple<
+    // clang-format off
+        //#####################################| NumDimM| NumDimN| NumDimK|      AData|     BData|     AccData|         CShuffle|     DsData|     EData|              A|              B|              CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|         Compute|
+        //#####################################|        |        |        |       Type|      Type|        Type|         DataType|       Type|      Type|    Elementwise|    Elementwise|      Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            Data|
+        //#####################################|        |        |        |           |          |            |                 |           |          |      Operation|      Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            Type|
+        //#####################################|        |        |        |           |          |            |                 |           |          |               |               |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                |
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   1,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   1,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1,  8>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   1,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1,  8, 1, 16>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   1,   16,   16,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   1,   16,   16,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1, ComputeDataType>
+    // clang-format on
+    >;
+
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename ComputeDataType,
+          typename AElementwiseOp,
+          typename BElementwiseOp,
+          typename CDEElementwiseOp>
+using device_contraction_f64_mk_instance = std::tuple<
+    // clang-format off
+        //#####################################| NumDimM| NumDimN| NumDimK|      AData|     BData|     AccData|         CShuffle|     DsData|     EData|              A|              B|              CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|         Compute|
+        //#####################################|        |        |        |       Type|      Type|        Type|         DataType|       Type|      Type|    Elementwise|    Elementwise|      Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            Data|
+        //#####################################|        |        |        |           |          |            |                 |           |          |      Operation|      Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            Type|
+        //#####################################|        |        |        |           |          |            |                 |           |          |               |               |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                |
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   2,   16,   16,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,              1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,              1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1,  8>,              1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1,  8>,              1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   2,   16,   16,    4,    4,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1,  8, 1, 16>,              1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1,  8, 1, 16>,              1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   2,   16,   16,    4,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,              1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,              1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   2,   16,   16,    2,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,              1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,              1, ComputeDataType>
+    // clang-format on
+    >;
+
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename ComputeDataType,
+          typename AElementwiseOp,
+          typename BElementwiseOp,
+          typename CDEElementwiseOp>
+using device_contraction_f64_mn_instance = std::tuple<
+    // clang-format off
+        //#####################################| NumDimM| NumDimN| NumDimK|      AData|     BData|     AccData|         CShuffle|     DsData|     EData|              A|              B|              CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|         Compute|
+        //#####################################|        |        |        |       Type|      Type|        Type|         DataType|       Type|      Type|    Elementwise|    Elementwise|      Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            Data|
+        //#####################################|        |        |        |           |          |            |                 |           |          |      Operation|      Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            Type|
+        //#####################################|        |        |        |           |          |            |                 |           |          |               |               |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                |
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   1,   16,   16,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   1,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1,  8>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   1,   16,   16,    4,    4,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1,  8, 1, 16>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   1,   16,   16,    4,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   1,   16,   16,    2,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1, ComputeDataType>
+    // clang-format on
+    >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp
@@ -17,7 +17,6 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 #ifdef CK_ENABLE_FP32
-// float
 void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                           2,
@@ -28,7 +27,8 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn
                                                           F32,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Bilinear>>>& instances);
+                                                           Bilinear,
+                                                           F32>>>& instances);

 void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -40,7 +40,8 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn
                                                           F32,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Bilinear>>>& instances);
+                                                           Bilinear,
+                                                           F32>>>& instances);

 void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -52,7 +53,8 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn
                                                           F32,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Bilinear>>>& instances);
+                                                           Bilinear,
+                                                           F32>>>& instances);

 void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -64,10 +66,115 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn
                                                           F32,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Bilinear>>>& instances);
-#endif
+                                                           Bilinear,
+                                                           F32>>>& instances);
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           F32_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F16>>>& instances);
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           F32_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F16>>>& instances);
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           F32_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F16>>>& instances);
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           F32_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F16>>>& instances);
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           F32_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           BF16>>>& instances);
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           F32_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           BF16>>>& instances);
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           F32_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           BF16>>>& instances);
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           F32_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           BF16>>>& instances);
+#endif // CK_ENABLE_FP32
+
 #ifdef CK_ENABLE_FP64
-// double
 void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                           2,
@@ -78,7 +185,8 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn
                                                           F64,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Bilinear>>>& instances);
+                                                           Bilinear,
+                                                           F64>>>& instances);

 void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -90,7 +198,8 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn
                                                           F64,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Bilinear>>>& instances);
+                                                           Bilinear,
+                                                           F64>>>& instances);

 void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -102,7 +211,8 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn
                                                           F64,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Bilinear>>>& instances);
+                                                           Bilinear,
+                                                           F64>>>& instances);

 void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -114,8 +224,170 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn
                                                           F64,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Bilinear>>>& instances);
-#endif
+                                                           Bilinear,
+                                                           F64>>>& instances);
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F64,
+                                                           F64,
+                                                           F64_Tuple,
+                                                           F64,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances);
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F64,
+                                                           F64,
+                                                           F64_Tuple,
+                                                           F64,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances);
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F64,
+                                                           F64,
+                                                           F64_Tuple,
+                                                           F64,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances);
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F64,
+                                                           F64,
+                                                           F64_Tuple,
+                                                           F64,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances);
+#endif // CK_ENABLE_FP64
+
+#ifdef CK_ENABLE_FP16
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F16,
+                                                           F16,
+                                                           F16_Tuple,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances);
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F16,
+                                                           F16,
+                                                           F16_Tuple,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances);
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F16,
+                                                           F16,
+                                                           F16_Tuple,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances);
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F16,
+                                                           F16,
+                                                           F16_Tuple,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances);
+#endif // CK_ENABLE_FP16
+
+#ifdef CK_ENABLE_BF16
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16_Tuple,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances);
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16_Tuple,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances);
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16_Tuple,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances);
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16_Tuple,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances);
+#endif // CK_ENABLE_FP16
+
 // Contraction + Bilinear
 template <index_t NumDimM,
          index_t NumDimN,
@@ -123,7 +395,8 @@ template <index_t NumDimM,
          typename ADataType,
          typename BDataType,
          typename DDataType,
-          typename EDataType>
+          typename EDataType,
+          typename ComputeDataType>
 struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContractionMultipleD<
    NumDimM,
    NumDimN,
@@ -134,7 +407,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
    EDataType,
    ck::tensor_operation::element_wise::PassThrough,
    ck::tensor_operation::element_wise::PassThrough,
-    ck::tensor_operation::element_wise::Bilinear>>
+    ck::tensor_operation::element_wise::Bilinear,
+    ComputeDataType>>
 {
    using DeviceOp = DeviceContractionMultipleD<NumDimM,
                                                NumDimN,
@@ -145,45 +419,125 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
                                                EDataType,
                                                ck::tensor_operation::element_wise::PassThrough,
                                                ck::tensor_operation::element_wise::PassThrough,
-                                                ck::tensor_operation::element_wise::Bilinear>;
+                                                ck::tensor_operation::element_wise::Bilinear,
+                                                ComputeDataType>;

    static auto GetInstances()
    {
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
 #ifdef CK_ENABLE_FP32
        if constexpr(is_same_v<ADataType, float> && is_same_v<BDataType, float> &&
-                     is_same_v<DDataType, float> && is_same_v<EDataType, float>)
+                     is_same_v<EDataType, float>)
        {
            if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
            {
-                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance(
-                    op_ptrs);
-                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance(
-                    op_ptrs);
-                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance(
-                    op_ptrs);
-                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance(
-                    op_ptrs);
+                if constexpr(is_same_v<ComputeDataType, float>)
+                {
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance(
+                        op_ptrs);
+                }
+                else if constexpr(is_same_v<ComputeDataType, ck::half_t>)
+                {
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance(
+                        op_ptrs);
+                }
+                else if constexpr(is_same_v<ComputeDataType, ck::bhalf_t>)
+                {
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance(
+                        op_ptrs);
+                }
            }
        }
-#endif
+#endif // CK_ENABLE_FP32
 #ifdef CK_ENABLE_FP64
        if constexpr(is_same_v<ADataType, double> && is_same_v<BDataType, double> &&
-                     is_same_v<DDataType, double> && is_same_v<EDataType, double>)
+                     is_same_v<EDataType, double>)
+        {
+            if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
+            {
+                if constexpr(is_same_v<ComputeDataType, double>)
+                {
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance(
+                        op_ptrs);
+                }
+                else if constexpr(is_same_v<ComputeDataType, float>)
+                {
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance(
+                        op_ptrs);
+                }
+            }
+        }
+#endif // CK_ENABLE_FP64
+#ifdef CK_ENABLE_FP16
+        if constexpr(is_same_v<ADataType, ck::half_t> && is_same_v<BDataType, ck::half_t> &&
+                     is_same_v<EDataType, ck::half_t>)
+        {
+            if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
+            {
+                if constexpr(is_same_v<ComputeDataType, float>)
+                {
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance(
+                        op_ptrs);
+                }
+            }
+        }
+#endif // CK_ENABLE_FP16
+#ifdef CK_ENABLE_BF16
+        if constexpr(is_same_v<ADataType, ck::bhalf_t> && is_same_v<BDataType, ck::bhalf_t> &&
+                     is_same_v<EDataType, ck::bhalf_t>)
        {
            if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
            {
-                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance(
-                    op_ptrs);
-                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance(
-                    op_ptrs);
-                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance(
-                    op_ptrs);
-                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance(
-                    op_ptrs);
+                if constexpr(is_same_v<ComputeDataType, float>)
+                {
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance(
+                        op_ptrs);
+                }
            }
        }
-#endif
+#endif // CK_ENABLE_BF16
        return op_ptrs;
    }
 };

--- a/library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp
@@ -17,7 +17,6 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 #ifdef CK_ENABLE_FP32
-// float
 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                           2,
@@ -28,7 +27,8 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instanc
                                                           F32,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Scale>>>& instances);
+                                                           Scale,
+                                                           F32>>>& instances);

 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -40,7 +40,8 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instanc
                                                           F32,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Scale>>>& instances);
+                                                           Scale,
+                                                           F32>>>& instances);

 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -52,7 +53,8 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instanc
                                                           F32,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Scale>>>& instances);
+                                                           Scale,
+                                                           F32>>>& instances);

 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -64,10 +66,115 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instanc
                                                           F32,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Scale>>>& instances);
-#endif
+                                                           Scale,
+                                                           F32>>>& instances);
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           Empty_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F16>>>& instances);
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           Empty_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F16>>>& instances);
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           Empty_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F16>>>& instances);
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           Empty_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F16>>>& instances);
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           Empty_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           BF16>>>& instances);
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           Empty_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           BF16>>>& instances);
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           Empty_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           BF16>>>& instances);
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           Empty_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           BF16>>>& instances);
+#endif // CK_ENABLE_FP32
+
 #ifdef CK_ENABLE_FP64
-// double
 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                           2,
@@ -78,7 +185,8 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instanc
                                                           F64,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Scale>>>& instances);
+                                                           Scale,
+                                                           F64>>>& instances);

 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -90,7 +198,8 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instanc
                                                           F64,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Scale>>>& instances);
+                                                           Scale,
+                                                           F64>>>& instances);

 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -102,7 +211,8 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instanc
                                                           F64,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Scale>>>& instances);
+                                                           Scale,
+                                                           F64>>>& instances);

 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -114,15 +224,178 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instanc
                                                           F64,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Scale>>>& instances);
-#endif
+                                                           Scale,
+                                                           F64>>>& instances);
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F64,
+                                                           F64,
+                                                           Empty_Tuple,
+                                                           F64,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F32>>>& instances);
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F64,
+                                                           F64,
+                                                           Empty_Tuple,
+                                                           F64,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F32>>>& instances);
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F64,
+                                                           F64,
+                                                           Empty_Tuple,
+                                                           F64,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F32>>>& instances);
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F64,
+                                                           F64,
+                                                           Empty_Tuple,
+                                                           F64,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F32>>>& instances);
+#endif // CK_ENABLE_FP64
+
+#ifdef CK_ENABLE_FP16
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F16,
+                                                           F16,
+                                                           Empty_Tuple,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F32>>>& instances);
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F16,
+                                                           F16,
+                                                           Empty_Tuple,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F32>>>& instances);
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F16,
+                                                           F16,
+                                                           Empty_Tuple,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F32>>>& instances);
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F16,
+                                                           F16,
+                                                           Empty_Tuple,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F32>>>& instances);
+#endif // CK_ENABLE_FP16
+
+#ifdef CK_ENABLE_BF16
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           BF16,
+                                                           BF16,
+                                                           Empty_Tuple,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F32>>>& instances);
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           BF16,
+                                                           BF16,
+                                                           Empty_Tuple,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F32>>>& instances);
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           BF16,
+                                                           BF16,
+                                                           Empty_Tuple,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F32>>>& instances);
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           BF16,
+                                                           BF16,
+                                                           Empty_Tuple,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F32>>>& instances);
+#endif // CK_ENABLE_FP16
+
 // Contraction + Scale
 template <index_t NumDimM,
          index_t NumDimN,
          index_t NumDimK,
          typename ADataType,
          typename BDataType,
-          typename EDataType>
+          typename EDataType,
+          typename ComputeDataType>
 struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContractionMultipleD<
    NumDimM,
    NumDimN,
@@ -133,7 +406,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
    EDataType,
    ck::tensor_operation::element_wise::PassThrough,
    ck::tensor_operation::element_wise::PassThrough,
-    ck::tensor_operation::element_wise::Scale>>
+    ck::tensor_operation::element_wise::Scale,
+    ComputeDataType>>
 {
    using DeviceOp = DeviceContractionMultipleD<NumDimM,
                                                NumDimN,
@@ -144,7 +418,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
                                                EDataType,
                                                ck::tensor_operation::element_wise::PassThrough,
                                                ck::tensor_operation::element_wise::PassThrough,
-                                                ck::tensor_operation::element_wise::Scale>;
+                                                ck::tensor_operation::element_wise::Scale,
+                                                ComputeDataType>;

    static auto GetInstances()
    {
@@ -155,34 +430,113 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
        {
            if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
            {
-                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance(
-                    op_ptrs);
-                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance(
-                    op_ptrs);
-                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance(
-                    op_ptrs);
-                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance(
-                    op_ptrs);
+                if constexpr(is_same_v<ComputeDataType, float>)
+                {
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance(
+                        op_ptrs);
+                }
+                else if constexpr(is_same_v<ComputeDataType, ck::half_t>)
+                {
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance(
+                        op_ptrs);
+                }
+                else if constexpr(is_same_v<ComputeDataType, ck::bhalf_t>)
+                {
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance(
+                        op_ptrs);
+                }
            }
        }
-#endif
+#endif // CK_ENABLE_FP32
 #ifdef CK_ENABLE_FP64
        if constexpr(is_same_v<ADataType, double> && is_same_v<BDataType, double> &&
                     is_same_v<EDataType, double>)
        {
            if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
            {
-                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance(
-                    op_ptrs);
-                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance(
-                    op_ptrs);
-                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance(
-                    op_ptrs);
-                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance(
-                    op_ptrs);
+                if constexpr(is_same_v<ComputeDataType, double>)
+                {
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance(
+                        op_ptrs);
+                }
+                else if constexpr(is_same_v<ComputeDataType, float>)
+                {
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance(
+                        op_ptrs);
+                }
+            }
+        }
+#endif // CK_ENABLE_FP64
+#ifdef CK_ENABLE_FP16
+        if constexpr(is_same_v<ADataType, ck::half_t> && is_same_v<BDataType, ck::half_t> &&
+                     is_same_v<EDataType, ck::half_t>)
+        {
+            if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
+            {
+                if constexpr(is_same_v<ComputeDataType, float>)
+                {
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance(
+                        op_ptrs);
+                }
+            }
+        }
+#endif // CK_ENABLE_FP16
+#ifdef CK_ENABLE_BF16
+        if constexpr(is_same_v<ADataType, ck::bhalf_t> && is_same_v<BDataType, ck::bhalf_t> &&
+                     is_same_v<EDataType, ck::bhalf_t>)
+        {
+            if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
+            {
+                if constexpr(is_same_v<ComputeDataType, float>)
+                {
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance(
+                        op_ptrs);
+                }
            }
        }
-#endif
+#endif // CK_ENABLE_BF16
        return op_ptrs;
    }
 };

--- a/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+#include <memory>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using namespace ck::conv_tensor_rearrange_op;
+
+// GNWC/GNHWC/GNDHWC
+// Image to Column
+// GNWC, 1d
+void add_device_image_to_column_gnwc_1d_bf16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, BF16, BF16, ImageToColumn>>>&
+        instances);
+
+void add_device_image_to_column_gnwc_1d_f16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F16, F16, ImageToColumn>>>&
+        instances);
+
+void add_device_image_to_column_gnwc_1d_f32_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F32, F32, ImageToColumn>>>&
+        instances);
+
+void add_device_image_to_column_gnwc_1d_i8_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, int8_t, int8_t, ImageToColumn>>>&
+        instances);
+// GNHWC, 2d
+void add_device_image_to_column_gnhwc_2d_bf16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, BF16, BF16, ImageToColumn>>>&
+        instances);
+
+void add_device_image_to_column_gnhwc_2d_f16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F16, F16, ImageToColumn>>>&
+        instances);
+
+void add_device_image_to_column_gnhwc_2d_f32_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F32, F32, ImageToColumn>>>&
+        instances);
+
+void add_device_image_to_column_gnhwc_2d_i8_instances(
+    std::vector<
+        std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, int8_t, int8_t, ImageToColumn>>>&
+        instances);
+// GNDHWC, 3d
+void add_device_image_to_column_gndhwc_3d_bf16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, BF16, BF16, ImageToColumn>>>&
+        instances);
+
+void add_device_image_to_column_gndhwc_3d_f16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F16, F16, ImageToColumn>>>&
+        instances);
+
+void add_device_image_to_column_gndhwc_3d_f32_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F32, F32, ImageToColumn>>>&
+        instances);
+
+void add_device_image_to_column_gndhwc_3d_i8_instances(
+    std::vector<
+        std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, int8_t, int8_t, ImageToColumn>>>&
+        instances);
+
+// Column to Image
+// GNWC, 1d
+void add_device_column_to_image_gnwc_1d_bf16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, BF16, BF16, ColumnToImage>>>&
+        instances);
+
+void add_device_column_to_image_gnwc_1d_f16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F16, F16, ColumnToImage>>>&
+        instances);
+
+void add_device_column_to_image_gnwc_1d_f32_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F32, F32, ColumnToImage>>>&
+        instances);
+
+void add_device_column_to_image_gnwc_1d_i8_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, int8_t, int8_t, ColumnToImage>>>&
+        instances);
+// GNHWC, 2d
+void add_device_column_to_image_gnhwc_2d_bf16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, BF16, BF16, ColumnToImage>>>&
+        instances);
+
+void add_device_column_to_image_gnhwc_2d_f16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F16, F16, ColumnToImage>>>&
+        instances);
+
+void add_device_column_to_image_gnhwc_2d_f32_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F32, F32, ColumnToImage>>>&
+        instances);
+
+void add_device_column_to_image_gnhwc_2d_i8_instances(
+    std::vector<
+        std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, int8_t, int8_t, ColumnToImage>>>&
+        instances);
+// GNDHWC, 3d
+void add_device_column_to_image_gndhwc_3d_bf16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, BF16, BF16, ColumnToImage>>>&
+        instances);
+
+void add_device_column_to_image_gndhwc_3d_f16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F16, F16, ColumnToImage>>>&
+        instances);
+
+void add_device_column_to_image_gndhwc_3d_f32_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F32, F32, ColumnToImage>>>&
+        instances);
+
+void add_device_column_to_image_gndhwc_3d_i8_instances(
+    std::vector<
+        std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, int8_t, int8_t, ColumnToImage>>>&
+        instances);
+// NWGC/NHWGC/NDHWGC
+// Image to Column
+// NWGC, 1d
+void add_device_image_to_column_nwgc_1d_bf16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, BF16, BF16, ImageToColumn>>>&
+        instances);
+
+void add_device_image_to_column_nwgc_1d_f16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, F16, F16, ImageToColumn>>>&
+        instances);
+
+void add_device_image_to_column_nwgc_1d_f32_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, F32, F32, ImageToColumn>>>&
+        instances);
+
+void add_device_image_to_column_nwgc_1d_i8_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, int8_t, int8_t, ImageToColumn>>>&
+        instances);
+// NHWGC, 2d
+void add_device_image_to_column_nhwgc_2d_bf16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, BF16, BF16, ImageToColumn>>>&
+        instances);
+
+void add_device_image_to_column_nhwgc_2d_f16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, F16, F16, ImageToColumn>>>&
+        instances);
+
+void add_device_image_to_column_nhwgc_2d_f32_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, F32, F32, ImageToColumn>>>&
+        instances);
+
+void add_device_image_to_column_nhwgc_2d_i8_instances(
+    std::vector<
+        std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, int8_t, int8_t, ImageToColumn>>>&
+        instances);
+// NDHWGC, 3d
+void add_device_image_to_column_ndhwgc_3d_bf16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, BF16, BF16, ImageToColumn>>>&
+        instances);
+
+void add_device_image_to_column_ndhwgc_3d_f16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, F16, F16, ImageToColumn>>>&
+        instances);
+
+void add_device_image_to_column_ndhwgc_3d_f32_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, F32, F32, ImageToColumn>>>&
+        instances);
+
+void add_device_image_to_column_ndhwgc_3d_i8_instances(
+    std::vector<
+        std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, int8_t, int8_t, ImageToColumn>>>&
+        instances);
+
+// Column to Image
+// NWGC, 1d
+void add_device_column_to_image_nwgc_1d_bf16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, BF16, BF16, ColumnToImage>>>&
+        instances);
+
+void add_device_column_to_image_nwgc_1d_f16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, F16, F16, ColumnToImage>>>&
+        instances);
+
+void add_device_column_to_image_nwgc_1d_f32_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, F32, F32, ColumnToImage>>>&
+        instances);
+
+void add_device_column_to_image_nwgc_1d_i8_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, int8_t, int8_t, ColumnToImage>>>&
+        instances);
+// NHWGC, 2d
+void add_device_column_to_image_nhwgc_2d_bf16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, BF16, BF16, ColumnToImage>>>&
+        instances);
+
+void add_device_column_to_image_nhwgc_2d_f16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, F16, F16, ColumnToImage>>>&
+        instances);
+
+void add_device_column_to_image_nhwgc_2d_f32_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, F32, F32, ColumnToImage>>>&
+        instances);
+
+void add_device_column_to_image_nhwgc_2d_i8_instances(
+    std::vector<
+        std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, int8_t, int8_t, ColumnToImage>>>&
+        instances);
+// NDHWGC, 3d
+void add_device_column_to_image_ndhwgc_3d_bf16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, BF16, BF16, ColumnToImage>>>&
+        instances);
+
+void add_device_column_to_image_ndhwgc_3d_f16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, F16, F16, ColumnToImage>>>&
+        instances);
+
+void add_device_column_to_image_ndhwgc_3d_f32_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, F32, F32, ColumnToImage>>>&
+        instances);
+
+void add_device_column_to_image_ndhwgc_3d_i8_instances(
+    std::vector<
+        std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, int8_t, int8_t, ColumnToImage>>>&
+        instances);
+
+template <ck::index_t NumDimSpatial,
+          typename ImageLayout,
+          typename InDataType,
+          typename OutDataType,
+          typename ConvTensorRearrangeOp>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceConvTensorRearrange<NumDimSpatial,
+                                                            ImageLayout,
+                                                            InDataType,
+                                                            OutDataType,
+                                                            ConvTensorRearrangeOp>>
+{
+    using DeviceOp = DeviceConvTensorRearrange<NumDimSpatial,
+                                               ImageLayout,
+                                               InDataType,
+                                               OutDataType,
+                                               ConvTensorRearrangeOp>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(is_same_v<ConvTensorRearrangeOp, ImageToColumn>)
+        {
+            if constexpr(NumDimSpatial == 1 && is_same_v<ImageLayout, GNWC>)
+            {
+                if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
+                {
+                    add_device_image_to_column_gnwc_1d_f32_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
+                {
+                    add_device_image_to_column_gnwc_1d_f16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                                  is_same_v<OutDataType, ck::bhalf_t>)
+                {
+                    add_device_image_to_column_gnwc_1d_bf16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
+                {
+                    add_device_image_to_column_gnwc_1d_i8_instances(op_ptrs);
+                }
+            }
+            else if constexpr(NumDimSpatial == 2 && is_same_v<ImageLayout, GNHWC>)
+            {
+                if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
+                {
+                    add_device_image_to_column_gnhwc_2d_f32_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
+                {
+                    add_device_image_to_column_gnhwc_2d_f16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                                  is_same_v<OutDataType, ck::bhalf_t>)
+                {
+                    add_device_image_to_column_gnhwc_2d_bf16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
+                {
+                    add_device_image_to_column_gnhwc_2d_i8_instances(op_ptrs);
+                }
+            }
+            else if constexpr(NumDimSpatial == 3 && is_same_v<ImageLayout, GNDHWC>)
+            {
+                if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
+                {
+                    add_device_image_to_column_gndhwc_3d_f32_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
+                {
+                    add_device_image_to_column_gndhwc_3d_f16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                                  is_same_v<OutDataType, ck::bhalf_t>)
+                {
+                    add_device_image_to_column_gndhwc_3d_bf16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
+                {
+                    add_device_image_to_column_gndhwc_3d_i8_instances(op_ptrs);
+                }
+            }
+            else if constexpr(NumDimSpatial == 1 && is_same_v<ImageLayout, NWGC>)
+            {
+                if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
+                {
+                    add_device_image_to_column_nwgc_1d_f32_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
+                {
+                    add_device_image_to_column_nwgc_1d_f16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                                  is_same_v<OutDataType, ck::bhalf_t>)
+                {
+                    add_device_image_to_column_nwgc_1d_bf16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
+                {
+                    add_device_image_to_column_nwgc_1d_i8_instances(op_ptrs);
+                }
+            }
+            else if constexpr(NumDimSpatial == 2 && is_same_v<ImageLayout, NHWGC>)
+            {
+                if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
+                {
+                    add_device_image_to_column_nhwgc_2d_f32_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
+                {
+                    add_device_image_to_column_nhwgc_2d_f16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                                  is_same_v<OutDataType, ck::bhalf_t>)
+                {
+                    add_device_image_to_column_nhwgc_2d_bf16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
+                {
+                    add_device_image_to_column_nhwgc_2d_i8_instances(op_ptrs);
+                }
+            }
+            else if constexpr(NumDimSpatial == 3 && is_same_v<ImageLayout, NDHWGC>)
+            {
+                if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
+                {
+                    add_device_image_to_column_ndhwgc_3d_f32_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
+                {
+                    add_device_image_to_column_ndhwgc_3d_f16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                                  is_same_v<OutDataType, ck::bhalf_t>)
+                {
+                    add_device_image_to_column_ndhwgc_3d_bf16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
+                {
+                    add_device_image_to_column_ndhwgc_3d_i8_instances(op_ptrs);
+                }
+            }
+        }
+        else if constexpr(is_same_v<ConvTensorRearrangeOp, ColumnToImage>)
+        {
+            if constexpr(NumDimSpatial == 1 && is_same_v<ImageLayout, GNWC>)
+            {
+                if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
+                {
+                    add_device_column_to_image_gnwc_1d_f32_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
+                {
+                    add_device_column_to_image_gnwc_1d_f16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                                  is_same_v<OutDataType, ck::bhalf_t>)
+                {
+                    add_device_column_to_image_gnwc_1d_bf16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
+                {
+                    add_device_column_to_image_gnwc_1d_i8_instances(op_ptrs);
+                }
+            }
+            else if constexpr(NumDimSpatial == 2 && is_same_v<ImageLayout, GNHWC>)
+            {
+                if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
+                {
+                    add_device_column_to_image_gnhwc_2d_f32_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
+                {
+                    add_device_column_to_image_gnhwc_2d_f16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                                  is_same_v<OutDataType, ck::bhalf_t>)
+                {
+                    add_device_column_to_image_gnhwc_2d_bf16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
+                {
+                    add_device_column_to_image_gnhwc_2d_i8_instances(op_ptrs);
+                }
+            }
+            else if constexpr(NumDimSpatial == 3 && is_same_v<ImageLayout, GNDHWC>)
+            {
+                if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
+                {
+                    add_device_column_to_image_gndhwc_3d_f32_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
+                {
+                    add_device_column_to_image_gndhwc_3d_f16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                                  is_same_v<OutDataType, ck::bhalf_t>)
+                {
+                    add_device_column_to_image_gndhwc_3d_bf16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
+                {
+                    add_device_column_to_image_gndhwc_3d_i8_instances(op_ptrs);
+                }
+            }
+            else if constexpr(NumDimSpatial == 1 && is_same_v<ImageLayout, NWGC>)
+            {
+                if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
+                {
+                    add_device_column_to_image_nwgc_1d_f32_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
+                {
+                    add_device_column_to_image_nwgc_1d_f16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                                  is_same_v<OutDataType, ck::bhalf_t>)
+                {
+                    add_device_column_to_image_nwgc_1d_bf16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
+                {
+                    add_device_column_to_image_nwgc_1d_i8_instances(op_ptrs);
+                }
+            }
+            else if constexpr(NumDimSpatial == 2 && is_same_v<ImageLayout, NHWGC>)
+            {
+                if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
+                {
+                    add_device_column_to_image_nhwgc_2d_f32_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
+                {
+                    add_device_column_to_image_nhwgc_2d_f16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                                  is_same_v<OutDataType, ck::bhalf_t>)
+                {
+                    add_device_column_to_image_nhwgc_2d_bf16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
+                {
+                    add_device_column_to_image_nhwgc_2d_i8_instances(op_ptrs);
+                }
+            }
+            else if constexpr(NumDimSpatial == 3 && is_same_v<ImageLayout, NDHWGC>)
+            {
+                if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
+                {
+                    add_device_column_to_image_ndhwgc_3d_f32_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
+                {
+                    add_device_column_to_image_ndhwgc_3d_f16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                                  is_same_v<OutDataType, ck::bhalf_t>)
+                {
+                    add_device_column_to_image_ndhwgc_3d_bf16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
+                {
+                    add_device_column_to_image_ndhwgc_3d_i8_instances(op_ptrs);
+                }
+            }
+        }
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using namespace ck::tensor_layout::convolution;
+using namespace ck::conv_tensor_rearrange_op;
+
+using BF16 = ck::bhalf_t;
+using F16  = ck::half_t;
+using F32  = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+template <ck::index_t NDimSpatial, typename InLayout>
+using device_column_to_image_bf16_instances = std::tuple<
+    // clang-format off
+        //#####################|        Num| InLayout| InDataType| OutDataType| Block|  MPer|  KPer|    Thread| Scalar|
+        //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
+        //#####################|    Spatial|         |           |            |      |      |      |   Lengths| Vector|
+        //#####################|           |         |           |            |      |      |      |          |       |
+        // generic instance
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,       BF16,        BF16,    64,    16,    16,   S<8, 8>,     1>,
+
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,       BF16,        BF16,    64,    32,    32,   S<8, 8>,     4>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,       BF16,        BF16,    64,    64,    64,   S<8, 8>,     8>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,       BF16,        BF16,   128,    32,    64,  S<8, 16>,     4>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,       BF16,        BF16,   128,    64,   128,  S<8, 16>,     8>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,       BF16,        BF16,   256,    64,    64, S<16, 16>,     4>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,       BF16,        BF16,   256,   128,   128, S<16, 16>,     4>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,       BF16,        BF16,   256,   128,   128, S<16, 16>,     8>
+    // clang-format on
+    >;
+
+template <ck::index_t NDimSpatial, typename InLayout>
+using device_column_to_image_f16_instances = std::tuple<
+    // clang-format off
+        //#####################|        Num| InLayout| InDataType| OutDataType| Block|  MPer|  KPer|    Thread| Scalar|
+        //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
+        //#####################|    Spatial|         |           |            |      |      |      |   Lengths| Vector|
+        //#####################|           |         |           |            |      |      |      |          |       |
+        // generic instance
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,        F16,         F16,    64,    16,    16,   S<8, 8>,     1>,
+        
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,        F16,         F16,    64,    32,    32,   S<8, 8>,     4>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,        F16,         F16,    64,    64,    64,   S<8, 8>,     8>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,        F16,         F16,   128,    32,    64,  S<8, 16>,     4>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,        F16,         F16,   128,    64,   128,  S<8, 16>,     8>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,        F16,         F16,   256,    64,    64, S<16, 16>,     4>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,        F16,         F16,   256,   128,   128, S<16, 16>,     4>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,        F16,         F16,   256,   128,   128, S<16, 16>,     8>
+    // clang-format on
+    >;
+
+template <ck::index_t NDimSpatial, typename InLayout>
+using device_column_to_image_f32_instances = std::tuple<
+    // clang-format off
+        //#####################|        Num| InLayout| InDataType| OutDataType| Block|  MPer|  KPer|    Thread| Scalar|
+        //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
+        //#####################|    Spatial|         |           |            |      |      |      |   Lengths| Vector|
+        //#####################|           |         |           |            |      |      |      |          |       |
+        // generic instance
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,        F32,         F32,    64,    16,    16,   S<8, 8>,     1>,
+        
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,        F32,         F32,    64,    32,    32,   S<8, 8>,     4>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,        F32,         F32,   128,    32,    64,  S<8, 16>,     4>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,        F32,         F32,   256,    64,    64, S<16, 16>,     4>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,        F32,         F32,   256,   128,   128, S<16, 16>,     4>
+    // clang-format on
+    >;
+
+template <ck::index_t NDimSpatial, typename InLayout>
+using device_column_to_image_i8_instances = std::tuple<
+    // clang-format off
+        //#####################|        Num| InLayout| InDataType| OutDataType| Block|  MPer|  KPer|    Thread| Scalar|
+        //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
+        //#####################|    Spatial|         |           |            |      |      |      |   Lengths| Vector|
+        //#####################|           |         |           |            |      |      |      |          |       |
+        // generic instance
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,     int8_t,      int8_t,    64,    16,    16,   S<8, 8>,     1>,
+
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,     int8_t,      int8_t,    64,    32,    32,   S<8, 8>,     4>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,     int8_t,      int8_t,    64,    64,    64,   S<8, 8>,     8>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   128,    32,    64,  S<8, 16>,     4>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   128,    64,   128,  S<8, 16>,     8>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   256,    64,    64, S<16, 16>,     4>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   256,   128,   128, S<16, 16>,     4>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   256,   128,   128, S<16, 16>,     8>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   256,   256,   256, S<16, 16>,     16>
+    // clang-format on
+    >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/include/ck/library/tensor_operation_instance/gpu/image_to_column/device_image_to_column_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/image_to_column/device_image_to_column_instance.hpp
@@ -13,6 +13,7 @@ namespace device {
 namespace instance {

 using namespace ck::tensor_layout::convolution;
+using namespace ck::conv_tensor_rearrange_op;

 using BF16 = ck::bhalf_t;
 using F16  = ck::half_t;
@@ -28,17 +29,12 @@ using device_image_to_column_bf16_instances = std::tuple<
        //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
        //#####################|    Spatial|         |           |            |      |      |      |   Lengths| Vector|
        //#####################|           |         |           |            |      |      |      |          |       |
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,    64,     8,     8,   S<8, 8>,     1>,
+        // generic instance
        DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,    64,    16,    16,   S<8, 8>,     1>,
        DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,    64,    32,    32,   S<8, 8>,     4>,
        DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,    64,    64,    64,   S<8, 8>,     8>,
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,   128,    16,    16,  S<8, 16>,     1>,
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,   128,    64,    64,  S<8, 16>,     1>,
        DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,   128,    32,    64,  S<8, 16>,     4>,
        DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,   128,    64,   128,  S<8, 16>,     8>,
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,   256,    16,    16, S<16, 16>,     1>,
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,   256,    64,    64, S<16, 16>,     1>,
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,   256,   128,   128, S<16, 16>,     1>,
        DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,   256,    64,    64, S<16, 16>,     4>,
        DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,   256,   128,   128, S<16, 16>,     4>,
        DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,   256,   128,   128, S<16, 16>,     8>
@@ -52,17 +48,13 @@ using device_image_to_column_f16_instances = std::tuple<
        //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
        //#####################|    Spatial|         |           |            |      |      |      |   Lengths| Vector|
        //#####################|           |         |           |            |      |      |      |          |       |
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,    64,     8,     8,   S<8, 8>,     1>,
+        // generic instance        
        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,    64,    16,    16,   S<8, 8>,     1>,
+
        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,    64,    32,    32,   S<8, 8>,     4>,
        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,    64,    64,    64,   S<8, 8>,     8>,
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,   128,    16,    16,  S<8, 16>,     1>,
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,   128,    64,    64,  S<8, 16>,     1>,
        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,   128,    32,    64,  S<8, 16>,     4>,
        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,   128,    64,   128,  S<8, 16>,     8>,
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,   256,    16,    16, S<16, 16>,     1>,
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,   256,    64,    64, S<16, 16>,     1>,
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,   256,   128,   128, S<16, 16>,     1>,
        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,   256,    64,    64, S<16, 16>,     4>,
        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,   256,   128,   128, S<16, 16>,     4>,
        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,   256,   128,   128, S<16, 16>,     8>
@@ -76,15 +68,11 @@ using device_image_to_column_f32_instances = std::tuple<
        //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
        //#####################|    Spatial|         |           |            |      |      |      |   Lengths| Vector|
        //#####################|           |         |           |            |      |      |      |          |       |
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F32,         F32,    64,     8,     8,   S<8, 8>,     1>,
+        // generic instance      
        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F32,         F32,    64,    16,    16,   S<8, 8>,     1>,
+
        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F32,         F32,    64,    32,    32,   S<8, 8>,     4>,
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F32,         F32,   128,    16,    16,  S<8, 16>,     1>,
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F32,         F32,   128,    64,    64,  S<8, 16>,     1>,
        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F32,         F32,   128,    32,    64,  S<8, 16>,     4>,
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F32,         F32,   256,    16,    16, S<16, 16>,     1>,
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F32,         F32,   256,    64,    64, S<16, 16>,     1>,
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F32,         F32,   256,   128,   128, S<16, 16>,     1>,
        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F32,         F32,   256,    64,    64, S<16, 16>,     4>,
        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F32,         F32,   256,   128,   128, S<16, 16>,     4>
    // clang-format on
@@ -97,17 +85,13 @@ using device_image_to_column_i8_instances = std::tuple<
        //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
        //#####################|    Spatial|         |           |            |      |      |      |   Lengths| Vector|
        //#####################|           |         |           |            |      |      |      |          |       |
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,    64,     8,     8,   S<8, 8>,     1>,
+        // generic instance
        DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,    64,    16,    16,   S<8, 8>,     1>,
+
        DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,    64,    32,    32,   S<8, 8>,     4>,
        DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,    64,    64,    64,   S<8, 8>,     8>,
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   128,    16,    16,  S<8, 16>,     1>,
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   128,    64,    64,  S<8, 16>,     1>,
        DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   128,    32,    64,  S<8, 16>,     4>,
        DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   128,    64,   128,  S<8, 16>,     8>,
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   256,    16,    16, S<16, 16>,     1>,
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   256,    64,    64, S<16, 16>,     1>,
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   256,   128,   128, S<16, 16>,     1>,
        DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   256,    64,    64, S<16, 16>,     4>,
        DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   256,   128,   128, S<16, 16>,     4>,
        DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   256,   128,   128, S<16, 16>,     8>,

--- a/library/include/ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp
@@ -240,11 +240,13 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceConvBw
        if constexpr(NumDimSpatial == 1 && is_same_v<InLayout, NWC> && is_same_v<WeiLayout, KXC> &&
                     is_same_v<OutLayout, NWK>)
        {
+#ifdef CK_ENABLE_FP32
            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
                         is_same_v<OutDataType, float>)
            {
                add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instances(op_ptrs);
            }
+#endif
 #ifdef CK_ENABLE_FP16
            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
                         is_same_v<OutDataType, half_t>)
@@ -267,17 +269,23 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceConvBw
            }
 #endif
        }
-        else if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NHWC> &&
-                          is_same_v<WeiLayout, KYXC> && is_same_v<OutLayout, NHWK>)
+        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NHWC> &&
+                     is_same_v<WeiLayout, KYXC> && is_same_v<OutLayout, NHWK>)
        {
+#ifdef CK_ENABLE_FP32
            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
                         is_same_v<OutDataType, float>)
            {
                add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances(op_ptrs);
-#ifdef DL_KERNELS
-                add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instances(op_ptrs);
+            }
 #endif
+#if defined(DL_KERNELS) && defined(CK_ENABLE_FP32)
+            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
+                         is_same_v<OutDataType, float>)
+            {
+                add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instances(op_ptrs);
            }
+#endif
 #ifdef CK_ENABLE_FP16
            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
                         is_same_v<OutDataType, half_t>)
@@ -306,14 +314,16 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceConvBw
            }
 #endif
        }
-        else if constexpr(NumDimSpatial == 3 && is_same_v<InLayout, NDHWC> &&
-                          is_same_v<WeiLayout, KZYXC> && is_same_v<OutLayout, NDHWK>)
+        if constexpr(NumDimSpatial == 3 && is_same_v<InLayout, NDHWC> &&
+                     is_same_v<WeiLayout, KZYXC> && is_same_v<OutLayout, NDHWK>)
        {
+#ifdef CK_ENABLE_FP32
            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
                         is_same_v<OutDataType, float>)
            {
                add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instances(op_ptrs);
            }
+#endif
 #ifdef CK_ENABLE_FP16
            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
                         is_same_v<OutDataType, half_t>)

--- a/library/include/ck/library/tensor_operation_instance/gpu/convolution_forward.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/convolution_forward.hpp
@@ -98,30 +98,31 @@ struct DeviceOperationInstanceFactory<
        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NHWC> &&
                     is_same_v<WeiLayout, KYXC> && is_same_v<OutLayout, NHWK>)
        {
+#ifdef CK_ENABLE_FP32
            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
                         is_same_v<OutDataType, float>)
            {
                add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(op_ptrs);
            }
+#endif
 #ifdef CK_ENABLE_FP16
-            else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
-                              is_same_v<OutDataType, half_t>)
+            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                         is_same_v<OutDataType, half_t>)
            {
                add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(op_ptrs);
                add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(op_ptrs);
            }
 #endif
 #ifdef CK_ENABLE_BF16
-            else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
-                              is_same_v<WeiDataType, ck::bhalf_t> &&
-                              is_same_v<OutDataType, ck::bhalf_t>)
+            if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                         is_same_v<WeiDataType, ck::bhalf_t> && is_same_v<OutDataType, ck::bhalf_t>)
            {
                add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(op_ptrs);
            }
 #endif
 #ifdef CK_ENABLE_INT8
-            else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
-                              is_same_v<OutDataType, int8_t>)
+            if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
+                         is_same_v<OutDataType, int8_t>)
            {
                add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(op_ptrs);
            }

--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
@@ -23,12 +23,17 @@ void add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances(
        DeviceGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);

-void add_device_gemm_dl_dpp8_f16_f16_f16_km_kn_mn_instances(
+void add_device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instances(
    std::vector<std::unique_ptr<
        DeviceGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);

-void add_device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instances(
+void add_device_gemm_dpp_f16_f16_f16_km_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_dpp_f16_f16_f16_km_kn_mn_irregular_instances(
    std::vector<std::unique_ptr<
        DeviceGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);
@@ -38,12 +43,17 @@ void add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances(
        DeviceGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);

-void add_device_gemm_dl_dpp8_f16_f16_f16_km_nk_mn_instances(
+void add_device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instances(
    std::vector<std::unique_ptr<
        DeviceGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);

-void add_device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instances(
+void add_device_gemm_dpp_f16_f16_f16_km_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_dpp_f16_f16_f16_km_nk_mn_irregular_instances(
    std::vector<std::unique_ptr<
        DeviceGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);
@@ -53,12 +63,17 @@ void add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances(
        DeviceGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);

-void add_device_gemm_dl_dpp8_f16_f16_f16_mk_kn_mn_instances(
+void add_device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instances(
    std::vector<std::unique_ptr<
        DeviceGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);

-void add_device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instances(
+void add_device_gemm_dpp_f16_f16_f16_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_dpp_f16_f16_f16_mk_kn_mn_irregular_instances(
    std::vector<std::unique_ptr<
        DeviceGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);
@@ -68,12 +83,17 @@ void add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances(
        DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);

-void add_device_gemm_dl_dpp8_f16_f16_f16_mk_nk_mn_instances(
+void add_device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instances(
    std::vector<std::unique_ptr<
        DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);

-void add_device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instances(
+void add_device_gemm_dpp_f16_f16_f16_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_dpp_f16_f16_f16_mk_nk_mn_irregular_instances(
    std::vector<std::unique_ptr<
        DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);
@@ -207,6 +227,10 @@ void add_device_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(
        DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);

+void add_device_gemm_xdl_c_shuffle_lds_direct_load_f16_f16_f16_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
 #endif
 #ifdef CK_ENABLE_BF16
 void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instances(
@@ -269,6 +293,26 @@ void add_device_gemm_xdl_f32_f32_f32_mk_nk_mn_instances(
    std::vector<std::unique_ptr<
        DeviceGemm<Row, Col, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
        instances);
+
+void add_device_gemm_xdl_c_shuffle_lds_direct_load_f32_f32_f32_km_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Row, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_c_shuffle_lds_direct_load_f32_f32_f32_km_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Col, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_c_shuffle_lds_direct_load_f32_f32_f32_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_c_shuffle_lds_direct_load_f32_f32_f32_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances);
 #endif
 #ifdef CK_ENABLE_FP64
 void add_device_gemm_xdl_f64_f64_f64_km_kn_mn_instances(
@@ -292,6 +336,34 @@ void add_device_gemm_xdl_f64_f64_f64_mk_nk_mn_instances(
        DeviceGemm<Row, Col, Row, F64, F64, F64, PassThrough, PassThrough, PassThrough>>>&
        instances);
 #endif
+#ifdef CK_ENABLE_FP8
+void add_device_gemm_xdl_c_shuffle_f8_f8_f8_km_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Row, Row, F8, F8, F8, PassThrough, PassThrough, PassThrough>>>& instances);
+
+void add_device_gemm_xdl_c_shuffle_f8_f8_f8_km_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Col, Row, F8, F8, F8, PassThrough, PassThrough, PassThrough>>>& instances);
+
+void add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, F8, F8, F8, PassThrough, PassThrough, PassThrough>>>& instances);
+
+void add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, F8, F8, F8, PassThrough, PassThrough, PassThrough>>>& instances);
+
+void add_device_gemm_xdl_c_shuffle_f16_f8_f16_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_c_shuffle_f16_f8_f16_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+#endif
+
 template <typename ALayout,
          typename BLayout,
          typename CLayout,
@@ -329,38 +401,46 @@ struct DeviceOperationInstanceFactory<
            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
                         is_same_v<CLayout, Row>)
            {
-                add_device_gemm_xdl_f32_f32_f32_mk_kn_mn_instances(op_ptrs);
+                /// add_device_gemm_xdl_f32_f32_f32_mk_kn_mn_instances(op_ptrs);
 #ifdef DL_KERNELS
                add_device_gemm_dl_f32_f32_f32_mk_kn_mn_instances(op_ptrs);
 #endif
                add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instances(op_ptrs);
+                add_device_gemm_xdl_c_shuffle_lds_direct_load_f32_f32_f32_mk_kn_mn_instances(
+                    op_ptrs);
            }
            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
                              is_same_v<CLayout, Row>)
            {
-                add_device_gemm_xdl_f32_f32_f32_mk_nk_mn_instances(op_ptrs);
+                /// add_device_gemm_xdl_f32_f32_f32_mk_nk_mn_instances(op_ptrs);
 #ifdef DL_KERNELS
                add_device_gemm_dl_f32_f32_f32_mk_nk_mn_instances(op_ptrs);
 #endif
                add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instances(op_ptrs);
+                add_device_gemm_xdl_c_shuffle_lds_direct_load_f32_f32_f32_mk_nk_mn_instances(
+                    op_ptrs);
            }
            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
                              is_same_v<CLayout, Row>)
            {
-                add_device_gemm_xdl_f32_f32_f32_km_kn_mn_instances(op_ptrs);
+                /// add_device_gemm_xdl_f32_f32_f32_km_kn_mn_instances(op_ptrs);
 #ifdef DL_KERNELS
                add_device_gemm_dl_f32_f32_f32_km_kn_mn_instances(op_ptrs);
 #endif
                add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instances(op_ptrs);
+                add_device_gemm_xdl_c_shuffle_lds_direct_load_f32_f32_f32_km_kn_mn_instances(
+                    op_ptrs);
            }
            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
                              is_same_v<CLayout, Row>)
            {
-                add_device_gemm_xdl_f32_f32_f32_km_nk_mn_instances(op_ptrs);
+                /// add_device_gemm_xdl_f32_f32_f32_km_nk_mn_instances(op_ptrs);
 #ifdef DL_KERNELS
                add_device_gemm_dl_f32_f32_f32_km_nk_mn_instances(op_ptrs);
 #endif
                add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instances(op_ptrs);
+                add_device_gemm_xdl_c_shuffle_lds_direct_load_f32_f32_f32_km_nk_mn_instances(
+                    op_ptrs);
            }
        }
 #ifdef CK_ENABLE_FP16
@@ -370,45 +450,51 @@ struct DeviceOperationInstanceFactory<
            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
                         is_same_v<CLayout, Row>)
            {
-                add_device_gemm_xdl_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
+                /// add_device_gemm_xdl_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
 #ifdef DL_KERNELS
                add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
                add_device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instances(op_ptrs);
-                add_device_gemm_dl_dpp8_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
+                add_device_gemm_dpp_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
+                add_device_gemm_dpp_f16_f16_f16_mk_kn_mn_irregular_instances(op_ptrs);
 #endif
                add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
            }
            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
                              is_same_v<CLayout, Row>)
            {
-                add_device_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
+                /// add_device_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
 #ifdef DL_KERNELS
                add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
                add_device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instances(op_ptrs);
-                add_device_gemm_dl_dpp8_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
+                add_device_gemm_dpp_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
+                add_device_gemm_dpp_f16_f16_f16_mk_nk_mn_irregular_instances(op_ptrs);
 #endif
                add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
                add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
+                add_device_gemm_xdl_c_shuffle_lds_direct_load_f16_f16_f16_mk_nk_mn_instances(
+                    op_ptrs);
            }
            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
                              is_same_v<CLayout, Row>)
            {
-                add_device_gemm_xdl_f16_f16_f16_km_kn_mn_instances(op_ptrs);
+                /// add_device_gemm_xdl_f16_f16_f16_km_kn_mn_instances(op_ptrs);
 #ifdef DL_KERNELS
                add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances(op_ptrs);
                add_device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instances(op_ptrs);
-                add_device_gemm_dl_dpp8_f16_f16_f16_km_kn_mn_instances(op_ptrs);
+                add_device_gemm_dpp_f16_f16_f16_km_kn_mn_instances(op_ptrs);
+                add_device_gemm_dpp_f16_f16_f16_km_kn_mn_irregular_instances(op_ptrs);
 #endif
                add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances(op_ptrs);
            }
            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
                              is_same_v<CLayout, Row>)
            {
-                add_device_gemm_xdl_f16_f16_f16_km_nk_mn_instances(op_ptrs);
+                /// add_device_gemm_xdl_f16_f16_f16_km_nk_mn_instances(op_ptrs);
 #ifdef DL_KERNELS
                add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances(op_ptrs);
                add_device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instances(op_ptrs);
-                add_device_gemm_dl_dpp8_f16_f16_f16_km_nk_mn_instances(op_ptrs);
+                add_device_gemm_dpp_f16_f16_f16_km_nk_mn_instances(op_ptrs);
+                add_device_gemm_dpp_f16_f16_f16_km_nk_mn_irregular_instances(op_ptrs);
 #endif
                add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(op_ptrs);
            }
@@ -481,6 +567,46 @@ struct DeviceOperationInstanceFactory<
 #endif
            }
        }
+#endif
+#ifdef CK_ENABLE_FP8
+        else if constexpr(is_same_v<ADataType, ck::f8_t> && is_same_v<BDataType, ck::f8_t> &&
+                          is_same_v<CDataType, ck::f8_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_nk_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_c_shuffle_f8_f8_f8_km_kn_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_c_shuffle_f8_f8_f8_km_nk_mn_instances(op_ptrs);
+            }
+        }
+        else if constexpr(is_same_v<ADataType, ck::half_t> && is_same_v<BDataType, ck::f8_t> &&
+                          is_same_v<CDataType, ck::half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_c_shuffle_f16_f8_f16_mk_kn_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_c_shuffle_f16_f8_f16_mk_nk_mn_instances(op_ptrs);
+            }
+        }
 #endif
        return op_ptrs;
    }