Merge remote-tracking branch 'origin/develop' into simple_gemm_dl

0f1e8187 · Jing Zhang · e5863fd6 · b0568b72 · 0f1e8187 · 0f1e8187
Commit 0f1e8187 authored Nov 05, 2023 by Jing Zhang
20 changed files
--- a/include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp
@@ -17,15 +17,18 @@
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
 #include "ck/host_utility/io.hpp"

 namespace ck {
 namespace tensor_operation {
 namespace device {

-// Image to column for input layout NDHWC:
-//   input : image converted to the gemm problem [N * Do * Ho * Wo, Z * Y * X * C]
-//   output : image [N, Di, Hi, Wi, C]
+// Column to Image:
+//   input : gemm form [G, N * Do * Ho * Wo, Z * Y * X * C]
+//   output : input image [G, N, Di, Hi, Wi, C]
+//   input : gemm form [N * Do * Ho * Wo, G, Z * Y * X * C]
+//   output : input image [N, Di, Hi, Wi, G, C]
 template <index_t NDimSpatial,
          typename ImageLayout,
          typename InputDataType,
@@ -43,6 +46,14 @@ struct DeviceColumnToImageImpl
                                       OutputDataType,
                                       conv_tensor_rearrange_op::ColumnToImage>
 {
+    static constexpr bool is_NSpatialGC =
+        std::is_same_v<ImageLayout, tensor_layout::convolution::NWGC> ||
+        std::is_same_v<ImageLayout, tensor_layout::convolution::NHWGC> ||
+        std::is_same_v<ImageLayout, tensor_layout::convolution::NDHWGC>;
+    static constexpr bool is_GNSpatialC =
+        std::is_same_v<ImageLayout, tensor_layout::convolution::GNWC> ||
+        std::is_same_v<ImageLayout, tensor_layout::convolution::GNHWC> ||
+        std::is_same_v<ImageLayout, tensor_layout::convolution::GNDHWC>;

    static constexpr auto I0 = Number<0>{};
    static constexpr auto I1 = Number<1>{};
@@ -90,7 +101,7 @@ struct DeviceColumnToImageImpl
                            const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
                            const std::array<index_t, NDimSpatial>& output_spatial_lengths,
                            const std::array<index_t, NDimSpatial>& conv_filter_strides,
-                            const std::array<index_t, 2>& gemm_m_k_strides,
+                            const std::array<index_t, 3>& gemm_g_m_k_strides,
                            const std::array<index_t, NDimSpatial>& independent_filters,
                            const std::array<index_t, NDimSpatial>& effs)
    {
@@ -100,23 +111,23 @@ struct DeviceColumnToImageImpl
            C * ck::accumulate_n<index_t>(
                    filter_spatial_lengths.begin(), NDimSpatial, 1, std::multiplies<>());

-        const index_t NStride = DoHoWo * gemm_m_k_strides[I0] * gemm_m_k_strides[I1];
+        const index_t NStride = DoHoWo * gemm_g_m_k_strides[I1] * gemm_g_m_k_strides[I2];
        // Calculate the appropriate stride for each set of independent filters
        // in each dimension
-        const index_t WStride =
-            math::integer_divide_ceil(effs[XIdx], conv_filter_strides[XIdx]) * gemm_m_k_strides[I0];
+        const index_t WStride = math::integer_divide_ceil(effs[XIdx], conv_filter_strides[XIdx]) *
+                                gemm_g_m_k_strides[I1];
        const index_t HStride = math::integer_divide_ceil(effs[YIdx], conv_filter_strides[YIdx]) *
-                                output_spatial_lengths[XIdx] * gemm_m_k_strides[I0];
+                                output_spatial_lengths[XIdx] * gemm_g_m_k_strides[I1];
        const index_t DStride = math::integer_divide_ceil(effs[ZIdx], conv_filter_strides[ZIdx]) *
                                output_spatial_lengths[YIdx] * output_spatial_lengths[XIdx] *
-                                gemm_m_k_strides[I0];
+                                gemm_g_m_k_strides[I1];
        // Create descriptor for independent filters in each dimension and
        // then merge them into column form
        if constexpr(NDimSpatial == 1)
        {
            const auto desc_gemm_form =
                make_naive_tensor_descriptor(make_tuple(N, independent_filters[XIdx], CZYX),
-                                             make_tuple(NStride, WStride, gemm_m_k_strides[I1]));
+                                             make_tuple(NStride, WStride, gemm_g_m_k_strides[I2]));
            const auto desc_gemm_form_merged_filters = transform_tensor_descriptor(
                desc_gemm_form,
                make_tuple(make_merge_transform(make_tuple(N, independent_filters[XIdx])),
@@ -130,7 +141,7 @@ struct DeviceColumnToImageImpl
        {
            const auto desc_gemm_form = make_naive_tensor_descriptor(
                make_tuple(N, independent_filters[YIdx], independent_filters[XIdx], CZYX),
-                make_tuple(NStride, HStride, WStride, gemm_m_k_strides[I1]));
+                make_tuple(NStride, HStride, WStride, gemm_g_m_k_strides[I2]));
            const auto desc_gemm_form_merged_filters = transform_tensor_descriptor(
                desc_gemm_form,
                make_tuple(make_merge_transform(
@@ -149,7 +160,7 @@ struct DeviceColumnToImageImpl
                           independent_filters[YIdx],
                           independent_filters[XIdx],
                           CZYX),
-                make_tuple(NStride, DStride, HStride, WStride, gemm_m_k_strides[I1]));
+                make_tuple(NStride, DStride, HStride, WStride, gemm_g_m_k_strides[I2]));
            const auto desc_gemm_form_merged_filters = transform_tensor_descriptor(
                desc_gemm_form,
                make_tuple(make_merge_transform(make_tuple(N,
@@ -252,34 +263,38 @@ struct DeviceColumnToImageImpl
        decltype(BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, InputGridDesc>(
            InputGridDesc{}))>;

-    using GridwiseTensorRearrangeKernel = GridwiseTensorRearrange<InputGridDesc,
-                                                                  InputDataType,
-                                                                  OutputGridDesc,
-                                                                  OutputDataType,
-                                                                  BlockSize,
-                                                                  MPerBlock,
-                                                                  KPerBlock,
-                                                                  ThreadClusterLengths,
-                                                                  ScalarPerVector,
-                                                                  InMemoryDataOperationEnum::Add,
-                                                                  Block2ETileMap>;
+    using GridwiseTensorRearrangeKernel =
+        GridwiseTensorRearrange<InputGridDesc,
+                                InputDataType,
+                                OutputGridDesc,
+                                OutputDataType,
+                                BlockSize,
+                                MPerBlock,
+                                KPerBlock,
+                                ThreadClusterLengths,
+                                ScalarPerVector,
+                                InMemoryDataOperationEnum::Add,
+                                Block2ETileMap,
+                                ComputePtrOffsetOfStridedBatch<I0>>;

    struct Argument : public BaseArgument
    {
        Argument(const void* p_in, // input image
                 void* p_out,      // output image
+                 const ck::index_t G,
                 const ck::index_t N,
                 const ck::index_t C,
                 const std::array<index_t, NDimSpatial>& input_spatial_lengths,
                 const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
                 const std::array<index_t, NDimSpatial>& output_spatial_lengths,
                 const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
-                 const std::array<index_t, 2>& gemm_m_k_strides,
+                 const std::array<index_t, 3>& gemm_g_m_k_strides,
                 const std::array<index_t, NDimSpatial>& conv_filter_strides,
                 const std::array<index_t, NDimSpatial>& conv_filter_dilations,
                 const std::array<index_t, NDimSpatial>& input_left_pads,
                 const std::array<index_t, NDimSpatial>& input_right_pads)
-            : C_(C),
+            : G_(G),
+              C_(C),
              X_(filter_spatial_lengths[NDimSpatial - I1]),
              p_in_{static_cast<const InputDataType*>(p_in)},
              p_out_{static_cast<OutputDataType*>(p_out)},
@@ -289,6 +304,9 @@ struct DeviceColumnToImageImpl
              input_left_pads_{input_left_pads},
              input_right_pads_{input_right_pads}
        {
+            compute_ptr_offset_of_batch_.BatchStrideA_ = gemm_g_m_k_strides[I0];
+            compute_ptr_offset_of_batch_.BatchStrideC_ = image_g_n_c_wis_strides[I0];
+
            const index_t x_eff =
                (filter_spatial_lengths[XIdx] - 1) * conv_filter_dilations[XIdx] + 1;
            const index_t y_eff =
@@ -354,7 +372,7 @@ struct DeviceColumnToImageImpl
                                                    filter_spatial_lengths,
                                                    output_spatial_lengths,
                                                    conv_filter_strides,
-                                                    gemm_m_k_strides,
+                                                    gemm_g_m_k_strides,
                                                    independent_filters,
                                                    effs);
                        const auto out_grid_desc_m_k =
@@ -387,10 +405,9 @@ struct DeviceColumnToImageImpl
                        // Memory offsets to next set of independent filters,
                        // move to independent filters in each dimension
                        const index_t in_offset =
-                            x_idx * gemm_m_k_strides[0] +
-                            y_idx * gemm_m_k_strides[0] * output_spatial_lengths[XIdx] +
-                            z_idx * gemm_m_k_strides[0] * output_spatial_lengths[YIdx] *
-                                output_spatial_lengths[XIdx];
+                            (x_idx + y_idx * output_spatial_lengths[XIdx] +
+                             z_idx * output_spatial_lengths[YIdx] * output_spatial_lengths[XIdx]) *
+                            gemm_g_m_k_strides[I1];
                        // Move to independent filters in appropriate dimensions
                        const index_t out_offset =
                            x_offset_with_pad * image_g_n_c_wis_strides[spatial_offset + XIdx] +
@@ -417,6 +434,7 @@ struct DeviceColumnToImageImpl
            }
        }

+        const ck::index_t G_;
        const ck::index_t C_;
        const ck::index_t X_;

@@ -434,6 +452,8 @@ struct DeviceColumnToImageImpl

        std::vector<const InputDataType*> p_in_container_;
        std::vector<OutputDataType*> p_out_container_;
+
+        ComputePtrOffsetOfStridedBatch<I0> compute_ptr_offset_of_batch_;
    };

    struct Invoker : public BaseInvoker
@@ -451,6 +471,7 @@ struct DeviceColumnToImageImpl
                                                        OutputGridDesc,
                                                        OutputDataType,
                                                        Block2ETileMap,
+                                                        ComputePtrOffsetOfStridedBatch<I0>,
                                                        GridwiseTensorRearrangeKernel>;

            // Execute each set of independent filters
@@ -460,7 +481,7 @@ struct DeviceColumnToImageImpl
                    BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, InputGridDesc>(
                        arg.out_grid_desc_m_k_container_[i]);
                const index_t grid_size =
-                    block_2_tile_map.CalculateGridSize(arg.in_grid_desc_m_k_container_[i]);
+                    block_2_tile_map.CalculateGridSize(arg.in_grid_desc_m_k_container_[i]) * arg.G_;
                elapsed_time += launch_and_time_kernel(stream_config,
                                                       kernel,
                                                       dim3(grid_size),
@@ -470,7 +491,9 @@ struct DeviceColumnToImageImpl
                                                       arg.p_in_container_[i],
                                                       arg.out_grid_desc_m_k_container_[i],
                                                       arg.p_out_container_[i],
-                                                       block_2_tile_map);
+                                                       arg.G_,
+                                                       block_2_tile_map,
+                                                       arg.compute_ptr_offset_of_batch_);
            }
            return elapsed_time;
        }
@@ -485,8 +508,7 @@ struct DeviceColumnToImageImpl
    bool IsSupportedArgument(const Argument& arg)
    {
        using namespace tensor_layout::convolution;
-        if constexpr(!(std::is_same_v<ImageLayout, GNWC> || std::is_same_v<ImageLayout, GNHWC> ||
-                       std::is_same_v<ImageLayout, GNDHWC>))
+        if constexpr(!(is_NSpatialGC || is_GNSpatialC))
        {
            return false;
        }
@@ -534,13 +556,14 @@ struct DeviceColumnToImageImpl

    static auto MakeArgument(const void* p_in, // input image
                             void* p_out,      // output image
+                             const ck::index_t G,
                             const ck::index_t N,
                             const ck::index_t C,
                             const std::array<index_t, NDimSpatial>& input_spatial_lengths,
                             const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
                             const std::array<index_t, NDimSpatial>& output_spatial_lengths,
                             const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
-                             const std::array<index_t, 2>& gemm_m_k_strides,
+                             const std::array<index_t, 3>& gemm_g_m_k_strides,
                             const std::array<index_t, NDimSpatial>& conv_filter_strides,
                             const std::array<index_t, NDimSpatial>& conv_filter_dilations,
                             const std::array<index_t, NDimSpatial>& input_left_pads,
@@ -548,13 +571,14 @@ struct DeviceColumnToImageImpl
    {
        return Argument{static_cast<const InputDataType*>(p_in),
                        static_cast<OutputDataType*>(p_out),
+                        G,
                        N,
                        C,
                        input_spatial_lengths,
                        filter_spatial_lengths,
                        output_spatial_lengths,
                        image_g_n_c_wis_strides,
-                        gemm_m_k_strides,
+                        gemm_g_m_k_strides,
                        conv_filter_strides,
                        conv_filter_dilations,
                        input_left_pads,
@@ -566,13 +590,14 @@ struct DeviceColumnToImageImpl
    std::unique_ptr<BaseArgument>
    MakeArgumentPointer(const void* p_in, // input image
                        void* p_out,      // output image
+                        const ck::index_t G,
                        const ck::index_t N,
                        const ck::index_t C,
                        const std::array<index_t, NDimSpatial>& input_spatial_lengths,
                        const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
                        const std::array<index_t, NDimSpatial>& output_spatial_lengths,
                        const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
-                        const std::array<index_t, 2>& gemm_m_k_strides,
+                        const std::array<index_t, 3>& gemm_g_m_k_strides,
                        const std::array<index_t, NDimSpatial>& conv_filter_strides,
                        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
                        const std::array<index_t, NDimSpatial>& input_left_pads,
@@ -580,13 +605,14 @@ struct DeviceColumnToImageImpl
    {
        return std::make_unique<Argument>(static_cast<const InputDataType*>(p_in),
                                          static_cast<OutputDataType*>(p_out),
+                                          G,
                                          N,
                                          C,
                                          input_spatial_lengths,
                                          filter_spatial_lengths,
                                          output_spatial_lengths,
                                          image_g_n_c_wis_strides,
-                                          gemm_m_k_strides,
+                                          gemm_g_m_k_strides,
                                          conv_filter_strides,
                                          conv_filter_dilations,
                                          input_left_pads,

--- a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
@@ -145,7 +145,8 @@ template <index_t NumDimM,
          index_t CShuffleNXdlPerWavePerShuffle,
          typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
          index_t CDEBlockTransferScalarPerVector_NPerBlock,
-          LoopScheduler LoopSched = make_default_loop_scheduler()>
+          typename ComputeDataType = ADataType,
+          LoopScheduler LoopSched  = make_default_loop_scheduler()>
 struct DeviceContractionMultipleD_Xdl_CShuffle
    : public DeviceContractionMultipleD<NumDimM,
                                        NumDimN,
@@ -156,7 +157,8 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
                                        EDataType,
                                        AElementwiseOperation,
                                        BElementwiseOperation,
-                                        CDEElementwiseOperation>
+                                        CDEElementwiseOperation,
+                                        ComputeDataType>
 {
    using DeviceOp = DeviceContractionMultipleD_Xdl_CShuffle;

@@ -310,8 +312,6 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
    using DsGridDesc_M_N = remove_cvref_t<decltype(MakeDsGridDescriptor_M_N({{}}, {{}}))>;
    using EGridDesc_M_N  = decltype(MakeEGridDescriptor_M_N({}, {}));

-    using ComputeDataType = ADataType;
-
    // GridwiseGemm
    using GridwiseGemm = GridwiseGemmMultipleD_xdl_cshuffle<
        ADataType, // TODO: distinguish A/B datatype

--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp
@@ -184,7 +184,8 @@ struct DeviceGemmXdl : public DeviceGemm<ALayout,
                return false;
            }
        }
-        else if(ck::get_device_name() == "gfx90a" || ck::get_device_name() == "gfx940")
+        else if(ck::get_device_name() == "gfx90a" || ck::get_device_name() == "gfx940" ||
+                ck::get_device_name() == "gfx941" || ck::get_device_name() == "gfx942")
        {
            if constexpr(!(is_same_v<AccDataType, float> || is_same_v<AccDataType, float> ||
                           is_same_v<AccDataType, int32_t> || is_same_v<AccDataType, double>))

--- a/include/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp
@@ -15,15 +15,18 @@
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
 #include "ck/host_utility/io.hpp"

 namespace ck {
 namespace tensor_operation {
 namespace device {

-// Image to column for input layout NDHWC:
-//   input : input image [N, Di, Hi, Wi, C]
-//   output : gemm form [N * Do * Ho * Wo, Z * Y * X * C]
+// Image to column:
+//   input : input image [G, N, Di, Hi, Wi, C]
+//   output : gemm form [G * N * Do * Ho * Wo, Z * Y * X * C]
+//   input : input image [N, Di, Hi, Wi, G, C]
+//   output : gemm form [N * Do * Ho * Wo * G, Z * Y * X * C]
 template <index_t NDimSpatial,
          typename ImageLayout,
          typename InputDataType,
@@ -41,6 +44,14 @@ struct DeviceImageToColumnImpl
                                       OutputDataType,
                                       conv_tensor_rearrange_op::ImageToColumn>
 {
+    static constexpr bool is_NSpatialGC =
+        std::is_same_v<ImageLayout, tensor_layout::convolution::NWGC> ||
+        std::is_same_v<ImageLayout, tensor_layout::convolution::NHWGC> ||
+        std::is_same_v<ImageLayout, tensor_layout::convolution::NDHWGC>;
+    static constexpr bool is_GNSpatialC =
+        std::is_same_v<ImageLayout, tensor_layout::convolution::GNWC> ||
+        std::is_same_v<ImageLayout, tensor_layout::convolution::GNHWC> ||
+        std::is_same_v<ImageLayout, tensor_layout::convolution::GNDHWC>;

    static constexpr auto I0 = Number<0>{};
    static constexpr auto I1 = Number<1>{};
@@ -109,7 +120,7 @@ struct DeviceImageToColumnImpl
                          const ck::index_t C,
                          const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
                          const std::array<index_t, NDimSpatial>& output_spatial_lengths,
-                          const std::array<index_t, 2>& gemm_m_k_strides)
+                          const std::array<index_t, 3>& gemm_g_m_k_strides)
    {
        const index_t NDoHoWo =
            N * ck::accumulate_n<index_t>(
@@ -117,11 +128,10 @@ struct DeviceImageToColumnImpl
        const index_t CZYX =
            C * ck::accumulate_n<index_t>(
                    filter_spatial_lengths.begin(), NDimSpatial, 1, std::multiplies<>());
-        const auto desc_mraw_kraw = make_naive_tensor_descriptor(
-            make_tuple(NDoHoWo, CZYX), make_tuple(gemm_m_k_strides[I0], gemm_m_k_strides[I1]));

-        const auto desc_m_k = matrix_padder.PadADescriptor_M_K(desc_mraw_kraw);
-        return desc_m_k;
+        const auto desc_mraw_kraw = make_naive_tensor_descriptor(
+            make_tuple(NDoHoWo, CZYX), make_tuple(gemm_g_m_k_strides[I1], gemm_g_m_k_strides[I2]));
+        return matrix_padder.PadADescriptor_M_K(desc_mraw_kraw);
    }

    using InputGridDesc =
@@ -132,34 +142,38 @@ struct DeviceImageToColumnImpl
        decltype(BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, OutputGridDesc>(
            OutputGridDesc{}))>;

-    using GridwiseTensorRearrangeKernel = GridwiseTensorRearrange<InputGridDesc,
-                                                                  InputDataType,
-                                                                  OutputGridDesc,
-                                                                  OutputDataType,
-                                                                  BlockSize,
-                                                                  MPerBlock,
-                                                                  KPerBlock,
-                                                                  ThreadClusterLengths,
-                                                                  ScalarPerVector,
-                                                                  InMemoryDataOperationEnum::Set,
-                                                                  Block2ETileMap>;
+    using GridwiseTensorRearrangeKernel =
+        GridwiseTensorRearrange<InputGridDesc,
+                                InputDataType,
+                                OutputGridDesc,
+                                OutputDataType,
+                                BlockSize,
+                                MPerBlock,
+                                KPerBlock,
+                                ThreadClusterLengths,
+                                ScalarPerVector,
+                                InMemoryDataOperationEnum::Set,
+                                Block2ETileMap,
+                                ComputePtrOffsetOfStridedBatch<I0>>;

    struct Argument : public BaseArgument
    {
        Argument(const void* p_in, // input image
                 void* p_out,      // gemm form
+                 const ck::index_t G,
                 const ck::index_t N,
                 const ck::index_t C,
                 const std::array<index_t, NDimSpatial>& input_spatial_lengths,
                 const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
                 const std::array<index_t, NDimSpatial>& output_spatial_lengths,
                 const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
-                 const std::array<index_t, 2>& gemm_m_k_strides,
+                 const std::array<index_t, 3>& gemm_g_m_k_strides,
                 const std::array<index_t, NDimSpatial>& conv_filter_strides,
                 const std::array<index_t, NDimSpatial>& conv_filter_dilations,
                 const std::array<index_t, NDimSpatial>& input_left_pads,
                 const std::array<index_t, NDimSpatial>& input_right_pads)
-            : C_(C),
+            : G_(G),
+              C_(C),
              X_(filter_spatial_lengths[NDimSpatial - I1]),
              p_in_{static_cast<const InputDataType*>(p_in)},
              p_out_{static_cast<OutputDataType*>(p_out)},
@@ -176,14 +190,16 @@ struct DeviceImageToColumnImpl
                                                        filter_spatial_lengths,
                                                        output_spatial_lengths,
                                                        image_g_n_c_wis_strides,
-
                                                        conv_filter_strides,
                                                        conv_filter_dilations,
                                                        input_left_pads,
                                                        input_right_pads);

            out_grid_desc_m_k_ = MakeOutDescriptor_M_K(
-                N, C, filter_spatial_lengths, output_spatial_lengths, gemm_m_k_strides);
+                N, C, filter_spatial_lengths, output_spatial_lengths, gemm_g_m_k_strides);
+
+            compute_ptr_offset_of_batch_.BatchStrideA_ = image_g_n_c_wis_strides[I0];
+            compute_ptr_offset_of_batch_.BatchStrideC_ = gemm_g_m_k_strides[I0];
        }

        void Print() const
@@ -192,6 +208,7 @@ struct DeviceImageToColumnImpl
            std::cout << out_grid_desc_m_k_ << std::endl;
        }

+        const ck::index_t G_;
        const ck::index_t C_;
        const ck::index_t X_;

@@ -206,6 +223,8 @@ struct DeviceImageToColumnImpl

        InputGridDesc in_grid_desc_m_k_;
        OutputGridDesc out_grid_desc_m_k_;
+
+        ComputePtrOffsetOfStridedBatch<I0> compute_ptr_offset_of_batch_;
    };

    struct Invoker : public BaseInvoker
@@ -220,12 +239,14 @@ struct DeviceImageToColumnImpl
            const auto block_2_tile_map =
                BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, OutputGridDesc>(
                    arg.out_grid_desc_m_k_);
-            const index_t grid_size = block_2_tile_map.CalculateGridSize(arg.out_grid_desc_m_k_);
-            const auto kernel       = kernel_tensor_rearrange<InputGridDesc,
+            const index_t grid_size =
+                block_2_tile_map.CalculateGridSize(arg.out_grid_desc_m_k_) * arg.G_;
+            const auto kernel = kernel_tensor_rearrange<InputGridDesc,
                                                        InputDataType,
                                                        OutputGridDesc,
                                                        OutputDataType,
                                                        Block2ETileMap,
+                                                        ComputePtrOffsetOfStridedBatch<I0>,
                                                        GridwiseTensorRearrangeKernel>;

            float elapsed_time = launch_and_time_kernel(stream_config,
@@ -237,7 +258,9 @@ struct DeviceImageToColumnImpl
                                                        arg.p_in_,
                                                        arg.out_grid_desc_m_k_,
                                                        arg.p_out_,
-                                                        block_2_tile_map);
+                                                        arg.G_,
+                                                        block_2_tile_map,
+                                                        arg.compute_ptr_offset_of_batch_);
            return elapsed_time;
        }

@@ -250,9 +273,7 @@ struct DeviceImageToColumnImpl

    bool IsSupportedArgument(const Argument& arg)
    {
-        using namespace tensor_layout::convolution;
-        if constexpr(!(std::is_same_v<ImageLayout, GNWC> || std::is_same_v<ImageLayout, GNHWC> ||
-                       std::is_same_v<ImageLayout, GNDHWC>))
+        if constexpr(!(is_NSpatialGC || is_GNSpatialC))
        {
            return false;
        }
@@ -295,13 +316,14 @@ struct DeviceImageToColumnImpl

    static auto MakeArgument(const void* p_in, // input image
                             void* p_out,      // gemm form
+                             const ck::index_t G,
                             const ck::index_t N,
                             const ck::index_t C,
                             const std::array<index_t, NDimSpatial>& input_spatial_lengths,
                             const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
                             const std::array<index_t, NDimSpatial>& output_spatial_lengths,
                             const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
-                             const std::array<index_t, 2>& gemm_m_k_strides,
+                             const std::array<index_t, 3>& gemm_g_m_k_strides,
                             const std::array<index_t, NDimSpatial>& conv_filter_strides,
                             const std::array<index_t, NDimSpatial>& conv_filter_dilations,
                             const std::array<index_t, NDimSpatial>& input_left_pads,
@@ -309,13 +331,14 @@ struct DeviceImageToColumnImpl
    {
        return Argument{static_cast<const InputDataType*>(p_in),
                        static_cast<OutputDataType*>(p_out),
+                        G,
                        N,
                        C,
                        input_spatial_lengths,
                        filter_spatial_lengths,
                        output_spatial_lengths,
                        image_g_n_c_wis_strides,
-                        gemm_m_k_strides,
+                        gemm_g_m_k_strides,
                        conv_filter_strides,
                        conv_filter_dilations,
                        input_left_pads,
@@ -327,13 +350,14 @@ struct DeviceImageToColumnImpl
    std::unique_ptr<BaseArgument>
    MakeArgumentPointer(const void* p_in, // input image
                        void* p_out,      // gemm form
+                        const ck::index_t G,
                        const ck::index_t N,
                        const ck::index_t C,
                        const std::array<index_t, NDimSpatial>& input_spatial_lengths,
                        const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
                        const std::array<index_t, NDimSpatial>& output_spatial_lengths,
                        const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
-                        const std::array<index_t, 2>& gemm_m_k_strides,
+                        const std::array<index_t, 3>& gemm_g_m_k_strides,
                        const std::array<index_t, NDimSpatial>& conv_filter_strides,
                        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
                        const std::array<index_t, NDimSpatial>& input_left_pads,
@@ -341,13 +365,14 @@ struct DeviceImageToColumnImpl
    {
        return std::make_unique<Argument>(static_cast<const InputDataType*>(p_in),
                                          static_cast<OutputDataType*>(p_out),
+                                          G,
                                          N,
                                          C,
                                          input_spatial_lengths,
                                          filter_spatial_lengths,
                                          output_spatial_lengths,
                                          image_g_n_c_wis_strides,
-                                          gemm_m_k_strides,
+                                          gemm_g_m_k_strides,
                                          conv_filter_strides,
                                          conv_filter_dilations,
                                          input_left_pads,

--- a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
@@ -186,6 +186,25 @@ struct Bilinear
        y = type_convert<half_t>(alpha_ * x0 + beta_ * ck::type_convert<float>(x1));
    };

+    template <>
+    __host__ __device__ constexpr void
+    operator()<bhalf_t, bhalf_t, bhalf_t>(bhalf_t& y, const bhalf_t& x0, const bhalf_t& x1) const
+    {
+        const float x0_tmp = type_convert<float>(x0);
+        const float x1_tmp = type_convert<float>(x1);
+        const float y_tmp  = alpha_ * x0_tmp + beta_ * x1_tmp;
+        y                  = type_convert<bhalf_t>(y_tmp);
+    };
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<bhalf_t, float, bhalf_t>(bhalf_t& y, const float& x0, const bhalf_t& x1) const
+    {
+        const float x1_tmp = ck::type_convert<float>(x1);
+        const float y_tmp  = alpha_ * x0 + beta_ * x1_tmp;
+        y                  = y_tmp;
+    };
+
    template <>
    __host__ __device__ constexpr void operator()<std::int8_t, std::int32_t, std::int8_t>(
        std::int8_t& y, const std::int32_t& x0, const std::int8_t& x1) const

--- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
@@ -311,6 +311,71 @@ struct AddAddFastGelu
    }
 };

+// E = Relu(alpha1 * C + alpha2 * D0 + D1)
+struct ScaleAddScaleAddRelu
+{
+
+    ScaleAddScaleAddRelu(const float alpha1 = 1.f, const float alpha2 = 1.f)
+        : alpha1_(alpha1), alpha2_(alpha2)
+    {
+    }
+
+    template <typename E, typename C, typename D0, typename D1>
+    __host__ __device__ constexpr void
+    operator()(E& e, const C& c, const D0& d0, const D1& d1) const;
+
+    template <>
+    __host__ __device__ constexpr void operator()<float, float, float, float>(float& e,
+                                                                              const float& c,
+                                                                              const float& d0,
+                                                                              const float& d1) const
+    {
+        const float x = c * alpha1_ + alpha2_ * d0 + d1;
+        Relu{}.template operator()<float>(e, x);
+    }
+
+    template <>
+    __host__ __device__ constexpr void operator()<half_t, half_t, half_t, half_t>(
+        half_t& e, const half_t& c, const half_t& d0, const half_t& d1) const
+    {
+        const float x = type_convert<float>(c) * alpha1_ + alpha2_ * type_convert<float>(d0) +
+                        type_convert<float>(d1);
+
+        float result = 0;
+        Relu{}.template operator()<float>(result, x);
+
+        e = type_convert<half_t>(result);
+    }
+
+    template <>
+    __host__ __device__ constexpr void operator()<bhalf_t, bhalf_t, bhalf_t, bhalf_t>(
+        bhalf_t& e, const bhalf_t& c, const bhalf_t& d0, const bhalf_t& d1) const
+    {
+        const float x = type_convert<float>(c) * alpha1_ + alpha2_ * type_convert<float>(d0) +
+                        type_convert<float>(d1);
+
+        float result = 0;
+        Relu{}.template operator()<float>(result, x);
+
+        e = type_convert<bhalf_t>(result);
+    }
+
+    template <>
+    __host__ __device__ constexpr void operator()<int8_t, int8_t, float, float>(
+        int8_t& e, const int8_t& c, const float& d0, const float& d1) const
+    {
+        const float x = type_convert<float>(c) * alpha1_ + alpha2_ * d0 + d1;
+
+        float result = 0;
+        Relu{}.template operator()<float>(result, x);
+
+        e = type_convert<int8_t>(result);
+    }
+
+    const float alpha1_;
+    const float alpha2_;
+};
+
 struct Normalize
 {
    // FIXME: is double absolutely necessary?

--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -33,6 +33,12 @@ struct PassThrough
        y = type_convert<float>(x);
    }

+    template <>
+    __host__ __device__ void operator()<double, float>(double& y, const float& x) const
+    {
+        y = type_convert<double>(x);
+    }
+
    template <>
    __host__ __device__ void operator()<float, float>(float& y, const float& x) const
    {
@@ -69,6 +75,12 @@ struct PassThrough
        y = type_convert<bhalf_t>(x);
    }

+    template <>
+    __host__ __device__ void operator()<float, bhalf_t>(float& y, const bhalf_t& x) const
+    {
+        y = type_convert<float>(x);
+    }
+
    template <>
    __host__ __device__ void operator()<bhalf_t, half_t>(bhalf_t& y, const half_t& x) const
    {
@@ -225,6 +237,20 @@ struct Scale
    template <typename Y, typename X>
    __host__ __device__ void operator()(Y& y, const X& x) const;

+    template <>
+    __host__ __device__ void operator()<half_t, half_t>(half_t& y, const half_t& x) const
+    {
+        y = ck::type_convert<half_t>(scale_) * x;
+    };
+
+    template <>
+    __host__ __device__ void operator()<bhalf_t, bhalf_t>(bhalf_t& y, const bhalf_t& x) const
+    {
+        const float x_tmp = ck::type_convert<float>(x);
+        const float y_tmp = scale_ * x_tmp;
+        y                 = ck::type_convert<bhalf_t>(y_tmp);
+    };
+
    template <>
    __host__ __device__ void operator()<float, float>(float& y, const float& x) const
    {

--- a/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
@@ -21,6 +21,7 @@ template <typename InputGridDesc,
          typename OutputGridDesc,
          typename OutputDataType,
          typename Block2ETileMap,
+          typename ComputePtrOffsetOfStridedBatch,
          typename GridwiseTensorRearrangeKernel>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
@@ -30,13 +31,20 @@ __global__ void
                                const InputDataType* __restrict__ p_in_global,
                                const OutputGridDesc out_grid_desc,
                                OutputDataType* __restrict__ p_out_global,
-                                const Block2ETileMap block_2_tile_map)
+                                const index_t batch_count,
+                                const Block2ETileMap block_2_tile_map,
+                                const ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) ||             \
    defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx1030__) || defined(__gfx1100__) || \
    defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx941__) || defined(__gfx942__))
-    GridwiseTensorRearrangeKernel::Run(
-        in_grid_desc, p_in_global, out_grid_desc, p_out_global, block_2_tile_map);
+    GridwiseTensorRearrangeKernel::Run(in_grid_desc,
+                                       p_in_global,
+                                       out_grid_desc,
+                                       p_out_global,
+                                       batch_count,
+                                       block_2_tile_map,
+                                       compute_ptr_offset_of_batch);
 #else
    ignore = in_grid_desc;
    ignore = p_in_global;
@@ -56,7 +64,8 @@ template <typename InputGridDesc,
          typename ThreadClusterLengths,
          index_t ScalarPerVector,
          InMemoryDataOperationEnum DstInMemOp,
-          typename Block2ETileMap>
+          typename Block2ETileMap,
+          typename ComputePtrOffsetOfStridedBatch>
 struct GridwiseTensorRearrange
 {

@@ -69,7 +78,9 @@ struct GridwiseTensorRearrange
                               const InputDataType* __restrict__ p_in_global,
                               const OutputGridDesc& out_grid_desc,
                               OutputDataType* __restrict__ p_out_global,
-                               const Block2ETileMap& block_2_tile_map)
+                               const index_t batch_count,
+                               const Block2ETileMap& block_2_tile_map,
+                               const ComputePtrOffsetOfStridedBatch& compute_ptr_offset_of_batch)
    {
        const auto block_work_idx =
            block_2_tile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
@@ -80,12 +91,6 @@ struct GridwiseTensorRearrange
        const index_t k_block_data_idx_on_grid =
            __builtin_amdgcn_readfirstlane(block_work_idx[I1] * KPerBlock);

-        // Global Memory
-        const auto in_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_in_global, in_grid_desc.GetElementSpaceSize());
-        auto out_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_out_global, out_grid_desc.GetElementSpaceSize());
-
        auto copy_global_to_global =
            ThreadGroupTensorSliceTransfer_v7<ThisThreadBlock,
                                              Tuple<InputDataType>,
@@ -108,6 +113,22 @@ struct GridwiseTensorRearrange
                make_tuple(make_multi_index(m_block_data_idx_on_grid, k_block_data_idx_on_grid)),
                tensor_operation::element_wise::PassThrough{}};

+        const index_t num_blocks_per_batch =
+            __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+        const index_t g_idx =
+            __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+
+        // Global Memory
+        const index_t a_batch_offset =
+            __builtin_amdgcn_readfirstlane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
+        const index_t c_batch_offset =
+            __builtin_amdgcn_readfirstlane(compute_ptr_offset_of_batch.GetCPtrOffset(g_idx));
+
+        const auto in_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_in_global + a_batch_offset, in_grid_desc.GetElementSpaceSize());
+        auto out_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_out_global + c_batch_offset, out_grid_desc.GetElementSpaceSize());
+
        copy_global_to_global.Run(
            tie(in_grid_desc), tie(in_global_buf), tie(out_grid_desc), tie(out_global_buf));
    }

--- a/include/ck/utility/type_convert.hpp
+++ b/include/ck/utility/type_convert.hpp
@@ -100,6 +100,8 @@ template <>
 inline __host__ __device__ f8_t type_convert<f8_t, float>(float x)
 {
 #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+    float max_fp8 = 240.0f;
+    x             = x > max_fp8 ? max_fp8 : (x < -max_fp8 ? -max_fp8 : x);
    union
    {
        float fval;

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_column_to_image.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_column_to_image.hpp
@@ -19,9 +19,7 @@ namespace host {
 * \brief Reference implementation for column to image.
 *
 * Input tensor descriptor has [N * Do * Ho * Wo, Z * Y * X * C] data layout.
- * Memory layout is the same.
 * Output tensor descriptor has [G, N, C, Di, Hi, Wi] data layout.
- * G must be equal to 1. Memory layout is [G, N, Di, Hi, Wi, C].
 *
 * \tparam NDimSpatial Number of spatial dimensions.
 * \tparam ImageLayout Image Layout.
@@ -95,18 +93,19 @@ struct ReferenceColumnToImage : public device::BaseOperator
        float Run(const Argument& arg)
        {
            if(!(arg.output_.GetNumOfDimension() == NDimSpatial + 3 &&
-                 arg.input_.GetNumOfDimension() == 2))
+                 arg.input_.GetNumOfDimension() == 3))
            {
                throw std::runtime_error("wrong! inconsistent dimension");
            }

+            const index_t G = arg.output_.GetLengths()[0];
            const index_t N = arg.output_.GetLengths()[1];
            const index_t C = arg.output_.GetLengths()[2];

            if constexpr(NDimSpatial == 1)
            {
                const index_t Wo = arg.output_spatial_lengths_[0];
-                auto func        = [&](auto n) {
+                auto func        = [&](auto g, auto n) {
                    for(index_t wo = 0; wo < Wo; ++wo)
                    {
                        index_t row    = n * Wo + wo;
@@ -123,9 +122,10 @@ struct ReferenceColumnToImage : public device::BaseOperator
                                if(wi >= 0 &&
                                   ck::type_convert<std::size_t>(wi) < arg.output_.GetLengths()[3])
                                {
-                                    float v_in  = ck::type_convert<float>(arg.input_(row, column));
-                                    float v_out = ck::type_convert<float>(arg.output_(0, n, c, wi));
-                                    arg.output_(0, n, c, wi) =
+                                    float v_in =
+                                        ck::type_convert<float>(arg.input_(g, row, column));
+                                    float v_out = ck::type_convert<float>(arg.output_(g, n, c, wi));
+                                    arg.output_(g, n, c, wi) =
                                        ck::type_convert<OutDataType>(v_in + v_out);
                                }
                                column++;
@@ -134,7 +134,7 @@ struct ReferenceColumnToImage : public device::BaseOperator
                    }
                };

-                make_ParallelTensorFunctor(func, N)(std::thread::hardware_concurrency());
+                make_ParallelTensorFunctor(func, G, N)(std::thread::hardware_concurrency());

                return 0;
            }
@@ -143,7 +143,7 @@ struct ReferenceColumnToImage : public device::BaseOperator
                const index_t Ho = arg.output_spatial_lengths_[0];
                const index_t Wo = arg.output_spatial_lengths_[1];

-                auto func = [&](auto n) {
+                auto func = [&](auto g, auto n) {
                    for(index_t ho = 0; ho < Ho; ++ho)
                    {
                        for(index_t wo = 0; wo < Wo; ++wo)
@@ -176,10 +176,10 @@ struct ReferenceColumnToImage : public device::BaseOperator
                                               arg.output_.GetLengths()[4])
                                        {
                                            float v_in =
-                                                ck::type_convert<float>(arg.input_(row, column));
+                                                ck::type_convert<float>(arg.input_(g, row, column));
                                            float v_out = ck::type_convert<float>(
-                                                arg.output_(0, n, c, hi, wi));
-                                            arg.output_(0, n, c, hi, wi) =
+                                                arg.output_(g, n, c, hi, wi));
+                                            arg.output_(g, n, c, hi, wi) =
                                                ck::type_convert<OutDataType>(v_in + v_out);
                                        }
                                        column++;
@@ -190,7 +190,7 @@ struct ReferenceColumnToImage : public device::BaseOperator
                    }
                };

-                make_ParallelTensorFunctor(func, N)(std::thread::hardware_concurrency());
+                make_ParallelTensorFunctor(func, G, N)(std::thread::hardware_concurrency());

                return 0;
            }
@@ -200,7 +200,7 @@ struct ReferenceColumnToImage : public device::BaseOperator
                const index_t Ho = arg.output_spatial_lengths_[1];
                const index_t Wo = arg.output_spatial_lengths_[2];

-                auto func = [&](auto n) {
+                auto func = [&](auto g, auto n) {
                    for(index_t d_o = 0; d_o < Do; ++d_o)
                    {
                        for(index_t ho = 0; ho < Ho; ++ho)
@@ -245,10 +245,10 @@ struct ReferenceColumnToImage : public device::BaseOperator
                                                       arg.output_.GetLengths()[5])
                                                {
                                                    float v_in = ck::type_convert<float>(
-                                                        arg.input_(row, column));
+                                                        arg.input_(g, row, column));
                                                    float v_out = ck::type_convert<float>(
-                                                        arg.output_(0, n, c, di, hi, wi));
-                                                    arg.output_(0, n, c, di, hi, wi) =
+                                                        arg.output_(g, n, c, di, hi, wi));
+                                                    arg.output_(g, n, c, di, hi, wi) =
                                                        ck::type_convert<OutDataType>(v_in + v_out);
                                                }
                                                column++;
@@ -261,7 +261,7 @@ struct ReferenceColumnToImage : public device::BaseOperator
                    }
                };

-                make_ParallelTensorFunctor(func, N)(std::thread::hardware_concurrency());
+                make_ParallelTensorFunctor(func, G, N)(std::thread::hardware_concurrency());

                return 0;
            }
@@ -303,8 +303,9 @@ struct ReferenceColumnToImage : public device::BaseOperator
            C * ck::accumulate_n<index_t>(
                    arg.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());

-        if(!(arg.input_.GetLengths()[0] == static_cast<std::size_t>(NDoHoWo) &&
-             arg.input_.GetLengths()[1] == static_cast<std::size_t>(CZYX)))
+        if(!(arg.input_.GetLengths()[0] == static_cast<std::size_t>(G) &&
+             arg.input_.GetLengths()[1] == static_cast<std::size_t>(NDoHoWo) &&
+             arg.input_.GetLengths()[2] == static_cast<std::size_t>(CZYX)))
        {
            return false;
        }

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_contraction.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_contraction.hpp
@@ -23,6 +23,7 @@ template <ck::index_t NumDimM,
          typename BDataType,
          typename CDataType,
          typename AccDataType,
+          typename ComputeDataType,
          typename AElementwiseOperation,
          typename BElementwiseOperation,
          ck::enable_if_t<NumDimM == 2 && NumDimN == 2 && NumDimK == 2, bool> = false>
@@ -69,19 +70,24 @@ struct ReferenceContraction_M2_N2_K2 : public ck::tensor_operation::device::Base
                {
                    for(ck::index_t k1 = 0; k1 < K1; ++k1)
                    {
+                        // Simulate the possible casting when ComputeDataType is different than the
+                        // A/B data types
+                        ComputeDataType v_a_compute_input =
+                            ck::type_convert<ComputeDataType>(arg.a_ms_ks_(m0, m1, k0, k1));
+                        ComputeDataType v_b_compute_input =
+                            ck::type_convert<ComputeDataType>(arg.b_ns_ks_(n0, n1, k0, k1));
+
                        AccDataType v_a;
                        AccDataType v_b;

-                        arg.a_element_op_(
-                            v_a, ck::type_convert<const AccDataType>(arg.a_ms_ks_(m0, m1, k0, k1)));
-                        arg.b_element_op_(
-                            v_b, ck::type_convert<const AccDataType>(arg.b_ns_ks_(n0, n1, k0, k1)));
+                        arg.a_element_op_(v_a, ck::type_convert<AccDataType>(v_a_compute_input));
+                        arg.b_element_op_(v_b, ck::type_convert<AccDataType>(v_b_compute_input));

                        v_acc += v_a * v_b;
                    }
                }

-                arg.c_ms_ns_(m0, m1, n0, n1) = v_acc;
+                arg.c_ms_ns_(m0, m1, n0, n1) = ck::type_convert<CDataType>(v_acc);
            };

            make_ParallelTensorFunctor(f_ms_ns,

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
@@ -42,6 +42,7 @@ template <ck::index_t NDimSpatial,
          typename InElementwiseOperation,
          typename WeiElementwiseOperation,
          typename OutElementwiseOperation,
+          ck::index_t NumDTensor                                                    = 0,
          typename std::enable_if<NDimSpatial >= 1 && NDimSpatial <= 3, bool>::type = false>
 struct ReferenceConvFwd : public device::BaseOperator
 {
@@ -57,10 +58,12 @@ struct ReferenceConvFwd : public device::BaseOperator
                 std::vector<ck::index_t> input_right_pads,
                 InElementwiseOperation in_element_op,
                 WeiElementwiseOperation wei_element_op,
-                 OutElementwiseOperation out_element_op)
+                 OutElementwiseOperation out_element_op,
+                 const std::array<Tensor<OutDataType>, NumDTensor>& d_tensors)
            : input_{input},
              weight_{weight},
              output_{output},
+              d_tensors_{d_tensors},
              conv_strides_{conv_filter_strides},
              conv_dilations_{conv_filter_dilations},
              in_left_pads_{input_left_pads},
@@ -75,6 +78,8 @@ struct ReferenceConvFwd : public device::BaseOperator
        const Tensor<WeiDataType>& weight_;
        Tensor<OutDataType>& output_;

+        const std::array<Tensor<OutDataType>, NumDTensor>& d_tensors_;
+
        std::vector<index_t> conv_strides_;
        std::vector<index_t> conv_dilations_;
        std::vector<index_t> in_left_pads_;
@@ -129,7 +134,26 @@ struct ReferenceConvFwd : public device::BaseOperator
                    }

                    OutDataType v_out;
-                    arg.out_element_op_(v_out, ck::type_convert<OutDataType>(v_acc));
+                    OutDataType v_acc_converted = ck::type_convert<OutDataType>(v_acc);
+                    if constexpr(NumDTensor == 0)
+                    {
+                        arg.out_element_op_(v_out, v_acc_converted);
+                    }
+                    else if constexpr(NumDTensor == 1)
+                    {
+                        arg.out_element_op_(v_out, v_acc_converted, arg.d_tensors_[0](g, n, k, wo));
+                    }
+                    else if constexpr(NumDTensor == 2)
+                    {
+                        arg.out_element_op_(v_out,
+                                            v_acc_converted,
+                                            arg.d_tensors_[0](g, n, k, wo),
+                                            arg.d_tensors_[1](g, n, k, wo));
+                    }
+                    else
+                    {
+                        throw std::runtime_error("Output ElementOp not supported in reference.");
+                    }
                    arg.output_(g, n, k, wo) = v_out;
                };

@@ -183,7 +207,27 @@ struct ReferenceConvFwd : public device::BaseOperator
                    }

                    OutDataType v_out;
-                    arg.out_element_op_(v_out, ck::type_convert<OutDataType>(v_acc));
+                    OutDataType v_acc_converted = ck::type_convert<OutDataType>(v_acc);
+                    if constexpr(NumDTensor == 0)
+                    {
+                        arg.out_element_op_(v_out, v_acc_converted);
+                    }
+                    else if constexpr(NumDTensor == 1)
+                    {
+                        arg.out_element_op_(
+                            v_out, v_acc_converted, arg.d_tensors_[0](g, n, k, ho, wo));
+                    }
+                    else if constexpr(NumDTensor == 2)
+                    {
+                        arg.out_element_op_(v_out,
+                                            v_acc_converted,
+                                            arg.d_tensors_[0](g, n, k, ho, wo),
+                                            arg.d_tensors_[1](g, n, k, ho, wo));
+                    }
+                    else
+                    {
+                        throw std::runtime_error("Output ElementOp not supported in reference.");
+                    }
                    arg.output_(g, n, k, ho, wo) = v_out;
                };

@@ -250,7 +294,27 @@ struct ReferenceConvFwd : public device::BaseOperator
                    }

                    OutDataType v_out;
-                    arg.out_element_op_(v_out, ck::type_convert<OutDataType>(v_acc));
+                    OutDataType v_acc_converted = ck::type_convert<OutDataType>(v_acc);
+                    if constexpr(NumDTensor == 0)
+                    {
+                        arg.out_element_op_(v_out, v_acc_converted);
+                    }
+                    else if constexpr(NumDTensor == 1)
+                    {
+                        arg.out_element_op_(
+                            v_out, v_acc_converted, arg.d_tensors_[0](g, n, k, d_o, ho, wo));
+                    }
+                    else if constexpr(NumDTensor == 2)
+                    {
+                        arg.out_element_op_(v_out,
+                                            v_acc_converted,
+                                            arg.d_tensors_[0](g, n, k, d_o, ho, wo),
+                                            arg.d_tensors_[1](g, n, k, d_o, ho, wo));
+                    }
+                    else
+                    {
+                        throw std::runtime_error("Output ElementOp not supported in reference.");
+                    }
                    arg.output_(g, n, k, d_o, ho, wo) = v_out;
                };

@@ -294,7 +358,8 @@ struct ReferenceConvFwd : public device::BaseOperator
                             std::vector<ck::index_t> input_right_pads,
                             InElementwiseOperation in_element_op,
                             WeiElementwiseOperation wei_element_op,
-                             OutElementwiseOperation out_element_op)
+                             OutElementwiseOperation out_element_op,
+                             const std::array<Tensor<OutDataType>, NumDTensor>& d_tensors = {})
    {
        return Argument{input,
                        weight,
@@ -305,7 +370,8 @@ struct ReferenceConvFwd : public device::BaseOperator
                        input_right_pads,
                        in_element_op,
                        wei_element_op,
-                        out_element_op};
+                        out_element_op,
+                        d_tensors};
    }

    static auto MakeInvoker() { return Invoker{}; }

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp
@@ -19,9 +19,7 @@ namespace host {
 * \brief Reference implementation for image to column.
 *
 * Input tensor descriptor has [G, N, C, Di, Hi, Wi] data layout.
- * G must be equal to 1. Memory layout is [G, N, Di, Hi, Wi, C].
- * Output tensor descriptor has [N * Do * Ho * Wo, Z * Y * X * C] data layout.
- * Memory layout is the same.
+ * Output tensor descriptor has [G * N * Do * Ho * Wo, Z * Y * X * C] data layout.
 *
 * \tparam NDimSpatial Number of spatial dimensions.
 * \tparam ImageLayout Image Layout.
@@ -95,18 +93,19 @@ struct ReferenceImageToColumn : public device::BaseOperator
        float Run(const Argument& arg)
        {
            if(!(arg.input_.GetNumOfDimension() == NDimSpatial + 3 &&
-                 arg.output_.GetNumOfDimension() == 2))
+                 arg.output_.GetNumOfDimension() == 3))
            {
                throw std::runtime_error("wrong! inconsistent dimension");
            }

+            const index_t G = arg.input_.GetLengths()[0];
            const index_t N = arg.input_.GetLengths()[1];
            const index_t C = arg.input_.GetLengths()[2];

            if constexpr(NDimSpatial == 1)
            {
                const index_t Wo = arg.output_spatial_lengths_[0];
-                auto func        = [&](auto n, auto wo) {
+                auto func        = [&](auto g, auto n, auto wo) {
                    index_t row    = n * Wo + wo;
                    index_t column = 0;

@@ -121,15 +120,15 @@ struct ReferenceImageToColumn : public device::BaseOperator
                            if(wi >= 0 &&
                               ck::type_convert<std::size_t>(wi) < arg.input_.GetLengths()[3])
                            {
-                                InDataType v_in          = arg.input_(0, n, c, wi);
-                                arg.output_(row, column) = ck::type_convert<OutDataType>(v_in);
+                                InDataType v_in             = arg.input_(g, n, c, wi);
+                                arg.output_(g, row, column) = ck::type_convert<OutDataType>(v_in);
                            }
                            column++;
                        }
                    }
                };

-                make_ParallelTensorFunctor(func, N, Wo)(std::thread::hardware_concurrency());
+                make_ParallelTensorFunctor(func, G, N, Wo)(std::thread::hardware_concurrency());

                return 0;
            }
@@ -138,7 +137,7 @@ struct ReferenceImageToColumn : public device::BaseOperator
                const index_t Ho = arg.output_spatial_lengths_[0];
                const index_t Wo = arg.output_spatial_lengths_[1];

-                auto func = [&](auto n, auto ho, auto wo) {
+                auto func = [&](auto g, auto n, auto ho, auto wo) {
                    index_t row    = n * Ho * Wo + ho * Wo + wo;
                    index_t column = 0;

@@ -162,8 +161,9 @@ struct ReferenceImageToColumn : public device::BaseOperator
                                   wi >= 0 &&
                                   ck::type_convert<std::size_t>(wi) < arg.input_.GetLengths()[4])
                                {
-                                    InDataType v_in          = arg.input_(0, n, c, hi, wi);
-                                    arg.output_(row, column) = ck::type_convert<OutDataType>(v_in);
+                                    InDataType v_in = arg.input_(g, n, c, hi, wi);
+                                    arg.output_(g, row, column) =
+                                        ck::type_convert<OutDataType>(v_in);
                                }
                                column++;
                            }
@@ -171,7 +171,7 @@ struct ReferenceImageToColumn : public device::BaseOperator
                    }
                };

-                make_ParallelTensorFunctor(func, N, Ho, Wo)(std::thread::hardware_concurrency());
+                make_ParallelTensorFunctor(func, G, N, Ho, Wo)(std::thread::hardware_concurrency());

                return 0;
            }
@@ -181,7 +181,7 @@ struct ReferenceImageToColumn : public device::BaseOperator
                const index_t Ho = arg.output_spatial_lengths_[1];
                const index_t Wo = arg.output_spatial_lengths_[2];

-                auto func = [&](auto n, auto d_o, auto ho, auto wo) {
+                auto func = [&](auto g, auto n, auto d_o, auto ho, auto wo) {
                    index_t row    = n * Do * Ho * Wo + d_o * Ho * Wo + ho * Wo + wo;
                    index_t column = 0;

@@ -213,8 +213,8 @@ struct ReferenceImageToColumn : public device::BaseOperator
                                       ck::type_convert<std::size_t>(wi) <
                                           arg.input_.GetLengths()[5])
                                    {
-                                        InDataType v_in = arg.input_(0, n, c, di, hi, wi);
-                                        arg.output_(row, column) =
+                                        InDataType v_in = arg.input_(g, n, c, di, hi, wi);
+                                        arg.output_(g, row, column) =
                                            ck::type_convert<OutDataType>(v_in);
                                    }
                                    column++;
@@ -224,7 +224,7 @@ struct ReferenceImageToColumn : public device::BaseOperator
                    }
                };

-                make_ParallelTensorFunctor(func, N, Do, Ho, Wo)(
+                make_ParallelTensorFunctor(func, G, N, Do, Ho, Wo)(
                    std::thread::hardware_concurrency());

                return 0;
@@ -267,8 +267,9 @@ struct ReferenceImageToColumn : public device::BaseOperator
            C * ck::accumulate_n<index_t>(
                    arg.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());

-        if(!(arg.output_.GetLengths()[0] == static_cast<std::size_t>(NDoHoWo) &&
-             arg.output_.GetLengths()[1] == static_cast<std::size_t>(CZYX)))
+        if(!(arg.output_.GetLengths()[0] == static_cast<std::size_t>(G) &&
+             arg.output_.GetLengths()[1] == static_cast<std::size_t>(NDoHoWo) &&
+             arg.output_.GetLengths()[2] == static_cast<std::size_t>(CZYX)))
        {
            return false;
        }

--- a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
+++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
@@ -25,6 +25,8 @@ using BF8  = ck::bf8_t;

 using Empty_Tuple = ck::Tuple<>;

+using BF16_Tuple = ck::Tuple<BF16>;
+
 using F16_Tuple     = ck::Tuple<F16>;
 using F16_F16_Tuple = ck::Tuple<F16, F16>;


--- a/library/include/ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16  = ck::half_t;
+using BF16 = ck::bhalf_t;
+using F32  = float;
+using F64  = double;
+
+using F16_Tuple   = ck::Tuple<F16>;
+using BF16_Tuple  = ck::Tuple<BF16>;
+using F32_Tuple   = ck::Tuple<F32>;
+using F64_Tuple   = ck::Tuple<F64>;
+using Empty_Tuple = ck::Tuple<>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
+using Scale       = ck::tensor_operation::element_wise::Scale;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename ComputeDataType,
+          typename AElementwiseOp,
+          typename BElementwiseOp,
+          typename CDEElementwiseOp>
+using device_contraction_kk_instance = std::tuple<
+    // clang-format off
+        //#####################################| NumDimM| NumDimN| NumDimK|      AData|     BData|     AccData|         CShuffle|     DsData|     EData|              A|              B|              CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|         Compute|
+        //#####################################|        |        |        |       Type|      Type|        Type|         DataType|       Type|      Type|    Elementwise|    Elementwise|      Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            Data|
+        //#####################################|        |        |        |           |          |            |                 |           |          |      Operation|      Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            Type|
+        //#####################################|        |        |        |           |          |            |                 |           |          |               |               |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                |
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4, ComputeDataType>
+    // clang-format on
+    >;
+
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename ComputeDataType,
+          typename AElementwiseOp,
+          typename BElementwiseOp,
+          typename CDEElementwiseOp>
+using device_contraction_kn_instance = std::tuple<
+    // clang-format off
+        //#####################################| NumDimM| NumDimN| NumDimK|      AData|     BData|     AccData|         CShuffle|     DsData|     EData|              A|              B|              CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|         Compute|
+        //#####################################|        |        |        |       Type|      Type|        Type|         DataType|       Type|      Type|    Elementwise|    Elementwise|      Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            Data|
+        //#####################################|        |        |        |           |          |            |                 |           |          |      Operation|      Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            Type|
+        //#####################################|        |        |        |           |          |            |                 |           |          |               |               |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                |
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   1,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   1,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   1,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   1,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   1,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1,  8>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   1,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   1,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   1,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>
+    // clang-format on
+    >;
+
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename ComputeDataType,
+          typename AElementwiseOp,
+          typename BElementwiseOp,
+          typename CDEElementwiseOp>
+using device_contraction_mk_instance = std::tuple<
+    // clang-format off
+        //#####################################| NumDimM| NumDimN| NumDimK|      AData|     BData|     AccData|         CShuffle|     DsData|     EData|              A|              B|              CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|         Compute|
+        //#####################################|        |        |        |       Type|      Type|        Type|         DataType|       Type|      Type|    Elementwise|    Elementwise|      Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            Data|
+        //#####################################|        |        |        |           |          |            |                 |           |          |      Operation|      Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            Type|
+        //#####################################|        |        |        |           |          |            |                 |           |          |               |               |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                |
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   256,   128,    16,   1,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   256,    16,   1,   4,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,   128,    16,   1,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   4,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   4,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   4,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   4,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>
+    // clang-format on
+    >;
+
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename ComputeDataType,
+          typename AElementwiseOp,
+          typename BElementwiseOp,
+          typename CDEElementwiseOp>
+using device_contraction_mn_instance = std::tuple<
+    // clang-format off
+        //#####################################| NumDimM| NumDimN| NumDimK|      AData|     BData|     AccData|         CShuffle|     DsData|     EData|              A|              B|              CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|         Compute|
+        //#####################################|        |        |        |       Type|      Type|        Type|         DataType|       Type|      Type|    Elementwise|    Elementwise|      Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            Data|
+        //#####################################|        |        |        |           |          |            |                 |           |          |      Operation|      Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            Type|
+        //#####################################|        |        |        |           |          |            |                 |           |          |               |               |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                |
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   256,   128,    16,   1,   1,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   256,    16,   1,   1,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,   128,    16,   1,   1,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   1,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   1,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1,  8>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   1,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   1,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   1,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>
+    // clang-format on
+    >;
+
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename ComputeDataType,
+          typename AElementwiseOp,
+          typename BElementwiseOp,
+          typename CDEElementwiseOp>
+using device_contraction_f64_kk_instance = std::tuple<
+    // clang-format off
+        //#####################################| NumDimM| NumDimN| NumDimK|      AData|     BData|     AccData|         CShuffle|     DsData|     EData|              A|              B|              CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|         Compute|
+        //#####################################|        |        |        |       Type|      Type|        Type|         DataType|       Type|      Type|    Elementwise|    Elementwise|      Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            Data|
+        //#####################################|        |        |        |           |          |            |                 |           |          |      Operation|      Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            Type|
+        //#####################################|        |        |        |           |          |            |                 |           |          |               |               |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                |
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,    64,    64,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1,  8>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    32,    16,   2,   2,   16,   16,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    32,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,    64,    64,    32,    16,   2,   2,   16,   16,    4,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1,  8>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,    64,    32,    64,    16,   2,   2,   16,   16,    2,    4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1,  8>,               1, ComputeDataType>
+    // clang-format on
+    >;
+
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename ComputeDataType,
+          typename AElementwiseOp,
+          typename BElementwiseOp,
+          typename CDEElementwiseOp>
+using device_contraction_f64_kn_instance = std::tuple<
+    // clang-format off
+        //#####################################| NumDimM| NumDimN| NumDimK|      AData|     BData|     AccData|         CShuffle|     DsData|     EData|              A|              B|              CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|         Compute|
+        //#####################################|        |        |        |       Type|      Type|        Type|         DataType|       Type|      Type|    Elementwise|    Elementwise|      Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            Data|
+        //#####################################|        |        |        |           |          |            |                 |           |          |      Operation|      Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            Type|
+        //#####################################|        |        |        |           |          |            |                 |           |          |               |               |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                |
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   1,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   1,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1,  8>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   1,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1,  8, 1, 16>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   1,   16,   16,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   1,   16,   16,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1, ComputeDataType>
+    // clang-format on
+    >;
+
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename ComputeDataType,
+          typename AElementwiseOp,
+          typename BElementwiseOp,
+          typename CDEElementwiseOp>
+using device_contraction_f64_mk_instance = std::tuple<
+    // clang-format off
+        //#####################################| NumDimM| NumDimN| NumDimK|      AData|     BData|     AccData|         CShuffle|     DsData|     EData|              A|              B|              CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|         Compute|
+        //#####################################|        |        |        |       Type|      Type|        Type|         DataType|       Type|      Type|    Elementwise|    Elementwise|      Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            Data|
+        //#####################################|        |        |        |           |          |            |                 |           |          |      Operation|      Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            Type|
+        //#####################################|        |        |        |           |          |            |                 |           |          |               |               |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                |
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   2,   16,   16,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,              1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,              1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1,  8>,              1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1,  8>,              1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   2,   16,   16,    4,    4,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1,  8, 1, 16>,              1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1,  8, 1, 16>,              1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   2,   16,   16,    4,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,              1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,              1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   2,   16,   16,    2,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,              1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,              1, ComputeDataType>
+    // clang-format on
+    >;
+
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename ComputeDataType,
+          typename AElementwiseOp,
+          typename BElementwiseOp,
+          typename CDEElementwiseOp>
+using device_contraction_f64_mn_instance = std::tuple<
+    // clang-format off
+        //#####################################| NumDimM| NumDimN| NumDimK|      AData|     BData|     AccData|         CShuffle|     DsData|     EData|              A|              B|              CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|         Compute|
+        //#####################################|        |        |        |       Type|      Type|        Type|         DataType|       Type|      Type|    Elementwise|    Elementwise|      Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            Data|
+        //#####################################|        |        |        |           |          |            |                 |           |          |      Operation|      Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            Type|
+        //#####################################|        |        |        |           |          |            |                 |           |          |               |               |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                |
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   1,   16,   16,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   1,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1,  8>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   1,   16,   16,    4,    4,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1,  8, 1, 16>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   1,   16,   16,    4,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   1,   16,   16,    2,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1, ComputeDataType>
+    // clang-format on
+    >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp
@@ -17,7 +17,6 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 #ifdef CK_ENABLE_FP32
-// float
 void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                           2,
@@ -28,7 +27,8 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn
                                                           F32,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Bilinear>>>& instances);
+                                                           Bilinear,
+                                                           F32>>>& instances);

 void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -40,7 +40,8 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn
                                                           F32,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Bilinear>>>& instances);
+                                                           Bilinear,
+                                                           F32>>>& instances);

 void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -52,7 +53,8 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn
                                                           F32,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Bilinear>>>& instances);
+                                                           Bilinear,
+                                                           F32>>>& instances);

 void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -64,10 +66,115 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn
                                                           F32,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Bilinear>>>& instances);
-#endif
+                                                           Bilinear,
+                                                           F32>>>& instances);
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           F32_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F16>>>& instances);
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           F32_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F16>>>& instances);
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           F32_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F16>>>& instances);
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           F32_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F16>>>& instances);
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           F32_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           BF16>>>& instances);
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           F32_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           BF16>>>& instances);
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           F32_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           BF16>>>& instances);
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           F32_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           BF16>>>& instances);
+#endif // CK_ENABLE_FP32
+
 #ifdef CK_ENABLE_FP64
-// double
 void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                           2,
@@ -78,7 +185,8 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn
                                                           F64,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Bilinear>>>& instances);
+                                                           Bilinear,
+                                                           F64>>>& instances);

 void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -90,7 +198,8 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn
                                                           F64,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Bilinear>>>& instances);
+                                                           Bilinear,
+                                                           F64>>>& instances);

 void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -102,7 +211,8 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn
                                                           F64,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Bilinear>>>& instances);
+                                                           Bilinear,
+                                                           F64>>>& instances);

 void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -114,8 +224,170 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn
                                                           F64,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Bilinear>>>& instances);
-#endif
+                                                           Bilinear,
+                                                           F64>>>& instances);
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F64,
+                                                           F64,
+                                                           F64_Tuple,
+                                                           F64,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances);
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F64,
+                                                           F64,
+                                                           F64_Tuple,
+                                                           F64,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances);
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F64,
+                                                           F64,
+                                                           F64_Tuple,
+                                                           F64,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances);
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F64,
+                                                           F64,
+                                                           F64_Tuple,
+                                                           F64,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances);
+#endif // CK_ENABLE_FP64
+
+#ifdef CK_ENABLE_FP16
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F16,
+                                                           F16,
+                                                           F16_Tuple,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances);
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F16,
+                                                           F16,
+                                                           F16_Tuple,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances);
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F16,
+                                                           F16,
+                                                           F16_Tuple,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances);
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F16,
+                                                           F16,
+                                                           F16_Tuple,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances);
+#endif // CK_ENABLE_FP16
+
+#ifdef CK_ENABLE_BF16
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16_Tuple,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances);
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16_Tuple,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances);
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16_Tuple,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances);
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16_Tuple,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear,
+                                                           F32>>>& instances);
+#endif // CK_ENABLE_FP16
+
 // Contraction + Bilinear
 template <index_t NumDimM,
          index_t NumDimN,
@@ -123,7 +395,8 @@ template <index_t NumDimM,
          typename ADataType,
          typename BDataType,
          typename DDataType,
-          typename EDataType>
+          typename EDataType,
+          typename ComputeDataType>
 struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContractionMultipleD<
    NumDimM,
    NumDimN,
@@ -134,7 +407,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
    EDataType,
    ck::tensor_operation::element_wise::PassThrough,
    ck::tensor_operation::element_wise::PassThrough,
-    ck::tensor_operation::element_wise::Bilinear>>
+    ck::tensor_operation::element_wise::Bilinear,
+    ComputeDataType>>
 {
    using DeviceOp = DeviceContractionMultipleD<NumDimM,
                                                NumDimN,
@@ -145,45 +419,125 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
                                                EDataType,
                                                ck::tensor_operation::element_wise::PassThrough,
                                                ck::tensor_operation::element_wise::PassThrough,
-                                                ck::tensor_operation::element_wise::Bilinear>;
+                                                ck::tensor_operation::element_wise::Bilinear,
+                                                ComputeDataType>;

    static auto GetInstances()
    {
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
 #ifdef CK_ENABLE_FP32
        if constexpr(is_same_v<ADataType, float> && is_same_v<BDataType, float> &&
-                     is_same_v<DDataType, float> && is_same_v<EDataType, float>)
+                     is_same_v<EDataType, float>)
        {
            if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
            {
-                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance(
-                    op_ptrs);
-                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance(
-                    op_ptrs);
-                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance(
-                    op_ptrs);
-                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance(
-                    op_ptrs);
+                if constexpr(is_same_v<ComputeDataType, float>)
+                {
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance(
+                        op_ptrs);
+                }
+                else if constexpr(is_same_v<ComputeDataType, ck::half_t>)
+                {
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance(
+                        op_ptrs);
+                }
+                else if constexpr(is_same_v<ComputeDataType, ck::bhalf_t>)
+                {
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance(
+                        op_ptrs);
+                }
            }
        }
-#endif
+#endif // CK_ENABLE_FP32
 #ifdef CK_ENABLE_FP64
        if constexpr(is_same_v<ADataType, double> && is_same_v<BDataType, double> &&
-                     is_same_v<DDataType, double> && is_same_v<EDataType, double>)
+                     is_same_v<EDataType, double>)
+        {
+            if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
+            {
+                if constexpr(is_same_v<ComputeDataType, double>)
+                {
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance(
+                        op_ptrs);
+                }
+                else if constexpr(is_same_v<ComputeDataType, float>)
+                {
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance(
+                        op_ptrs);
+                }
+            }
+        }
+#endif // CK_ENABLE_FP64
+#ifdef CK_ENABLE_FP16
+        if constexpr(is_same_v<ADataType, ck::half_t> && is_same_v<BDataType, ck::half_t> &&
+                     is_same_v<EDataType, ck::half_t>)
+        {
+            if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
+            {
+                if constexpr(is_same_v<ComputeDataType, float>)
+                {
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance(
+                        op_ptrs);
+                }
+            }
+        }
+#endif // CK_ENABLE_FP16
+#ifdef CK_ENABLE_BF16
+        if constexpr(is_same_v<ADataType, ck::bhalf_t> && is_same_v<BDataType, ck::bhalf_t> &&
+                     is_same_v<EDataType, ck::bhalf_t>)
        {
            if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
            {
-                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance(
-                    op_ptrs);
-                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance(
-                    op_ptrs);
-                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance(
-                    op_ptrs);
-                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance(
-                    op_ptrs);
+                if constexpr(is_same_v<ComputeDataType, float>)
+                {
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance(
+                        op_ptrs);
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance(
+                        op_ptrs);
+                }
            }
        }
-#endif
+#endif // CK_ENABLE_BF16
        return op_ptrs;
    }
 };

--- a/library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp
@@ -17,7 +17,6 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 #ifdef CK_ENABLE_FP32
-// float
 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                           2,
@@ -28,7 +27,8 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instanc
                                                           F32,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Scale>>>& instances);
+                                                           Scale,
+                                                           F32>>>& instances);

 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -40,7 +40,8 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instanc
                                                           F32,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Scale>>>& instances);
+                                                           Scale,
+                                                           F32>>>& instances);

 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -52,7 +53,8 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instanc
                                                           F32,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Scale>>>& instances);
+                                                           Scale,
+                                                           F32>>>& instances);

 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -64,10 +66,115 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instanc
                                                           F32,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Scale>>>& instances);
-#endif
+                                                           Scale,
+                                                           F32>>>& instances);
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           Empty_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F16>>>& instances);
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           Empty_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F16>>>& instances);
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           Empty_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F16>>>& instances);
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           Empty_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F16>>>& instances);
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           Empty_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           BF16>>>& instances);
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           Empty_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           BF16>>>& instances);
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           Empty_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           BF16>>>& instances);
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           Empty_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           BF16>>>& instances);
+#endif // CK_ENABLE_FP32
+
 #ifdef CK_ENABLE_FP64
-// double
 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                           2,
@@ -78,7 +185,8 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instanc
                                                           F64,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Scale>>>& instances);
+                                                           Scale,
+                                                           F64>>>& instances);

 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -90,7 +198,8 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instanc
                                                           F64,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Scale>>>& instances);
+                                                           Scale,
+                                                           F64>>>& instances);

 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -102,7 +211,8 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instanc
                                                           F64,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Scale>>>& instances);
+                                                           Scale,
+                                                           F64>>>& instances);

 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance(
    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -114,15 +224,178 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instanc
                                                           F64,
                                                           PassThrough,
                                                           PassThrough,
-                                                           Scale>>>& instances);
-#endif
+                                                           Scale,
+                                                           F64>>>& instances);
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F64,
+                                                           F64,
+                                                           Empty_Tuple,
+                                                           F64,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F32>>>& instances);
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F64,
+                                                           F64,
+                                                           Empty_Tuple,
+                                                           F64,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F32>>>& instances);
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F64,
+                                                           F64,
+                                                           Empty_Tuple,
+                                                           F64,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F32>>>& instances);
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F64,
+                                                           F64,
+                                                           Empty_Tuple,
+                                                           F64,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F32>>>& instances);
+#endif // CK_ENABLE_FP64
+
+#ifdef CK_ENABLE_FP16
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F16,
+                                                           F16,
+                                                           Empty_Tuple,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F32>>>& instances);
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F16,
+                                                           F16,
+                                                           Empty_Tuple,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F32>>>& instances);
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F16,
+                                                           F16,
+                                                           Empty_Tuple,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F32>>>& instances);
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F16,
+                                                           F16,
+                                                           Empty_Tuple,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F32>>>& instances);
+#endif // CK_ENABLE_FP16
+
+#ifdef CK_ENABLE_BF16
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           BF16,
+                                                           BF16,
+                                                           Empty_Tuple,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F32>>>& instances);
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           BF16,
+                                                           BF16,
+                                                           Empty_Tuple,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F32>>>& instances);
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           BF16,
+                                                           BF16,
+                                                           Empty_Tuple,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F32>>>& instances);
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           BF16,
+                                                           BF16,
+                                                           Empty_Tuple,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale,
+                                                           F32>>>& instances);
+#endif // CK_ENABLE_FP16
+
 // Contraction + Scale
 template <index_t NumDimM,
          index_t NumDimN,
          index_t NumDimK,
          typename ADataType,
          typename BDataType,
-          typename EDataType>
+          typename EDataType,
+          typename ComputeDataType>
 struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContractionMultipleD<
    NumDimM,
    NumDimN,
@@ -133,7 +406,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
    EDataType,
    ck::tensor_operation::element_wise::PassThrough,
    ck::tensor_operation::element_wise::PassThrough,
-    ck::tensor_operation::element_wise::Scale>>
+    ck::tensor_operation::element_wise::Scale,
+    ComputeDataType>>
 {
    using DeviceOp = DeviceContractionMultipleD<NumDimM,
                                                NumDimN,
@@ -144,7 +418,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
                                                EDataType,
                                                ck::tensor_operation::element_wise::PassThrough,
                                                ck::tensor_operation::element_wise::PassThrough,
-                                                ck::tensor_operation::element_wise::Scale>;
+                                                ck::tensor_operation::element_wise::Scale,
+                                                ComputeDataType>;

    static auto GetInstances()
    {
@@ -155,34 +430,113 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
        {
            if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
            {
-                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance(
-                    op_ptrs);
-                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance(
-                    op_ptrs);
-                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance(
-                    op_ptrs);
-                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance(
-                    op_ptrs);
+                if constexpr(is_same_v<ComputeDataType, float>)
+                {
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance(
+                        op_ptrs);
+                }
+                else if constexpr(is_same_v<ComputeDataType, ck::half_t>)
+                {
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance(
+                        op_ptrs);
+                }
+                else if constexpr(is_same_v<ComputeDataType, ck::bhalf_t>)
+                {
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance(
+                        op_ptrs);
+                }
            }
        }
-#endif
+#endif // CK_ENABLE_FP32
 #ifdef CK_ENABLE_FP64
        if constexpr(is_same_v<ADataType, double> && is_same_v<BDataType, double> &&
                     is_same_v<EDataType, double>)
        {
            if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
            {
-                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance(
-                    op_ptrs);
-                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance(
-                    op_ptrs);
-                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance(
-                    op_ptrs);
-                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance(
-                    op_ptrs);
+                if constexpr(is_same_v<ComputeDataType, double>)
+                {
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance(
+                        op_ptrs);
+                }
+                else if constexpr(is_same_v<ComputeDataType, float>)
+                {
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance(
+                        op_ptrs);
+                }
+            }
+        }
+#endif // CK_ENABLE_FP64
+#ifdef CK_ENABLE_FP16
+        if constexpr(is_same_v<ADataType, ck::half_t> && is_same_v<BDataType, ck::half_t> &&
+                     is_same_v<EDataType, ck::half_t>)
+        {
+            if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
+            {
+                if constexpr(is_same_v<ComputeDataType, float>)
+                {
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance(
+                        op_ptrs);
+                }
+            }
+        }
+#endif // CK_ENABLE_FP16
+#ifdef CK_ENABLE_BF16
+        if constexpr(is_same_v<ADataType, ck::bhalf_t> && is_same_v<BDataType, ck::bhalf_t> &&
+                     is_same_v<EDataType, ck::bhalf_t>)
+        {
+            if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
+            {
+                if constexpr(is_same_v<ComputeDataType, float>)
+                {
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance(
+                        op_ptrs);
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance(
+                        op_ptrs);
+                }
            }
        }
-#endif
+#endif // CK_ENABLE_BF16
        return op_ptrs;
    }
 };

--- a/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp
@@ -19,109 +19,214 @@ namespace instance {

 using namespace ck::conv_tensor_rearrange_op;

+// GNWC/GNHWC/GNDHWC
 // Image to Column
-// nhwc, 1d
-void add_device_image_to_column_nwc_1d_bf16_instances(
+// GNWC, 1d
+void add_device_image_to_column_gnwc_1d_bf16_instances(
    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, BF16, BF16, ImageToColumn>>>&
        instances);

-void add_device_image_to_column_nwc_1d_f16_instances(
+void add_device_image_to_column_gnwc_1d_f16_instances(
    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F16, F16, ImageToColumn>>>&
        instances);

-void add_device_image_to_column_nwc_1d_f32_instances(
+void add_device_image_to_column_gnwc_1d_f32_instances(
    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F32, F32, ImageToColumn>>>&
        instances);

-void add_device_image_to_column_nwc_1d_i8_instances(
+void add_device_image_to_column_gnwc_1d_i8_instances(
    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, int8_t, int8_t, ImageToColumn>>>&
        instances);
-// nhwc, 2d
-void add_device_image_to_column_nhwc_2d_bf16_instances(
+// GNHWC, 2d
+void add_device_image_to_column_gnhwc_2d_bf16_instances(
    std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, BF16, BF16, ImageToColumn>>>&
        instances);

-void add_device_image_to_column_nhwc_2d_f16_instances(
+void add_device_image_to_column_gnhwc_2d_f16_instances(
    std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F16, F16, ImageToColumn>>>&
        instances);

-void add_device_image_to_column_nhwc_2d_f32_instances(
+void add_device_image_to_column_gnhwc_2d_f32_instances(
    std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F32, F32, ImageToColumn>>>&
        instances);

-void add_device_image_to_column_nhwc_2d_i8_instances(
+void add_device_image_to_column_gnhwc_2d_i8_instances(
    std::vector<
        std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, int8_t, int8_t, ImageToColumn>>>&
        instances);
-// nhwc, 3d
-void add_device_image_to_column_ndhwc_3d_bf16_instances(
+// GNDHWC, 3d
+void add_device_image_to_column_gndhwc_3d_bf16_instances(
    std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, BF16, BF16, ImageToColumn>>>&
        instances);

-void add_device_image_to_column_ndhwc_3d_f16_instances(
+void add_device_image_to_column_gndhwc_3d_f16_instances(
    std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F16, F16, ImageToColumn>>>&
        instances);

-void add_device_image_to_column_ndhwc_3d_f32_instances(
+void add_device_image_to_column_gndhwc_3d_f32_instances(
    std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F32, F32, ImageToColumn>>>&
        instances);

-void add_device_image_to_column_ndhwc_3d_i8_instances(
+void add_device_image_to_column_gndhwc_3d_i8_instances(
    std::vector<
        std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, int8_t, int8_t, ImageToColumn>>>&
        instances);

 // Column to Image
-// nhwc, 1d
-void add_device_column_to_image_nwc_1d_bf16_instances(
+// GNWC, 1d
+void add_device_column_to_image_gnwc_1d_bf16_instances(
    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, BF16, BF16, ColumnToImage>>>&
        instances);

-void add_device_column_to_image_nwc_1d_f16_instances(
+void add_device_column_to_image_gnwc_1d_f16_instances(
    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F16, F16, ColumnToImage>>>&
        instances);

-void add_device_column_to_image_nwc_1d_f32_instances(
+void add_device_column_to_image_gnwc_1d_f32_instances(
    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F32, F32, ColumnToImage>>>&
        instances);

-void add_device_column_to_image_nwc_1d_i8_instances(
+void add_device_column_to_image_gnwc_1d_i8_instances(
    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, int8_t, int8_t, ColumnToImage>>>&
        instances);
-// nhwc, 2d
-void add_device_column_to_image_nhwc_2d_bf16_instances(
+// GNHWC, 2d
+void add_device_column_to_image_gnhwc_2d_bf16_instances(
    std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, BF16, BF16, ColumnToImage>>>&
        instances);

-void add_device_column_to_image_nhwc_2d_f16_instances(
+void add_device_column_to_image_gnhwc_2d_f16_instances(
    std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F16, F16, ColumnToImage>>>&
        instances);

-void add_device_column_to_image_nhwc_2d_f32_instances(
+void add_device_column_to_image_gnhwc_2d_f32_instances(
    std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F32, F32, ColumnToImage>>>&
        instances);

-void add_device_column_to_image_nhwc_2d_i8_instances(
+void add_device_column_to_image_gnhwc_2d_i8_instances(
    std::vector<
        std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, int8_t, int8_t, ColumnToImage>>>&
        instances);
-// nhwc, 3d
-void add_device_column_to_image_ndhwc_3d_bf16_instances(
+// GNDHWC, 3d
+void add_device_column_to_image_gndhwc_3d_bf16_instances(
    std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, BF16, BF16, ColumnToImage>>>&
        instances);

-void add_device_column_to_image_ndhwc_3d_f16_instances(
+void add_device_column_to_image_gndhwc_3d_f16_instances(
    std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F16, F16, ColumnToImage>>>&
        instances);

-void add_device_column_to_image_ndhwc_3d_f32_instances(
+void add_device_column_to_image_gndhwc_3d_f32_instances(
    std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F32, F32, ColumnToImage>>>&
        instances);

-void add_device_column_to_image_ndhwc_3d_i8_instances(
+void add_device_column_to_image_gndhwc_3d_i8_instances(
    std::vector<
        std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, int8_t, int8_t, ColumnToImage>>>&
        instances);
+// NWGC/NHWGC/NDHWGC
+// Image to Column
+// NWGC, 1d
+void add_device_image_to_column_nwgc_1d_bf16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, BF16, BF16, ImageToColumn>>>&
+        instances);
+
+void add_device_image_to_column_nwgc_1d_f16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, F16, F16, ImageToColumn>>>&
+        instances);
+
+void add_device_image_to_column_nwgc_1d_f32_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, F32, F32, ImageToColumn>>>&
+        instances);
+
+void add_device_image_to_column_nwgc_1d_i8_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, int8_t, int8_t, ImageToColumn>>>&
+        instances);
+// NHWGC, 2d
+void add_device_image_to_column_nhwgc_2d_bf16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, BF16, BF16, ImageToColumn>>>&
+        instances);
+
+void add_device_image_to_column_nhwgc_2d_f16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, F16, F16, ImageToColumn>>>&
+        instances);
+
+void add_device_image_to_column_nhwgc_2d_f32_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, F32, F32, ImageToColumn>>>&
+        instances);
+
+void add_device_image_to_column_nhwgc_2d_i8_instances(
+    std::vector<
+        std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, int8_t, int8_t, ImageToColumn>>>&
+        instances);
+// NDHWGC, 3d
+void add_device_image_to_column_ndhwgc_3d_bf16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, BF16, BF16, ImageToColumn>>>&
+        instances);
+
+void add_device_image_to_column_ndhwgc_3d_f16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, F16, F16, ImageToColumn>>>&
+        instances);
+
+void add_device_image_to_column_ndhwgc_3d_f32_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, F32, F32, ImageToColumn>>>&
+        instances);
+
+void add_device_image_to_column_ndhwgc_3d_i8_instances(
+    std::vector<
+        std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, int8_t, int8_t, ImageToColumn>>>&
+        instances);
+
+// Column to Image
+// NWGC, 1d
+void add_device_column_to_image_nwgc_1d_bf16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, BF16, BF16, ColumnToImage>>>&
+        instances);
+
+void add_device_column_to_image_nwgc_1d_f16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, F16, F16, ColumnToImage>>>&
+        instances);
+
+void add_device_column_to_image_nwgc_1d_f32_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, F32, F32, ColumnToImage>>>&
+        instances);
+
+void add_device_column_to_image_nwgc_1d_i8_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, int8_t, int8_t, ColumnToImage>>>&
+        instances);
+// NHWGC, 2d
+void add_device_column_to_image_nhwgc_2d_bf16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, BF16, BF16, ColumnToImage>>>&
+        instances);
+
+void add_device_column_to_image_nhwgc_2d_f16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, F16, F16, ColumnToImage>>>&
+        instances);
+
+void add_device_column_to_image_nhwgc_2d_f32_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, F32, F32, ColumnToImage>>>&
+        instances);
+
+void add_device_column_to_image_nhwgc_2d_i8_instances(
+    std::vector<
+        std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, int8_t, int8_t, ColumnToImage>>>&
+        instances);
+// NDHWGC, 3d
+void add_device_column_to_image_ndhwgc_3d_bf16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, BF16, BF16, ColumnToImage>>>&
+        instances);
+
+void add_device_column_to_image_ndhwgc_3d_f16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, F16, F16, ColumnToImage>>>&
+        instances);
+
+void add_device_column_to_image_ndhwgc_3d_f32_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, F32, F32, ColumnToImage>>>&
+        instances);
+
+void add_device_column_to_image_ndhwgc_3d_i8_instances(
+    std::vector<
+        std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, int8_t, int8_t, ColumnToImage>>>&
+        instances);

 template <ck::index_t NumDimSpatial,
          typename ImageLayout,
@@ -151,60 +256,120 @@ struct DeviceOperationInstanceFactory<
            {
                if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
                {
-                    add_device_image_to_column_nwc_1d_f32_instances(op_ptrs);
+                    add_device_image_to_column_gnwc_1d_f32_instances(op_ptrs);
                }
                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
                {
-                    add_device_image_to_column_nwc_1d_f16_instances(op_ptrs);
+                    add_device_image_to_column_gnwc_1d_f16_instances(op_ptrs);
                }
                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
                                  is_same_v<OutDataType, ck::bhalf_t>)
                {
-                    add_device_image_to_column_nwc_1d_bf16_instances(op_ptrs);
+                    add_device_image_to_column_gnwc_1d_bf16_instances(op_ptrs);
                }
                else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
                {
-                    add_device_image_to_column_nwc_1d_i8_instances(op_ptrs);
+                    add_device_image_to_column_gnwc_1d_i8_instances(op_ptrs);
                }
            }
            else if constexpr(NumDimSpatial == 2 && is_same_v<ImageLayout, GNHWC>)
            {
                if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
                {
-                    add_device_image_to_column_nhwc_2d_f32_instances(op_ptrs);
+                    add_device_image_to_column_gnhwc_2d_f32_instances(op_ptrs);
                }
                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
                {
-                    add_device_image_to_column_nhwc_2d_f16_instances(op_ptrs);
+                    add_device_image_to_column_gnhwc_2d_f16_instances(op_ptrs);
                }
                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
                                  is_same_v<OutDataType, ck::bhalf_t>)
                {
-                    add_device_image_to_column_nhwc_2d_bf16_instances(op_ptrs);
+                    add_device_image_to_column_gnhwc_2d_bf16_instances(op_ptrs);
                }
                else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
                {
-                    add_device_image_to_column_nhwc_2d_i8_instances(op_ptrs);
+                    add_device_image_to_column_gnhwc_2d_i8_instances(op_ptrs);
                }
            }
            else if constexpr(NumDimSpatial == 3 && is_same_v<ImageLayout, GNDHWC>)
            {
                if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
                {
-                    add_device_image_to_column_ndhwc_3d_f32_instances(op_ptrs);
+                    add_device_image_to_column_gndhwc_3d_f32_instances(op_ptrs);
                }
                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
                {
-                    add_device_image_to_column_ndhwc_3d_f16_instances(op_ptrs);
+                    add_device_image_to_column_gndhwc_3d_f16_instances(op_ptrs);
                }
                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
                                  is_same_v<OutDataType, ck::bhalf_t>)
                {
-                    add_device_image_to_column_ndhwc_3d_bf16_instances(op_ptrs);
+                    add_device_image_to_column_gndhwc_3d_bf16_instances(op_ptrs);
                }
                else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
                {
-                    add_device_image_to_column_ndhwc_3d_i8_instances(op_ptrs);
+                    add_device_image_to_column_gndhwc_3d_i8_instances(op_ptrs);
+                }
+            }
+            else if constexpr(NumDimSpatial == 1 && is_same_v<ImageLayout, NWGC>)
+            {
+                if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
+                {
+                    add_device_image_to_column_nwgc_1d_f32_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
+                {
+                    add_device_image_to_column_nwgc_1d_f16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                                  is_same_v<OutDataType, ck::bhalf_t>)
+                {
+                    add_device_image_to_column_nwgc_1d_bf16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
+                {
+                    add_device_image_to_column_nwgc_1d_i8_instances(op_ptrs);
+                }
+            }
+            else if constexpr(NumDimSpatial == 2 && is_same_v<ImageLayout, NHWGC>)
+            {
+                if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
+                {
+                    add_device_image_to_column_nhwgc_2d_f32_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
+                {
+                    add_device_image_to_column_nhwgc_2d_f16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                                  is_same_v<OutDataType, ck::bhalf_t>)
+                {
+                    add_device_image_to_column_nhwgc_2d_bf16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
+                {
+                    add_device_image_to_column_nhwgc_2d_i8_instances(op_ptrs);
+                }
+            }
+            else if constexpr(NumDimSpatial == 3 && is_same_v<ImageLayout, NDHWGC>)
+            {
+                if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
+                {
+                    add_device_image_to_column_ndhwgc_3d_f32_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
+                {
+                    add_device_image_to_column_ndhwgc_3d_f16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                                  is_same_v<OutDataType, ck::bhalf_t>)
+                {
+                    add_device_image_to_column_ndhwgc_3d_bf16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
+                {
+                    add_device_image_to_column_ndhwgc_3d_i8_instances(op_ptrs);
                }
            }
        }
@@ -214,60 +379,120 @@ struct DeviceOperationInstanceFactory<
            {
                if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
                {
-                    add_device_column_to_image_nwc_1d_f32_instances(op_ptrs);
+                    add_device_column_to_image_gnwc_1d_f32_instances(op_ptrs);
                }
                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
                {
-                    add_device_column_to_image_nwc_1d_f16_instances(op_ptrs);
+                    add_device_column_to_image_gnwc_1d_f16_instances(op_ptrs);
                }
                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
                                  is_same_v<OutDataType, ck::bhalf_t>)
                {
-                    add_device_column_to_image_nwc_1d_bf16_instances(op_ptrs);
+                    add_device_column_to_image_gnwc_1d_bf16_instances(op_ptrs);
                }
                else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
                {
-                    add_device_column_to_image_nwc_1d_i8_instances(op_ptrs);
+                    add_device_column_to_image_gnwc_1d_i8_instances(op_ptrs);
                }
            }
            else if constexpr(NumDimSpatial == 2 && is_same_v<ImageLayout, GNHWC>)
            {
                if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
                {
-                    add_device_column_to_image_nhwc_2d_f32_instances(op_ptrs);
+                    add_device_column_to_image_gnhwc_2d_f32_instances(op_ptrs);
                }
                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
                {
-                    add_device_column_to_image_nhwc_2d_f16_instances(op_ptrs);
+                    add_device_column_to_image_gnhwc_2d_f16_instances(op_ptrs);
                }
                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
                                  is_same_v<OutDataType, ck::bhalf_t>)
                {
-                    add_device_column_to_image_nhwc_2d_bf16_instances(op_ptrs);
+                    add_device_column_to_image_gnhwc_2d_bf16_instances(op_ptrs);
                }
                else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
                {
-                    add_device_column_to_image_nhwc_2d_i8_instances(op_ptrs);
+                    add_device_column_to_image_gnhwc_2d_i8_instances(op_ptrs);
                }
            }
            else if constexpr(NumDimSpatial == 3 && is_same_v<ImageLayout, GNDHWC>)
            {
                if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
                {
-                    add_device_column_to_image_ndhwc_3d_f32_instances(op_ptrs);
+                    add_device_column_to_image_gndhwc_3d_f32_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
+                {
+                    add_device_column_to_image_gndhwc_3d_f16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                                  is_same_v<OutDataType, ck::bhalf_t>)
+                {
+                    add_device_column_to_image_gndhwc_3d_bf16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
+                {
+                    add_device_column_to_image_gndhwc_3d_i8_instances(op_ptrs);
+                }
+            }
+            else if constexpr(NumDimSpatial == 1 && is_same_v<ImageLayout, NWGC>)
+            {
+                if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
+                {
+                    add_device_column_to_image_nwgc_1d_f32_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
+                {
+                    add_device_column_to_image_nwgc_1d_f16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                                  is_same_v<OutDataType, ck::bhalf_t>)
+                {
+                    add_device_column_to_image_nwgc_1d_bf16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
+                {
+                    add_device_column_to_image_nwgc_1d_i8_instances(op_ptrs);
+                }
+            }
+            else if constexpr(NumDimSpatial == 2 && is_same_v<ImageLayout, NHWGC>)
+            {
+                if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
+                {
+                    add_device_column_to_image_nhwgc_2d_f32_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
+                {
+                    add_device_column_to_image_nhwgc_2d_f16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                                  is_same_v<OutDataType, ck::bhalf_t>)
+                {
+                    add_device_column_to_image_nhwgc_2d_bf16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
+                {
+                    add_device_column_to_image_nhwgc_2d_i8_instances(op_ptrs);
+                }
+            }
+            else if constexpr(NumDimSpatial == 3 && is_same_v<ImageLayout, NDHWGC>)
+            {
+                if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
+                {
+                    add_device_column_to_image_ndhwgc_3d_f32_instances(op_ptrs);
                }
                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
                {
-                    add_device_column_to_image_ndhwc_3d_f16_instances(op_ptrs);
+                    add_device_column_to_image_ndhwgc_3d_f16_instances(op_ptrs);
                }
                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
                                  is_same_v<OutDataType, ck::bhalf_t>)
                {
-                    add_device_column_to_image_ndhwc_3d_bf16_instances(op_ptrs);
+                    add_device_column_to_image_ndhwgc_3d_bf16_instances(op_ptrs);
                }
                else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
                {
-                    add_device_column_to_image_ndhwc_3d_i8_instances(op_ptrs);
+                    add_device_column_to_image_ndhwgc_3d_i8_instances(op_ptrs);
                }
            }
        }

--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_instance.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = ck::bhalf_t;
+using F16  = ck::half_t;
+using F32  = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using namespace ck::tensor_layout::convolution;
+
+using PassThrough          = ck::tensor_operation::element_wise::PassThrough;
+using ScaleAddScaleAddRelu = ck::tensor_operation::element_wise::ScaleAddScaleAddRelu;
+
+static constexpr auto ConvFwdDefault =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto ConvFwd1x1P0 = ConvolutionForwardSpecialization::Filter1x1Pad0;
+
+static constexpr auto ConvFwd1x1S1P0 = ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
+
+static constexpr auto ConvFwdOddC =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::OddC;
+
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_bf16_instances = std::tuple<
+    // clang-format off
+        //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|             CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise|     Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|       Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |                |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        // generic instance
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32, BF16, ck::Tuple<BF16, BF16>, BF16, PassThrough, PassThrough, ScaleAddScaleAddRelu,  ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
+        // instances for small conv.K and conv.C
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32, BF16, ck::Tuple<BF16, BF16>, BF16, PassThrough, PassThrough, ScaleAddScaleAddRelu,  ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32, BF16, ck::Tuple<BF16, BF16>, BF16, PassThrough, PassThrough, ScaleAddScaleAddRelu,  ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32, BF16, ck::Tuple<BF16, BF16>, BF16, PassThrough, PassThrough, ScaleAddScaleAddRelu,  ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>
+    // clang-format on
+    >;
+
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_f16_instances = std::tuple<
+    // clang-format off
+        //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|             CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise|     Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|       Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |                |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        // generic instance
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32, F16, ck::Tuple<F16, F16>, F16, PassThrough, PassThrough, ScaleAddScaleAddRelu,  ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
+        // instances for small conv.K and conv.C
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32, F16, ck::Tuple<F16, F16>, F16, PassThrough, PassThrough, ScaleAddScaleAddRelu,  ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32, F16, ck::Tuple<F16, F16>, F16, PassThrough, PassThrough, ScaleAddScaleAddRelu,  ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32, F16, ck::Tuple<F16, F16>, F16, PassThrough, PassThrough, ScaleAddScaleAddRelu,  ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>
+    // clang-format on
+    >;
+
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_f32_instances = std::tuple<
+    // clang-format off
+        //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|             CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise|     Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|       Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |                |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        // generic instance
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32, F32, ck::Tuple<F32, F32>, F32, PassThrough, PassThrough, ScaleAddScaleAddRelu,  ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,         1,           1,           1,              S<1,  8, 1,  8>,              1>,
+        // instances for small conv.K and conv.C
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32, F32, ck::Tuple<F32, F32>, F32, PassThrough, PassThrough, ScaleAddScaleAddRelu,  ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              1>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32, F32, ck::Tuple<F32, F32>, F32, PassThrough, PassThrough, ScaleAddScaleAddRelu,  ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32, F32, ck::Tuple<F32, F32>, F32, PassThrough, PassThrough, ScaleAddScaleAddRelu,  ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>
+    // clang-format on
+    >;
+
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_int8_instances = std::tuple<
+    // clang-format off
+        //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|                 Ds| EData|           A|           B|             CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|           DataType|  Type| Elementwise| Elementwise|     Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|           |       |       |            |       |      |      |        |         |                   |      |   Operation|   Operation|       Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|           |       |       |            |       |      |      |        |         |                   |      |            |            |                |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        // generic instance
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t, int8_t, ck::Tuple<F32, F32>, int8_t, PassThrough, PassThrough, ScaleAddScaleAddRelu,  ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
+        // instances for small conv.K and conv.C
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t, int8_t, ck::Tuple<F32, F32>, int8_t, PassThrough, PassThrough, ScaleAddScaleAddRelu,  ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t, int8_t, ck::Tuple<F32, F32>, int8_t, PassThrough, PassThrough, ScaleAddScaleAddRelu,  ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t, int8_t, ck::Tuple<F32, F32>, int8_t, PassThrough, PassThrough, ScaleAddScaleAddRelu,  ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>
+    // clang-format on
+    >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scaleadd_scaleadd_relu.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scaleadd_scaleadd_relu.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+#include <memory>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using PassThrough          = ck::tensor_operation::element_wise::PassThrough;
+using ScaleAddScaleAddRelu = ck::tensor_operation::element_wise::ScaleAddScaleAddRelu;
+
+#ifdef CK_ENABLE_BF16
+// grouped conv3d forward, NDHWGC/GKZYXC/NDHWGK
+void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
+                                                              NDHWGC,
+                                                              GKZYXC,
+                                                              ck::Tuple<NDHWGK, NDHWGK>,
+                                                              NDHWGK,
+                                                              BF16,
+                                                              BF16,
+                                                              ck::Tuple<BF16, BF16>,
+                                                              BF16,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              ScaleAddScaleAddRelu>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_FP16
+void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
+                                                              NDHWGC,
+                                                              GKZYXC,
+                                                              ck::Tuple<NDHWGK, NDHWGK>,
+                                                              NDHWGK,
+                                                              F16,
+                                                              F16,
+                                                              ck::Tuple<F16, F16>,
+                                                              F16,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              ScaleAddScaleAddRelu>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_FP32
+void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
+                                                              NDHWGC,
+                                                              GKZYXC,
+                                                              ck::Tuple<NDHWGK, NDHWGK>,
+                                                              NDHWGK,
+                                                              F32,
+                                                              F32,
+                                                              ck::Tuple<F32, F32>,
+                                                              F32,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              ScaleAddScaleAddRelu>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_INT8
+void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
+                                                              NDHWGC,
+                                                              GKZYXC,
+                                                              ck::Tuple<NDHWGK, NDHWGK>,
+                                                              NDHWGK,
+                                                              int8_t,
+                                                              int8_t,
+                                                              ck::Tuple<F32, F32>,
+                                                              int8_t,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              ScaleAddScaleAddRelu>>>& instances);
+#endif
+
+template <ck::index_t NumDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename DLayouts,
+          typename OutLayout,
+          typename InDataType,
+          typename WeiDataType,
+          typename DDataTypes,
+          typename OutDataType,
+          typename ComputeType>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
+    NumDimSpatial,
+    InLayout,
+    WeiLayout,
+    DLayouts,
+    OutLayout,
+    InDataType,
+    WeiDataType,
+    DDataTypes,
+    OutDataType,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::ScaleAddScaleAddRelu,
+    ComputeType>>
+{
+    using DeviceOp =
+        DeviceGroupedConvFwdMultipleD<NumDimSpatial,
+                                      InLayout,
+                                      WeiLayout,
+                                      DLayouts,
+                                      OutLayout,
+                                      InDataType,
+                                      WeiDataType,
+                                      DDataTypes,
+                                      OutDataType,
+                                      ck::tensor_operation::element_wise::PassThrough,
+                                      ck::tensor_operation::element_wise::PassThrough,
+                                      ck::tensor_operation::element_wise::ScaleAddScaleAddRelu,
+                                      ComputeType>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+        if constexpr(NumDimSpatial == 3 && is_same_v<InLayout, NDHWGC> &&
+                     is_same_v<WeiLayout, GKZYXC> && is_same_v<OutLayout, NDHWGK>)
+        {
+#ifdef CK_ENABLE_FP32
+            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
+                         is_same_v<OutDataType, float>)
+            {
+                add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+                    op_ptrs);
+            }
+#endif
+#ifdef CK_ENABLE_FP16
+            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                         is_same_v<OutDataType, half_t> && is_same_v<ComputeType, half_t>)
+            {
+                add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+                    op_ptrs);
+            }
+#endif
+#ifdef CK_ENABLE_BF16
+            if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                         is_same_v<WeiDataType, ck::bhalf_t> && is_same_v<OutDataType, ck::bhalf_t>)
+            {
+                add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+                    op_ptrs);
+            }
+#endif
+#ifdef CK_ENABLE_INT8
+            if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
+                         is_same_v<OutDataType, int8_t>)
+            {
+                add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_int8_instances(
+                    op_ptrs);
+            }
+#endif
+        }
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck