Merge branch 'develop' into amd-develop

705d5a08 · Jun Liu · d4ad52d6 · 8f84a012 · 705d5a08 · 705d5a08
Commit 705d5a08 authored Sep 08, 2023 by Jun Liu
20 changed files
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_streamk.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_streamk.hpp
@@ -37,7 +37,8 @@ __global__ void
                                   index_t StrideC,
                                   typename GridwiseGemm::Block2CTileMap block_mapping)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
    constexpr index_t shared_size = GridwiseGemm::GetSharedMemoryNumberOfByte();
    __shared__ uint8_t p_shared[shared_size];

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
@@ -194,7 +194,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
              StrideC{StrideC_},
              MPadded{CalculateMPadded(M_)},
              NPadded{CalculateNPadded(N_)},
-              K0{CalculateK0(K)}
+              K0{CalculateK0(K_)}
        {
        }
@@ -383,7 +383,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
    __host__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
    {
-        const index_t num_loop = K / (K0PerBlock * K1);
+        const index_t num_loop = math::integer_divide_ceil(K, K0PerBlock * K1);
        return GridwiseGemmPipe::CalculateHasMainLoop(num_loop);
    }
@@ -840,7 +840,25 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3_ext
            }
        }();
-        if constexpr(GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding)
+        if constexpr(GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding)
+        {
+            const auto K0Pad = math::integer_divide_ceil(K0, K0PerBlock) * K0PerBlock;
+            const auto KPad  = K0Pad * K1Value;
+            const auto a_grid_desc_m_kpad = transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_pass_through_transform(M), make_right_pad_transform(K, KPad - K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+            return transform_tensor_descriptor(
+                a_grid_desc_m_kpad,
+                make_tuple(make_unmerge_transform(make_tuple(K0Pad, K1Value)),
+                           make_right_pad_transform(M, MPad - M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        }
+        else if constexpr(GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding)
        {
            return transform_tensor_descriptor(
                a_grid_desc_m_k,
@@ -874,7 +892,26 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3_ext
            }
        }();
-        if constexpr(GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding)
+        if constexpr(GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding)
+        {
+            const auto K0Pad = math::integer_divide_ceil(K0, K0PerBlock) * K0PerBlock;
+            const auto KPad  = K0Pad * K1Value;
+            const auto b_grid_desc_kpad_n = transform_tensor_descriptor(
+                b_grid_desc_k_n,
+                make_tuple(make_right_pad_transform(K, KPad - K), make_pass_through_transform(N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+            return transform_tensor_descriptor(
+                b_grid_desc_kpad_n,
+                make_tuple(make_unmerge_transform(make_tuple(K0Pad, K1Value)),
+                           make_right_pad_transform(N, NPad - N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        }
+        else if constexpr(GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding)
        {
            return transform_tensor_descriptor(
                b_grid_desc_k_n,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_image_to_column.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_image_to_column.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+namespace ck {
+template <typename InputGridDesc,
+          typename InputDataType,
+          typename OutputGridDesc,
+          typename OutputDataType,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t KPerBlock,
+          typename ThreadClusterLengths,
+          index_t ScalarPerVector,
+          typename Block2ETileMap>
+struct GridwiseImageToColumn
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+    __device__ static void Run(const InputGridDesc& in_grid_desc,
+                               const InputDataType* __restrict__ p_in_global,
+                               const OutputGridDesc& out_grid_desc,
+                               OutputDataType* __restrict__ p_out_global,
+                               const Block2ETileMap& block_2_tile_map)
+    {
+        const auto block_work_idx =
+            block_2_tile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+        const index_t m_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I0] * MPerBlock);
+        const index_t k_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I1] * KPerBlock);
+        // Global Memory
+        const auto in_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_in_global, in_grid_desc.GetElementSpaceSize());
+        auto out_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_out_global, out_grid_desc.GetElementSpaceSize());
+        auto copy_global_to_global = ThreadGroupTensorSliceTransfer_v7<
+            ThisThreadBlock,
+            Tuple<InputDataType>,
+            Tuple<OutputDataType>,
+            decltype(tie(in_grid_desc)),
+            decltype(tie(out_grid_desc)),
+            tensor_operation::element_wise::PassThrough,
+            Sequence<static_cast<index_t>(InMemoryDataOperationEnum::Set)>,
+            Sequence<MPerBlock, KPerBlock>,
+            ThreadClusterLengths,
+            Sequence<0, 1>,
+            Sequence<0, 1>,
+            I1,
+            ScalarPerVector,
+            Sequence<true>,
+            Sequence<true>>{
+            in_grid_desc,
+            make_tuple(make_multi_index(m_block_data_idx_on_grid, k_block_data_idx_on_grid)),
+            out_grid_desc,
+            make_tuple(make_multi_index(m_block_data_idx_on_grid, k_block_data_idx_on_grid)),
+            tensor_operation::element_wise::PassThrough{}};
+        copy_global_to_global.Run(
+            tie(in_grid_desc), tie(in_global_buf), tie(out_grid_desc), tie(out_global_buf));
+    }
+    __host__ static constexpr bool CheckValidity(const InputGridDesc& in_grid_desc,
+                                                 const OutputGridDesc& out_grid_desc)
+    {
+        if(in_grid_desc.GetLength(I0) % MPerBlock != 0 ||
+           in_grid_desc.GetLength(I1) % KPerBlock != 0)
+            return false;
+        if(out_grid_desc.GetLength(I0) % MPerBlock != 0 ||
+           out_grid_desc.GetLength(I1) % KPerBlock != 0)
+            return false;
+        return true;
+    }
+};
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/thread/threadwise_contraction_dl_dpp8.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_contraction_dl_dpp8.hpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#pragma once
-#include "ck/utility/amd_gemm_dpp.hpp"
-#include "ck/utility/common_header.hpp"
-#include "ck/utility/inner_product_dpp8.hpp"
-#include "ck/utility/math.hpp"
-namespace ck {
-/**
- * Threadwise contraction using dot instructions with DPP8 modifier.
- *
- * Assumptions:
- *   1. `AThreadDesc_TK0_TM0_TM1_TK1`, `BThreadDesc_TK0_TN0_TN1_TK1`, `CThreadDesc_TM0_TM1_TN0_TN1`
- *      are known at compile-time;
- *   2. `AOriginIdx`, `BOriginIdx`, `COriginIdx` are known at compile-time;
- *   3. `TM0` is equal to 1 and `TN0` is equal to 1;
- *   4. When `ShareA` is set (unset, respectively), `TM1` (`TN1`, respectively) is divisible by
- *      the size of the lane group (`dpp8::lane_group_size`).
- */
-template <typename FloatA,
-          typename FloatB,
-          typename FloatC,
-          typename AThreadDesc_TK0_TM0_TM1_TK1,
-          typename BThreadDesc_TK0_TN0_TN1_TK1,
-          typename CThreadDesc_TM0_TM1_TN0_TN1,
-          typename TKLengths,
-          typename TMLengths,
-          typename TNLengths,
-          bool ShareA,
-          typename enable_if<AThreadDesc_TK0_TM0_TM1_TK1::IsKnownAtCompileTime() &&
-                                 BThreadDesc_TK0_TN0_TN1_TK1::IsKnownAtCompileTime() &&
-                                 CThreadDesc_TM0_TM1_TN0_TN1::IsKnownAtCompileTime(),
-                             bool>::type = false>
-struct ThreadwiseContractionDlDpp8_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1
-{
-    static constexpr auto I0 = Number<0>{};
-    static constexpr auto I1 = Number<1>{};
-    static constexpr index_t TK0 = TKLengths{}[I0];
-    static constexpr index_t TK1 = TKLengths{}[I1];
-    static constexpr index_t TM0 = TMLengths{}[I0];
-    static constexpr index_t TM1 = TMLengths{}[I1];
-    static constexpr index_t TN0 = TNLengths{}[I0];
-    static constexpr index_t TN1 = TNLengths{}[I1];
-    static_assert(TM0 == 1 && TN0 == 1);
-    static_assert((ShareA && TM1 % dpp8::lane_group_size == 0) ||
-                  (!ShareA && TN1 % dpp8::lane_group_size == 0));
-    static constexpr index_t shared_elems_per_lane =
-        ShareA ? TM1 / dpp8::lane_group_size : TN1 / dpp8::lane_group_size;
-    __device__ constexpr ThreadwiseContractionDlDpp8_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1()
-    {
-        static_assert(AThreadDesc_TK0_TM0_TM1_TK1::IsKnownAtCompileTime() &&
-                          BThreadDesc_TK0_TN0_TN1_TK1::IsKnownAtCompileTime() &&
-                          CThreadDesc_TM0_TM1_TN0_TN1::IsKnownAtCompileTime(),
-                      "wrong! Desc should be known at compile-time");
-        static_assert(TKLengths::Size() == 2 && TMLengths::Size() == 2 && TNLengths::Size() == 2,
-                      "wrong!");
-    }
-    template <typename ABuffer,
-              typename AOriginIdx,
-              typename BBuffer,
-              typename BOriginIdx,
-              typename CBuffer,
-              typename COriginIdx>
-    __device__ static void Run(const ABuffer& a_buf,
-                               AOriginIdx,
-                               const BBuffer& b_buf,
-                               BOriginIdx,
-                               CBuffer& c_buf,
-                               COriginIdx)
-    {
-        static_assert(is_known_at_compile_time<remove_cvref_t<AOriginIdx>>::value &&
-                          is_known_at_compile_time<remove_cvref_t<BOriginIdx>>::value &&
-                          is_known_at_compile_time<remove_cvref_t<COriginIdx>>::value,
-                      "wrong! AOriginIdx, BOriginIdx, COringinIdx should be known at compile-time");
-        static_assert(
-            is_same<remove_cvref_t<typename ABuffer::type>, remove_cvref_t<FloatA>>::value &&
-            is_same<remove_cvref_t<typename BBuffer::type>, remove_cvref_t<FloatB>>::value &&
-            is_same<remove_cvref_t<typename CBuffer::type>, remove_cvref_t<FloatC>>::value &&
-            "wrong! inconsistent type");
-        constexpr auto a_origin_idx = to_multi_index(AOriginIdx{});
-        constexpr auto b_origin_idx = to_multi_index(BOriginIdx{});
-        constexpr auto c_origin_idx = to_multi_index(COriginIdx{});
-        static_for<0, TK0, 1>{}([&](auto tk0) {
-            static_for<0, TM1, 1>{}([&](auto tm1) {
-                static_for<0, TN1, 1>{}([&](auto tn1) {
-                    vector_type<FloatA, TK1> a_vec;
-                    vector_type<FloatB, TK1> b_vec;
-                    static_for<0, TK1, 1>{}([&](auto tk1) {
-                        constexpr index_t local_tm1 = ShareA ? tm1 % shared_elems_per_lane : tm1;
-                        constexpr index_t a_offset  = AThreadDesc_TK0_TM0_TM1_TK1{}.CalculateOffset(
-                            a_origin_idx + make_multi_index(tk0, 0, local_tm1, tk1));
-                        constexpr index_t local_tn1 = ShareA ? tn1 : tn1 % shared_elems_per_lane;
-                        constexpr index_t b_offset  = BThreadDesc_TK0_TN0_TN1_TK1{}.CalculateOffset(
-                            b_origin_idx + make_multi_index(tk0, 0, local_tn1, tk1));
-                        a_vec.template AsType<FloatA>()(tk1) = a_buf[Number<a_offset>{}];
-                        b_vec.template AsType<FloatB>()(tk1) = b_buf[Number<b_offset>{}];
-                    });
-                    using a_vector_t = typename vector_type<FloatA, TK1>::type;
-                    using b_vector_t = typename vector_type<FloatB, TK1>::type;
-                    constexpr index_t c_offset = CThreadDesc_TM0_TM1_TN0_TN1{}.CalculateOffset(
-                        c_origin_idx + make_multi_index(0, tm1, 0, tn1));
-                    constexpr int src_lane =
-                        ShareA ? (tm1 / shared_elems_per_lane) % dpp8::lane_group_size
-                               : (tn1 / shared_elems_per_lane) % dpp8::lane_group_size;
-                    dpp8::inner_product_dpp<a_vector_t, b_vector_t, FloatC, src_lane, ShareA>(
-                        a_vec.template AsType<a_vector_t>()[I0],
-                        b_vec.template AsType<b_vector_t>()[I0],
-                        c_buf(Number<c_offset>{}));
-                });
-            });
-        });
-    }
-};
-} // namespace ck
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp
@@ -104,13 +104,13 @@ struct ThreadwiseTensorSliceTransfer_v6r1
            // apply pointwise operation
            static_for<0, ScalarPerVector, 1>{}([&](auto i) {
-                SrcData v;
+                DstData v;
                // apply element-wise operation
                element_op_(v, src_vector_container.template AsType<SrcData>()[i]);
                // apply type convert
-                dst_vector_container.template AsType<DstData>()(i) = type_convert<DstData>(v);
+                dst_vector_container.template AsType<DstData>()(i) = v;
            });
            const bool is_dst_valid =

--- a/include/ck/tensor_operation/gpu/warp/dpp_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/dpp_gemm.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck/utility/amd_gemm_dpp.hpp"
+#include "ck/utility/common_header.hpp"
+#include "ck/utility/math.hpp"
+namespace ck {
+enum struct DppInstr
+{
+    dpp8_f16_16x16x2 = 0,
+    dpp8_f16_8x32x2,
+    dpp8_f16_32x8x2
+};
+/**
+ * Structure representing DPP GEMM executed by a single wavefront.
+ *
+ * Each structure instantiation must contain the following fields:
+ * - wave_size - number of threads that execute a single DPP GEMM operation, usually equal to the
+ *               number of threads in a wavefront;
+ * - lanegroup_size - number of threads (lanes) that share data using DPP instruction modifier,
+ *                    it's 8 in case of DPP8;
+ * - m_per_wave - size along M dimension of matrix C that is processed in a single DPP GEMM
+ *                operation;
+ * - n_per_wave - size along N dimension of matrix C that is processed in a single DPP GEMM
+ *                operation;
+ * - m_per_lanegroup - size along M dimension that is processed by a single lanegroup;
+ * - n_per_lanegroup - size along N dimension that is processed by a single lanegroup;
+ * - m_per_thread - size along M dimension of the tile calculated by a single thread;
+ * - n_per_thread - size along N dimension of the tile calculated by a single thread;
+ * - k_per_dpp - size along K dimension that is reduced in a single DPP GEMM operation;
+ * - share_a - indicates whether we share matrix A or matrix B between lanes using DPP modifiers.
+ *
+ * Not all the combinarions are supported now, for current restrictions see the static asserts
+ * in the DppSelector's contructor.
+ */
+template <DppInstr instr>
+struct dpp_type;
+template <>
+struct dpp_type<DppInstr::dpp8_f16_32x8x2>
+{
+    static constexpr index_t wave_size       = 32;
+    static constexpr index_t lanegroup_size  = 8;
+    static constexpr index_t m_per_wave      = 32;
+    static constexpr index_t n_per_wave      = 8;
+    static constexpr index_t m_per_lanegroup = 8;
+    static constexpr index_t n_per_lanegroup = 8;
+    static constexpr index_t m_per_thread    = 8;
+    static constexpr index_t n_per_thread    = 1;
+    static constexpr index_t k_per_dpp       = 2;
+    static constexpr bool share_a            = true;
+    using BaseType                           = half_t;
+    template <index_t MPerDpp, index_t NPerDpp, class ADataType, class BDataType, class CDataType>
+    __device__ void run(const ADataType& a, const BDataType& b, CDataType& reg_c) const
+    {
+        dpp8::DppLanegroupGemm<m_per_thread,
+                               n_per_thread,
+                               k_per_dpp,
+                               BaseType,
+                               ADataType,
+                               BDataType,
+                               CDataType,
+                               share_a>{}
+            .Run(a, b, reg_c);
+    }
+};
+template <>
+struct dpp_type<DppInstr::dpp8_f16_8x32x2>
+{
+    static constexpr index_t wave_size       = 32;
+    static constexpr index_t lanegroup_size  = 8;
+    static constexpr index_t m_per_wave      = 8;
+    static constexpr index_t n_per_wave      = 32;
+    static constexpr index_t m_per_lanegroup = 8;
+    static constexpr index_t n_per_lanegroup = 8;
+    static constexpr index_t m_per_thread    = 8;
+    static constexpr index_t n_per_thread    = 1;
+    static constexpr index_t k_per_dpp       = 2;
+    static constexpr bool share_a            = true;
+    using BaseType                           = half_t;
+    template <index_t MPerDpp, index_t NPerDpp, class ADataType, class BDataType, class CDataType>
+    __device__ void run(const ADataType& a, const BDataType& b, CDataType& reg_c) const
+    {
+        dpp8::DppLanegroupGemm<m_per_thread,
+                               n_per_thread,
+                               k_per_dpp,
+                               BaseType,
+                               ADataType,
+                               BDataType,
+                               CDataType,
+                               share_a>{}
+            .Run(a, b, reg_c);
+    }
+};
+template <>
+struct dpp_type<DppInstr::dpp8_f16_16x16x2>
+{
+    static constexpr index_t wave_size       = 32;
+    static constexpr index_t lanegroup_size  = 8;
+    static constexpr index_t m_per_wave      = 16;
+    static constexpr index_t n_per_wave      = 16;
+    static constexpr index_t m_per_lanegroup = 8;
+    static constexpr index_t n_per_lanegroup = 8;
+    static constexpr index_t m_per_thread    = 8;
+    static constexpr index_t n_per_thread    = 1;
+    static constexpr index_t k_per_dpp       = 2;
+    static constexpr bool share_a            = true;
+    using BaseType                           = half_t;
+    template <index_t MPerDpp, index_t NPerDpp, class ADataType, class BDataType, class CDataType>
+    __device__ void run(const ADataType& a, const BDataType& b, CDataType& reg_c) const
+    {
+        dpp8::DppLanegroupGemm<m_per_thread,
+                               n_per_thread,
+                               k_per_dpp,
+                               BaseType,
+                               ADataType,
+                               BDataType,
+                               CDataType,
+                               share_a>{}
+            .Run(a, b, reg_c);
+    }
+};
+template <typename BaseType, index_t MPerDpp, index_t NPerDpp>
+struct DppSelector
+{
+    template <typename BaseType_, index_t MPerDpp_, index_t NPerDpp_>
+    static constexpr auto GetDpp();
+    template <>
+    static constexpr auto GetDpp<half_t, 8, 32>()
+    {
+        return DppInstr::dpp8_f16_8x32x2;
+    }
+    template <>
+    static constexpr auto GetDpp<half_t, 16, 16>()
+    {
+        return DppInstr::dpp8_f16_16x16x2;
+    }
+    template <>
+    static constexpr auto GetDpp<half_t, 32, 8>()
+    {
+        return DppInstr::dpp8_f16_32x8x2;
+    }
+    static constexpr auto selected_dpp = dpp_type<GetDpp<BaseType, MPerDpp, NPerDpp>()>{};
+    __host__ __device__ constexpr DppSelector()
+    {
+        static_assert(selected_dpp.m_per_wave % selected_dpp.m_per_lanegroup == 0);
+        static_assert(selected_dpp.n_per_wave % selected_dpp.n_per_lanegroup == 0);
+        static_assert(selected_dpp.k_per_dpp % 2 == 0);
+        static_assert(selected_dpp.wave_size % selected_dpp.lanegroup_size == 0);
+        constexpr index_t num_dpp_per_wave = selected_dpp.wave_size / selected_dpp.lanegroup_size;
+        constexpr index_t num_wave_c_elems = selected_dpp.m_per_wave * selected_dpp.n_per_wave;
+        constexpr index_t num_dpp_c_elems =
+            selected_dpp.m_per_lanegroup * selected_dpp.n_per_lanegroup;
+        static_assert(num_wave_c_elems % num_dpp_c_elems == 0);
+        static_assert(num_dpp_per_wave == num_wave_c_elems / num_dpp_c_elems);
+        if constexpr(selected_dpp.share_a)
+        {
+            static_assert(selected_dpp.m_per_lanegroup == selected_dpp.m_per_thread);
+            static_assert(selected_dpp.n_per_lanegroup % selected_dpp.n_per_thread == 0);
+            static_assert(selected_dpp.n_per_lanegroup / selected_dpp.n_per_thread ==
+                          selected_dpp.lanegroup_size);
+        }
+        else
+        {
+            static_assert(selected_dpp.m_per_lanegroup % selected_dpp.n_per_thread == 0);
+            static_assert(selected_dpp.m_per_lanegroup / selected_dpp.n_per_thread ==
+                          selected_dpp.lanegroup_size);
+            static_assert(selected_dpp.n_per_lanegroup == selected_dpp.n_per_thread);
+        }
+        // Below checks come from the restrictions of the current implementation, could be removed
+        // in the future when the implementation is more generalized.
+        static_assert(selected_dpp.share_a);
+        static_assert(selected_dpp.n_per_thread == 1);
+        static_assert(selected_dpp.m_per_thread == selected_dpp.lanegroup_size);
+        static_assert(selected_dpp.m_per_lanegroup == selected_dpp.m_per_thread);
+        static_assert(selected_dpp.n_per_lanegroup ==
+                      selected_dpp.n_per_thread * selected_dpp.lanegroup_size);
+    }
+    static constexpr index_t GetK1PerDpp() { return selected_dpp.k_per_dpp; }
+};
+template <typename BaseType, index_t MPerDpp, index_t NPerDpp, index_t KPack>
+struct DppGemm
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    using CIndex   = MultiIndex<2>;
+    using CIndex4D = MultiIndex<4>;
+    __host__ __device__ constexpr DppGemm()
+    {
+        static_assert(MPerDpp == 8 || MPerDpp == 16 || MPerDpp == 32,
+                      "MPerDpp must be either 8, 16 or 32.");
+        static_assert(NPerDpp == 8 || NPerDpp == 16 || NPerDpp == 32,
+                      "NPerDpp must be either 8, 16 or 32.");
+        static_assert(KPack % dpp_instr.k_per_dpp == 0, "KPack must be divisible by k_per_dpp.");
+    }
+    __device__ static constexpr index_t GetRegSizePerDpp()
+    {
+        return MPerDpp * NPerDpp / dpp_instr.wave_size;
+    }
+    template <class ADataType, class BDataType, class CDataType>
+    __device__ void
+    Run(const ADataType& p_a_wave, const BDataType& p_b_wave, CDataType& p_c_thread) const
+    {
+        static_assert(is_same<BaseType, double>::value || is_same<BaseType, float>::value ||
+                          is_same<BaseType, half_t>::value || is_same<BaseType, bhalf_t>::value ||
+                          is_same<BaseType, int8_t>::value || is_same<BaseType, f8_t>::value,
+                      "base BaseType must be double, float, half, bfloat16, and int8_t!");
+        static_for<0, KPack / dpp_instr.k_per_dpp, 1>{}([&](auto k) {
+            dpp_instr.template run<MPerDpp, NPerDpp>(p_a_wave[k], p_b_wave[k], p_c_thread);
+        });
+    }
+    __device__ static auto GetLaneIdInWave()
+    {
+        return get_thread_local_1d_id() % dpp_instr.wave_size;
+    }
+    __device__ static auto GetWaveId() { return get_thread_local_1d_id() / dpp_instr.wave_size; }
+    __device__ static auto GetLaneIdInLaneGroup()
+    {
+        return get_thread_local_1d_id() % dpp_instr.lanegroup_size;
+    }
+    __device__ static auto GetLaneGroupIdInWave()
+    {
+        return GetLaneIdInWave() / dpp_instr.lanegroup_size;
+    }
+    __device__ static auto GetDppOpIdx()
+    {
+        const auto lanegroupId = GetLaneGroupIdInWave();
+        constexpr auto lanegroup_idx_1d_to_dpp_idx_2d_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(
+                make_merge_transform(make_tuple(dpp_instr.m_per_wave / dpp_instr.m_per_lanegroup,
+                                                dpp_instr.n_per_wave / dpp_instr.n_per_lanegroup))),
+            make_tuple(Sequence<0, 1>{}),
+            make_tuple(Sequence<0>{}));
+        const auto dpp_idx = lanegroup_idx_1d_to_dpp_idx_2d_adaptor.CalculateBottomIndex(
+            make_multi_index(lanegroupId));
+        const auto m_dpp_idx = dpp_idx[I0];
+        const auto n_dpp_idx = dpp_idx[I1];
+        return make_tuple(m_dpp_idx, n_dpp_idx);
+    }
+    __host__ __device__ static auto CalculateAThreadOriginDataIndex_K_M()
+    {
+        const auto laneId   = get_thread_local_1d_id();
+        const auto wave_row = laneId / dpp_instr.n_per_wave;
+        auto m_idx          = dpp_instr.m_per_thread * wave_row + GetLaneIdInLaneGroup();
+        return make_tuple(0, m_idx % dpp_instr.m_per_wave);
+    }
+    __host__ __device__ static auto CalculateBThreadOriginDataIndex_K_N()
+    {
+        const auto laneId = get_thread_local_1d_id();
+        return make_tuple(0, laneId % dpp_instr.n_per_wave);
+    }
+    __device__ static CIndex GetBeginOfThreadBlk()
+    {
+        const auto dpp_op_idx = GetDppOpIdx();
+        const auto m_dpp_op_idx = dpp_op_idx[I0];
+        const auto n_dpp_op_idx = dpp_op_idx[I1];
+        index_t n_offset = n_dpp_op_idx * dpp_instr.n_per_lanegroup + GetLaneIdInLaneGroup();
+        index_t m_offset = m_dpp_op_idx * dpp_instr.m_per_lanegroup;
+        return CIndex{m_offset, n_offset};
+    }
+    static constexpr auto dpp = DppSelector<BaseType, MPerDpp, NPerDpp>{};
+    static constexpr auto dpp_instr = dpp.selected_dpp;
+    static constexpr auto K0PerDpp = 1;
+    static constexpr auto K1PerDpp = dpp.GetK1PerDpp();
+    __host__ __device__ static constexpr auto GetCMNThreadBlkLengths()
+    {
+        return make_tuple(Number<dpp_instr.m_per_thread>{}, Number<dpp_instr.n_per_thread>{});
+    }
+};
+} // namespace ck
--- a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp
@@ -164,6 +164,7 @@ template <
    index_t BK1,
    index_t GemmMPerBlock,
    index_t GemmNPerBlock,
+    index_t GemmKPerBlock,
    bool DoPadGemmM,
    bool DoPadGemmN>
 struct TransformConvBwdDataToGemm_v1
@@ -308,9 +309,6 @@ struct TransformConvBwdDataToGemm_v1
            const auto YDotSlice = math::integer_divide_ceil(Y - i_ytilde, YTilde);
            const auto XDotSlice = math::integer_divide_ceil(X - i_xtilde, XTilde);
-            const index_t AK0 =
-                math::integer_divide_ceil(ZDotSlice * YDotSlice * XDotSlice * K, AK1);
            if constexpr(NDimSpatial == 2)
            {
                // A: output tensor
@@ -367,9 +365,11 @@ struct TransformConvBwdDataToGemm_v1
                const auto out_gemmk_gemmm_padded_grid_desc =
                    ck::tensor_operation::device::PadTensorDescriptor(
                        out_gemmk_gemmmraw_grid_desc,
-                        make_tuple(AK1, GemmMPerBlock),
+                        make_tuple(GemmKPerBlock, GemmMPerBlock),
                        Sequence<true, DoPadGemmM>{});
+                const index_t AK0 = out_gemmk_gemmm_padded_grid_desc.GetLength(I0) / AK1;
                const auto out_gemmak0_gemmm_gemmak1_grid_desc = transform_tensor_descriptor(
                    out_gemmk_gemmm_padded_grid_desc,
                    make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
@@ -460,9 +460,11 @@ struct TransformConvBwdDataToGemm_v1
                const auto out_gemmk_gemmm_padded_grid_desc =
                    ck::tensor_operation::device::PadTensorDescriptor(
                        out_gemmk_gemmmraw_grid_desc,
-                        make_tuple(AK1, GemmMPerBlock),
+                        make_tuple(GemmKPerBlock, GemmMPerBlock),
                        Sequence<true, DoPadGemmM>{});
+                const index_t AK0 = out_gemmk_gemmm_padded_grid_desc.GetLength(I0) / AK1;
                const auto out_gemmak0_gemmm_gemmak1_grid_desc = transform_tensor_descriptor(
                    out_gemmk_gemmm_padded_grid_desc,
                    make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
@@ -568,9 +570,6 @@ struct TransformConvBwdDataToGemm_v1
            const auto YDotSlice = math::integer_divide_ceil(Y - i_ytilde, YTilde);
            const auto XDotSlice = math::integer_divide_ceil(X - i_xtilde, XTilde);
-            const index_t BK0 =
-                math::integer_divide_ceil(ZDotSlice * YDotSlice * XDotSlice * K, BK1);
            // B weight tensor
            if constexpr(NDimSpatial == 2)
            {
@@ -617,9 +616,11 @@ struct TransformConvBwdDataToGemm_v1
                const auto wei_gemmk_gemmn_padded_grid_desc =
                    ck::tensor_operation::device::PadTensorDescriptor(
                        wei_gemmk_gemmnraw_grid_desc,
-                        make_tuple(BK1, GemmNPerBlock),
+                        make_tuple(GemmKPerBlock, GemmNPerBlock),
                        Sequence<true, DoPadGemmN>{});
+                const index_t BK0 = wei_gemmk_gemmn_padded_grid_desc.GetLength(I0) / BK1;
                const auto wei_gemmbk0_gemmn_gemmbk1_grid_desc = transform_tensor_descriptor(
                    wei_gemmk_gemmn_padded_grid_desc,
                    make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
@@ -690,17 +691,19 @@ struct TransformConvBwdDataToGemm_v1
                    make_tuple(Sequence<1, 2, 3, 0>{}, Sequence<4>{}),
                    make_tuple(Sequence<0>{}, Sequence<1>{}));
-                const auto wei_gemmk_gemm_padded_grid_desc =
+                const auto wei_gemmk_gemmn_padded_grid_desc =
                    ck::tensor_operation::device::PadTensorDescriptor(
                        wei_gemmk_gemmnraw_grid_desc,
-                        make_tuple(BK1, GemmNPerBlock),
+                        make_tuple(GemmKPerBlock, GemmNPerBlock),
                        Sequence<true, DoPadGemmN>{});
+                const index_t BK0 = wei_gemmk_gemmn_padded_grid_desc.GetLength(I0) / BK1;
                const auto wei_gemmbk0_gemm_gemmbk1_grid_desc = transform_tensor_descriptor(
-                    wei_gemmk_gemm_padded_grid_desc,
+                    wei_gemmk_gemmn_padded_grid_desc,
-                    make_tuple(
+                    make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
-                        make_unmerge_transform(make_tuple(BK0, BK1)),
+                               make_pass_through_transform(
-                        make_pass_through_transform(wei_gemmk_gemm_padded_grid_desc.GetLength(I1))),
+                                   wei_gemmk_gemmn_padded_grid_desc.GetLength(I1))),
                    make_tuple(Sequence<0>{}, Sequence<1>{}),
                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));

--- a/include/ck/utility/amd_gemm_dpp.hpp
+++ b/include/ck/utility/amd_gemm_dpp.hpp
@@ -5,17 +5,63 @@
 #include "ck/utility/common_header.hpp"
 #include "ck/utility/math.hpp"
-#include "ck/utility/amd_gemm_dpp.hpp"
+#include "ck/utility/inner_product_dpp8.hpp"
 namespace ck {
 namespace dpp8 {
-/// Number of lanes that can share data using DPP8 modifiers.
+template <class ABDataType>
-constexpr index_t lane_group_size = 8;
+struct dpp_datatypes;
-__device__ index_t get_lane_group_local_idx() { return threadIdx.x / lane_group_size; }
+template <>
-__device__ index_t get_thread_idx_in_lane_group() { return threadIdx.x % lane_group_size; }
+struct dpp_datatypes<half_t>
+{
+    // Dot product of `half2_t` and `half2_t` to get `float`. Reducing 2 elements from K in a
+    // single instruction.
+    using a_dtype                        = half_t;
+    using b_dtype                        = half_t;
+    using c_dtype                        = float;
+    static constexpr index_t k_per_instr = 2;
+};
+template <index_t MPerThread,
+          index_t NPerThread,
+          index_t KPerThread,
+          class BaseInputType,
+          class AVecDataType,
+          class BVecDataType,
+          class CVecDataType,
+          bool ShareA>
+struct DppLanegroupGemm
+{
+    using datatypes_conf = dpp_datatypes<BaseInputType>;
+    using ADataType      = typename datatypes_conf::a_dtype;
+    using BDataType      = typename datatypes_conf::b_dtype;
+    using CDataType      = typename datatypes_conf::c_dtype;
+    __device__ void Run(const AVecDataType& a_vec, const BVecDataType& b_vec, CVecDataType& c_vec)
+    {
+        constexpr index_t num_c_elems_per_thread = ShareA ? MPerThread : NPerThread;
+        const vector_type<ADataType, KPerThread> a_vector{a_vec};
+        const vector_type<BDataType, KPerThread> b_vector{b_vec};
+        static_for<0, num_c_elems_per_thread, 1>{}([&](auto c_idx) {
+            float c = c_vec.template AsType<CDataType>()(c_idx);
+            // Next `c_idx` implies that we need to pull data from the next lane.
+            constexpr index_t source_lane = c_idx;
+            static_for<0, KPerThread / datatypes_conf::k_per_instr, 1>{}([&](auto k_chunk) {
+                const auto a_k_vec = a_vector.template AsType<AVecDataType>()[k_chunk];
+                const auto b_k_vec = b_vector.template AsType<BVecDataType>()[k_chunk];
+                ck::dpp8::
+                    inner_product_dpp<AVecDataType, BVecDataType, CDataType, source_lane, ShareA>(
+                        a_k_vec, b_k_vec, c);
+            });
+            c_vec.template AsType<CDataType>()(c_idx) = c;
+        });
+    }
+};
 } // namespace dpp8

--- a/include/ck/utility/inner_product_dpp8.hpp
+++ b/include/ck/utility/inner_product_dpp8.hpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 #include "amd_gemm_dpp.hpp"
 #include "data_type.hpp"
 #include "type_convert.hpp"
@@ -10,6 +11,9 @@ namespace ck {
 namespace dpp8 {
+/// Number of lanes that can share data using DPP8 modifiers.
+constexpr index_t lane_group_size = 8;
 template <int SrcLaneIdx>
 __device__ void inline_v_dot2c_dpp8_instr(const half2_t& a, const half2_t& b, float& c);

--- a/include/ck/utility/loop_scheduler.hpp
+++ b/include/ck/utility/loop_scheduler.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_adaptor.hpp"
+namespace ck {
+enum struct LoopScheduler
+{
+    Default,
+    Interwave,
+};
+constexpr LoopScheduler make_default_loop_scheduler()
+{
+#if CK_EXPERIMENTAL_DEFAULT_TO_INTER_WAVE_SCHEDULING
+    return LoopScheduler::Interwave;
+#else
+    return LoopScheduler::Default;
+#endif // if CK_EXPERIMENTAL_DEFAULT_TO_INTER_WAVE_SCHEDULING
+}
+} // namespace ck
--- a/include/ck/utility/reduction_operator.hpp
+++ b/include/ck/utility/reduction_operator.hpp
@@ -116,7 +116,15 @@ struct Max
    template <typename T>
    __host__ __device__ static constexpr T GetIdentityValue()
    {
-        return NumericLimits<T>::Lowest();
+        if constexpr(is_same_v<T, bhalf_t>)
+        {
+            float val = NumericLimits<float>::Lowest();
+            return type_convert<bhalf_t>(val);
+        }
+        else
+        {
+            return NumericLimits<T>::Lowest();
+        }
    };
    __host__ __device__ static constexpr bool
@@ -138,6 +146,15 @@ struct Max
            a = b;
    }
+    __host__ __device__ inline constexpr void operator()(bhalf_t& a, bhalf_t b) const
+    {
+        float a_ = type_convert<float>(a);
+        float b_ = type_convert<float>(b);
+        if(a_ < b_)
+            a = b;
+    }
    template <typename T>
    __host__ __device__ inline constexpr void operator()(T& a, T b, bool& changed) const
    {
@@ -152,6 +169,18 @@ struct Max
            changed = true;
        }
    }
+    __host__ __device__ inline constexpr void operator()(bhalf_t& a, bhalf_t b, bool& changed) const
+    {
+        float a_ = type_convert<float>(a);
+        float b_ = type_convert<float>(b);
+        if(a_ < b_)
+        {
+            a       = b;
+            changed = true;
+        }
+    }
 };
 struct Min
@@ -159,6 +188,15 @@ struct Min
    template <typename T>
    __host__ __device__ static constexpr T GetIdentityValue()
    {
+        if constexpr(is_same_v<T, bhalf_t>)
+        {
+            float val = NumericLimits<float>::Max();
+            return type_convert<bhalf_t>(val);
+        }
+        else
+        {
+            return NumericLimits<T>::Max();
+        }
        return NumericLimits<T>::Max();
    };
@@ -181,6 +219,15 @@ struct Min
            a = b;
    }
+    __host__ __device__ inline constexpr void operator()(bhalf_t& a, bhalf_t b) const
+    {
+        float a_ = type_convert<float>(a);
+        float b_ = type_convert<float>(b);
+        if(a_ > b_)
+            a = b;
+    }
    template <typename T>
    __host__ __device__ inline constexpr void operator()(T& a, T b, bool& changed) const
    {
@@ -195,6 +242,18 @@ struct Min
            changed = true;
        }
    }
+    __host__ __device__ inline constexpr void operator()(bhalf_t& a, bhalf_t b, bool& changed) const
+    {
+        float a_ = type_convert<float>(a);
+        float b_ = type_convert<float>(b);
+        if(a_ > b_)
+        {
+            a       = b;
+            changed = true;
+        }
+    }
 };
 struct AMax

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
@@ -92,11 +92,11 @@ struct ReferenceGemm : public device::BaseOperator
                        ck::type_convert<AccDataType>(v_a) * ck::type_convert<AccDataType>(v_b);
                }
-                AccDataType v_c;
+                CDataType v_c;
                arg.c_element_op_(v_c, v_acc);
-                arg.c_m_n_(m, n) = ck::type_convert<CDataType>(v_c);
+                arg.c_m_n_(m, n) = v_c;
            };
            make_ParallelTensorFunctor(

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iostream>
+#include <type_traits>
+#include <sstream>
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace host {
+/**
+ * \brief Reference implementation for image to column.
+ *
+ * Tensor descriptor has [G, N, C, Di, Hi, Wi] data layout.
+ * G must be equal to 1. Memory layout is [G, N, Di, Hi, Wi, C].
+ *
+ * \tparam NDimSpatial Number of spatial dimensions.
+ * \tparam InputLayout Input Layout.
+ * \tparam InDataType Input Data Type.
+ * \tparam OutDataType Output Data Type.
+ */
+template <ck::index_t NDimSpatial,
+          typename InputLayout,
+          typename InDataType,
+          typename OutDataType,
+          typename std::enable_if<NDimSpatial >= 1 && NDimSpatial <= 3, bool>::type = false>
+struct ReferenceImageToColumn : public device::BaseOperator
+{
+    // Argument
+    struct Argument : public device::BaseArgument
+    {
+        public:
+        Argument(const Tensor<InDataType>& input,
+                 Tensor<OutDataType>& output,
+                 std::vector<ck::index_t> filter_spatial_lengths,
+                 std::vector<ck::index_t> conv_filter_strides,
+                 std::vector<ck::index_t> conv_filter_dilations,
+                 std::vector<ck::index_t> input_left_pads,
+                 std::vector<ck::index_t> input_right_pads)
+            : input_{input},
+              output_{output},
+              conv_strides_{conv_filter_strides},
+              conv_dilations_{conv_filter_dilations},
+              in_left_pads_{input_left_pads},
+              in_right_pads_{input_right_pads},
+              filter_spatial_lengths_{filter_spatial_lengths}
+        {
+            initOutputSpatialLengths();
+        }
+        const Tensor<InDataType>& input_;
+        Tensor<OutDataType>& output_;
+        std::vector<index_t> conv_strides_;
+        std::vector<index_t> conv_dilations_;
+        std::vector<index_t> in_left_pads_;
+        std::vector<index_t> in_right_pads_;
+        std::vector<index_t> filter_spatial_lengths_;
+        std::vector<index_t> output_spatial_lengths_;
+        private:
+        void initOutputSpatialLengths()
+        {
+            constexpr auto input_offset_to_spatial = 3;
+            for(ck::index_t i = 0; i < NDimSpatial; ++i)
+            {
+                // XEff = (X - 1) * conv_dilation_w + 1;
+                // Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+                const ck::index_t x_eff = (filter_spatial_lengths_[i] - 1) * conv_dilations_[i] + 1;
+                output_spatial_lengths_.push_back(
+                    (input_.GetLengths()[i + input_offset_to_spatial] + in_left_pads_[i] +
+                     in_right_pads_[i] - x_eff) /
+                        conv_strides_[i] +
+                    1);
+            }
+        }
+    };
+    struct Invoker : public device::BaseInvoker
+    {
+        using Argument = ReferenceImageToColumn::Argument;
+        float Run(const Argument& arg)
+        {
+            if(!(arg.input_.GetNumOfDimension() == NDimSpatial + 3 &&
+                 arg.output_.GetNumOfDimension() == 2))
+            {
+                throw std::runtime_error("wrong! inconsistent dimension");
+            }
+            const index_t N = arg.input_.GetLengths()[1];
+            const index_t C = arg.input_.GetLengths()[2];
+            if constexpr(NDimSpatial == 1)
+            {
+                const index_t Wo = arg.output_spatial_lengths_[0];
+                auto func        = [&](auto n, auto wo) {
+                    index_t row    = n * Wo + wo;
+                    index_t column = 0;
+                    for(index_t x = 0; x < arg.filter_spatial_lengths_[0]; ++x)
+                    {
+                        auto wi = static_cast<ck::long_index_t>(wo * arg.conv_strides_[0]) +
+                                  static_cast<ck::long_index_t>(x * arg.conv_dilations_[0]) -
+                                  static_cast<ck::long_index_t>(arg.in_left_pads_[0]);
+                        for(index_t c = 0; c < C; ++c)
+                        {
+                            if(wi >= 0 &&
+                               ck::type_convert<std::size_t>(wi) < arg.input_.GetLengths()[3])
+                            {
+                                InDataType v_in          = arg.input_(0, n, c, wi);
+                                arg.output_(row, column) = ck::type_convert<OutDataType>(v_in);
+                            }
+                            column++;
+                        }
+                    }
+                };
+                make_ParallelTensorFunctor(func, N, Wo)(std::thread::hardware_concurrency());
+                return 0;
+            }
+            else if constexpr(NDimSpatial == 2)
+            {
+                const index_t Ho = arg.output_spatial_lengths_[0];
+                const index_t Wo = arg.output_spatial_lengths_[1];
+                auto func = [&](auto n, auto ho, auto wo) {
+                    index_t row    = n * Ho * Wo + ho * Wo + wo;
+                    index_t column = 0;
+                    for(index_t y = 0; y < arg.filter_spatial_lengths_[0]; ++y)
+                    {
+                        auto hi = static_cast<ck::long_index_t>(ho * arg.conv_strides_[0]) +
+                                  static_cast<ck::long_index_t>(y * arg.conv_dilations_[0]) -
+                                  static_cast<ck::long_index_t>(arg.in_left_pads_[0]);
+                        for(index_t x = 0; x < arg.filter_spatial_lengths_[1]; ++x)
+                        {
+                            auto wi = static_cast<ck::long_index_t>(wo * arg.conv_strides_[1]) +
+                                      static_cast<ck::long_index_t>(x * arg.conv_dilations_[1]) -
+                                      static_cast<ck::long_index_t>(arg.in_left_pads_[1]);
+                            for(index_t c = 0; c < C; ++c)
+                            {
+                                if(hi >= 0 &&
+                                   ck::type_convert<std::size_t>(hi) < arg.input_.GetLengths()[3] &&
+                                   wi >= 0 &&
+                                   ck::type_convert<std::size_t>(wi) < arg.input_.GetLengths()[4])
+                                {
+                                    InDataType v_in          = arg.input_(0, n, c, hi, wi);
+                                    arg.output_(row, column) = ck::type_convert<OutDataType>(v_in);
+                                }
+                                column++;
+                            }
+                        }
+                    }
+                };
+                make_ParallelTensorFunctor(func, N, Ho, Wo)(std::thread::hardware_concurrency());
+                return 0;
+            }
+            else if constexpr(NDimSpatial == 3)
+            {
+                const index_t Do = arg.output_spatial_lengths_[0];
+                const index_t Ho = arg.output_spatial_lengths_[1];
+                const index_t Wo = arg.output_spatial_lengths_[2];
+                auto func = [&](auto n, auto d_o, auto ho, auto wo) {
+                    index_t row    = n * Do * Ho * Wo + d_o * Ho * Wo + ho * Wo + wo;
+                    index_t column = 0;
+                    for(index_t z = 0; z < arg.filter_spatial_lengths_[0]; ++z)
+                    {
+                        auto di = static_cast<ck::long_index_t>(d_o * arg.conv_strides_[0]) +
+                                  static_cast<ck::long_index_t>(z * arg.conv_dilations_[0]) -
+                                  static_cast<ck::long_index_t>(arg.in_left_pads_[0]);
+                        for(index_t y = 0; y < arg.filter_spatial_lengths_[1]; ++y)
+                        {
+                            auto hi = static_cast<ck::long_index_t>(ho * arg.conv_strides_[1]) +
+                                      static_cast<ck::long_index_t>(y * arg.conv_dilations_[1]) -
+                                      static_cast<ck::long_index_t>(arg.in_left_pads_[1]);
+                            for(index_t x = 0; x < arg.filter_spatial_lengths_[2]; ++x)
+                            {
+                                auto wi =
+                                    static_cast<ck::long_index_t>(wo * arg.conv_strides_[2]) +
+                                    static_cast<ck::long_index_t>(x * arg.conv_dilations_[2]) -
+                                    static_cast<ck::long_index_t>(arg.in_left_pads_[2]);
+                                for(index_t c = 0; c < C; ++c)
+                                {
+                                    if(di >= 0 &&
+                                       ck::type_convert<std::size_t>(di) <
+                                           arg.input_.GetLengths()[3] &&
+                                       hi >= 0 &&
+                                       ck::type_convert<std::size_t>(hi) <
+                                           arg.input_.GetLengths()[4] &&
+                                       wi >= 0 &&
+                                       ck::type_convert<std::size_t>(wi) <
+                                           arg.input_.GetLengths()[5])
+                                    {
+                                        InDataType v_in = arg.input_(0, n, c, di, hi, wi);
+                                        arg.output_(row, column) =
+                                            ck::type_convert<OutDataType>(v_in);
+                                    }
+                                    column++;
+                                }
+                            }
+                        }
+                    }
+                };
+                make_ParallelTensorFunctor(func, N, Do, Ho, Wo)(
+                    std::thread::hardware_concurrency());
+                return 0;
+            }
+        }
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /*stream_config*/ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+    static constexpr bool IsValidCompilationParameter()
+    {
+        using namespace tensor_layout::convolution;
+        if constexpr(!(std::is_same_v<InputLayout, GNWC> || std::is_same_v<InputLayout, GNHWC> ||
+                       std::is_same_v<InputLayout, GNDHWC>))
+        {
+            return false;
+        }
+        if constexpr(!(NDimSpatial >= 1 && NDimSpatial <= 3))
+        {
+            return false;
+        }
+        return true;
+    }
+    bool IsSupportedArgument(const Argument& arg)
+    {
+        const ck::index_t G = arg.input_.GetLengths()[0];
+        const ck::index_t N = arg.input_.GetLengths()[1];
+        const ck::index_t C = arg.input_.GetLengths()[2];
+        const index_t NDoHoWo =
+            N * ck::accumulate_n<index_t>(
+                    arg.output_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
+        const index_t CZYX =
+            C * ck::accumulate_n<index_t>(
+                    arg.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
+        if(!(arg.output_.GetLengths()[0] == static_cast<std::size_t>(NDoHoWo) &&
+             arg.output_.GetLengths()[1] == static_cast<std::size_t>(CZYX)))
+        {
+            return false;
+        }
+        if(G != 1)
+        {
+            return false;
+        }
+        return true;
+    }
+    bool IsSupportedArgument(const device::BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+    static auto MakeArgument(const Tensor<InDataType>& input,
+                             Tensor<OutDataType>& output,
+                             std::vector<ck::index_t> filter_spatial_lengths,
+                             std::vector<ck::index_t> conv_filter_strides,
+                             std::vector<ck::index_t> conv_filter_dilations,
+                             std::vector<ck::index_t> input_left_pads,
+                             std::vector<ck::index_t> input_right_pads)
+    {
+        return Argument{input,
+                        output,
+                        filter_spatial_lengths,
+                        conv_filter_strides,
+                        conv_filter_dilations,
+                        input_left_pads,
+                        input_right_pads};
+    }
+    static auto MakeInvoker() { return Invoker{}; }
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+        // clang-format off
+        str << "ReferenceImageToColumn"
+            << std::endl;
+        // clang-format on
+        return str.str();
+    }
+};
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_maxpool_bwd.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_maxpool_bwd.hpp
@@ -53,7 +53,16 @@ struct ReferenceMaxPoolBwd : public device::BaseOperator
            {
                int index = arg.indices_.mData[i];
                if(index >= 0 && index < din_length)
-                    buf[index] += ck::type_convert<ConputeDataType>(arg.dout_.mData[i]);
+                {
+                    if constexpr(is_same_v<ConputeDataType, bhalf_t>)
+                    {
+                        float buf_val = ck::type_convert<float>(buf[index]);
+                        buf_val += ck::type_convert<float>(arg.dout_.mData[i]);
+                        buf[index] = ck::type_convert<ConputeDataType>(buf_val);
+                    }
+                    else
+                        buf[index] += ck::type_convert<ConputeDataType>(arg.dout_.mData[i]);
+                }
            }
            for(int i = 0; i < din_length; ++i)

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp
@@ -256,10 +256,12 @@ struct ReferencePoolingFwd : public device::BaseOperator
                    for(ck::index_t y = 0; y < arg.window_spatial_lengths_[0]; ++y)
                    {
-                        ck::index_t hi = ho * arg.window_strides_[0] + y - arg.in_left_pads_[0];
+                        ck::index_t hi = ho * arg.window_strides_[0] +
+                                         y * arg.window_dilations_[0] - arg.in_left_pads_[0];
                        for(ck::index_t x = 0; x < arg.window_spatial_lengths_[1]; ++x)
                        {
-                            ck::index_t wi = wo * arg.window_strides_[1] + x - arg.in_left_pads_[1];
+                            ck::index_t wi = wo * arg.window_strides_[1] +
+                                             x * arg.window_dilations_[1] - arg.in_left_pads_[1];
                            if(hi >= 0 &&
                               hi < static_cast<ck::index_t>(arg.in_.mDesc.GetLengths()[2]) &&
                               wi >= 0 &&

--- a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
+++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
@@ -31,6 +31,9 @@ using F64_Tuple     = ck::Tuple<F64>;
 using F32_Tuple     = ck::Tuple<F32>;
 using I32_Tuple     = ck::Tuple<I32>;
 using I32_F32_Tuple = ck::Tuple<I32, F32>;
+using I8_Tuple      = ck::Tuple<I8>;
+using F32_F32_Tuple = ck::Tuple<F32, F32>;
 // GEMM layout
 using Row = ck::tensor_layout::gemm::RowMajor;
@@ -95,9 +98,11 @@ using AddFastGelu    = ck::tensor_operation::element_wise::AddFastGelu;
 using AddReluAdd     = ck::tensor_operation::element_wise::AddReluAdd;
 using FastGelu       = ck::tensor_operation::element_wise::FastGelu;
 using AddMultiply    = ck::tensor_operation::element_wise::AddMultiply;
+using MultiplyAdd    = ck::tensor_operation::element_wise::MultiplyAdd;
 using ScaleAdd       = ck::tensor_operation::element_wise::ScaleAdd;
 using Gelu           = ck::tensor_operation::element_wise::Gelu;
 using Swish          = ck::tensor_operation::element_wise::Swish;
+using Add            = ck::tensor_operation::element_wise::Add;
 template <typename Activation>
 using Activation_Mul_Clamp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<Activation>;

--- a/library/include/ck/library/tensor_operation_instance/gpu/avg_pool3d_bwd.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/avg_pool3d_bwd.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck/tensor_operation/gpu/device/device_avgpool_bwd.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+#ifdef CK_ENABLE_FP16
+void add_device_avgpool_bwd_ndhwc_f16_instances(
+    std::vector<std::unique_ptr<DeviceAvgPoolBwd<3, F16, F16, NDHWC, NDHWC>>>&);
+#endif
+#ifdef CK_ENABLE_BF16
+void add_device_avgpool_bwd_ndhwc_bf16_instances(
+    std::vector<std::unique_ptr<DeviceAvgPoolBwd<3, BF16, BF16, NDHWC, NDHWC>>>&);
+#endif
+#ifdef CK_ENABLE_FP32
+void add_device_avgpool_bwd_ndhwc_f32_instances(
+    std::vector<std::unique_ptr<DeviceAvgPoolBwd<3, F32, F32, NDHWC, NDHWC>>>&);
+#endif
+template <typename DOutDataType, typename DInDataType, typename InLayout, typename OutLayout>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::
+        DeviceAvgPoolBwd<3, DOutDataType, DInDataType, InLayout, OutLayout>>
+{
+    using DeviceOp = DeviceAvgPoolBwd<3, DOutDataType, DInDataType, InLayout, OutLayout>;
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+        if constexpr(is_same_v<InLayout, NDHWC> && is_same_v<OutLayout, NDHWC>)
+        {
+#ifdef CK_ENABLE_FP16
+            if constexpr(is_same_v<DOutDataType, F16> && is_same_v<DInDataType, F16>)
+                add_device_avgpool_bwd_ndhwc_f16_instances(op_ptrs);
+#endif
+#ifdef CK_ENABLE_BF16
+            else if constexpr(is_same_v<DOutDataType, BF16> && is_same_v<DInDataType, BF16>)
+                add_device_avgpool_bwd_ndhwc_bf16_instances(op_ptrs);
+#endif
+#ifdef CK_ENABLE_FP32
+            else if constexpr(is_same_v<DOutDataType, F32> && is_same_v<DInDataType, F32>)
+                add_device_avgpool_bwd_ndhwc_f32_instances(op_ptrs);
+#endif
+        }
+        return op_ptrs;
+    }
+};
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
@@ -23,7 +23,7 @@ void add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances(
        DeviceGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);
-void add_device_gemm_dl_dpp8_f16_f16_f16_km_kn_mn_instances(
+void add_device_gemm_dpp_f16_f16_f16_km_kn_mn_instances(
    std::vector<std::unique_ptr<
        DeviceGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);
@@ -38,7 +38,7 @@ void add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances(
        DeviceGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);
-void add_device_gemm_dl_dpp8_f16_f16_f16_km_nk_mn_instances(
+void add_device_gemm_dpp_f16_f16_f16_km_nk_mn_instances(
    std::vector<std::unique_ptr<
        DeviceGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);
@@ -53,7 +53,7 @@ void add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances(
        DeviceGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);
-void add_device_gemm_dl_dpp8_f16_f16_f16_mk_kn_mn_instances(
+void add_device_gemm_dpp_f16_f16_f16_mk_kn_mn_instances(
    std::vector<std::unique_ptr<
        DeviceGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);
@@ -68,7 +68,7 @@ void add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances(
        DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);
-void add_device_gemm_dl_dpp8_f16_f16_f16_mk_nk_mn_instances(
+void add_device_gemm_dpp_f16_f16_f16_mk_nk_mn_instances(
    std::vector<std::unique_ptr<
        DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
        instances);
@@ -374,7 +374,7 @@ struct DeviceOperationInstanceFactory<
 #ifdef DL_KERNELS
                add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
                add_device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instances(op_ptrs);
-                add_device_gemm_dl_dpp8_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
+                add_device_gemm_dpp_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
 #endif
                add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
            }
@@ -385,7 +385,7 @@ struct DeviceOperationInstanceFactory<
 #ifdef DL_KERNELS
                add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
                add_device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instances(op_ptrs);
-                add_device_gemm_dl_dpp8_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
+                add_device_gemm_dpp_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
 #endif
                add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
                add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
@@ -397,7 +397,7 @@ struct DeviceOperationInstanceFactory<
 #ifdef DL_KERNELS
                add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances(op_ptrs);
                add_device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instances(op_ptrs);
-                add_device_gemm_dl_dpp8_f16_f16_f16_km_kn_mn_instances(op_ptrs);
+                add_device_gemm_dpp_f16_f16_f16_km_kn_mn_instances(op_ptrs);
 #endif
                add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances(op_ptrs);
            }
@@ -408,7 +408,7 @@ struct DeviceOperationInstanceFactory<
 #ifdef DL_KERNELS
                add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances(op_ptrs);
                add_device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instances(op_ptrs);
-                add_device_gemm_dl_dpp8_f16_f16_f16_km_nk_mn_instances(op_ptrs);
+                add_device_gemm_dpp_f16_f16_f16_km_nk_mn_instances(op_ptrs);
 #endif
                add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(op_ptrs);
            }

--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp
@@ -69,6 +69,58 @@ void add_device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance
                                                    PassThrough,
                                                    Bilinear>>>& instances);
+void add_device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_kn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
+                                                    Row,
+                                                    Row_Tuple,
+                                                    Row,
+                                                    I8,
+                                                    I8,
+                                                    I8_Tuple,
+                                                    I8,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    Bilinear>>>& instances);
+void add_device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_nk_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
+                                                    Col,
+                                                    Row_Tuple,
+                                                    Row,
+                                                    I8,
+                                                    I8,
+                                                    I8_Tuple,
+                                                    I8,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    Bilinear>>>& instances);
+void add_device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_kn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Col,
+                                                    Row,
+                                                    Row_Tuple,
+                                                    Row,
+                                                    I8,
+                                                    I8,
+                                                    I8_Tuple,
+                                                    I8,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    Bilinear>>>& instances);
+void add_device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_nk_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Col,
+                                                    Col,
+                                                    Row_Tuple,
+                                                    Row,
+                                                    I8,
+                                                    I8,
+                                                    I8_Tuple,
+                                                    I8,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    Bilinear>>>& instances);
 // GEMM + Bilinear
 template <typename ALayout,
          typename BLayout,
@@ -135,6 +187,30 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
                    op_ptrs);
            }
        }
+        else if constexpr(is_same_v<ADataType, std::int8_t> && is_same_v<BDataType, std::int8_t> &&
+                          is_same_v<DDataType, std::int8_t> && is_same_v<EDataType, std::int8_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<DLayout, Row> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_kn_mn_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<DLayout, Row> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_nk_mn_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<DLayout, Row> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_kn_mn_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<DLayout, Row> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_nk_mn_mn_instances(op_ptrs);
+            }
+        }
        return op_ptrs;
    }

--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_add.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <cstdlib>
+#include <vector>
+#include <memory>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+void add_device_gemm_multiply_add_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
+                                                    Row,
+                                                    Row_Row_Tuple,
+                                                    Row,
+                                                    F16,
+                                                    F16,
+                                                    F16_F16_Tuple,
+                                                    F16,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    MultiplyAdd>>>&);
+void add_device_gemm_multiply_add_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
+                                                    Col,
+                                                    Row_Row_Tuple,
+                                                    Row,
+                                                    F16,
+                                                    F16,
+                                                    F16_F16_Tuple,
+                                                    F16,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    MultiplyAdd>>>&);
+void add_device_gemm_multiply_add_xdl_c_shuffle_f16_f8_f32_f32_f16_mk_kn_mn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
+                                                    Row,
+                                                    Row_Row_Tuple,
+                                                    Row,
+                                                    F16,
+                                                    F8,
+                                                    F32_F32_Tuple,
+                                                    F16,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    MultiplyAdd>>>&);
+void add_device_gemm_multiply_add_xdl_c_shuffle_f16_f8_f32_f32_f16_mk_nk_mn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
+                                                    Col,
+                                                    Row_Row_Tuple,
+                                                    Row,
+                                                    F16,
+                                                    F8,
+                                                    F32_F32_Tuple,
+                                                    F16,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    MultiplyAdd>>>&);
+// GEMM + Multiply + Add
+template <typename ALayout,
+          typename BLayout,
+          typename D0Layout,
+          typename D1Layout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename D0DataType,
+          typename D1DataType,
+          typename EDataType>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMultipleD<
+    ALayout,
+    BLayout,
+    ck::Tuple<D0Layout, D1Layout>,
+    ELayout,
+    ADataType,
+    BDataType,
+    ck::Tuple<D0DataType, D1DataType>,
+    EDataType,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::MultiplyAdd>>
+{
+    using DeviceOp = DeviceGemmMultipleD<ALayout,
+                                         BLayout,
+                                         ck::Tuple<D0Layout, D1Layout>,
+                                         ELayout,
+                                         ADataType,
+                                         BDataType,
+                                         ck::Tuple<D0DataType, D1DataType>,
+                                         EDataType,
+                                         ck::tensor_operation::element_wise::PassThrough,
+                                         ck::tensor_operation::element_wise::PassThrough,
+                                         ck::tensor_operation::element_wise::MultiplyAdd>;
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
+                     is_same_v<D0DataType, half_t> && is_same_v<D1DataType, half_t> &&
+                     is_same_v<EDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<D0Layout, Row> && is_same_v<D1Layout, Row> &&
+                         is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_multiply_add_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<D0Layout, Row> && is_same_v<D1Layout, Row> &&
+                              is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_multiply_add_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instances(
+                    op_ptrs);
+            }
+        }
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, f8_t> &&
+                     is_same_v<D0DataType, float> && is_same_v<D1DataType, float> &&
+                     is_same_v<EDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<D0Layout, Row> && is_same_v<D1Layout, Row> &&
+                         is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_multiply_add_xdl_c_shuffle_f16_f8_f32_f32_f16_mk_kn_mn_mn_mn_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<D0Layout, Row> && is_same_v<D1Layout, Row> &&
+                              is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_multiply_add_xdl_c_shuffle_f16_f8_f32_f32_f16_mk_nk_mn_mn_mn_instances(
+                    op_ptrs);
+            }
+        }
+        return op_ptrs;
+    }
+};
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck