Merge remote-tracking branch 'upstream/develop' into merge_upstream_1129

also fix regression

Merge remote-tracking branch 'upstream/develop' into merge_upstream_1129
also fix regression
d27e0691 · Chao Liu · 0a7174ad · a2969aa8 · d27e0691 · d27e0691
Commit d27e0691 authored Nov 30, 2023 by Chao Liu
20 changed files
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dpp.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dpp.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_adaptor.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/warp/dpp_gemm.hpp"
+namespace ck {
+/**
+ * Blockwise GEMM that uses DPP instruction modifier to limit the amount of data loaded for each
+ * thread by sharing the data between threads in a lanegroup.
+ *
+ * In every iteration, each wave calculates a C tile of size `MPerDpp` * `NPerDpp`, there are
+ * `MRepeat` iterations for `M` dimension and `NRepeat` for `N` one.
+ * In total, the algorithm runs using
+ * `MPerBlock / (MRepeat * MPerDpp) * NPerBlock / (NRepeat * NPerDpp)` waves.
+ */
+template <index_t BlockSize,
+          typename ABDataType,
+          typename AccDataType,
+          typename AK0MK1BlockDesc,
+          typename BK0NK1BlockDesc,
+          index_t MPerDpp,
+          index_t NPerDpp,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack>
+struct BlockwiseGemmDpp_ak0mak1_bk0nbk1_m0n0m1n1m2n2
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+    static constexpr index_t WaveSize = get_warp_size();
+    static constexpr index_t MPerBlock = AK0MK1BlockDesc{}.GetLength(I1);
+    static constexpr index_t NPerBlock = BK0NK1BlockDesc{}.GetLength(I1);
+    static constexpr index_t KPerBlock =
+        BK0NK1BlockDesc{}.GetLength(I0) * BK0NK1BlockDesc{}.GetLength(I2);
+    static constexpr index_t A_K0 = AK0MK1BlockDesc{}.GetLength(I0);
+    static constexpr index_t B_K0 = BK0NK1BlockDesc{}.GetLength(I0);
+    static constexpr index_t A_K1 = AK0MK1BlockDesc{}.GetLength(I2);
+    static constexpr index_t B_K1 = BK0NK1BlockDesc{}.GetLength(I2);
+    static constexpr auto dpp_gemm = DppGemm<ABDataType, MPerDpp, NPerDpp, KPack>{};
+    static constexpr index_t KPerThread = KPerBlock / dpp_gemm.K0PerDpp;
+    static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerDpp);
+    static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerDpp);
+    StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
+                              AccDataType,
+                              MRepeat * NRepeat,
+                              dpp_gemm.GetRegSizePerDpp(),
+                              true>
+        c_thread_buf_;
+    __host__ __device__ constexpr auto& GetCThreadBuffer() { return c_thread_buf_; }
+    __device__ static auto GetWaveIdx()
+    {
+        const index_t thread_id = ThisThreadBlock::GetThreadId();
+        constexpr auto threadid_to_wave_idx_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_merge_transform(make_tuple(MWaves, NWaves, WaveSize))),
+            make_tuple(Sequence<0, 1, 2>{}),
+            make_tuple(Sequence<0>{}));
+        return threadid_to_wave_idx_adaptor.CalculateBottomIndex(make_multi_index(thread_id));
+    }
+    __device__ static auto CalculateAThreadOriginDataIndex_M0_M1_M2_K()
+    {
+        const auto wave_idx    = GetWaveIdx();
+        const auto waveId_m    = wave_idx[I0];
+        const auto dpp_a_idx   = dpp_gemm.CalculateAThreadOriginDataIndex_K_M();
+        const auto dpp_a_idx_k = dpp_a_idx[I0];
+        const auto dpp_a_idx_m = dpp_a_idx[I1];
+        return make_multi_index(0, waveId_m, dpp_a_idx_m, KPerThread * dpp_a_idx_k);
+    }
+    __device__ static auto CalculateBThreadOriginDataIndex_N0_N1_N2_K()
+    {
+        const auto wave_idx    = GetWaveIdx();
+        const auto waveId_n    = wave_idx[I1];
+        const auto dpp_b_idx   = dpp_gemm.CalculateBThreadOriginDataIndex_K_N();
+        const auto dpp_b_idx_k = dpp_b_idx[I0];
+        const auto dpp_b_idx_n = dpp_b_idx[I1];
+        return make_multi_index(0, waveId_n, dpp_b_idx_n, KPerThread * dpp_b_idx_k);
+    }
+    template <index_t m0, index_t n0>
+    __device__ static auto CalculateCThreadOriginDataIndex(Number<m0>, Number<n0>)
+    {
+        const auto wave_idx = GetWaveIdx();
+        const auto waveId_m = wave_idx[I0];
+        const auto waveId_n = wave_idx[I1];
+        const auto blk_idx      = dpp_gemm.GetBeginOfThreadBlk();
+        const auto blk_m_offset = blk_idx[I0];
+        const auto blk_n_offset = blk_idx[I1];
+        constexpr auto mrepeat_mwave_MPerDpp_to_m_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_unmerge_transform(make_tuple(MRepeat, MWaves, MPerDpp))),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0, 1, 2>{}));
+        constexpr auto nrepeat_nwave_NPerDpp_to_n_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_unmerge_transform(make_tuple(NRepeat, NWaves, NPerDpp))),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0, 1, 2>{}));
+        const index_t c_thread_m = mrepeat_mwave_MPerDpp_to_m_adaptor.CalculateBottomIndex(
+            make_tuple(m0, waveId_m, blk_m_offset))[I0];
+        const index_t c_thread_n = nrepeat_nwave_NPerDpp_to_n_adaptor.CalculateBottomIndex(
+            make_tuple(n0, waveId_n, blk_n_offset))[I0];
+        return make_tuple(c_thread_m, c_thread_n);
+    }
+    __host__ __device__ BlockwiseGemmDpp_ak0mak1_bk0nbk1_m0n0m1n1m2n2()
+    {
+        static_assert(AK0MK1BlockDesc::IsKnownAtCompileTime() &&
+                          BK0NK1BlockDesc::IsKnownAtCompileTime(),
+                      "Wrong! Block descriptors should be known at the time of compilation.");
+#if defined(__HIP_DEVICE_COMPILE__)
+        // Host wave size can be different than the device one and this assert could fail for host,
+        // but it does matter only for device.
+        static_assert(ThisThreadBlock::GetNumOfThread() == MWaves * NWaves * WaveSize,
+                      "ThisThreadBlock::GetNumOfThread() != MWaves * NWaves * WaveSize\n");
+#endif
+        static_assert(MPerBlock % (MPerDpp * MRepeat) == 0,
+                      "Invalid parameters. MPerBlock must be divisible by MPerDpp * MRepeat.");
+        static_assert(NPerBlock % (NPerDpp * NRepeat) == 0,
+                      "Invalid parameters. NPerBlock must be divisible by NPerDpp * NRepeat.");
+    }
+    __host__ __device__ static constexpr auto GetCThreadDescriptor_M0_N0_M1_N1_M2_N2()
+    {
+        constexpr auto c_m_n_tblk_lens = dpp_gemm.GetCMNThreadBlkLengths();
+        constexpr auto M               = c_m_n_tblk_lens[I0];
+        constexpr auto N               = c_m_n_tblk_lens[I1];
+        return make_naive_tensor_descriptor_packed(
+            make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, I1, I1, M, N));
+    }
+    __host__ __device__ static constexpr auto GetCThreadDescriptor_G_M0_N0_M1_N1_M2_N2()
+    {
+        constexpr auto c_m_n_tblk_lens = dpp_gemm.GetCMNThreadBlkLengths();
+        constexpr auto M               = c_m_n_tblk_lens[I0];
+        constexpr auto N               = c_m_n_tblk_lens[I1];
+        return make_naive_tensor_descriptor_packed(
+            make_tuple(I1, Number<MRepeat>{}, Number<NRepeat>{}, I1, I1, M, N));
+    }
+    __host__ __device__ static constexpr auto GetCBlockDescriptor_M0_N0_M1_N1_M2_N2()
+    {
+        constexpr auto c_block_desc_m0_n0_m1_n1_m2_n2 =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<MRepeat>{},
+                                                           Number<NRepeat>{},
+                                                           Number<MWaves>{},
+                                                           Number<NWaves>{},
+                                                           Number<MPerDpp>{},
+                                                           Number<NPerDpp>{}));
+        return c_block_desc_m0_n0_m1_n1_m2_n2;
+    }
+    __host__ __device__ static constexpr auto GetCBlockDescriptor_G_M0_N0_M1_N1_M2_N2()
+    {
+        constexpr auto c_block_desc_g_m0_n0_m1_n1_m2_n2 =
+            make_naive_tensor_descriptor_packed(make_tuple(I1,
+                                                           Number<MRepeat>{},
+                                                           Number<NRepeat>{},
+                                                           Number<MWaves>{},
+                                                           Number<NWaves>{},
+                                                           Number<MPerDpp>{},
+                                                           Number<NPerDpp>{}));
+        return c_block_desc_g_m0_n0_m1_n1_m2_n2;
+    }
+    template <typename CGridDesc_M_N>
+    __host__ __device__ static constexpr auto
+    MakeCGridDescriptor_M0_N0_M1_N1_M2_N2(const CGridDesc_M_N& c_grid_desc_m_n)
+    {
+        const auto M = c_grid_desc_m_n.GetLength(I0);
+        const auto N = c_grid_desc_m_n.GetLength(I1);
+        const auto c_grid_desc_m0_n0_m1_n1_m2_n2 = transform_tensor_descriptor(
+            c_grid_desc_m_n,
+            make_tuple(make_unmerge_transform(make_tuple(M / (MWaves * MPerDpp), MWaves, MPerDpp)),
+                       make_unmerge_transform(make_tuple(N / (NWaves * NPerDpp), NWaves, NPerDpp))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 2, 4>{}, Sequence<1, 3, 5>{}));
+        return c_grid_desc_m0_n0_m1_n1_m2_n2;
+    }
+    template <typename CGridDesc_G_M_N>
+    __host__ __device__ static constexpr auto
+    MakeCGridDescriptor_G_M0_N0_M1_N1_M2_N2(const CGridDesc_G_M_N& c_grid_desc_g_m_n)
+    {
+        const auto G = c_grid_desc_g_m_n.GetLength(I0);
+        const auto M = c_grid_desc_g_m_n.GetLength(I1);
+        const auto N = c_grid_desc_g_m_n.GetLength(I2);
+        const auto c_grid_desc_g_m0_n0_m1_n1_m2_n2 = transform_tensor_descriptor(
+            c_grid_desc_g_m_n,
+            make_tuple(make_pass_through_transform(G),
+                       make_unmerge_transform(make_tuple(M / (MWaves * MPerDpp), MWaves, MPerDpp)),
+                       make_unmerge_transform(make_tuple(N / (NWaves * NPerDpp), NWaves, NPerDpp))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 3, 5>{}, Sequence<2, 4, 6>{}));
+        return c_grid_desc_g_m0_n0_m1_n1_m2_n2;
+    }
+    __host__ __device__ static constexpr auto MakeABlockDescriptor_M0_M1_M2_K()
+    {
+        return transform_tensor_descriptor(
+            AK0MK1BlockDesc{},
+            make_tuple(
+                make_merge_transform_v3_division_mod(make_tuple(Number<A_K0>{}, Number<A_K1>{})),
+                make_unmerge_transform(
+                    make_tuple(Number<MRepeat>{}, Number<MWaves>{}, Number<MPerDpp>{}))),
+            make_tuple(Sequence<0, 2>{}, Sequence<1>{}),
+            make_tuple(Sequence<3>{}, Sequence<0, 1, 2>{}));
+    }
+    __host__ __device__ static constexpr auto MakeBBlockDescriptor_N0_N1_N2_K()
+    {
+        return transform_tensor_descriptor(
+            BK0NK1BlockDesc{},
+            make_tuple(
+                make_merge_transform_v3_division_mod(make_tuple(Number<B_K0>{}, Number<B_K1>{})),
+                make_unmerge_transform(
+                    make_tuple(Number<NRepeat>{}, Number<NWaves>{}, Number<NPerDpp>{}))),
+            make_tuple(Sequence<0, 2>{}, Sequence<1>{}),
+            make_tuple(Sequence<3>{}, Sequence<0, 1, 2>{}));
+    }
+    static constexpr auto a_block_desc_m0_m1_m2_k = MakeABlockDescriptor_M0_M1_M2_K();
+    static constexpr auto b_block_desc_n0_n1_n2_k = MakeBBlockDescriptor_N0_N1_N2_K();
+    template <typename ABlockBuffer, typename BBlockBuffer, typename CThreadBuffer>
+    __device__ void Run(const ABlockBuffer& a_block_buf,
+                        const BBlockBuffer& b_block_buf,
+                        CThreadBuffer& c_thread_buf) const
+    {
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ABDataType>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ABDataType>(
+            b_thread_desc_.GetElementSpaceSize());
+        static_for<0, MRepeat, 1>{}([&](auto m0) {
+            // read A
+            a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                               make_tuple(m0, I0, I0, I0),
+                               a_block_buf,
+                               a_thread_desc_,
+                               make_tuple(I0, I0, I0, I0),
+                               a_thread_buf);
+            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                // read B
+                b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                   make_tuple(n0, I0, I0, I0),
+                                   b_block_buf,
+                                   b_thread_desc_,
+                                   make_tuple(I0, I0, I0, I0),
+                                   b_thread_buf);
+                static_for<0, KPerThread, KPack>{}([&](auto k) {
+                    vector_type<ABDataType, KPack> a_thread_vec;
+                    vector_type<ABDataType, KPack> b_thread_vec;
+                    static_for<0, KPack, 1>{}([&](auto i) {
+                        a_thread_vec.template AsType<ABDataType>()(i) = a_thread_buf
+                            [Number<a_thread_desc_.CalculateOffset(make_tuple(0, 0, 0, k + i))>{}];
+                        b_thread_vec.template AsType<ABDataType>()(i) = b_thread_buf
+                            [Number<b_thread_desc_.CalculateOffset(make_tuple(0, 0, 0, k + i))>{}];
+                    });
+                    using dpp_input_type =
+                        typename vector_type<ABDataType, dpp_gemm.K1PerDpp>::type;
+                    constexpr index_t c_offset =
+                        c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+                    dpp_gemm.template Run(a_thread_vec.template AsType<dpp_input_type>(),
+                                          b_thread_vec.template AsType<dpp_input_type>(),
+                                          c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                });
+            });
+        });
+    }
+    protected:
+    // A[M0, M1, M2, KPerThread]
+    static constexpr auto a_thread_desc_ =
+        make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, Number<KPerThread>{}));
+    // B[N0, N1, N2, KPerThread]
+    static constexpr auto b_thread_desc_ =
+        make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, Number<KPerThread>{}));
+    // C[M, N, NumRegDpp]
+    static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, dpp_gemm.GetRegSizePerDpp()));
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<ABDataType,
+                                                         ABDataType,
+                                                         decltype(a_block_desc_m0_m1_m2_k),
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<1, 1, 1, KPerThread>,
+                                                         Sequence<0, 1, 2, 3>,
+                                                         3,
+                                                         A_K1,
+                                                         A_K1>;
+    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<ABDataType,
+                                                         ABDataType,
+                                                         decltype(b_block_desc_n0_n1_n2_k),
+                                                         decltype(b_thread_desc_),
+                                                         Sequence<1, 1, 1, KPerThread>,
+                                                         Sequence<0, 1, 2, 3>,
+                                                         3,
+                                                         B_K1,
+                                                         B_K1>;
+    AThreadCopy a_thread_copy_{CalculateAThreadOriginDataIndex_M0_M1_M2_K()};
+    BThreadCopy b_thread_copy_{CalculateBThreadOriginDataIndex_N0_N1_N2_K()};
+};
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp
@@ -221,49 +221,102 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle
        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatB>(
            b_thread_desc_.GetElementSpaceSize());
-        static_for<0, KPerBlock / WmmaK, 1>{}([&](auto k) { // k=0,1,2 instead of k=0,kpack*1, ...
+        // basic intrinsic to determine loopover direction
-            static_for<0, MRepeat, 1>{}([&](auto m0) {
+        if constexpr(MRepeat < NRepeat)
-                // read A
+        {
-                a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1,
+            static_for<0, KPerBlock / WmmaK, 1>{}(
-                                   make_tuple(Number<k * WmmaK / A_K1>{}, m0, I0, I0, I0),
+                [&](auto k) { // k=0,1,2 instead of k=0,kpack*1, ...
-                                   a_block_buf,
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
-                                   a_thread_desc_,
+                        // read A
-                                   make_tuple(I0, m0, I0, I0, I0),
+                        a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1,
-                                   a_thread_buf);
+                                           make_tuple(Number<k * WmmaK / A_K1>{}, m0, I0, I0, I0),
+                                           a_block_buf,
-                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                                           a_thread_desc_,
-                    // read B
+                                           make_tuple(I0, m0, I0, I0, I0),
-                    b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1,
+                                           a_thread_buf);
-                                       make_tuple(Number<k * WmmaK / B_K1>{}, n0, I0, I0, I0),
-                                       b_block_buf,
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
-                                       b_thread_desc_,
+                            // read B
-                                       make_tuple(I0, n0, I0, I0, I0),
+                            b_thread_copy_.Run(
-                                       b_thread_buf);
+                                b_block_desc_k0_n0_n1_n2_k1,
-                    vector_type<FloatA, WmmaK> a_thread_vec;
+                                make_tuple(Number<k * WmmaK / B_K1>{}, n0, I0, I0, I0),
-                    vector_type<FloatB, WmmaK> b_thread_vec;
+                                b_block_buf,
+                                b_thread_desc_,
-                    static_for<0, WmmaK, 1>{}([&](auto i) {
+                                make_tuple(I0, n0, I0, I0, I0),
-                        a_thread_vec.template AsType<FloatA>()(i) =
+                                b_thread_buf);
-                            a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                            vector_type<FloatA, WmmaK> a_thread_vec;
-                                make_tuple(i / A_K1, m0, 0, 0, i % A_K1))>{}];
+                            vector_type<FloatB, WmmaK> b_thread_vec;
-                        b_thread_vec.template AsType<FloatB>()(i) =
-                            b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                            static_for<0, WmmaK, 1>{}([&](auto i) {
-                                make_tuple(i / B_K1, n0, 0, 0, i % B_K1))>{}];
+                                a_thread_vec.template AsType<FloatA>()(i) =
+                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                        make_tuple(i / A_K1, m0, 0, 0, i % A_K1))>{}];
+                                b_thread_vec.template AsType<FloatB>()(i) =
+                                    b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(i / B_K1, n0, 0, 0, i % B_K1))>{}];
+                            });
+                            using wmma_input_type_a = typename vector_type<FloatA, WmmaK>::type;
+                            using wmma_input_type_b = typename vector_type<FloatB, WmmaK>::type;
+                            constexpr index_t c_offset =
+                                c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+                            wmma_gemm.template Run(
+                                a_thread_vec.template AsType<wmma_input_type_a>()(Number<0>{}),
+                                b_thread_vec.template AsType<wmma_input_type_b>()(Number<0>{}),
+                                c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                        });
                    });
-                    using wmma_input_type_a = typename vector_type<FloatA, WmmaK>::type;
-                    using wmma_input_type_b = typename vector_type<FloatB, WmmaK>::type;
-                    constexpr index_t c_offset =
-                        c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
-                    wmma_gemm.template Run(
-                        a_thread_vec.template AsType<wmma_input_type_a>()(Number<0>{}),
-                        b_thread_vec.template AsType<wmma_input_type_b>()(Number<0>{}),
-                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                });
-            });
+        }
-        });
+        else
+        {
+            static_for<0, KPerBlock / WmmaK, 1>{}(
+                [&](auto k) { // k=0,1,2 instead of k=0,kpack*1, ...
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        // read B
+                        b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1,
+                                           make_tuple(Number<k * WmmaK / B_K1>{}, n0, I0, I0, I0),
+                                           b_block_buf,
+                                           b_thread_desc_,
+                                           make_tuple(I0, n0, I0, I0, I0),
+                                           b_thread_buf);
+                        static_for<0, MRepeat, 1>{}([&](auto m0) {
+                            // read A
+                            a_thread_copy_.Run(
+                                a_block_desc_k0_m0_m1_m2_k1,
+                                make_tuple(Number<k * WmmaK / A_K1>{}, m0, I0, I0, I0),
+                                a_block_buf,
+                                a_thread_desc_,
+                                make_tuple(I0, m0, I0, I0, I0),
+                                a_thread_buf);
+                            vector_type<FloatA, WmmaK> a_thread_vec;
+                            vector_type<FloatB, WmmaK> b_thread_vec;
+                            static_for<0, WmmaK, 1>{}([&](auto i) {
+                                a_thread_vec.template AsType<FloatA>()(i) =
+                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                        make_tuple(i / A_K1, m0, 0, 0, i % A_K1))>{}];
+                                b_thread_vec.template AsType<FloatB>()(i) =
+                                    b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(i / B_K1, n0, 0, 0, i % B_K1))>{}];
+                            });
+                            using wmma_input_type_a = typename vector_type<FloatA, WmmaK>::type;
+                            using wmma_input_type_b = typename vector_type<FloatB, WmmaK>::type;
+                            constexpr index_t c_offset =
+                                c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+                            wmma_gemm.template Run(
+                                a_thread_vec.template AsType<wmma_input_type_a>()(Number<0>{}),
+                                b_thread_vec.template AsType<wmma_input_type_b>()(Number<0>{}),
+                                c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                        });
+                    });
+                });
+        }
    }
    protected:

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
@@ -4,27 +4,13 @@
 #pragma once
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/loop_scheduler.hpp"
 #include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
 #include "ck/tensor_operation/gpu/warp/xdlops_gemm.hpp"
 #include "ck/tensor_description/tensor_adaptor.hpp"
 namespace ck {
-enum struct LoopScheduler
-{
-    Default,
-    Interwave,
-};
-constexpr LoopScheduler make_default_loop_scheduler()
-{
-#if CK_EXPERIMENTAL_DEFAULT_TO_INTER_WAVE_SCHEDULING
-    return LoopScheduler::Interwave;
-#else
-    return LoopScheduler::Default;
-#endif // if CK_EXPERIMENTAL_DEFAULT_TO_INTER_WAVE_SCHEDULING
-}
 template <index_t MNXdlPerWave, index_t MNWaves, index_t MNPerXdl, typename TileDesc_K0_MN_K1>
 __host__ __device__ static constexpr auto
 MakeGemmMmaTileDescriptor_MN0_MN1_MN2_K(const TileDesc_K0_MN_K1&)
@@ -42,7 +28,8 @@ MakeGemmMmaTileDescriptor_MN0_MN1_MN2_K(const TileDesc_K0_MN_K1&)
 }
 template <index_t BlockSize,
-          typename FloatAB,
+          typename FloatA,
+          typename FloatB,
          typename FloatAcc,
          typename AK0MK1BlockDesc,
          typename BK0NK1BlockDesc,
@@ -72,7 +59,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
    static constexpr index_t A_K1 = AK0MK1BlockDesc{}.GetLength(I2);
    static constexpr index_t B_K1 = BK0NK1BlockDesc{}.GetLength(I2);
-    static constexpr auto xdlops_gemm = XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack>{};
+    static constexpr auto xdlops_gemm = XdlopsGemm<FloatA, MPerXDL, NPerXDL, KPack, FloatB>{};
    static constexpr index_t KPerThread = KPerBlock / xdlops_gemm.K0PerXdlops;
@@ -309,9 +296,9 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                        const BBlockBuffer& b_block_buf,
                        CThreadBuffer& c_thread_buf) const
    {
-        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatAB>(
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatA>(
            a_thread_desc_.GetElementSpaceSize());
-        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatAB>(
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatB>(
            b_thread_desc_.GetElementSpaceSize());
        static_for<0, MRepeat, 1>{}([&](auto m0) {
@@ -333,27 +320,29 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                                   b_thread_buf);
                static_for<0, KPerThread, KPack>{}([&](auto k) {
-                    vector_type<FloatAB, KPack> a_thread_vec;
+                    vector_type<FloatA, KPack> a_thread_vec;
-                    vector_type<FloatAB, KPack> b_thread_vec;
+                    vector_type<FloatB, KPack> b_thread_vec;
                    static_for<0, KPack, 1>{}([&](auto i) {
-                        a_thread_vec.template AsType<FloatAB>()(i) =
+                        a_thread_vec.template AsType<FloatA>()(i) =
                            a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                make_multi_index(0, 0, 0, k + i))>{}];
-                        b_thread_vec.template AsType<FloatAB>()(i) =
+                        b_thread_vec.template AsType<FloatB>()(i) =
                            b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                make_multi_index(0, 0, 0, k + i))>{}];
                    });
-                    using mfma_input_type =
+                    using mfma_input_type_a =
-                        typename vector_type<FloatAB, xdlops_gemm.K1PerXdlops>::type;
+                        typename vector_type<FloatA, xdlops_gemm.K1PerXdlops>::type;
+                    using mfma_input_type_b =
+                        typename vector_type<FloatB, xdlops_gemm.K1PerXdlops>::type;
                    constexpr index_t c_offset =
                        c_thread_desc_.CalculateOffset(make_multi_index(m0, n0, 0));
                    xdlops_gemm.template Run(
-                        a_thread_vec.template AsType<mfma_input_type>(),
+                        a_thread_vec.template AsType<mfma_input_type_a>(),
-                        b_thread_vec.template AsType<mfma_input_type>(),
+                        b_thread_vec.template AsType<mfma_input_type_b>(),
                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                });
            });
@@ -373,8 +362,8 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
    static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed(
        make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, xdlops_gemm.GetRegSizePerXdlops()));
-    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatA,
-                                                         FloatAB,
+                                                         FloatA,
                                                         decltype(a_block_desc_m0_m1_m2_k),
                                                         decltype(a_thread_desc_),
                                                         Sequence<1, 1, 1, KPerThread>,
@@ -383,8 +372,8 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                                                         A_K1,
                                                         A_K1>;
-    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
+    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatB,
-                                                         FloatAB,
+                                                         FloatB,
                                                         decltype(b_block_desc_n0_n1_n2_k),
                                                         decltype(b_thread_desc_),
                                                         Sequence<1, 1, 1, KPerThread>,
@@ -402,7 +391,8 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
 // the latest ROCm release. For unsupported compilers, inter-wave loop scheduler falls back to the
 // default loop scheduler which is given by the macro CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING=0
 template <index_t BlockSize,
-          typename FloatAB,
+          typename FloatA,
+          typename FloatB,
          typename FloatAcc,
          typename AK0MK1BlockDesc,
          typename BK0NK1BlockDesc,
@@ -414,7 +404,8 @@ template <index_t BlockSize,
          index_t NumMacClusters = CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING_MAC_CLUSTERS>
 struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
    : public BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
-                                                                 FloatAB,
+                                                                 FloatA,
+                                                                 FloatB,
                                                                 FloatAcc,
                                                                 AK0MK1BlockDesc,
                                                                 BK0NK1BlockDesc,
@@ -425,7 +416,8 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                                                                 KPack>
 {
    using Base = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
-                                                                     FloatAB,
+                                                                     FloatA,
+                                                                     FloatB,
                                                                     FloatAcc,
                                                                     AK0MK1BlockDesc,
                                                                     BK0NK1BlockDesc,
@@ -457,9 +449,9 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                        const BBlockBuffer& b_block_buf,
                        CThreadBuffer& c_thread_buf) const
    {
-        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatAB>(
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatA>(
            a_thread_desc_.GetElementSpaceSize());
-        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatAB>(
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatB>(
            b_thread_desc_.GetElementSpaceSize());
        static_for<0, KPerThread, KPerInnerLoop>{}([&](auto k) {
@@ -496,20 +488,22 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
            static_for<0, KPerInnerLoop, KPack>{}([&](auto k_) {
                static_for<0, MRepeat, 1>{}([&](auto m0) {
                    static_for<0, NRepeat, 1>{}([&](auto n0) {
-                        vector_type<FloatAB, KPack> a_thread_vec;
+                        vector_type<FloatA, KPack> a_thread_vec;
-                        vector_type<FloatAB, KPack> b_thread_vec;
+                        vector_type<FloatB, KPack> b_thread_vec;
                        static_for<0, KPack, 1>{}([&](auto i) {
-                            a_thread_vec.template AsType<FloatAB>()(i) =
+                            a_thread_vec.template AsType<FloatA>()(i) =
                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                    make_tuple(m0, 0, 0, k_ + i))>{}];
-                            b_thread_vec.template AsType<FloatAB>()(i) =
+                            b_thread_vec.template AsType<FloatB>()(i) =
                                b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                    make_tuple(n0, 0, 0, k_ + i))>{}];
                        });
-                        using mfma_input_type =
+                        using mfma_input_type_a =
-                            typename vector_type<FloatAB, xdlops_gemm.K1PerXdlops>::type;
+                            typename vector_type<FloatA, xdlops_gemm.K1PerXdlops>::type;
+                        using mfma_input_type_b =
+                            typename vector_type<FloatB, xdlops_gemm.K1PerXdlops>::type;
                        constexpr index_t c_offset =
                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
@@ -531,8 +525,8 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                        // TODO: insert setprio in more precise manner since we
                        // could have more than >1 MFMA instructions in single call
                        xdlops_gemm.template Run(
-                            a_thread_vec.template AsType<mfma_input_type>(),
+                            a_thread_vec.template AsType<mfma_input_type_a>(),
-                            b_thread_vec.template AsType<mfma_input_type>(),
+                            b_thread_vec.template AsType<mfma_input_type_b>(),
                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                        if constexpr(k_.value == 0 && m0.value == 0 && n0.value == 0)
                        {
@@ -558,8 +552,8 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed(
        make_tuple(Number<NRepeat>{}, I1, I1, Number<KPerInnerLoop>{}));
-    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatA,
-                                                         FloatAB,
+                                                         FloatA,
                                                         decltype(a_block_desc_m0_m1_m2_k),
                                                         decltype(a_thread_desc_),
                                                         Sequence<1, 1, 1, KPerInnerLoop>,
@@ -568,8 +562,8 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                                                         A_K1,
                                                         A_K1>;
-    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
+    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatB,
-                                                         FloatAB,
+                                                         FloatB,
                                                         decltype(b_block_desc_n0_n1_n2_k),
                                                         decltype(b_thread_desc_),
                                                         Sequence<1, 1, 1, KPerInnerLoop>,
@@ -585,7 +579,8 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
 };
 template <index_t BlockSize,
-          typename FloatAB,
+          typename FloatA,
+          typename FloatB,
          typename FloatAcc,
          typename AK0MK1BlockDesc,
          typename BK0NK1BlockDesc,
@@ -600,7 +595,8 @@ constexpr auto BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector()
    if constexpr(LoopSched == LoopScheduler::Default)
    {
        return BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
-                                                                   FloatAB,
+                                                                   FloatA,
+                                                                   FloatB,
                                                                   FloatAcc,
                                                                   AK0MK1BlockDesc,
                                                                   BK0NK1BlockDesc,
@@ -613,7 +609,8 @@ constexpr auto BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector()
    else if constexpr(LoopSched == LoopScheduler::Interwave)
    {
        return BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
-                                                                            FloatAB,
+                                                                            FloatA,
+                                                                            FloatB,
                                                                            FloatAcc,
                                                                            AK0MK1BlockDesc,
                                                                            BK0NK1BlockDesc,
@@ -635,26 +632,27 @@ constexpr auto BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector()
 * 3. configurable k index starting position and step size after each FMA/XDL instruction
 */
-template <index_t BlockSize,
+template <
-          typename FloatAB,
+    index_t BlockSize,
-          typename FloatAcc,
+    typename FloatAB,
-          typename ATileDesc,
+    typename FloatAcc,
-          typename BTileDesc,
+    typename ATileDesc,
-          typename AMmaTileDesc,
+    typename BTileDesc,
-          typename BMmaTileDesc,
+    typename AMmaTileDesc,
-          index_t MPerBlock,
+    typename BMmaTileDesc,
-          index_t NPerBlock,
+    index_t MPerBlock,
-          index_t KPerBlock,
+    index_t NPerBlock,
-          index_t MPerXDL,
+    index_t KPerBlock,
-          index_t NPerXDL,
+    index_t MPerXDL,
-          index_t MRepeat,
+    index_t NPerXDL,
-          index_t NRepeat,
+    index_t MRepeat,
-          index_t KPack,
+    index_t NRepeat,
-          bool TransposeC = false,
+    index_t KPack,
-          index_t AMmaKStride =
+    bool TransposeC = false,
-              KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, TransposeC>{}.K0PerXdlops,
+    index_t AMmaKStride =
-          index_t BMmaKStride =
+        KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops,
-              KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, TransposeC>{}.K0PerXdlops>
+    index_t BMmaKStride =
+        KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
 struct BlockwiseGemmXdlops_v2
 {
    static constexpr auto I0 = Number<0>{};
@@ -671,7 +669,8 @@ struct BlockwiseGemmXdlops_v2
    static constexpr index_t A_K1 = ATileDesc{}.GetLength(I2);
    static constexpr index_t B_K1 = BTileDesc{}.GetLength(I2);
-    static constexpr auto xdlops_gemm = XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, TransposeC>{};
+    static constexpr auto xdlops_gemm =
+        XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{};
    static constexpr index_t KPerThread = KPerBlock / xdlops_gemm.K0PerXdlops;

--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_direct_load.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_direct_load.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/cluster_descriptor.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+namespace ck {
+/**
+ * Transfer that uses direct load instructions to copy data from global to LDS memory.
+ *
+ * Traditional loads first copy data from global to registers, and then from registers to LDS.
+ * Direct loads do not need an intermediate step, data is copied directly from global to LDS,
+ * without the use of additional registers.
+ *
+ * However, the instruction has limitations:
+ * - each thread must copy exactly a single DWORD - 4 bytes;
+ * - threads within a single wavefront must write consecutive DWORDS into LDS,
+ *   (data in global do not need to be contiguous, each thread might have its own offset).
+ *
+ * To make sure that all the transfers finished, the `waitcnt` instruction must be used with
+ * `vmcnt` instead of `lgkmcnt`.
+ *
+ * Limitations of the transfer class:
+ * - `SrcData` must be the same as `DstData` - no possibility to convert the data type in flight;
+ * - `DstVectorDim` must be the last dimension;
+ * - `SrcVectorDim` must be the last dimension if `ScalarPerVector` is greater than 1;
+ * - `ScalarPerVector` times the number of bytes of `DstData` must be equal to a single DWORD = 4B
+ *   (for examlpe if `DstData` is fp32, then `ScalarPerVector` must be 1; if `DstData` is fp16,
+ *   `ScalarPerVector` must be 2);
+ * - if `ScalarPerVector` is greater than 1, the contiguous dimension in src and dst must be
+ *   the same dimension;
+ * - threads in a wavefront must write contiguous data to LDS (when wavefront size is 64,
+ *   they must write 64 contiguous DWORDs) - `ThreadClusterLengths` must be prepared in such a way
+ *   to guarantee that.
+ */
+template <typename ThreadGroup,
+          typename BlockSliceLengths,
+          typename ThreadClusterLengths,
+          typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          index_t SrcVectorDim,
+          index_t DstVectorDim,
+          index_t ScalarPerVector>
+struct ThreadGroupTensorSliceTransfer_DirectLoad
+{
+    static constexpr index_t nDim = remove_reference_t<SrcDesc>::GetNumOfDimension();
+    using Index                   = MultiIndex<nDim>;
+    using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
+    using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto block_slice_lengths    = BlockSliceLengths{};
+    static constexpr auto thread_cluster_lengths = ThreadClusterLengths{};
+    static constexpr auto thread_single_load_size = generate_sequence(
+        detail::lambda_scalar_per_access<DstVectorDim, ScalarPerVector>{}, Number<nDim>{});
+    // After a load, each thread moves by `thread_steps` instead of loading the next elements.
+    // It makes the whole wavefront load contiguous memory, what is required for direct loads.
+    static constexpr auto thread_steps         = thread_cluster_lengths * thread_single_load_size;
+    static constexpr auto thread_slice_lengths = block_slice_lengths / thread_steps;
+    static __device__ constexpr bool AreThreadClusterLengthsValid()
+    {
+        // Make sure that ThreadClusterLengths are set in a way that allows for contiguous writes to
+        // LDS by the threads from a single wavefront.
+        // Examples (assuming 64 threads in a wavefront, 128 in a thread block):
+        // 1. BlockSliceLengths = [K0PerBlock, MPerBlock, K1PerBlock] = [4, 128, 8],
+        //    data type = fp32 -> ScalarPerVector = 1
+        //    INVALID: ThreadClusterLengths = [4, 4, 8] since in the first iteration, threads 0-31
+        //             write [0, 0, 0] - [0, 3, 7] and thread 32 writes [1, 0, 0] instead of
+        //             [0, 4, 0].
+        //    VALID: ThreadClusterLengths = [2, 8, 8] or [1, 16, 8] since in the first iteration,
+        //           threads 0-63 write [0, 0, 0] - [0, 7, 7] -> 64 consecutive elements (DWORDs).
+        // 2. BlockSliceLengths = [K0PerBlock, MPerBlock, K1PerBlock] = [4, 128, 8],
+        //    data type = fp16 -> ScalarPerVector = 2
+        //    NOTE: ThreadClusterLengths must take into account that each thread writes two
+        //          elements (single DWORD) along the contiguous dimension.
+        //    INVALID: ThreadClusterLengths = [4, 4, 8] since each 8 threads would try to write
+        //             8 * 2 elements of K1PerBlock and there are only 8;
+        //             ThreadClusterLengths = [4, 8, 4] since in the first iteration, threads 0-31
+        //             write [0, 0, 0] - [0, 7, 7] (7 since each writes 2 elements) and thread 32
+        //             writes [1, 0, 0] instead of [0, 8, 0].
+        //    VALID: ThreadClusterLengths = [4, 16, 4] or [2, 32, 4] or [1, 64, 4] since in the
+        //           first iteration, threads 0-63 write [0, 0, 0] -  [0, 15, 7] -> 128 consecutive
+        //           elements = 64 consecutive DWORDs.
+        int num_contiguous_dwords = 1;
+        bool is_contiguous        = true;
+        static_for<0, nDim, 1>{}([&](auto i) {
+            if(is_contiguous)
+            {
+                num_contiguous_dwords *= thread_cluster_lengths[nDim - i - 1];
+            }
+            if(thread_slice_lengths[nDim - i - 1] > 1)
+            {
+                is_contiguous = false;
+            }
+        });
+        constexpr index_t wavefront_size = get_warp_size();
+        const bool wave_contiguous       = num_contiguous_dwords % wavefront_size == 0;
+        bool thread_slice_lengths_correct = true;
+        static_for<0, nDim, 1>{}([&](auto i) {
+            if(thread_slice_lengths[i] <= 0)
+            {
+                thread_slice_lengths_correct = false;
+            }
+        });
+        return wave_contiguous && thread_slice_lengths_correct;
+    }
+    __device__ constexpr ThreadGroupTensorSliceTransfer_DirectLoad(
+        const SrcDesc& src_desc,
+        const Index& src_block_slice_origin,
+        const DstDesc& dst_desc,
+        const Index& dst_block_slice_origin)
+    {
+        static_assert(ck::is_same_v<SrcData, DstData>,
+                      "Direct load transfer does not support datatypes conversion. Source and "
+                      "destination data types must be the same.");
+        static_assert(
+            DstVectorDim == nDim - 1,
+            "Direct load transfer requires the destination vector dimension to be the last one.");
+        static_assert(ScalarPerVector == 1 || SrcVectorDim == DstVectorDim,
+                      "When loading more than one element per thread at once, the contiguous "
+                      "dimension must be the same between source and destination.");
+        constexpr auto dword_bytes           = 4;
+        constexpr auto bytes_per_thread_load = ScalarPerVector * sizeof(SrcData);
+        static_assert(bytes_per_thread_load == dword_bytes,
+                      "Direct load transfer requires each thread to load exactly a single "
+                      "DWORD of data.");
+        static_assert(nDim == remove_cvref_t<SrcDesc>::GetNumOfDimension() &&
+                          nDim == remove_cvref_t<DstDesc>::GetNumOfDimension() &&
+                          nDim == ThreadClusterLengths::Size(),
+                      "Inconsistent number of dimensions across lengths and descriptors.");
+        static_assert(ThreadGroup::GetNumOfThread() >= thread_cluster_desc_.GetElementSize(),
+                      "The number of threads cannot be less than the number of elements in "
+                      "thread cluster lengths.");
+        static_assert(
+            AreThreadClusterLengthsValid(),
+            "Thread cluster lengths are incorrect. They must be set in a way that allows a single "
+            "wavefront to write contiguous DWORDs into LDS memory. ");
+        const auto thread_cluster_idx =
+            thread_cluster_desc_.CalculateBottomIndex(make_multi_index(ThreadGroup::GetThreadId()));
+        const auto thread_data_idx_begin = thread_cluster_idx * thread_single_load_size;
+        SetSrcSliceOrigin(src_desc, src_block_slice_origin + thread_data_idx_begin);
+        SetDstSliceOrigin(dst_desc, dst_block_slice_origin + thread_data_idx_begin);
+    }
+    __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
+    {
+        src_coord_        = make_tensor_coordinate(src_desc, src_slice_origin_idx);
+        src_slice_origin_ = src_slice_origin_idx;
+    }
+    __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
+    {
+        dst_coord_        = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
+        dst_slice_origin_ = dst_slice_origin_idx;
+    }
+    __device__ void ResetDstSliceWindow(const DstDesc& dst_desc)
+    {
+        dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_);
+    }
+    template <typename SrcBuffer, typename DstBuffer>
+    __device__ void Run(const SrcDesc& src_desc,
+                        const SrcBuffer& src_buf,
+                        const DstDesc& dst_desc,
+                        DstBuffer& dst_buf)
+    {
+        static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Global,
+                      "Source data must come from a global memory buffer.");
+        static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
+                      "Destination data must be stored in an LDS memory buffer.");
+        static_assert(
+            ck::is_same_v<remove_cvref_t<typename SrcBuffer::type>, remove_cvref_t<SrcData>>,
+            "SrcBuffer and SrcData data types must be consistent.");
+        static_assert(
+            ck::is_same_v<remove_cvref_t<typename DstBuffer::type>, remove_cvref_t<DstData>>,
+            "DstBuffer and DstData data types must be consistent.");
+        constexpr auto dst_access_lengths = thread_slice_lengths;
+        const auto dst_forward_steps  = generate_steps(1);
+        const auto dst_backward_steps = generate_steps(-1);
+        const auto src_forward_steps  = generate_steps(1);
+        const auto src_backward_steps = generate_steps(-1);
+        // Loop over the destination block and copy data.
+        static_ford<decltype(dst_access_lengths)>{}([&](auto ordered_dst_access_idx) {
+            const auto src_offset = src_coord_.GetOffset();
+            const auto dst_offset = dst_coord_.GetOffset();
+            // Check if src data is not in the logic padding area.
+            const bool is_src_valid =
+                coordinate_has_valid_offset_assuming_top_index_is_valid(src_desc, src_coord_);
+            src_buf.template DirectCopyToLds<remove_cvref_t<decltype(dst_buf)>, ScalarPerVector>(
+                dst_buf, src_offset, dst_offset, is_src_valid);
+            constexpr auto move_on_dim = [&]() constexpr
+            {
+                StaticallyIndexedArray<bool, nDim> move_on_dim_;
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    move_on_dim_(i) = ordered_dst_access_idx[i] < dst_access_lengths[i] - 1;
+                    static_for<i + 1, nDim, 1>{}([&](auto j) {
+                        move_on_dim_(i) &= ordered_dst_access_idx[j] == dst_access_lengths[j] - 1;
+                    });
+                });
+                return move_on_dim_;
+            }
+            ();
+            // Decide whether to move forward or backward.
+            constexpr auto forward_sweep = [&]() {
+                StaticallyIndexedArray<bool, nDim> forward_sweep_;
+                forward_sweep_(I0) = true;
+                static_for<1, nDim, 1>{}([&](auto i) {
+                    index_t tmp = ordered_dst_access_idx[I0];
+                    static_for<1, i, 1>{}([&](auto j) {
+                        tmp = tmp * dst_access_lengths[j] + ordered_dst_access_idx[j];
+                    });
+                    forward_sweep_(i) = tmp % 2 == 0;
+                });
+                return forward_sweep_;
+            }();
+            static_for<0, nDim, 1>{}([&](auto i) {
+                if constexpr(move_on_dim[i])
+                {
+                    if constexpr(forward_sweep[i])
+                    {
+                        move_tensor_coordinate(dst_desc, dst_coord_, dst_forward_steps[i]);
+                        move_tensor_coordinate(src_desc, src_coord_, src_forward_steps[i]);
+                    }
+                    else
+                    {
+                        move_tensor_coordinate(dst_desc, dst_coord_, dst_backward_steps[i]);
+                        move_tensor_coordinate(src_desc, src_coord_, src_backward_steps[i]);
+                    }
+                }
+            });
+        });
+        // Reset the destination slice since the entire buffer has been already filled.
+        ResetDstSliceWindow(dst_desc);
+    }
+    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc, const Index& step)
+    {
+        src_slice_origin_ = src_slice_origin_ + step;
+        src_coord_        = make_tensor_coordinate(src_desc, src_slice_origin_);
+    }
+    __device__ auto generate_steps(int sign) const
+    {
+        return generate_tuple(
+            [&](auto i) {
+                Index step;
+                static_for<0, nDim, 1>{}(
+                    [&](auto j) { step(j) = (i.value == j.value) ? sign * thread_steps[i] : 0; });
+                return step;
+            },
+            Number<nDim>{});
+    }
+    private:
+    static constexpr auto thread_cluster_desc_ = make_cluster_descriptor(ThreadClusterLengths{});
+    SrcCoord src_coord_;
+    DstCoord dst_coord_;
+    Index src_slice_origin_;
+    Index dst_slice_origin_;
+};
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r2.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r2.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/cluster_descriptor.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r2.hpp"
+#include "ck/utility/is_detected.hpp"
+namespace ck {
+// Thread-group level multi-source, multi-destination tensor slice data movement
+// Assume:
+//   1. All sources and destinations are DynamicBuffer
+//   2. Same VectorDim and ScalerPerVector for all sources and destinations
+//   3. DstInMemOps are per destination tensor
+//   4. ThreadTransferSrcResetCoordinateAfterRunFlags are per source tensor
+//   5. ThreadTransferDstResetCoordinateAfterRunFlags are per destination tensor
+//
+// Does following things to avoid scratch memory issue
+//   1. Pass tensor descritpors by reference (or tuple of references)
+//   2. Does not keep reference to tensor descriptor
+//   3. Does not construct new tensor coordinate when call Run()
+template <typename ThreadGroup,
+          typename SrcDatas,
+          typename DstDatas,
+          typename SrcDescs,
+          typename DstDescs,
+          typename ElementwiseOperation,
+          typename DstInMemOps, // Sequence<InMemoryDataOperationEnum ...>
+          typename SliceLengths,
+          typename ThreadClusterLengths,
+          typename ThreadClusterArrangeOrder,
+          typename SrcDimAccessOrder,
+          typename DstDimAccessOrder,
+          index_t SrcVectorDim,
+          index_t DstVectorDim,
+          index_t SrcScalarPerVector,
+          index_t DstScalarPerVector,
+          typename ThreadTransferSrcResetCoordinateAfterRunFlags,
+          typename ThreadTransferDstResetCoordinateAfterRunFlags>
+struct ThreadGroupTensorSliceTransfer_v7r2
+{
+    static constexpr index_t nDim =
+        remove_cvref_t<tuple_element_t<0, SrcDescs>>::GetNumOfDimension();
+    static constexpr index_t nSrc = remove_cvref_t<SrcDescs>::Size();
+    static constexpr index_t nDst = remove_cvref_t<DstDescs>::Size();
+    using Index = MultiIndex<nDim>;
+    static constexpr auto thread_slice_lengths = SliceLengths{} / ThreadClusterLengths{};
+    __device__ constexpr ThreadGroupTensorSliceTransfer_v7r2(
+        const SrcDescs& src_descs,
+        const StaticallyIndexedArray<Index, nSrc>& src_block_slice_origins,
+        const DstDescs& dst_descs,
+        const StaticallyIndexedArray<Index, nDst>& dst_block_slice_origins,
+        const ElementwiseOperation& element_op)
+        : threadwise_transfer_(src_descs,
+                               StaticallyIndexedArray<Index, nSrc>{},
+                               dst_descs,
+                               StaticallyIndexedArray<Index, nDst>{},
+                               element_op)
+    {
+        static_assert(nSrc == SrcDatas::Size() && nSrc == SrcDescs::Size() &&
+                          nSrc == ThreadTransferSrcResetCoordinateAfterRunFlags::Size() &&
+                          nDst == DstDatas::Size() && nDst == DstDescs::Size() &&
+                          nDst == ThreadTransferDstResetCoordinateAfterRunFlags::Size(),
+                      "wrong!");
+        static_for<0, nSrc, 1>{}([&](auto i) {
+            static_assert(
+                nDim == remove_cvref_t<tuple_element_t<i.value, SrcDescs>>::GetNumOfDimension(),
+                "wrong!");
+        });
+        static_for<0, nDst, 1>{}([&](auto i) {
+            static_assert(
+                nDim == remove_cvref_t<tuple_element_t<i.value, DstDescs>>::GetNumOfDimension(),
+                "wrong!");
+        });
+        static_assert(nDim == ThreadClusterLengths::Size() &&
+                          nDim == ThreadClusterArrangeOrder::Size() &&
+                          nDim == SrcDimAccessOrder::Size() && nDim == DstDimAccessOrder::Size(),
+                      "wrong! nDim not consistent");
+        static_assert(
+            is_same<SliceLengths, decltype(thread_slice_lengths * ThreadClusterLengths{})>{},
+            "wrong! threads should be mapped to cover entire slicing window");
+        static_assert(ThreadGroup::GetNumOfThread() >= thread_cluster_desc_.GetElementSize(),
+                      "wrong! ThreadGroup::GetNumOfThread() too small");
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            const auto thread_cluster_idx = thread_cluster_desc_.CalculateBottomIndex(
+                make_multi_index(get_thread_local_1d_id()));
+            const auto thread_data_idx_begin = thread_cluster_idx * thread_slice_lengths;
+            const auto src_thread_slice_origins = generate_tuple(
+                [&](auto i) { return src_block_slice_origins[i] + thread_data_idx_begin; },
+                Number<nSrc>{});
+            const auto dst_thread_slice_origins = generate_tuple(
+                [&](auto i) { return dst_block_slice_origins[i] + thread_data_idx_begin; },
+                Number<nDst>{});
+            threadwise_transfer_.SetSrcSliceOrigins(src_descs, src_thread_slice_origins);
+            threadwise_transfer_.SetDstSliceOrigins(dst_descs, dst_thread_slice_origins);
+        }
+    }
+    template <typename SrcBuffers>
+    __device__ void RunRead(const SrcDescs& src_descs, const SrcBuffers& src_bufs)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.RunRead(src_descs, src_bufs);
+        }
+    }
+    template <typename T>
+    using is_tuple = decltype(std::declval<T&>().IsTuple());
+    template <typename DstBuffers>
+    __device__ void RunWrite(const DstDescs& dst_descs, DstBuffers dst_bufs)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            if constexpr(is_detected<is_tuple, decltype(dst_bufs)>::value)
+                threadwise_transfer_.RunWrite(dst_descs, dst_bufs);
+            else
+                threadwise_transfer_.RunWrite(dst_descs, tie(dst_bufs));
+        }
+    }
+    template <typename SrcBuffers, typename DstBuffers>
+    __device__ void Run(const SrcDescs& src_descs,
+                        const SrcBuffers& src_bufs,
+                        const DstDescs& dst_descs,
+                        DstBuffers dst_bufs)
+    {
+        RunRead(src_descs, src_bufs);
+        RunWrite(dst_descs, dst_bufs);
+    }
+    template <index_t ISrc>
+    __device__ void
+    MoveSrcSliceWindow(const SrcDescs& src_descs, Number<ISrc> iSrc, const Index& step)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveSrcSliceWindow(src_descs, iSrc, step);
+        }
+    }
+    __device__ void MoveSrcSliceWindow(const SrcDescs& src_descs, const Index& step)
+    {
+        static_for<0, SrcDescs::Size(), 1>{}(
+            [&](auto i) { MoveSrcSliceWindow(src_descs, i, step); });
+    }
+    template <index_t IDst>
+    __device__ void
+    MoveDstSliceWindow(const DstDescs& dst_descs, Number<IDst> iDst, const Index& step)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveDstSliceWindow(dst_descs, iDst, step);
+        }
+    }
+    __device__ void MoveDstSliceWindow(const DstDescs& dst_descs, const Index& step)
+    {
+        static_for<0, DstDescs::Size(), 1>{}(
+            [&](auto i) { MoveDstSliceWindow(dst_descs, i, step); });
+    }
+    private:
+    static constexpr auto thread_cluster_desc_ =
+        make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
+    using ThreadwiseTransfer =
+        ThreadwiseTensorSliceTransfer_v7r2<SrcDatas,
+                                           DstDatas,
+                                           SrcDescs,
+                                           DstDescs,
+                                           ElementwiseOperation,
+                                           DstInMemOps,
+                                           decltype(thread_slice_lengths),
+                                           SrcDimAccessOrder,
+                                           DstDimAccessOrder,
+                                           SrcVectorDim,
+                                           DstVectorDim,
+                                           SrcScalarPerVector,
+                                           DstScalarPerVector,
+                                           ThreadTransferSrcResetCoordinateAfterRunFlags,
+                                           ThreadTransferDstResetCoordinateAfterRunFlags>;
+    ThreadwiseTransfer threadwise_transfer_;
+};
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp
+++ b/include/ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+namespace ck {
+namespace conv_tensor_rearrange_op {
+struct BaseConvTensorRearrangeOp
+{
+};
+struct ImageToColumn : public BaseConvTensorRearrangeOp
+{
+    static constexpr const char* name = "Image to Column";
+};
+struct ColumnToImage : public BaseConvTensorRearrangeOp
+{
+    static constexpr const char* name = "Column to Image";
+};
+template <typename Op,
+          typename std::enable_if<std::is_base_of<BaseConvTensorRearrangeOp, Op>::value,
+                                  bool>::type = false>
+std::ostream& operator<<(std::ostream& os, const BaseConvTensorRearrangeOp&)
+{
+    os << Op::name;
+    return os;
+}
+} // namespace conv_tensor_rearrange_op
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_contraction_multiple_abd.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_contraction_multiple_abd.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <array>
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+// GEMM:
+//   input : A0[M0, M1, ... K0, K1, ...], ...
+//   input : B0[N0, N1, ... K0, K1, ...], ...
+//   input : D0[M0, M1, ... N0, N1, ...], D1[M0, M1, ... N0, N1, ...], ...
+//   output : E[M0, M1, ... N0, N1, ...]
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+// Assume:
+//   D0, D1, ... and E have the same layout
+template <index_t NumDimM,
+          index_t NumDimN,
+          index_t NumDimK,
+          typename AsDataType,
+          typename BsDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct DeviceContractionMultipleABD : public BaseOperator
+{
+    static constexpr index_t NumATensor = AsDataType::Size();
+    static constexpr index_t NumBTensor = BsDataType::Size();
+    static constexpr index_t NumDTensor = DsDataType::Size();
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(std::array<const void*, NumATensor> p_as,
+                        std::array<const void*, NumBTensor> p_bs,
+                        std::array<const void*, NumDTensor> p_ds,
+                        void* p_e,
+                        const std::array<std::vector<index_t>, NumATensor>& a_ms_ks_lengths,
+                        const std::array<std::vector<index_t>, NumATensor>& a_ms_ks_strides,
+                        const std::array<std::vector<index_t>, NumBTensor>& b_ns_ks_lengths,
+                        const std::array<std::vector<index_t>, NumBTensor>& b_ns_ks_strides,
+                        const std::array<std::vector<index_t>, NumDTensor>& d_ms_ns_lengths,
+                        const std::array<std::vector<index_t>, NumDTensor>& d_ms_ns_strides,
+                        const std::vector<index_t>& e_ms_ns_length,
+                        const std::vector<index_t>& e_ms_ns_stride,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) = 0;
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp
@@ -33,7 +33,8 @@ template <index_t NumDimM,
          typename EDataType,
          typename AElementwiseOperation,
          typename BElementwiseOperation,
-          typename CDEElementwiseOperation>
+          typename CDEElementwiseOperation,
+          typename ComputeDataType = ADataType>
 struct DeviceContractionMultipleD : public BaseOperator
 {
    static constexpr index_t NumDTensor = DsDataType::Size();

--- a/include/ck/tensor_operation/gpu/device/device_image_to_column.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_image_to_column.hpp
@@ -12,21 +12,27 @@ namespace tensor_operation {
 namespace device {
 /**
- * \brief Image to column.
+ * \brief Convolution Tensor Rearrange.
 *
- * This Device operator converts image ([G, N, Di, Hi, Wi, C]) to the gemm
+ * This Device operator supports converting an image to
- * problem([N * Do * Ho * Wo, Z *  Y * X * C]). G must be equal to 1.
+ * the GEMM representation (Image to Column) and
+ * converting a GEMM form to the image (Column to Image).
+ * Supported layouts:
+ * [G, N, Di, Hi, Wi, C] <-> [G, N * Do * Ho * Wo, Z *  Y * X * C]
+ * [N, Di, Hi, Wi, G, C] <-> [N * Do * Ho * Wo, G, Z *  Y * X * C]
 *
 * \tparam NDimSpatial Number of spatial dimensions.
- * \tparam InputLayout Input Layout.
+ * \tparam ImageLayout Input Layout.
 * \tparam InputDataType Input Data Type.
 * \tparam OutputDataType Output Data Type.
+ * \tparam ConvTensorRearrangeOp Operation type: ImageToColumn, ColumnToImage.
 */
 template <index_t NDimSpatial,
-          typename InputLayout,
+          typename ImageLayout,
          typename InputDataType,
-          typename OutputDataType>
+          typename OutputDataType,
-struct DeviceImageToColumn : public BaseOperator
+          typename ConvTensorRearrangeOp>
+struct DeviceConvTensorRearrange : public BaseOperator
 {
    /**
@@ -34,13 +40,14 @@ struct DeviceImageToColumn : public BaseOperator
     *
     * \param p_in A pointer to the device memory of the input image.
     * \param p_out A pointer to the device memory of the output.
+     * \param G Convolution number of groups.
     * \param N Convolution batch size.
     * \param C Convolution number of channels.
     * \param input_spatial_lengths Input spatial lengths.
     * \param filter_spatial_lengths Filter spatial lengths.
     * \param output_spatial_lengths Output spatial lengths.
-     * \param input_g_n_c_wis_strides Input strides in order [G, N, C, D, H, W].
+     * \param image_g_n_c_wis_strides Image strides in order [G, N, C, D, H, W].
-     * \param output_m_k_strides Output strides.
+     * \param gemm_g_m_k_strides Gemm form strides.
     * \param conv_filter_strides Convolution filter strides.
     * \param conv_filter_dilations Convolution filter dilations.
     * \param input_left_pads Convolution left pads.
@@ -50,13 +57,14 @@ struct DeviceImageToColumn : public BaseOperator
    virtual std::unique_ptr<BaseArgument>
    MakeArgumentPointer(const void* p_in,
                        void* p_out,
+                        const ck::index_t G,
                        const ck::index_t N,
                        const ck::index_t C,
                        const std::array<index_t, NDimSpatial>& input_spatial_lengths,
                        const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
                        const std::array<index_t, NDimSpatial>& output_spatial_lengths,
-                        const std::array<index_t, NDimSpatial + 3>& input_g_n_c_wis_strides,
+                        const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
-                        const std::array<index_t, 2>& output_m_k_strides,
+                        const std::array<index_t, 3>& gemm_g_m_k_strides,
                        const std::array<index_t, NDimSpatial>& conv_filter_strides,
                        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
                        const std::array<index_t, NDimSpatial>& input_left_pads,

--- a/include/ck/tensor_operation/gpu/device/device_elementwise_scale.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_elementwise_scale.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <memory>
+#include <array>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+template <typename InDataTypeTuple,
+          typename OutDataTypeTuple,
+          typename ElementwiseOperation,
+          typename UnaryOperation,
+          typename Scale,
+          index_t NumDim>
+struct DeviceElementwise : public BaseOperator
+{
+    static constexpr int NumInput  = InDataTypeTuple::Size();
+    static constexpr int NumOutput = OutDataTypeTuple::Size();
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const std::array<index_t, NumDim> lengths,
+                        const std::array<std::array<index_t, NumDim>, NumInput> inStridesArray,
+                        const std::array<std::array<index_t, NumDim>, NumOutput> outStridesArray,
+                        const std::array<const void*, NumInput> in_dev_buffers,
+                        const std::array<void*, NumOutput> out_dev_buffers,
+                        ElementwiseOperation elementwise_op,
+                        UnaryOperation unary_op,
+                        Scale scale_op) = 0;
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+}; // namespace device
+template <typename InDataTypeTuple,
+          typename OutDataTypeTuple,
+          typename ElementwiseOperation,
+          typename UnaryOperation,
+          typename Scale,
+          index_t NumDim>
+using DeviceElementwisePtr = std::unique_ptr<DeviceElementwise<InDataTypeTuple,
+                                                               OutDataTypeTuple,
+                                                               ElementwiseOperation,
+                                                               UnaryOperation,
+                                                               Scale,
+                                                               NumDim>>;
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_abd.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_abd.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <array>
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+// GEMM:
+//   input : A0[M, K], B0[K, N],
+//   input : D0[M, N], D1[M, N], ...
+//   output : E[M, N]
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+// Assume:
+//   D0, D1, ... and E have the same layout
+template <typename AsLayout,
+          typename BsLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename AsDataType,
+          typename BsDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct DeviceGemmMultipleABD : public BaseOperator
+{
+    static constexpr index_t NumATensor = AsDataType::Size();
+    static constexpr index_t NumBTensor = BsDataType::Size();
+    static constexpr index_t NumDTensor = DsDataType::Size();
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(std::array<const void*, NumATensor> p_as,
+                        std::array<const void*, NumBTensor> p_bs,
+                        std::array<const void*, NumDTensor> p_ds,
+                        void* p_e,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        std::array<ck::index_t, NumATensor> StrideAs,
+                        std::array<ck::index_t, NumBTensor> StrideBs,
+                        std::array<ck::index_t, NumDTensor> StrideDs,
+                        ck::index_t StrideE,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) = 0;
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_gemm_splitk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_splitk.hpp
@@ -20,7 +20,8 @@ template <typename ALayout,
          typename CDataType,
          typename AElementwiseOperation,
          typename BElementwiseOperation,
-          typename CElementwiseOperation>
+          typename CElementwiseOperation,
+          typename ComputeType = CDataType>
 struct DeviceGemmSplitK : public BaseOperator
 {
    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
@@ -48,7 +49,8 @@ template <typename ALayout,
          typename CDataType,
          typename AElementwiseOperation,
          typename BElementwiseOperation,
-          typename CElementwiseOperation>
+          typename CElementwiseOperation,
+          typename ComputeType = CDataType>
 using DeviceGemmSplitKPtr = std::unique_ptr<DeviceGemmSplitK<ALayout,
                                                             BLayout,
                                                             CLayout,
@@ -57,7 +59,8 @@ using DeviceGemmSplitKPtr = std::unique_ptr<DeviceGemmSplitK<ALayout,
                                                             CDataType,
                                                             AElementwiseOperation,
                                                             BElementwiseOperation,
-                                                             CElementwiseOperation>>;
+                                                             CElementwiseOperation,
+                                                             ComputeType>>;
 } // namespace device
 } // namespace tensor_operation

--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp
@@ -29,7 +29,9 @@ template <ck::index_t NDimSpatial,
          typename EDataType,
          typename AElementwiseOperation,
          typename BElementwiseOperation,
-          typename CDEElementwiseOperation>
+          typename CDEElementwiseOperation,
+          typename AComputeType = ADataType,
+          typename BComputeType = AComputeType>
 struct DeviceGroupedConvBwdDataMultipleD : public BaseOperator
 {
    static constexpr index_t NumDTensor = DsDataType::Size();

--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp
@@ -20,7 +20,9 @@ template <ck::index_t NDimSpatial,
          typename OutDataType,
          typename InElementwiseOperation,
          typename WeiElementwiseOperation,
-          typename OutElementwiseOperation>
+          typename OutElementwiseOperation,
+          typename ComputeTypeA = InDataType,
+          typename ComputeTypeB = ComputeTypeA>
 struct DeviceGroupedConvBwdWeight : public BaseOperator
 {
    virtual std::unique_ptr<BaseArgument>

--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <array>
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
+#include "ck/utility/is_detected.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+template <typename T>
+using is_tuple = decltype(std::declval<T&>().IsTuple());
+/**
+ * \brief Grouped Convolution Forward
+ *
+ * \details
+ * input : input image A[G, N, C, Hi, Wi], A1[G, N, C, Hi, Wi]...
+ * input : weight B[G, K, C, Y, X], B1[G, K, C, Y, X]...
+ * input : D0[G, N, K, Ho, Wo], D1[G, N, K, Ho, Wo], ...
+ * output : output image E[G, N, K, Ho, Wo]
+ *
+ * C = a_op(A, A1...) * b_op(B, B1...)
+ * E = cde_op(C, D0, D1, ...)
+ *
+ * \tparam NDimSpatial Number of spatial dimensions.
+ * \tparam ALayout Input layout (also for a1, a2...).
+ * \tparam BLayout Weight layout (also for b1, b2...).
+ * \tparam DsLayout Ds layouts.
+ * \tparam ELayout Output layout.
+ * \tparam ADataType Input data type. Pass tuple if there is multiple A.
+ * \tparam BDataType Weight data type. Pass tuple if there is multiple B.
+ * \tparam DsDataType D data types.
+ * \tparam EDataType Output data type.
+ * \tparam AElementwiseOperation A elementwise operation.
+ * \tparam BElementwiseOperation B elementwise operation.
+ * \tparam CDEElementwiseOperation CDE elementwise operation.
+ * \tparam ComputeType Compute data type (default: ADataType, first if tuple passed).
+ */
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          typename ComputeType =
+              decltype(UnpackDataType<is_detected<is_tuple, ADataType>::value,
+                                      Number<0>,
+                                      ADataType>())> // ComputeType is InputType by default (first
+                                                     // in tuple for MultiAB), unpack if tuple was
+                                                     // passed
+struct DeviceGroupedConvFwdMultipleABD : public BaseOperator
+{
+    static constexpr bool isMultiA = is_detected<is_tuple, ADataType>::value;
+    static constexpr bool isMultiB = is_detected<is_tuple, BDataType>::value;
+    static constexpr index_t NumATensor = GetNumABTensors<isMultiA, ADataType>();
+    static constexpr index_t NumBTensor = GetNumABTensors<isMultiB, BDataType>();
+    static constexpr index_t NumDTensor = DsDataType::Size();
+    static_assert(NumDTensor == DsLayout::Size(), "wrong! Inconsistent NumDTensor");
+    // If DataType is tuple, user has to pass std::array with pointers.
+    using APointers =
+        std::conditional_t<isMultiA, std::array<const void*, NumATensor>&, const void*>;
+    using BPointers =
+        std::conditional_t<isMultiB, std::array<const void*, NumBTensor>&, const void*>;
+    /**
+     * \brief Make argument pointer for grouped conv fwd.
+     *
+     * \param p_a A pointer to the input (std::array<const void*, NumA> with
+                  pointers for multiple A).
+     * \param p_b A pointer to the weight (std::array<const void*, NumA> with
+                  pointers for multiple B).
+     * \param p_ds A pointers to the Ds.
+     * \param p_e A pointers to the output.
+     * \param a_g_n_c_wis_lengths Input lengths [G, N, C, Spatial...] (for 3d).
+     * \param a_g_n_c_wis_strides Input strides [G, N, C, Spatial...] (for 3d).
+     * \param b_g_k_c_xs_lengths Weight lengths [G, K, C, Spatial...] (for 3d).
+     * \param b_g_k_c_xs_strides Weight strides [G, K, C, Spatial...] (for 3d).
+     * \param ds_g_n_k_wos_lengths Ds lengths [G, N, K, Spatial...] (for 3d).
+     * \param ds_g_n_k_wos_strides Ds strides [G, N, K, Spatial...] (for 3d).
+     * \param e_g_n_k_wos_lengths Output lengths [G, N, K, Spatial...] (for 3d).
+     * \param e_g_n_k_wos_strides Output strides [G, N, K, Spatial...] (for 3d).
+     * \param conv_filter_strides Convolution filter strides.
+     * \param conv_filter_dilations Convolution filter dilations.
+     * \param input_left_pads Input left paddings.
+     * \param input_right_pads Input right paddings.
+     * \param a_element_op A elementwise operation object.
+     * \param b_element_op B elementwise operation object.
+     * \param cde_element_op CDE elementwise operation object.
+     * \return Pointer to the argument.
+     */
+    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(
+        APointers p_a,
+        BPointers p_b,
+        const std::array<const void*, NumDTensor>& p_ds,
+        void* p_e,
+        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
+        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
+        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_lengths,
+        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_strides,
+        const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths,
+        const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+        const std::array<index_t, NDimSpatial>& input_left_pads,
+        const std::array<index_t, NDimSpatial>& input_right_pads,
+        const AElementwiseOperation& a_element_op,
+        const BElementwiseOperation& b_element_op,
+        const CDEElementwiseOperation& cde_element_op) = 0;
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp
@@ -3,21 +3,33 @@
 #pragma once
-#include <array>
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
-#include "ck/tensor_operation/gpu/device/device_base.hpp"
 namespace ck {
 namespace tensor_operation {
 namespace device {
-// Convolution Forward:
+/**
-//   input : input image A[G, N, C, Hi, Wi],
+ * \brief Grouped Convolution Forward
-//   input : weight B[G, K, C, Y, X],
+ *
-//   input : D0[G, N, K, Ho, Wo], D1[G, N, K, Ho, Wo], ...
+ * \note This structure is deprecated (left for backwards compatibility). Please use
-//   output : output image E[G, N, K, Ho, Wo]
+ *       DeviceGroupedConvFwdMultipleABD.
-//   C = a_op(A) * b_op(B)
+ *
-//   E = cde_op(C, D0, D1, ...)
+ * \tparam NDimSpatial Number of spatial dimensions.
+ * \tparam ALayout Input layout (also for a1, a2...).
+ * \tparam BLayout Weight layout (also for b1, b2...).
+ * \tparam DsLayout Ds layouts.
+ * \tparam ELayout Output layout.
+ * \tparam ADataType Input data type. Pass tuple if there is multiple A.
+ * \tparam BDataType Weight data type. Pass tuple if there is multiple B.
+ * \tparam DsDataType D data types.
+ * \tparam EDataType Output data type.
+ * \tparam AElementwiseOperation A elementwise operation.
+ * \tparam BElementwiseOperation B elementwise operation.
+ * \tparam CDEElementwiseOperation CDE elementwise operation.
+ * \tparam ComputeType Compute data type (default: ADataType, first if tuple passed).
+ */
 template <index_t NDimSpatial,
          typename ALayout,
          typename BLayout,
@@ -29,36 +41,26 @@ template <index_t NDimSpatial,
          typename EDataType,
          typename AElementwiseOperation,
          typename BElementwiseOperation,
-          typename CDEElementwiseOperation>
+          typename CDEElementwiseOperation,
-struct DeviceGroupedConvFwdMultipleD : public BaseOperator
+          typename ComputeType =
-{
+              decltype(UnpackDataType<is_detected<is_tuple, ADataType>::value,
-    static constexpr index_t NumDTensor = DsDataType::Size();
+                                      Number<0>,
+                                      ADataType>())> // ComputeType is InputType by default (first
-    static_assert(NumDTensor == DsLayout::Size(), "wrong! Inconsistent NumDTensor");
+                                                     // in tuple for MultiAB), unpack if tuple was
+                                                     // passed
-    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(
+using DeviceGroupedConvFwdMultipleD = DeviceGroupedConvFwdMultipleABD<NDimSpatial,
-        const void* p_a, // input image
+                                                                      ALayout,
-        const void* p_b, // weight
+                                                                      BLayout,
-        const std::array<const void*, NumDTensor>& p_ds,
+                                                                      DsLayout,
-        void* p_e, // output image
+                                                                      ELayout,
-        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
+                                                                      ADataType,
-        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+                                                                      BDataType,
-        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+                                                                      DsDataType,
-        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
+                                                                      EDataType,
-        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_lengths,
+                                                                      AElementwiseOperation,
-        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_strides,
+                                                                      BElementwiseOperation,
-        const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths,
+                                                                      CDEElementwiseOperation,
-        const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides,
+                                                                      ComputeType>;
-        const std::array<index_t, NDimSpatial>& conv_filter_strides,
-        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
-        const std::array<index_t, NDimSpatial>& input_left_pads,
-        const std::array<index_t, NDimSpatial>& input_right_pads,
-        const AElementwiseOperation& a_element_op,
-        const BElementwiseOperation& b_element_op,
-        const CDEElementwiseOperation& cde_element_op) = 0;
-    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
-};
 } // namespace device
 } // namespace tensor_operation

--- a/include/ck/tensor_operation/gpu/device/device_normalization_bwd_gamma_beta.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_normalization_bwd_gamma_beta.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iostream>
+#include <vector>
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+template <typename DYDataType,
+          typename XDataType,
+          typename MeanInvStdDataType,
+          typename DGammaDataType,
+          typename DBetaDataType,
+          index_t Rank,
+          index_t NumReduceDim>
+struct DeviceNormalizationBwdGammaBeta : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const std::vector<index_t> inLengths,
+                        const std::vector<index_t> dyStrides,
+                        const std::vector<index_t> xStrides,
+                        const std::vector<index_t> meanStrides,
+                        const std::vector<index_t> invStdStrides,
+                        const std::vector<index_t> outLengths,
+                        const std::vector<index_t> dgammaStrides,
+                        const std::vector<index_t> dbetaStrides,
+                        const std::vector<index_t> reduceDims,
+                        const void* p_dy,
+                        const void* p_x,
+                        const void* p_mean,
+                        const void* p_invStd,
+                        void* p_dgamma,
+                        void* p_dbeta) = 0;
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+template <typename DYDataType,
+          typename XDataType,
+          typename MeanInvStdDataType,
+          typename DGammaDataType,
+          typename DBetaDataType,
+          index_t Rank,
+          index_t NumReduceDim>
+using DeviceNormalizationBwdGammaBetaPtr =
+    std::unique_ptr<DeviceNormalizationBwdGammaBeta<DYDataType,
+                                                    XDataType,
+                                                    MeanInvStdDataType,
+                                                    DGammaDataType,
+                                                    DBetaDataType,
+                                                    Rank,
+                                                    NumReduceDim>>;
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_normalization.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_normalization.hpp
@@ -14,12 +14,12 @@ namespace device {
 template <typename XDataType,
          typename GammaDataType,
          typename BetaDataType,
-          typename ComputeDataType,
          typename YDataType,
+          typename SaveMeanInvStdDataType,
          typename YElementwiseOperation,
          index_t Rank,
          index_t NumReduceDim>
-struct DeviceNormalization : public BaseOperator
+struct DeviceNormalizationFwd : public BaseOperator
 {
    virtual std::unique_ptr<BaseArgument>
    MakeArgumentPointer(const std::vector<index_t> lengths,
@@ -27,6 +27,8 @@ struct DeviceNormalization : public BaseOperator
                        const std::vector<index_t> gammaStrides,
                        const std::vector<index_t> betaStrides,
                        const std::vector<index_t> yStrides,
+                        const std::vector<index_t> saveMeanStrides,
+                        const std::vector<index_t> saveInvStdStrides,
                        const std::vector<index_t> reduceDims,
                        double epsilon,
                        const void* p_x,
@@ -43,19 +45,19 @@ struct DeviceNormalization : public BaseOperator
 template <typename XDataType,
          typename GammaDataType,
          typename BetaDataType,
-          typename ComputeDataType,
          typename YDataType,
+          typename SaveMeanInvStdDataType,
          typename YElementwiseOperation,
          index_t Rank,
          index_t NumReduceDim>
-using DeviceNormalizationPtr = std::unique_ptr<DeviceNormalization<XDataType,
+using DeviceNormalizationFwdPtr = std::unique_ptr<DeviceNormalizationFwd<XDataType,
-                                                                   GammaDataType,
+                                                                         GammaDataType,
-                                                                   BetaDataType,
+                                                                         BetaDataType,
-                                                                   ComputeDataType,
+                                                                         YDataType,
-                                                                   YDataType,
+                                                                         SaveMeanInvStdDataType,
-                                                                   YElementwiseOperation,
+                                                                         YElementwiseOperation,
-                                                                   Rank,
+                                                                         Rank,
-                                                                   NumReduceDim>>;
+                                                                         NumReduceDim>>;
 } // namespace device
 } // namespace tensor_operation

--- a/include/ck/tensor_operation/gpu/device/gemm_dl_algorithm.hpp
+++ b/include/ck/tensor_operation/gpu/device/gemm_dl_algorithm.hpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#pragma once
-namespace ck {
-namespace tensor_operation {
-namespace device {
-enum struct GemmDlAlgorithm
-{
-    Default, // Uses DOT vector instructions
-    Dpp8,    // Uses DOT vector instructions with DPP8 SEL modifier to reduce data loads from LDS
-};
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp
@@ -185,7 +185,7 @@ struct DeviceBatchedGemmXdl : public DeviceBatchedGemm<ALayout,
        AElementwiseOperation,
        BElementwiseOperation,
        CElementwiseOperation,
-        GemmSpecialization::MNPadding,
+        GemmSpecialization::MNKPadding,
        MPerBlock,
        NPerBlock,
        K0PerBlock,
@@ -315,11 +315,6 @@ struct DeviceBatchedGemmXdl : public DeviceBatchedGemm<ALayout,
            return false;
        }
-        if(problem.K % K1 != 0)
-        {
-            return false;
-        }
        return GridwiseGemm::CheckValidity(problem);
    }
@@ -416,7 +411,12 @@ struct DeviceBatchedGemmXdl : public DeviceBatchedGemm<ALayout,
            << BlockSize << ", "
            << MPerBlock << ", "
            << NPerBlock << ", "
-            << K0PerBlock
+            << K0PerBlock << ", "
+            << K1 << ", "
+            << MPerXDL << ", "
+            << NPerXDL << ", "
+            << MXdlPerWave << ", "
+            << NXdlPerWave << ", "
            << ">"
            << " NumGemmKPrefetchStage: "
            << NumGemmKPrefetchStage << ", "