merge develop

2724c519 · Jing Zhang · 1fb4a474 · 2eb74a9c · 2724c519 · 2724c519
Commit 2724c519 authored Feb 24, 2024 by Jing Zhang
20 changed files
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
@@ -4,27 +4,13 @@
 #pragma once

 #include "ck/utility/common_header.hpp"
+#include "ck/utility/loop_scheduler.hpp"
 #include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
 #include "ck/tensor_operation/gpu/warp/xdlops_gemm.hpp"
 #include "ck/tensor_description/tensor_adaptor.hpp"

 namespace ck {

-enum struct LoopScheduler
-{
-    Default,
-    Interwave,
-};
-
-constexpr LoopScheduler make_default_loop_scheduler()
-{
-#if CK_EXPERIMENTAL_DEFAULT_TO_INTER_WAVE_SCHEDULING
-    return LoopScheduler::Interwave;
-#else
-    return LoopScheduler::Default;
-#endif // if CK_EXPERIMENTAL_DEFAULT_TO_INTER_WAVE_SCHEDULING
-}
-
 template <index_t MNXdlPerWave, index_t MNWaves, index_t MNPerXdl, typename TileDesc_K0_MN_K1>
 __host__ __device__ static constexpr auto
 MakeGemmMmaTileDescriptor_MN0_MN1_MN2_K(const TileDesc_K0_MN_K1&)
@@ -42,7 +28,8 @@ MakeGemmMmaTileDescriptor_MN0_MN1_MN2_K(const TileDesc_K0_MN_K1&)
 }

 template <index_t BlockSize,
-          typename FloatAB,
+          typename FloatA,
+          typename FloatB,
          typename FloatAcc,
          typename AK0MK1BlockDesc,
          typename BK0NK1BlockDesc,
@@ -50,7 +37,9 @@ template <index_t BlockSize,
          index_t NPerXDL,
          index_t MRepeat,
          index_t NRepeat,
-          index_t KPack>
+          index_t KPack,
+          typename ComputeTypeA = FloatA,
+          typename ComputeTypeB = FloatB>
 struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
 {
    static constexpr auto I0 = Number<0>{};
@@ -72,7 +61,8 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
    static constexpr index_t A_K1 = AK0MK1BlockDesc{}.GetLength(I2);
    static constexpr index_t B_K1 = BK0NK1BlockDesc{}.GetLength(I2);

-    static constexpr auto xdlops_gemm = XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack>{};
+    static constexpr auto xdlops_gemm =
+        XdlopsGemm<ComputeTypeA, MPerXDL, NPerXDL, KPack, ComputeTypeB>{};

    static constexpr index_t KPerThread = KPerBlock / xdlops_gemm.K0PerXdlops;

@@ -308,9 +298,9 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                        const BBlockBuffer& b_block_buf,
                        CThreadBuffer& c_thread_buf) const
    {
-        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatAB>(
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeA>(
            a_thread_desc_.GetElementSpaceSize());
-        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatAB>(
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeB>(
            b_thread_desc_.GetElementSpaceSize());

        static_for<0, MRepeat, 1>{}([&](auto m0) {
@@ -332,25 +322,27 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                                   b_thread_buf);

                static_for<0, KPerThread, KPack>{}([&](auto k) {
-                    vector_type<FloatAB, KPack> a_thread_vec;
-                    vector_type<FloatAB, KPack> b_thread_vec;
+                    vector_type<ComputeTypeA, KPack> a_thread_vec;
+                    vector_type<ComputeTypeB, KPack> b_thread_vec;

                    static_for<0, KPack, 1>{}([&](auto i) {
-                        a_thread_vec.template AsType<FloatAB>()(i) = a_thread_buf
+                        a_thread_vec.template AsType<ComputeTypeA>()(i) = a_thread_buf
                            [Number<a_thread_desc_.CalculateOffset(make_tuple(0, 0, 0, k + i))>{}];
-                        b_thread_vec.template AsType<FloatAB>()(i) = b_thread_buf
+                        b_thread_vec.template AsType<ComputeTypeB>()(i) = b_thread_buf
                            [Number<b_thread_desc_.CalculateOffset(make_tuple(0, 0, 0, k + i))>{}];
                    });

-                    using mfma_input_type =
-                        typename vector_type<FloatAB, xdlops_gemm.K1PerXdlops>::type;
+                    using mfma_input_type_a =
+                        typename vector_type<ComputeTypeA, xdlops_gemm.K1PerXdlops>::type;
+                    using mfma_input_type_b =
+                        typename vector_type<ComputeTypeB, xdlops_gemm.K1PerXdlops>::type;

                    constexpr index_t c_offset =
                        c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));

                    xdlops_gemm.template Run(
-                        a_thread_vec.template AsType<mfma_input_type>(),
-                        b_thread_vec.template AsType<mfma_input_type>(),
+                        a_thread_vec.template AsType<mfma_input_type_a>(),
+                        b_thread_vec.template AsType<mfma_input_type_b>(),
                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                });
            });
@@ -370,8 +362,8 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
    static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed(
        make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, xdlops_gemm.GetRegSizePerXdlops()));

-    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
-                                                         FloatAB,
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatA,
+                                                         ComputeTypeA,
                                                         decltype(a_block_desc_m0_m1_m2_k),
                                                         decltype(a_thread_desc_),
                                                         Sequence<1, 1, 1, KPerThread>,
@@ -380,8 +372,8 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                                                         A_K1,
                                                         A_K1>;

-    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
-                                                         FloatAB,
+    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatB,
+                                                         ComputeTypeB,
                                                         decltype(b_block_desc_n0_n1_n2_k),
                                                         decltype(b_thread_desc_),
                                                         Sequence<1, 1, 1, KPerThread>,
@@ -399,7 +391,8 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
 // the latest ROCm release. For unsupported compilers, inter-wave loop scheduler falls back to the
 // default loop scheduler which is given by the macro CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING=0
 template <index_t BlockSize,
-          typename FloatAB,
+          typename FloatA,
+          typename FloatB,
          typename FloatAcc,
          typename AK0MK1BlockDesc,
          typename BK0NK1BlockDesc,
@@ -408,10 +401,13 @@ template <index_t BlockSize,
          index_t MRepeat,
          index_t NRepeat,
          index_t KPack,
+          typename ComputeTypeA  = FloatA,
+          typename ComputeTypeB  = FloatB,
          index_t NumMacClusters = CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING_MAC_CLUSTERS>
 struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
    : public BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
-                                                                 FloatAB,
+                                                                 FloatA,
+                                                                 FloatB,
                                                                 FloatAcc,
                                                                 AK0MK1BlockDesc,
                                                                 BK0NK1BlockDesc,
@@ -419,10 +415,13 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                                                                 NPerXDL,
                                                                 MRepeat,
                                                                 NRepeat,
-                                                                 KPack>
+                                                                 KPack,
+                                                                 ComputeTypeA,
+                                                                 ComputeTypeB>
 {
    using Base = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
-                                                                     FloatAB,
+                                                                     FloatA,
+                                                                     FloatB,
                                                                     FloatAcc,
                                                                     AK0MK1BlockDesc,
                                                                     BK0NK1BlockDesc,
@@ -430,7 +429,9 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                                                                     NPerXDL,
                                                                     MRepeat,
                                                                     NRepeat,
-                                                                     KPack>;
+                                                                     KPack,
+                                                                     ComputeTypeA,
+                                                                     ComputeTypeB>;

 #if CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING
    using Base::a_block_desc_m0_m1_m2_k;
@@ -454,9 +455,9 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                        const BBlockBuffer& b_block_buf,
                        CThreadBuffer& c_thread_buf) const
    {
-        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatAB>(
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeA>(
            a_thread_desc_.GetElementSpaceSize());
-        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatAB>(
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeB>(
            b_thread_desc_.GetElementSpaceSize());

        static_for<0, KPerThread, KPerInnerLoop>{}([&](auto k) {
@@ -493,20 +494,22 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
            static_for<0, KPerInnerLoop, KPack>{}([&](auto k_) {
                static_for<0, MRepeat, 1>{}([&](auto m0) {
                    static_for<0, NRepeat, 1>{}([&](auto n0) {
-                        vector_type<FloatAB, KPack> a_thread_vec;
-                        vector_type<FloatAB, KPack> b_thread_vec;
+                        vector_type<ComputeTypeA, KPack> a_thread_vec;
+                        vector_type<ComputeTypeB, KPack> b_thread_vec;

                        static_for<0, KPack, 1>{}([&](auto i) {
-                            a_thread_vec.template AsType<FloatAB>()(i) =
+                            a_thread_vec.template AsType<ComputeTypeA>()(i) =
                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                    make_tuple(m0, 0, 0, k_ + i))>{}];
-                            b_thread_vec.template AsType<FloatAB>()(i) =
+                            b_thread_vec.template AsType<ComputeTypeB>()(i) =
                                b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                    make_tuple(n0, 0, 0, k_ + i))>{}];
                        });

-                        using mfma_input_type =
-                            typename vector_type<FloatAB, xdlops_gemm.K1PerXdlops>::type;
+                        using mfma_input_type_a =
+                            typename vector_type<ComputeTypeA, xdlops_gemm.K1PerXdlops>::type;
+                        using mfma_input_type_b =
+                            typename vector_type<ComputeTypeB, xdlops_gemm.K1PerXdlops>::type;

                        constexpr index_t c_offset =
                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
@@ -528,8 +531,8 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                        // TODO: insert setprio in more precise manner since we
                        // could have more than >1 MFMA instructions in single call
                        xdlops_gemm.template Run(
-                            a_thread_vec.template AsType<mfma_input_type>(),
-                            b_thread_vec.template AsType<mfma_input_type>(),
+                            a_thread_vec.template AsType<mfma_input_type_a>(),
+                            b_thread_vec.template AsType<mfma_input_type_b>(),
                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                        if constexpr(k_.value == 0 && m0.value == 0 && n0.value == 0)
                        {
@@ -555,8 +558,8 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed(
        make_tuple(Number<NRepeat>{}, I1, I1, Number<KPerInnerLoop>{}));

-    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
-                                                         FloatAB,
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatA,
+                                                         ComputeTypeA,
                                                         decltype(a_block_desc_m0_m1_m2_k),
                                                         decltype(a_thread_desc_),
                                                         Sequence<1, 1, 1, KPerInnerLoop>,
@@ -565,8 +568,8 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                                                         A_K1,
                                                         A_K1>;

-    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
-                                                         FloatAB,
+    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatB,
+                                                         ComputeTypeB,
                                                         decltype(b_block_desc_n0_n1_n2_k),
                                                         decltype(b_thread_desc_),
                                                         Sequence<1, 1, 1, KPerInnerLoop>,
@@ -582,7 +585,8 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
 };

 template <index_t BlockSize,
-          typename FloatAB,
+          typename FloatA,
+          typename FloatB,
          typename FloatAcc,
          typename AK0MK1BlockDesc,
          typename BK0NK1BlockDesc,
@@ -591,13 +595,16 @@ template <index_t BlockSize,
          index_t MRepeat,
          index_t NRepeat,
          index_t KPack,
-          LoopScheduler LoopSched>
+          LoopScheduler LoopSched,
+          typename ComputeTypeA = FloatA,
+          typename ComputeTypeB = FloatB>
 constexpr auto BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector()
 {
    if constexpr(LoopSched == LoopScheduler::Default)
    {
        return BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
-                                                                   FloatAB,
+                                                                   FloatA,
+                                                                   FloatB,
                                                                   FloatAcc,
                                                                   AK0MK1BlockDesc,
                                                                   BK0NK1BlockDesc,
@@ -605,12 +612,15 @@ constexpr auto BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector()
                                                                   NPerXDL,
                                                                   MRepeat,
                                                                   NRepeat,
-                                                                   KPack>{};
+                                                                   KPack,
+                                                                   ComputeTypeA,
+                                                                   ComputeTypeB>{};
    }
    else if constexpr(LoopSched == LoopScheduler::Interwave)
    {
        return BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
-                                                                            FloatAB,
+                                                                            FloatA,
+                                                                            FloatB,
                                                                            FloatAcc,
                                                                            AK0MK1BlockDesc,
                                                                            BK0NK1BlockDesc,
@@ -618,7 +628,9 @@ constexpr auto BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector()
                                                                            NPerXDL,
                                                                            MRepeat,
                                                                            NRepeat,
-                                                                            KPack>{};
+                                                                            KPack,
+                                                                            ComputeTypeA,
+                                                                            ComputeTypeB>{};
    }
 };

@@ -632,26 +644,27 @@ constexpr auto BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector()
 * 3. configurable k index starting position and step size after each FMA/XDL instruction
 */

-template <index_t BlockSize,
-          typename FloatAB,
-          typename FloatAcc,
-          typename ATileDesc,
-          typename BTileDesc,
-          typename AMmaTileDesc,
-          typename BMmaTileDesc,
-          index_t MPerBlock,
-          index_t NPerBlock,
-          index_t KPerBlock,
-          index_t MPerXDL,
-          index_t NPerXDL,
-          index_t MRepeat,
-          index_t NRepeat,
-          index_t KPack,
-          bool TransposeC = false,
-          index_t AMmaKStride =
-              KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, TransposeC>{}.K0PerXdlops,
-          index_t BMmaKStride =
-              KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, TransposeC>{}.K0PerXdlops>
+template <
+    index_t BlockSize,
+    typename FloatAB,
+    typename FloatAcc,
+    typename ATileDesc,
+    typename BTileDesc,
+    typename AMmaTileDesc,
+    typename BMmaTileDesc,
+    index_t MPerBlock,
+    index_t NPerBlock,
+    index_t KPerBlock,
+    index_t MPerXDL,
+    index_t NPerXDL,
+    index_t MRepeat,
+    index_t NRepeat,
+    index_t KPack,
+    bool TransposeC = false,
+    index_t AMmaKStride =
+        KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops,
+    index_t BMmaKStride =
+        KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
 struct BlockwiseGemmXdlops_v2
 {
    static constexpr auto I0 = Number<0>{};
@@ -668,7 +681,8 @@ struct BlockwiseGemmXdlops_v2
    static constexpr index_t A_K1 = ATileDesc{}.GetLength(I2);
    static constexpr index_t B_K1 = BTileDesc{}.GetLength(I2);

-    static constexpr auto xdlops_gemm = XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, TransposeC>{};
+    static constexpr auto xdlops_gemm =
+        XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{};

    static constexpr index_t KPerThread = KPerBlock / xdlops_gemm.K0PerXdlops;


--- a/include/ck/tensor_operation/gpu/block/blockwise_softmax.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_softmax.hpp
@@ -35,8 +35,8 @@ struct BlockwiseSoftmax
    static constexpr index_t MRepeat = ThreadSliceDesc_M_K{}.GetLength(I0);
    static constexpr index_t KRepeat = ThreadSliceDesc_M_K{}.GetLength(I1);

-    using ThreadSliceDesc_M = decltype(
-        make_naive_tensor_descriptor_packed(make_tuple(ThreadSliceDesc_M_K{}.GetLength(I0))));
+    using ThreadSliceDesc_M = decltype(make_naive_tensor_descriptor_packed(
+        make_tuple(ThreadSliceDesc_M_K{}.GetLength(I0))));

    using ThreadwiseMaxReduce = typename conditional<
        IgnoreNaN,

--- a/include/ck/tensor_operation/gpu/block/blockwise_welford.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_welford.hpp
@@ -4,7 +4,7 @@
 #pragma once

 #include "ck/tensor_description/cluster_descriptor.hpp"
-#include "ck/utility/reduction_common.hpp"
+#include "ck/utility/get_shift.hpp"

 namespace ck {

@@ -35,10 +35,11 @@ struct BlockwiseWelford
    static constexpr auto thread_cluster_desc =
        make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});

+    template <typename CountDataType>
    __device__ static inline void
-    Merge(T& mean_a, T& var_a, int& count_a, T mean_b, T var_b, int count_b)
+    Merge(T& mean_a, T& var_a, CountDataType& count_a, T mean_b, T var_b, CountDataType count_b)
    {
-        int count            = count_a + count_b;
+        CountDataType count  = count_a + count_b;
        T count_b_over_count = count == 0 ? type_convert<T>(0) : type_convert<T>(count_b) / count;
        T delta              = mean_b - mean_a;
        mean_a += delta * count_b_over_count;
@@ -46,11 +47,12 @@ struct BlockwiseWelford
        count_a = count;
    }

-    __device__ static void Run(T& mean_value, T& var_value, int& count)
+    template <typename CountDataType>
+    __device__ static void Run(T& mean_value, T& var_value, CountDataType& count)
    {
        __shared__ T mean_block_buf[BlockSize];
        __shared__ T var_block_buf[BlockSize];
-        __shared__ int count_block_buf[BlockSize];
+        __shared__ CountDataType count_block_buf[BlockSize];

        constexpr auto cluster_len_shift = get_shift<BufferLength_K>();

@@ -76,13 +78,13 @@ struct BlockwiseWelford
                index_t offset2 = block_buf_desc_m_k.CalculateOffset(thread_cluster_idx +
                                                                     make_tuple(0, indOffset));

-                T mean1    = mean_block_buf[offset1];
-                T var1     = var_block_buf[offset1];
-                int count1 = count_block_buf[offset1];
+                T mean1              = mean_block_buf[offset1];
+                T var1               = var_block_buf[offset1];
+                CountDataType count1 = count_block_buf[offset1];

-                T mean2    = mean_block_buf[offset2];
-                T var2     = var_block_buf[offset2];
-                int count2 = count_block_buf[offset2];
+                T mean2              = mean_block_buf[offset2];
+                T var2               = var_block_buf[offset2];
+                CountDataType count2 = count_block_buf[offset2];

                Merge(mean1, var1, count1, mean2, var2, count2);


--- a/include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp
+++ b/include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp
@@ -4,7 +4,7 @@
 #pragma once

 #include "ck/tensor_description/cluster_descriptor.hpp"
-#include "ck/utility/reduction_common.hpp"
+#include "ck/utility/get_shift.hpp"
 #include "ck/utility/reduction_functions_accumulate.hpp"

 namespace ck {

--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_direct_load.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_direct_load.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/cluster_descriptor.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+/**
+ * Transfer that uses direct load instructions to copy data from global to LDS memory.
+ *
+ * Traditional loads first copy data from global to registers, and then from registers to LDS.
+ * Direct loads do not need an intermediate step, data is copied directly from global to LDS,
+ * without the use of additional registers.
+ *
+ * However, the instruction has limitations:
+ * - each thread must copy exactly a single DWORD - 4 bytes;
+ * - threads within a single wavefront must write consecutive DWORDS into LDS,
+ *   (data in global do not need to be contiguous, each thread might have its own offset).
+ *
+ * To make sure that all the transfers finished, the `waitcnt` instruction must be used with
+ * `vmcnt` instead of `lgkmcnt`.
+ *
+ * Limitations of the transfer class:
+ * - `SrcData` must be the same as `DstData` - no possibility to convert the data type in flight;
+ * - `DstVectorDim` must be the last dimension;
+ * - `SrcVectorDim` must be the last dimension if `ScalarPerVector` is greater than 1;
+ * - `ScalarPerVector` times the number of bytes of `DstData` must be equal to a single DWORD = 4B
+ *   (for examlpe if `DstData` is fp32, then `ScalarPerVector` must be 1; if `DstData` is fp16,
+ *   `ScalarPerVector` must be 2);
+ * - if `ScalarPerVector` is greater than 1, the contiguous dimension in src and dst must be
+ *   the same dimension;
+ * - threads in a wavefront must write contiguous data to LDS (when wavefront size is 64,
+ *   they must write 64 contiguous DWORDs) - `ThreadClusterLengths` must be prepared in such a way
+ *   to guarantee that.
+ */
+template <typename ThreadGroup,
+          typename BlockSliceLengths,
+          typename ThreadClusterLengths,
+          typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          index_t SrcVectorDim,
+          index_t DstVectorDim,
+          index_t ScalarPerVector>
+struct ThreadGroupTensorSliceTransfer_DirectLoad
+{
+    static constexpr index_t nDim = remove_reference_t<SrcDesc>::GetNumOfDimension();
+    using Index                   = MultiIndex<nDim>;
+
+    using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
+    using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
+
+    using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
+    using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));
+
+    static constexpr auto I0 = Number<0>{};
+
+    static constexpr auto block_slice_lengths    = BlockSliceLengths{};
+    static constexpr auto thread_cluster_lengths = ThreadClusterLengths{};
+
+    static constexpr auto thread_single_load_size = generate_sequence(
+        detail::lambda_scalar_per_access<DstVectorDim, ScalarPerVector>{}, Number<nDim>{});
+    // After a load, each thread moves by `thread_steps` instead of loading the next elements.
+    // It makes the whole wavefront load contiguous memory, what is required for direct loads.
+    static constexpr auto thread_steps         = thread_cluster_lengths * thread_single_load_size;
+    static constexpr auto thread_slice_lengths = block_slice_lengths / thread_steps;
+
+    static __device__ constexpr bool AreThreadClusterLengthsValid()
+    {
+        // Make sure that ThreadClusterLengths are set in a way that allows for contiguous writes to
+        // LDS by the threads from a single wavefront.
+        // Examples (assuming 64 threads in a wavefront, 128 in a thread block):
+        // 1. BlockSliceLengths = [K0PerBlock, MPerBlock, K1PerBlock] = [4, 128, 8],
+        //    data type = fp32 -> ScalarPerVector = 1
+        //    INVALID: ThreadClusterLengths = [4, 4, 8] since in the first iteration, threads 0-31
+        //             write [0, 0, 0] - [0, 3, 7] and thread 32 writes [1, 0, 0] instead of
+        //             [0, 4, 0].
+        //    VALID: ThreadClusterLengths = [2, 8, 8] or [1, 16, 8] since in the first iteration,
+        //           threads 0-63 write [0, 0, 0] - [0, 7, 7] -> 64 consecutive elements (DWORDs).
+        // 2. BlockSliceLengths = [K0PerBlock, MPerBlock, K1PerBlock] = [4, 128, 8],
+        //    data type = fp16 -> ScalarPerVector = 2
+        //    NOTE: ThreadClusterLengths must take into account that each thread writes two
+        //          elements (single DWORD) along the contiguous dimension.
+        //    INVALID: ThreadClusterLengths = [4, 4, 8] since each 8 threads would try to write
+        //             8 * 2 elements of K1PerBlock and there are only 8;
+        //             ThreadClusterLengths = [4, 8, 4] since in the first iteration, threads 0-31
+        //             write [0, 0, 0] - [0, 7, 7] (7 since each writes 2 elements) and thread 32
+        //             writes [1, 0, 0] instead of [0, 8, 0].
+        //    VALID: ThreadClusterLengths = [4, 16, 4] or [2, 32, 4] or [1, 64, 4] since in the
+        //           first iteration, threads 0-63 write [0, 0, 0] -  [0, 15, 7] -> 128 consecutive
+        //           elements = 64 consecutive DWORDs.
+        int num_contiguous_dwords = 1;
+        bool is_contiguous        = true;
+        static_for<0, nDim, 1>{}([&](auto i) {
+            if(is_contiguous)
+            {
+                num_contiguous_dwords *= thread_cluster_lengths[nDim - i - 1];
+            }
+            if(thread_slice_lengths[nDim - i - 1] > 1)
+            {
+                is_contiguous = false;
+            }
+        });
+        constexpr index_t wavefront_size = get_warp_size();
+        const bool wave_contiguous       = num_contiguous_dwords % wavefront_size == 0;
+
+        bool thread_slice_lengths_correct = true;
+        static_for<0, nDim, 1>{}([&](auto i) {
+            if(thread_slice_lengths[i] <= 0)
+            {
+                thread_slice_lengths_correct = false;
+            }
+        });
+
+        return wave_contiguous && thread_slice_lengths_correct;
+    }
+
+    __device__ constexpr ThreadGroupTensorSliceTransfer_DirectLoad(
+        const SrcDesc& src_desc,
+        const Index& src_block_slice_origin,
+        const DstDesc& dst_desc,
+        const Index& dst_block_slice_origin)
+
+    {
+        static_assert(ck::is_same_v<SrcData, DstData>,
+                      "Direct load transfer does not support datatypes conversion. Source and "
+                      "destination data types must be the same.");
+
+        static_assert(
+            DstVectorDim == nDim - 1,
+            "Direct load transfer requires the destination vector dimension to be the last one.");
+
+        static_assert(ScalarPerVector == 1 || SrcVectorDim == DstVectorDim,
+                      "When loading more than one element per thread at once, the contiguous "
+                      "dimension must be the same between source and destination.");
+
+        constexpr auto dword_bytes           = 4;
+        constexpr auto bytes_per_thread_load = ScalarPerVector * sizeof(SrcData);
+        static_assert(bytes_per_thread_load == dword_bytes,
+                      "Direct load transfer requires each thread to load exactly a single "
+                      "DWORD of data.");
+
+        static_assert(nDim == remove_cvref_t<SrcDesc>::GetNumOfDimension() &&
+                          nDim == remove_cvref_t<DstDesc>::GetNumOfDimension() &&
+                          nDim == ThreadClusterLengths::Size(),
+                      "Inconsistent number of dimensions across lengths and descriptors.");
+
+        static_assert(ThreadGroup::GetNumOfThread() >= thread_cluster_desc_.GetElementSize(),
+                      "The number of threads cannot be less than the number of elements in "
+                      "thread cluster lengths.");
+
+        static_assert(
+            AreThreadClusterLengthsValid(),
+            "Thread cluster lengths are incorrect. They must be set in a way that allows a single "
+            "wavefront to write contiguous DWORDs into LDS memory. ");
+
+        const auto thread_cluster_idx =
+            thread_cluster_desc_.CalculateBottomIndex(make_multi_index(ThreadGroup::GetThreadId()));
+
+        const auto thread_data_idx_begin = thread_cluster_idx * thread_single_load_size;
+
+        SetSrcSliceOrigin(src_desc, src_block_slice_origin + thread_data_idx_begin);
+        SetDstSliceOrigin(dst_desc, dst_block_slice_origin + thread_data_idx_begin);
+    }
+
+    __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
+    {
+        src_coord_        = make_tensor_coordinate(src_desc, src_slice_origin_idx);
+        src_slice_origin_ = src_slice_origin_idx;
+    }
+
+    __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
+    {
+        dst_coord_        = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
+        dst_slice_origin_ = dst_slice_origin_idx;
+    }
+
+    __device__ void ResetDstSliceWindow(const DstDesc& dst_desc)
+    {
+        dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_);
+    }
+
+    template <typename SrcBuffer, typename DstBuffer>
+    __device__ void Run(const SrcDesc& src_desc,
+                        const SrcBuffer& src_buf,
+                        const DstDesc& dst_desc,
+                        DstBuffer& dst_buf)
+    {
+        static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Global,
+                      "Source data must come from a global memory buffer.");
+        static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
+                      "Destination data must be stored in an LDS memory buffer.");
+
+        static_assert(
+            ck::is_same_v<remove_cvref_t<typename SrcBuffer::type>, remove_cvref_t<SrcData>>,
+            "SrcBuffer and SrcData data types must be consistent.");
+        static_assert(
+            ck::is_same_v<remove_cvref_t<typename DstBuffer::type>, remove_cvref_t<DstData>>,
+            "DstBuffer and DstData data types must be consistent.");
+
+        constexpr auto dst_access_lengths = thread_slice_lengths;
+
+        const auto dst_forward_steps  = generate_steps(dst_desc, 1);
+        const auto dst_backward_steps = generate_steps(dst_desc, -1);
+        const auto src_forward_steps  = generate_steps(src_desc, 1);
+        const auto src_backward_steps = generate_steps(src_desc, -1);
+
+        // Loop over the destination block and copy data.
+        static_ford<decltype(dst_access_lengths)>{}([&](auto ordered_dst_access_idx) {
+            const auto src_offset = src_coord_.GetOffset();
+            const auto dst_offset = dst_coord_.GetOffset();
+
+            // Check if src data is not in the logic padding area.
+            const bool is_src_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_);
+
+            src_buf.template DirectCopyToLds<remove_cvref_t<decltype(dst_buf)>, ScalarPerVector>(
+                dst_buf, src_offset, dst_offset, is_src_valid);
+
+            constexpr auto move_on_dim = [&]() constexpr
+            {
+                StaticallyIndexedArray<bool, nDim> move_on_dim_;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    move_on_dim_(i) = ordered_dst_access_idx[i] < dst_access_lengths[i] - 1;
+
+                    static_for<i + 1, nDim, 1>{}([&](auto j) {
+                        move_on_dim_(i) &= ordered_dst_access_idx[j] == dst_access_lengths[j] - 1;
+                    });
+                });
+
+                return move_on_dim_;
+            }
+            ();
+
+            // Decide whether to move forward or backward.
+            constexpr auto forward_sweep = [&]() {
+                StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+                forward_sweep_(I0) = true;
+
+                static_for<1, nDim, 1>{}([&](auto i) {
+                    index_t tmp = ordered_dst_access_idx[I0];
+
+                    static_for<1, i, 1>{}([&](auto j) {
+                        tmp = tmp * dst_access_lengths[j] + ordered_dst_access_idx[j];
+                    });
+
+                    forward_sweep_(i) = tmp % 2 == 0;
+                });
+
+                return forward_sweep_;
+            }();
+
+            static_for<0, nDim, 1>{}([&](auto i) {
+                if constexpr(move_on_dim[i])
+                {
+                    if constexpr(forward_sweep[i])
+                    {
+                        move_tensor_coordinate(dst_desc, dst_coord_, dst_forward_steps[i]);
+                        move_tensor_coordinate(src_desc, src_coord_, src_forward_steps[i]);
+                    }
+                    else
+                    {
+                        move_tensor_coordinate(dst_desc, dst_coord_, dst_backward_steps[i]);
+                        move_tensor_coordinate(src_desc, src_coord_, src_backward_steps[i]);
+                    }
+                }
+            });
+        });
+
+        // Reset the destination slice since the entire buffer has been already filled.
+        ResetDstSliceWindow(dst_desc);
+    }
+
+    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc, const Index& step)
+    {
+        src_slice_origin_ = src_slice_origin_ + step;
+        src_coord_        = make_tensor_coordinate(src_desc, src_slice_origin_);
+    }
+
+    template <typename DescType>
+    __device__ auto generate_steps(const DescType& desc, int sign)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                Index step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    step_idx(j) = (i.value == j.value) ? sign * thread_steps[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(desc, step_idx);
+            },
+            Number<nDim>{});
+    }
+
+    private:
+    static constexpr auto thread_cluster_desc_ = make_cluster_descriptor(ThreadClusterLengths{});
+
+    SrcCoord src_coord_;
+    DstCoord dst_coord_;
+    Index src_slice_origin_;
+    Index dst_slice_origin_;
+};
+
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
@@ -94,6 +94,21 @@ struct ThreadGroupTensorSliceTransfer_v4r1
        }
    }

+    __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_block_slice_origin)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            const auto thread_cluster_idx = thread_cluster_desc_.CalculateBottomIndex(
+                make_multi_index(ThreadGroup::GetThreadId()));
+
+            const auto thread_data_idx_begin = thread_cluster_idx * thread_slice_lengths;
+
+            threadwise_transfer_.SetSrcSliceOrigin(src_desc,
+                                                   src_block_slice_origin + thread_data_idx_begin);
+        }
+    }
+
    template <typename SrcBuffer, index_t ThreadScratchId = 0>
    __device__ void RunRead(const SrcDesc& src_desc,
                            const SrcBuffer& src_buf,

--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1r2.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1r2.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/cluster_descriptor.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1r2.hpp"
+
+namespace ck {
+
+// this version does following things to avoid scratch memory issue
+// 1. Use StaticallyIndexedArray instead of C array for thread buffer
+// 2. ThreadwiseTensorSliceTransfer_v3 does not keep reference to tensor descriptor
+// 3. ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
+template <typename ThreadGroup,
+          typename ElementwiseOperation,
+          typename SliceLengths,
+          typename ThreadClusterLengths,
+          typename ThreadClusterArrangeOrder,
+          typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename DimAccessOrder,
+          index_t VectorDim,
+          index_t ScalarPerVector,
+          bool ThreadTransferSrcResetCoordinateAfterRun,
+          bool ThreadTransferDstResetCoordinateAfterRun>
+struct ThreadGroupTensorSliceTransfer_v6r1r2
+{
+    static constexpr index_t nDim = remove_reference_t<SrcDesc>::GetNumOfDimension();
+
+    static constexpr auto thread_slice_lengths = SliceLengths{} / ThreadClusterLengths{};
+
+    using Index = MultiIndex<nDim>;
+
+    __device__ constexpr ThreadGroupTensorSliceTransfer_v6r1r2(
+        const SrcDesc& src_desc,
+        const Index& src_block_slice_origin,
+        const DstDesc& dst_desc,
+        const Index& dst_block_slice_origin,
+        const ElementwiseOperation& element_op)
+        : threadwise_transfer_(src_desc,
+                               make_zero_multi_index<nDim>(),
+                               dst_desc,
+                               make_zero_multi_index<nDim>(),
+                               element_op)
+
+    {
+        static_assert(nDim == remove_cvref_t<SrcDesc>::GetNumOfDimension() &&
+                          nDim == remove_cvref_t<DstDesc>::GetNumOfDimension() &&
+                          nDim == ThreadClusterLengths::Size() &&
+                          nDim == ThreadClusterArrangeOrder::Size() &&
+                          nDim == DimAccessOrder::Size(),
+                      "wrong! nDim not consistent");
+
+        static_assert(
+            is_same<SliceLengths, decltype(thread_slice_lengths * ThreadClusterLengths{})>{},
+            "wrong! threads should be mapped to cover entire slicing window");
+
+        static_assert(ThreadGroup::GetNumOfThread() >= thread_cluster_desc_.GetElementSize(),
+                      "wrong! ThreadGroup::GetNumOfThread() too small");
+
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            const auto thread_cluster_idx = thread_cluster_desc_.CalculateBottomIndex(
+                make_multi_index(ThreadGroup::GetThreadId()));
+
+            const auto thread_data_idx_begin = thread_cluster_idx * thread_slice_lengths;
+
+            threadwise_transfer_.SetSrcSliceOrigin(src_desc,
+                                                   src_block_slice_origin + thread_data_idx_begin);
+            threadwise_transfer_.SetDstSliceOrigin(dst_desc,
+                                                   dst_block_slice_origin + thread_data_idx_begin);
+        }
+    }
+
+    template <typename SrcBuffer, typename DstBuffer, InMemoryDataOperationEnum DstInMemOp>
+    __device__ void Run(const SrcDesc& src_desc,
+                        const SrcBuffer& src_buf,
+                        const DstDesc& dst_desc,
+                        DstBuffer& dst_buf)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.template Run<SrcBuffer, DstBuffer, DstInMemOp>(
+                src_desc, src_buf, dst_desc, dst_buf);
+        }
+    }
+
+    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc, const Index& step)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveSrcSliceWindow(src_desc, step);
+        }
+    }
+
+    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc, const Index& step)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveDstSliceWindow(dst_desc, step);
+        }
+    }
+
+    __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_block_slice_origin)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            const auto thread_cluster_idx = thread_cluster_desc_.CalculateBottomIndex(
+                make_multi_index(ThreadGroup::GetThreadId()));
+
+            const auto thread_data_idx_begin = thread_cluster_idx * thread_slice_lengths;
+
+            threadwise_transfer_.SetSrcSliceOrigin(src_desc,
+                                                   src_block_slice_origin + thread_data_idx_begin);
+        }
+    }
+
+    __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_block_slice_origin)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            const auto thread_cluster_idx = thread_cluster_desc_.CalculateBottomIndex(
+                make_multi_index(ThreadGroup::GetThreadId()));
+
+            const auto thread_data_idx_begin = thread_cluster_idx * thread_slice_lengths;
+
+            threadwise_transfer_.SetDstSliceOrigin(dst_desc,
+                                                   dst_block_slice_origin + thread_data_idx_begin);
+        }
+    }
+
+    private:
+    static constexpr auto thread_cluster_desc_ =
+        make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
+
+    using ThreadwiseTransfer =
+        ThreadwiseTensorSliceTransfer_v6r1r2<SrcData,
+                                             DstData,
+                                             SrcDesc,
+                                             DstDesc,
+                                             ElementwiseOperation,
+                                             decltype(thread_slice_lengths),
+                                             DimAccessOrder,
+                                             VectorDim,
+                                             ScalarPerVector,
+                                             ThreadTransferSrcResetCoordinateAfterRun,
+                                             ThreadTransferDstResetCoordinateAfterRun>;
+
+    ThreadwiseTransfer threadwise_transfer_;
+};
+
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r2.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r2.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/cluster_descriptor.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r2.hpp"
+#include "ck/utility/is_detected.hpp"
+
+namespace ck {
+
+// Thread-group level multi-source, multi-destination tensor slice data movement
+// Assume:
+//   1. All sources and destinations are DynamicBuffer
+//   2. Same VectorDim and ScalerPerVector for all sources and destinations
+//   3. DstInMemOps are per destination tensor
+//   4. ThreadTransferSrcResetCoordinateAfterRunFlags are per source tensor
+//   5. ThreadTransferDstResetCoordinateAfterRunFlags are per destination tensor
+//
+// Does following things to avoid scratch memory issue
+//   1. Pass tensor descritpors by reference (or tuple of references)
+//   2. Does not keep reference to tensor descriptor
+//   3. Does not construct new tensor coordinate when call Run()
+template <typename ThreadGroup,
+          typename SrcDatas,
+          typename DstDatas,
+          typename SrcDescs,
+          typename DstDescs,
+          typename ElementwiseOperation,
+          typename DstInMemOps, // Sequence<InMemoryDataOperationEnum ...>
+          typename SliceLengths,
+          typename ThreadClusterLengths,
+          typename ThreadClusterArrangeOrder,
+          typename SrcDimAccessOrder,
+          typename DstDimAccessOrder,
+          index_t SrcVectorDim,
+          index_t DstVectorDim,
+          index_t SrcScalarPerVector,
+          index_t DstScalarPerVector,
+          typename ThreadTransferSrcResetCoordinateAfterRunFlags,
+          typename ThreadTransferDstResetCoordinateAfterRunFlags>
+struct ThreadGroupTensorSliceTransfer_v7r2
+{
+    static constexpr index_t nDim =
+        remove_cvref_t<tuple_element_t<0, SrcDescs>>::GetNumOfDimension();
+
+    static constexpr index_t nSrc = remove_cvref_t<SrcDescs>::Size();
+    static constexpr index_t nDst = remove_cvref_t<DstDescs>::Size();
+
+    using Index = MultiIndex<nDim>;
+
+    static constexpr auto thread_slice_lengths = SliceLengths{} / ThreadClusterLengths{};
+
+    __device__ constexpr ThreadGroupTensorSliceTransfer_v7r2(
+        const SrcDescs& src_descs,
+        const StaticallyIndexedArray<Index, nSrc>& src_block_slice_origins,
+        const DstDescs& dst_descs,
+        const StaticallyIndexedArray<Index, nDst>& dst_block_slice_origins,
+        const ElementwiseOperation& element_op)
+        : threadwise_transfer_(src_descs,
+                               StaticallyIndexedArray<Index, nSrc>{},
+                               dst_descs,
+                               StaticallyIndexedArray<Index, nDst>{},
+                               element_op)
+    {
+        static_assert(nSrc == SrcDatas::Size() && nSrc == SrcDescs::Size() &&
+                          nSrc == ThreadTransferSrcResetCoordinateAfterRunFlags::Size() &&
+                          nDst == DstDatas::Size() && nDst == DstDescs::Size() &&
+                          nDst == ThreadTransferDstResetCoordinateAfterRunFlags::Size(),
+                      "wrong!");
+
+        static_for<0, nSrc, 1>{}([&](auto i) {
+            static_assert(
+                nDim == remove_cvref_t<tuple_element_t<i.value, SrcDescs>>::GetNumOfDimension(),
+                "wrong!");
+        });
+
+        static_for<0, nDst, 1>{}([&](auto i) {
+            static_assert(
+                nDim == remove_cvref_t<tuple_element_t<i.value, DstDescs>>::GetNumOfDimension(),
+                "wrong!");
+        });
+
+        static_assert(nDim == ThreadClusterLengths::Size() &&
+                          nDim == ThreadClusterArrangeOrder::Size() &&
+                          nDim == SrcDimAccessOrder::Size() && nDim == DstDimAccessOrder::Size(),
+                      "wrong! nDim not consistent");
+
+        static_assert(
+            is_same<SliceLengths, decltype(thread_slice_lengths * ThreadClusterLengths{})>{},
+            "wrong! threads should be mapped to cover entire slicing window");
+
+        static_assert(ThreadGroup::GetNumOfThread() >= thread_cluster_desc_.GetElementSize(),
+                      "wrong! ThreadGroup::GetNumOfThread() too small");
+
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            const auto thread_cluster_idx = thread_cluster_desc_.CalculateBottomIndex(
+                make_multi_index(get_thread_local_1d_id()));
+
+            const auto thread_data_idx_begin = thread_cluster_idx * thread_slice_lengths;
+
+            const auto src_thread_slice_origins = generate_tuple(
+                [&](auto i) { return src_block_slice_origins[i] + thread_data_idx_begin; },
+                Number<nSrc>{});
+
+            const auto dst_thread_slice_origins = generate_tuple(
+                [&](auto i) { return dst_block_slice_origins[i] + thread_data_idx_begin; },
+                Number<nDst>{});
+
+            threadwise_transfer_.SetSrcSliceOrigins(src_descs, src_thread_slice_origins);
+            threadwise_transfer_.SetDstSliceOrigins(dst_descs, dst_thread_slice_origins);
+        }
+    }
+
+    template <typename SrcBuffers>
+    __device__ void RunRead(const SrcDescs& src_descs, const SrcBuffers& src_bufs)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.RunRead(src_descs, src_bufs);
+        }
+    }
+
+    template <typename T>
+    using is_tuple = decltype(std::declval<T&>().IsTuple());
+
+    template <typename DstBuffers>
+    __device__ void RunWrite(const DstDescs& dst_descs, DstBuffers dst_bufs)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            if constexpr(is_detected<is_tuple, decltype(dst_bufs)>::value)
+                threadwise_transfer_.RunWrite(dst_descs, dst_bufs);
+            else
+                threadwise_transfer_.RunWrite(dst_descs, tie(dst_bufs));
+        }
+    }
+
+    template <typename SrcBuffers, typename DstBuffers>
+    __device__ void Run(const SrcDescs& src_descs,
+                        const SrcBuffers& src_bufs,
+                        const DstDescs& dst_descs,
+                        DstBuffers dst_bufs)
+    {
+        RunRead(src_descs, src_bufs);
+        RunWrite(dst_descs, dst_bufs);
+    }
+
+    template <index_t ISrc>
+    __device__ void
+    MoveSrcSliceWindow(const SrcDescs& src_descs, Number<ISrc> iSrc, const Index& step)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveSrcSliceWindow(src_descs, iSrc, step);
+        }
+    }
+
+    __device__ void MoveSrcSliceWindow(const SrcDescs& src_descs, const Index& step)
+    {
+        static_for<0, SrcDescs::Size(), 1>{}(
+            [&](auto i) { MoveSrcSliceWindow(src_descs, i, step); });
+    }
+
+    template <index_t IDst>
+    __device__ void
+    MoveDstSliceWindow(const DstDescs& dst_descs, Number<IDst> iDst, const Index& step)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveDstSliceWindow(dst_descs, iDst, step);
+        }
+    }
+
+    __device__ void MoveDstSliceWindow(const DstDescs& dst_descs, const Index& step)
+    {
+        static_for<0, DstDescs::Size(), 1>{}(
+            [&](auto i) { MoveDstSliceWindow(dst_descs, i, step); });
+    }
+
+    private:
+    static constexpr auto thread_cluster_desc_ =
+        make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
+
+    using ThreadwiseTransfer =
+        ThreadwiseTensorSliceTransfer_v7r2<SrcDatas,
+                                           DstDatas,
+                                           SrcDescs,
+                                           DstDescs,
+                                           ElementwiseOperation,
+                                           DstInMemOps,
+                                           decltype(thread_slice_lengths),
+                                           SrcDimAccessOrder,
+                                           DstDimAccessOrder,
+                                           SrcVectorDim,
+                                           DstVectorDim,
+                                           SrcScalarPerVector,
+                                           DstScalarPerVector,
+                                           ThreadTransferSrcResetCoordinateAfterRunFlags,
+                                           ThreadTransferDstResetCoordinateAfterRunFlags>;
+
+    ThreadwiseTransfer threadwise_transfer_;
+};
+
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp
+++ b/include/ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+namespace ck {
+namespace conv_tensor_rearrange_op {
+
+struct BaseConvTensorRearrangeOp
+{
+};
+
+struct ImageToColumn : public BaseConvTensorRearrangeOp
+{
+    static constexpr const char* name = "Image to Column";
+};
+
+struct ColumnToImage : public BaseConvTensorRearrangeOp
+{
+    static constexpr const char* name = "Column to Image";
+};
+
+template <typename Op,
+          typename std::enable_if<std::is_base_of<BaseConvTensorRearrangeOp, Op>::value,
+                                  bool>::type = false>
+std::ostream& operator<<(std::ostream& os, const BaseConvTensorRearrangeOp&)
+{
+    os << Op::name;
+    return os;
+}
+
+} // namespace conv_tensor_rearrange_op
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp
@@ -19,8 +19,7 @@ getConvBackwardDataSpecializationString(const ConvolutionBackwardDataSpecializat
    switch(s)
    {
    case ConvolutionBackwardDataSpecialization::Default: return "Default";
-    case ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0:
-        return "FFilter1x1Stride1Pad0";
+    case ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0: return "Filter1x1Stride1Pad0";
    default: return "Unrecognized specialization!";
    }
 }

--- a/include/ck/tensor_operation/gpu/device/device_avgpool_bwd.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_avgpool_bwd.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <index_t NDimSpatial,
+          typename DOutDataType,
+          typename DInDataType,
+          typename DOutLayout,
+          typename DInLayout>
+struct DeviceAvgPoolBwd : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_dout,
+                        void* p_din,
+                        std::vector<ck::index_t> dout_n_k_wos_lengths,
+                        std::vector<ck::index_t> dout_n_k_wos_strides,
+                        std::vector<ck::index_t> din_n_k_wos_length,
+                        std::vector<ck::index_t> din_n_k_wos_strides,
+                        std::vector<ck::index_t> window_k_c_xs_lengths,
+                        std::vector<ck::index_t> window_strides,
+                        std::vector<ck::index_t> window_dilations,
+                        std::vector<ck::index_t> input_left_pads,
+                        std::vector<ck::index_t> input_right_pads) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_base.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_base.hpp
@@ -59,7 +59,9 @@ struct BaseOperator

    virtual size_t GetWorkSpaceSize(const BaseArgument*) const { return 0; }

-    virtual void SetWorkSpacePointer(BaseArgument* p_arg, void* p_workspace) const
+    virtual void SetWorkSpacePointer(BaseArgument* p_arg,
+                                     void* p_workspace,
+                                     const StreamConfig& = StreamConfig{}) const
    {
        assert(p_arg);
        p_arg->p_workspace_ = p_workspace;

--- a/include/ck/tensor_operation/gpu/device/device_contraction_multiple_abd.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_contraction_multiple_abd.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <array>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// GEMM:
+//   input : A0[M0, M1, ... K0, K1, ...], ...
+//   input : B0[N0, N1, ... K0, K1, ...], ...
+//   input : D0[M0, M1, ... N0, N1, ...], D1[M0, M1, ... N0, N1, ...], ...
+//   output : E[M0, M1, ... N0, N1, ...]
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+// Assume:
+//   D0, D1, ... and E have the same layout
+template <index_t NumDimM,
+          index_t NumDimN,
+          index_t NumDimK,
+          typename AsDataType,
+          typename BsDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct DeviceContractionMultipleABD : public BaseOperator
+{
+    static constexpr index_t NumATensor = AsDataType::Size();
+    static constexpr index_t NumBTensor = BsDataType::Size();
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(std::array<const void*, NumATensor> p_as,
+                        std::array<const void*, NumBTensor> p_bs,
+                        std::array<const void*, NumDTensor> p_ds,
+                        void* p_e,
+                        const std::array<std::vector<index_t>, NumATensor>& a_ms_ks_lengths,
+                        const std::array<std::vector<index_t>, NumATensor>& a_ms_ks_strides,
+                        const std::array<std::vector<index_t>, NumBTensor>& b_ns_ks_lengths,
+                        const std::array<std::vector<index_t>, NumBTensor>& b_ns_ks_strides,
+                        const std::array<std::vector<index_t>, NumDTensor>& d_ms_ns_lengths,
+                        const std::array<std::vector<index_t>, NumDTensor>& d_ms_ns_strides,
+                        const std::vector<index_t>& e_ms_ns_length,
+                        const std::vector<index_t>& e_ms_ns_stride,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp
@@ -33,7 +33,8 @@ template <index_t NumDimM,
          typename EDataType,
          typename AElementwiseOperation,
          typename BElementwiseOperation,
-          typename CDEElementwiseOperation>
+          typename CDEElementwiseOperation,
+          typename ComputeDataType = ADataType>
 struct DeviceContractionMultipleD : public BaseOperator
 {
    static constexpr index_t NumDTensor = DsDataType::Size();

--- a/include/ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <array>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+/**
+ * \brief Convolution Tensor Rearrange.
+ *
+ * This Device operator supports converting an image to
+ * the GEMM representation (Image to Column) and
+ * converting a GEMM form to the image (Column to Image).
+ * Supported layouts:
+ * [G, N, Di, Hi, Wi, C] <-> [G, N * Do * Ho * Wo, Z *  Y * X * C]
+ * [N, Di, Hi, Wi, G, C] <-> [N * Do * Ho * Wo, G, Z *  Y * X * C]
+ *
+ * \tparam NDimSpatial Number of spatial dimensions.
+ * \tparam ImageLayout Input Layout.
+ * \tparam InputDataType Input Data Type.
+ * \tparam OutputDataType Output Data Type.
+ * \tparam ConvTensorRearrangeOp Operation type: ImageToColumn, ColumnToImage.
+ */
+template <index_t NDimSpatial,
+          typename ImageLayout,
+          typename InputDataType,
+          typename OutputDataType,
+          typename ConvTensorRearrangeOp>
+struct DeviceConvTensorRearrange : public BaseOperator
+{
+
+    /**
+     * \brief Make argument pointer for image to column.
+     *
+     * \param p_in A pointer to the device memory of the input image.
+     * \param p_out A pointer to the device memory of the output.
+     * \param G Convolution number of groups.
+     * \param N Convolution batch size.
+     * \param C Convolution number of channels.
+     * \param input_spatial_lengths Input spatial lengths.
+     * \param filter_spatial_lengths Filter spatial lengths.
+     * \param output_spatial_lengths Output spatial lengths.
+     * \param image_g_n_c_wis_strides Image strides in order [G, N, C, D, H, W].
+     * \param gemm_g_m_k_strides Gemm form strides.
+     * \param conv_filter_strides Convolution filter strides.
+     * \param conv_filter_dilations Convolution filter dilations.
+     * \param input_left_pads Convolution left pads.
+     * \param input_right_pads Convolution right pads.
+     * \return Pointer to the argument.
+     */
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_in,
+                        void* p_out,
+                        const ck::index_t G,
+                        const ck::index_t N,
+                        const ck::index_t C,
+                        const std::array<index_t, NDimSpatial>& input_spatial_lengths,
+                        const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
+                        const std::array<index_t, NDimSpatial>& output_spatial_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
+                        const std::array<index_t, 3>& gemm_g_m_k_strides,
+                        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                        const std::array<index_t, NDimSpatial>& input_left_pads,
+                        const std::array<index_t, NDimSpatial>& input_right_pads) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_elementwise_scale.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_elementwise_scale.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <memory>
+#include <array>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename InDataTypeTuple,
+          typename OutDataTypeTuple,
+          typename ElementwiseOperation,
+          typename UnaryOperation,
+          typename Scale,
+          index_t NumDim>
+struct DeviceElementwise : public BaseOperator
+{
+    static constexpr int NumInput  = InDataTypeTuple::Size();
+    static constexpr int NumOutput = OutDataTypeTuple::Size();
+
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const std::array<index_t, NumDim> lengths,
+                        const std::array<std::array<index_t, NumDim>, NumInput> inStridesArray,
+                        const std::array<std::array<index_t, NumDim>, NumOutput> outStridesArray,
+                        const std::array<const void*, NumInput> in_dev_buffers,
+                        const std::array<void*, NumOutput> out_dev_buffers,
+                        ElementwiseOperation elementwise_op,
+                        UnaryOperation unary_op,
+                        Scale scale_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+}; // namespace device
+
+template <typename InDataTypeTuple,
+          typename OutDataTypeTuple,
+          typename ElementwiseOperation,
+          typename UnaryOperation,
+          typename Scale,
+          index_t NumDim>
+using DeviceElementwisePtr = std::unique_ptr<DeviceElementwise<InDataTypeTuple,
+                                                               OutDataTypeTuple,
+                                                               ElementwiseOperation,
+                                                               UnaryOperation,
+                                                               Scale,
+                                                               NumDim>>;
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_abd.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_abd.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <array>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// GEMM:
+//   input : A0[M, K], B0[K, N],
+//   input : D0[M, N], D1[M, N], ...
+//   output : E[M, N]
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+// Assume:
+//   D0, D1, ... and E have the same layout
+template <typename AsLayout,
+          typename BsLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename AsDataType,
+          typename BsDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct DeviceGemmMultipleABD : public BaseOperator
+{
+    static constexpr index_t NumATensor = AsDataType::Size();
+    static constexpr index_t NumBTensor = BsDataType::Size();
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(std::array<const void*, NumATensor> p_as,
+                        std::array<const void*, NumBTensor> p_bs,
+                        std::array<const void*, NumDTensor> p_ds,
+                        void* p_e,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        std::array<ck::index_t, NumATensor> StrideAs,
+                        std::array<ck::index_t, NumBTensor> StrideBs,
+                        std::array<ck::index_t, NumDTensor> StrideDs,
+                        ck::index_t StrideE,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_gemm_splitk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_splitk.hpp
@@ -20,7 +20,8 @@ template <typename ALayout,
          typename CDataType,
          typename AElementwiseOperation,
          typename BElementwiseOperation,
-          typename CElementwiseOperation>
+          typename CElementwiseOperation,
+          typename ComputeType = CDataType>
 struct DeviceGemmSplitK : public BaseOperator
 {
    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
@@ -48,7 +49,8 @@ template <typename ALayout,
          typename CDataType,
          typename AElementwiseOperation,
          typename BElementwiseOperation,
-          typename CElementwiseOperation>
+          typename CElementwiseOperation,
+          typename ComputeType = CDataType>
 using DeviceGemmSplitKPtr = std::unique_ptr<DeviceGemmSplitK<ALayout,
                                                             BLayout,
                                                             CLayout,
@@ -57,7 +59,8 @@ using DeviceGemmSplitKPtr = std::unique_ptr<DeviceGemmSplitK<ALayout,
                                                             CDataType,
                                                             AElementwiseOperation,
                                                             BElementwiseOperation,
-                                                             CElementwiseOperation>>;
+                                                             CElementwiseOperation,
+                                                             ComputeType>>;

 } // namespace device
 } // namespace tensor_operation

--- a/include/ck/tensor_operation/gpu/device/device_gemm_streamk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_streamk.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+
+#include "device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+struct DeviceGemmStreamK : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                              const void* p_b,
+                                                              void* p_c,
+                                                              ck::index_t M,
+                                                              ck::index_t N,
+                                                              ck::index_t K,
+                                                              ck::index_t StrideA,
+                                                              ck::index_t StrideB,
+                                                              ck::index_t StrideC,
+                                                              AElementwiseOperation a_element_op,
+                                                              BElementwiseOperation b_element_op,
+                                                              CElementwiseOperation c_element_op,
+                                                              ck::index_t NumSKBlocks = 0) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+using DeviceGemmStreamKPtr = std::unique_ptr<DeviceGemmStreamK<ALayout,
+                                                               BLayout,
+                                                               CLayout,
+                                                               ADataType,
+                                                               BDataType,
+                                                               CDataType,
+                                                               AElementwiseOperation,
+                                                               BElementwiseOperation,
+                                                               CElementwiseOperation>>;
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp
@@ -29,7 +29,9 @@ template <ck::index_t NDimSpatial,
          typename EDataType,
          typename AElementwiseOperation,
          typename BElementwiseOperation,
-          typename CDEElementwiseOperation>
+          typename CDEElementwiseOperation,
+          typename AComputeType = ADataType,
+          typename BComputeType = AComputeType>
 struct DeviceGroupedConvBwdDataMultipleD : public BaseOperator
 {
    static constexpr index_t NumDTensor = DsDataType::Size();