best performing kernel for GEMV codex problem with M=1 with inverted B matrix

930b2872 · Harisankar Sadasivan · a1e17d18 · a4f72a31 · 930b2872 · 930b2872
Commit 930b2872 authored Oct 11, 2023 by Harisankar Sadasivan
20 changed files
--- a/example/53_gemv_splitk/gemv_splitk_fp16.cpp
+++ b/example/53_gemv_splitk/gemv_splitk_fp16.cpp
@@ -10,7 +10,7 @@ using CDataType   = ck::half_t;
 using AccDataType = float;

 using ALayout = Row;
-using BLayout = Col;
+using BLayout = Row; // Col;
 using CLayout = Row;

 using AElementOp = PassThrough;
@@ -19,9 +19,9 @@ using CElementOp = PassThrough;

 static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;

-#define K1 8 // K1PerThread:2,4,8
-#define K0 4 // K0PerBlock:1,2,3,4...32
-#define N1 2 // Nperthread:2,4,8
+#define K1 4
+#define K0 3
+#define N1 2
 #define B 64 // block-size:64

 // clang-format off
@@ -31,7 +31,7 @@ using DeviceGemvInstance = ck::tensor_operation::device::deviceGemvDl/*
 // ######|          |          |          |            |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       | KBatch_K0_M0_M1_K1|   KBatch_K0_M0_M1_K1|   ArrangeOrder|          Order| KBatch_K0_M0_M1_K1   | ContiguousDimOrder| KBatch_K0_M0_M1_K1   |          Order|                 |                   |               Order|                |                   |
 // ######|          |          |          |            |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |                   |                     |               |               |                      |                   |                      |               |                 |                   |                    |                |                   |
       //< ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,  GemmMNPadding,    64,     1,    64,    32,  2,          1,          1,      1,      S<1, 1, 1, 2>,      S<32, 1,  1, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<1, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>,    S<1, 2, 0, 3>,             3,               2,         S<0, 1, 2, 3, 4, 5>,               5,                  1>;*/
-         < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,  GemmMNPadding,    B,     1,    B*N1,   K0,  K1,         1,          N1,      1,    S<1,1, 1, 1, K1>,   S<1,K0, 1,  1, 1>,S<0,1,2,3,4>,  S<0,1,2,3,4>,      S<1,1, 1, 1, K1>,     S<0,1,2,3,4>,     S<1,1, 1, 1, 2>,    S<0,1,2,3,4>,                4,               K1,        S<0, 1, 2, 3, 4, 5>,             5,                  N1>;
+         < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,  GemmMNPadding,    B,     1,    B*N1,   K0,  K1,         1,          N1,      1,    S<1,1, 1, 1, K1>,   S<1,K0, 1,  1, 1>,S<0,1,2,3,4>,  S<0,1,2,3,4>,      S<1,1, 1, 1, K1>,     S<0,1,2,3,4>,     S<1,1, 1, 1, 2>,    S<0,1,2,3,4>,                3,               N1,        S<0, 1, 2, 3, 4, 5>,             5,                  N1>;
 // clang-format on

 using ReferenceGemmInstance = ck::tensor_operation::host::

--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -7,20 +7,114 @@ add_custom_target(examples)

 function(add_example_executable EXAMPLE_NAME FILE_NAME)
    message("adding example ${EXAMPLE_NAME}")
-    add_executable(${EXAMPLE_NAME} ${FILE_NAME})
-    target_link_libraries(${EXAMPLE_NAME} PRIVATE utility)
-    add_test(NAME ${EXAMPLE_NAME} COMMAND $<TARGET_FILE:${EXAMPLE_NAME}> ${ARGN})
-    add_dependencies(examples ${EXAMPLE_NAME})
-    add_dependencies(check ${EXAMPLE_NAME})
-    rocm_install(TARGETS ${EXAMPLE_NAME} COMPONENT examples)
+    set(result 1)
+    if(DEFINED DTYPES)
+    foreach(source IN LISTS FILE_NAME)
+        set(test 0)
+        foreach(type IN LISTS DTYPES)
+            if(type MATCHES "fp16")
+                set(type1 "_f16")
+            elseif(type MATCHES "fp32")
+                set(type1 "_f32")
+            elseif(type MATCHES "fp8")
+                set(type1 "_f8")
+            elseif(type MATCHES "bf16")
+                set(type1 "_b16")
+            elseif(type MATCHES "fp64")
+                set(type1 "_f64")
+            elseif(type MATCHES "int8")
+                set(type1 "_i8")
+            endif()
+            if("${source}" MATCHES "${type}" OR "${source}" MATCHES "${type1}")
+                #if filename matches any selected type, exit type loop and do no exclude the file from the list
+                set(test 0)
+                break()
+            elseif((source MATCHES "fp8" OR source MATCHES "fp32" OR source MATCHES "fp64" OR source MATCHES "bf16" OR source MATCHES "int8" OR source MATCHES "fp16" OR
+                source MATCHES "_f8" OR source MATCHES "_f32" OR source MATCHES "_f64" OR source MATCHES "_i8" OR source MATCHES "_f16" OR source MATCHES "_b16") AND
+                NOT(source MATCHES type OR source MATCHES type1))
+                    #if filename contains a type which doesn't match any selected type, mark it for removal
+                    set(test 1)
+            endif()
+        endforeach()
+        if(test EQUAL 1)
+            message("removing example source file ${source} ")
+            list(REMOVE_ITEM FILE_NAME "${source}")
+        endif()
+    endforeach()
+    endif()
+    foreach(source IN LISTS FILE_NAME)
+        if(NOT DEFINED DL_KERNELS AND source MATCHES "_dl")
+            message("removing dl example ${source} ")
+            list(REMOVE_ITEM FILE_NAME "${source}")
+        endif()
+    endforeach()
+    #only continue if there are some source files left on the list
+    if(FILE_NAME)
+        add_executable(${EXAMPLE_NAME} ${FILE_NAME})
+        target_link_libraries(${EXAMPLE_NAME} PRIVATE utility)
+        add_test(NAME ${EXAMPLE_NAME} COMMAND $<TARGET_FILE:${EXAMPLE_NAME}> ${ARGN})
+        add_dependencies(examples ${EXAMPLE_NAME})
+        add_dependencies(check ${EXAMPLE_NAME})
+        rocm_install(TARGETS ${EXAMPLE_NAME} COMPONENT examples)
+        set(result 0)
+    endif()
+    #message("add_example returns ${result}")
+    set(result ${result} PARENT_SCOPE)
 endfunction(add_example_executable EXAMPLE_NAME)

 function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
    message("adding example ${EXAMPLE_NAME}")
-    add_executable(${EXAMPLE_NAME} ${FILE_NAME})
-    target_link_libraries(${EXAMPLE_NAME} PRIVATE utility)
-    add_dependencies(examples ${EXAMPLE_NAME})
-    rocm_install(TARGETS ${EXAMPLE_NAME} COMPONENT examples)
+    set(result 1)
+    if(DEFINED DTYPES)
+    foreach(source IN LISTS FILE_NAME)
+        set(test 0)
+        foreach(type IN LISTS DTYPES)
+                if(type MATCHES "fp16")
+                    set(type1 "_f16")
+                elseif(type MATCHES "fp32")
+                    set(type1 "_f32")
+                elseif(type MATCHES "fp8")
+                    set(type1 "_f8")
+                elseif(type MATCHES "bf16")
+                    set(type1 "_b16")
+                elseif(type MATCHES "fp64")
+                    set(type1 "_f64")
+                elseif(type MATCHES "int8")
+                    set(type1 "_i8")
+                endif()
+                if("${source}" MATCHES "${type}" OR "${source}" MATCHES "${type1}")
+                    #if filename matches any selected type, exit type loop and do no exclude the file from the list
+                    set(test 0)
+                    break()
+                elseif((source MATCHES "fp8" OR source MATCHES "fp32" OR source MATCHES "fp64" OR source MATCHES "bf16" OR source MATCHES "int8" OR source MATCHES "fp16" OR
+                  source MATCHES "_f8" OR source MATCHES "_f32" OR source MATCHES "_f64" OR source MATCHES "_i8" OR source MATCHES "_f16" OR source MATCHES "_b16") AND
+                  NOT(source MATCHES type OR source MATCHES type1))
+                    #if filename contains a type which doesn't match any selected type, mark it for removal
+                    set(test 1)
+                endif()
+        endforeach()
+        if(test EQUAL 1)
+            message("removing example ${source} ")
+            list(REMOVE_ITEM FILE_NAME "${source}")
+        endif()
+    endforeach()
+    endif()
+    foreach(source IN LISTS FILE_NAME)
+        if(NOT DEFINED DL_KERNELS AND source MATCHES "_dl")
+            message("removing dl example ${source} ")
+            list(REMOVE_ITEM FILE_NAME "${source}")
+        endif()
+    endforeach()
+    #only continue if there are some source files left on the list
+    if(FILE_NAME)
+        add_executable(${EXAMPLE_NAME} ${FILE_NAME})
+        target_link_libraries(${EXAMPLE_NAME} PRIVATE utility)
+        add_dependencies(examples ${EXAMPLE_NAME})
+        rocm_install(TARGETS ${EXAMPLE_NAME} COMPONENT examples)
+        set(result 0)
+    endif()
+    #message("add_example returns ${result}")
+    set(result ${result} PARENT_SCOPE)
 endfunction(add_example_executable_no_testing EXAMPLE_NAME)

 # add all example subdir

--- a/include/ck/host_utility/kernel_launch.hpp
+++ b/include/ck/host_utility/kernel_launch.hpp
@@ -34,6 +34,7 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
 #endif
        // warm up
        kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
+        hip_check_error(hipGetLastError());

        const int nrepeat = 10;
 #if DEBUG_LOG
@@ -50,6 +51,7 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
        for(int i = 0; i < nrepeat; ++i)
        {
            kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
+            hip_check_error(hipGetLastError());
        }

        hip_check_error(hipEventRecord(stop, stream_config.stream_id_));
@@ -64,11 +66,13 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
    else
    {
        kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
+        hip_check_error(hipGetLastError());

        return 0;
    }
 #else
    kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
+    hip_check_error(hipGetLastError());

    return 0;
 #endif
@@ -101,6 +105,7 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
        // warm up
        preprocess();
        kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
+        hip_check_error(hipGetLastError());

        const int nrepeat = 10;
 #if DEBUG_LOG
@@ -118,6 +123,7 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
        {
            preprocess();
            kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
+            hip_check_error(hipGetLastError());
        }

        hip_check_error(hipEventRecord(stop, stream_config.stream_id_));
@@ -133,11 +139,13 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
    {
        preprocess();
        kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
+        hip_check_error(hipGetLastError());

        return 0;
    }
 #else
    kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
+    hip_check_error(hipGetLastError());

    return 0;
 #endif

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
@@ -28,7 +28,8 @@ MakeGemmMmaTileDescriptor_MN0_MN1_MN2_K(const TileDesc_K0_MN_K1&)
 }

 template <index_t BlockSize,
-          typename FloatAB,
+          typename FloatA,
+          typename FloatB,
          typename FloatAcc,
          typename AK0MK1BlockDesc,
          typename BK0NK1BlockDesc,
@@ -58,7 +59,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
    static constexpr index_t A_K1 = AK0MK1BlockDesc{}.GetLength(I2);
    static constexpr index_t B_K1 = BK0NK1BlockDesc{}.GetLength(I2);

-    static constexpr auto xdlops_gemm = XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack>{};
+    static constexpr auto xdlops_gemm = XdlopsGemm<FloatA, MPerXDL, NPerXDL, KPack, FloatB>{};

    static constexpr index_t KPerThread = KPerBlock / xdlops_gemm.K0PerXdlops;

@@ -294,9 +295,9 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                        const BBlockBuffer& b_block_buf,
                        CThreadBuffer& c_thread_buf) const
    {
-        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatAB>(
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatA>(
            a_thread_desc_.GetElementSpaceSize());
-        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatAB>(
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatB>(
            b_thread_desc_.GetElementSpaceSize());

        static_for<0, MRepeat, 1>{}([&](auto m0) {
@@ -318,25 +319,27 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                                   b_thread_buf);

                static_for<0, KPerThread, KPack>{}([&](auto k) {
-                    vector_type<FloatAB, KPack> a_thread_vec;
-                    vector_type<FloatAB, KPack> b_thread_vec;
+                    vector_type<FloatA, KPack> a_thread_vec;
+                    vector_type<FloatB, KPack> b_thread_vec;

                    static_for<0, KPack, 1>{}([&](auto i) {
-                        a_thread_vec.template AsType<FloatAB>()(i) = a_thread_buf
+                        a_thread_vec.template AsType<FloatA>()(i) = a_thread_buf
                            [Number<a_thread_desc_.CalculateOffset(make_tuple(0, 0, 0, k + i))>{}];
-                        b_thread_vec.template AsType<FloatAB>()(i) = b_thread_buf
+                        b_thread_vec.template AsType<FloatB>()(i) = b_thread_buf
                            [Number<b_thread_desc_.CalculateOffset(make_tuple(0, 0, 0, k + i))>{}];
                    });

-                    using mfma_input_type =
-                        typename vector_type<FloatAB, xdlops_gemm.K1PerXdlops>::type;
+                    using mfma_input_type_a =
+                        typename vector_type<FloatA, xdlops_gemm.K1PerXdlops>::type;
+                    using mfma_input_type_b =
+                        typename vector_type<FloatB, xdlops_gemm.K1PerXdlops>::type;

                    constexpr index_t c_offset =
                        c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));

                    xdlops_gemm.template Run(
-                        a_thread_vec.template AsType<mfma_input_type>(),
-                        b_thread_vec.template AsType<mfma_input_type>(),
+                        a_thread_vec.template AsType<mfma_input_type_a>(),
+                        b_thread_vec.template AsType<mfma_input_type_b>(),
                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                });
            });
@@ -356,8 +359,8 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
    static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed(
        make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, xdlops_gemm.GetRegSizePerXdlops()));

-    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
-                                                         FloatAB,
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatA,
+                                                         FloatA,
                                                         decltype(a_block_desc_m0_m1_m2_k),
                                                         decltype(a_thread_desc_),
                                                         Sequence<1, 1, 1, KPerThread>,
@@ -366,8 +369,8 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                                                         A_K1,
                                                         A_K1>;

-    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
-                                                         FloatAB,
+    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatB,
+                                                         FloatB,
                                                         decltype(b_block_desc_n0_n1_n2_k),
                                                         decltype(b_thread_desc_),
                                                         Sequence<1, 1, 1, KPerThread>,
@@ -385,7 +388,8 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
 // the latest ROCm release. For unsupported compilers, inter-wave loop scheduler falls back to the
 // default loop scheduler which is given by the macro CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING=0
 template <index_t BlockSize,
-          typename FloatAB,
+          typename FloatA,
+          typename FloatB,
          typename FloatAcc,
          typename AK0MK1BlockDesc,
          typename BK0NK1BlockDesc,
@@ -397,7 +401,8 @@ template <index_t BlockSize,
          index_t NumMacClusters = CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING_MAC_CLUSTERS>
 struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
    : public BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
-                                                                 FloatAB,
+                                                                 FloatA,
+                                                                 FloatB,
                                                                 FloatAcc,
                                                                 AK0MK1BlockDesc,
                                                                 BK0NK1BlockDesc,
@@ -408,7 +413,8 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                                                                 KPack>
 {
    using Base = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
-                                                                     FloatAB,
+                                                                     FloatA,
+                                                                     FloatB,
                                                                     FloatAcc,
                                                                     AK0MK1BlockDesc,
                                                                     BK0NK1BlockDesc,
@@ -440,9 +446,9 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                        const BBlockBuffer& b_block_buf,
                        CThreadBuffer& c_thread_buf) const
    {
-        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatAB>(
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatA>(
            a_thread_desc_.GetElementSpaceSize());
-        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatAB>(
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatB>(
            b_thread_desc_.GetElementSpaceSize());

        static_for<0, KPerThread, KPerInnerLoop>{}([&](auto k) {
@@ -479,20 +485,22 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
            static_for<0, KPerInnerLoop, KPack>{}([&](auto k_) {
                static_for<0, MRepeat, 1>{}([&](auto m0) {
                    static_for<0, NRepeat, 1>{}([&](auto n0) {
-                        vector_type<FloatAB, KPack> a_thread_vec;
-                        vector_type<FloatAB, KPack> b_thread_vec;
+                        vector_type<FloatA, KPack> a_thread_vec;
+                        vector_type<FloatB, KPack> b_thread_vec;

                        static_for<0, KPack, 1>{}([&](auto i) {
-                            a_thread_vec.template AsType<FloatAB>()(i) =
+                            a_thread_vec.template AsType<FloatA>()(i) =
                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                    make_tuple(m0, 0, 0, k_ + i))>{}];
-                            b_thread_vec.template AsType<FloatAB>()(i) =
+                            b_thread_vec.template AsType<FloatB>()(i) =
                                b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                    make_tuple(n0, 0, 0, k_ + i))>{}];
                        });

-                        using mfma_input_type =
-                            typename vector_type<FloatAB, xdlops_gemm.K1PerXdlops>::type;
+                        using mfma_input_type_a =
+                            typename vector_type<FloatA, xdlops_gemm.K1PerXdlops>::type;
+                        using mfma_input_type_b =
+                            typename vector_type<FloatB, xdlops_gemm.K1PerXdlops>::type;

                        constexpr index_t c_offset =
                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
@@ -514,8 +522,8 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                        // TODO: insert setprio in more precise manner since we
                        // could have more than >1 MFMA instructions in single call
                        xdlops_gemm.template Run(
-                            a_thread_vec.template AsType<mfma_input_type>(),
-                            b_thread_vec.template AsType<mfma_input_type>(),
+                            a_thread_vec.template AsType<mfma_input_type_a>(),
+                            b_thread_vec.template AsType<mfma_input_type_b>(),
                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                        if constexpr(k_.value == 0 && m0.value == 0 && n0.value == 0)
                        {
@@ -541,8 +549,8 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed(
        make_tuple(Number<NRepeat>{}, I1, I1, Number<KPerInnerLoop>{}));

-    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
-                                                         FloatAB,
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatA,
+                                                         FloatA,
                                                         decltype(a_block_desc_m0_m1_m2_k),
                                                         decltype(a_thread_desc_),
                                                         Sequence<1, 1, 1, KPerInnerLoop>,
@@ -551,8 +559,8 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                                                         A_K1,
                                                         A_K1>;

-    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
-                                                         FloatAB,
+    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatB,
+                                                         FloatB,
                                                         decltype(b_block_desc_n0_n1_n2_k),
                                                         decltype(b_thread_desc_),
                                                         Sequence<1, 1, 1, KPerInnerLoop>,
@@ -568,7 +576,8 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
 };

 template <index_t BlockSize,
-          typename FloatAB,
+          typename FloatA,
+          typename FloatB,
          typename FloatAcc,
          typename AK0MK1BlockDesc,
          typename BK0NK1BlockDesc,
@@ -583,7 +592,8 @@ constexpr auto BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector()
    if constexpr(LoopSched == LoopScheduler::Default)
    {
        return BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
-                                                                   FloatAB,
+                                                                   FloatA,
+                                                                   FloatB,
                                                                   FloatAcc,
                                                                   AK0MK1BlockDesc,
                                                                   BK0NK1BlockDesc,
@@ -596,7 +606,8 @@ constexpr auto BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector()
    else if constexpr(LoopSched == LoopScheduler::Interwave)
    {
        return BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
-                                                                            FloatAB,
+                                                                            FloatA,
+                                                                            FloatB,
                                                                            FloatAcc,
                                                                            AK0MK1BlockDesc,
                                                                            BK0NK1BlockDesc,
@@ -618,26 +629,27 @@ constexpr auto BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector()
 * 3. configurable k index starting position and step size after each FMA/XDL instruction
 */

-template <index_t BlockSize,
-          typename FloatAB,
-          typename FloatAcc,
-          typename ATileDesc,
-          typename BTileDesc,
-          typename AMmaTileDesc,
-          typename BMmaTileDesc,
-          index_t MPerBlock,
-          index_t NPerBlock,
-          index_t KPerBlock,
-          index_t MPerXDL,
-          index_t NPerXDL,
-          index_t MRepeat,
-          index_t NRepeat,
-          index_t KPack,
-          bool TransposeC = false,
-          index_t AMmaKStride =
-              KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, TransposeC>{}.K0PerXdlops,
-          index_t BMmaKStride =
-              KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, TransposeC>{}.K0PerXdlops>
+template <
+    index_t BlockSize,
+    typename FloatAB,
+    typename FloatAcc,
+    typename ATileDesc,
+    typename BTileDesc,
+    typename AMmaTileDesc,
+    typename BMmaTileDesc,
+    index_t MPerBlock,
+    index_t NPerBlock,
+    index_t KPerBlock,
+    index_t MPerXDL,
+    index_t NPerXDL,
+    index_t MRepeat,
+    index_t NRepeat,
+    index_t KPack,
+    bool TransposeC = false,
+    index_t AMmaKStride =
+        KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops,
+    index_t BMmaKStride =
+        KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
 struct BlockwiseGemmXdlops_v2
 {
    static constexpr auto I0 = Number<0>{};
@@ -654,7 +666,8 @@ struct BlockwiseGemmXdlops_v2
    static constexpr index_t A_K1 = ATileDesc{}.GetLength(I2);
    static constexpr index_t B_K1 = BTileDesc{}.GetLength(I2);

-    static constexpr auto xdlops_gemm = XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, TransposeC>{};
+    static constexpr auto xdlops_gemm =
+        XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{};

    static constexpr index_t KPerThread = KPerBlock / xdlops_gemm.K0PerXdlops;


--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r2.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r2.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/cluster_descriptor.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r2.hpp"
+#include "ck/utility/is_detected.hpp"
+
+namespace ck {
+
+// Thread-group level multi-source, multi-destination tensor slice data movement
+// Assume:
+//   1. All sources and destinations are DynamicBuffer
+//   2. Same VectorDim and ScalerPerVector for all sources and destinations
+//   3. DstInMemOps are per destination tensor
+//   4. ThreadTransferSrcResetCoordinateAfterRunFlags are per source tensor
+//   5. ThreadTransferDstResetCoordinateAfterRunFlags are per destination tensor
+//
+// Does following things to avoid scratch memory issue
+//   1. Pass tensor descritpors by reference (or tuple of references)
+//   2. Does not keep reference to tensor descriptor
+//   3. Does not construct new tensor coordinate when call Run()
+template <typename ThreadGroup,
+          typename SrcDatas,
+          typename DstDatas,
+          typename SrcDescs,
+          typename DstDescs,
+          typename ElementwiseOperation,
+          typename DstInMemOps, // Sequence<InMemoryDataOperationEnum ...>
+          typename SliceLengths,
+          typename ThreadClusterLengths,
+          typename ThreadClusterArrangeOrder,
+          typename SrcDimAccessOrder,
+          typename DstDimAccessOrder,
+          index_t SrcVectorDim,
+          index_t DstVectorDim,
+          index_t SrcScalarPerVector,
+          index_t DstScalarPerVector,
+          typename ThreadTransferSrcResetCoordinateAfterRunFlags,
+          typename ThreadTransferDstResetCoordinateAfterRunFlags>
+struct ThreadGroupTensorSliceTransfer_v7r2
+{
+    static constexpr index_t nDim =
+        remove_cvref_t<tuple_element_t<0, SrcDescs>>::GetNumOfDimension();
+
+    static constexpr index_t nSrc = remove_cvref_t<SrcDescs>::Size();
+    static constexpr index_t nDst = remove_cvref_t<DstDescs>::Size();
+
+    using Index = MultiIndex<nDim>;
+
+    static constexpr auto thread_slice_lengths = SliceLengths{} / ThreadClusterLengths{};
+
+    __device__ constexpr ThreadGroupTensorSliceTransfer_v7r2(
+        const SrcDescs& src_descs,
+        const StaticallyIndexedArray<Index, nSrc>& src_block_slice_origins,
+        const DstDescs& dst_descs,
+        const StaticallyIndexedArray<Index, nDst>& dst_block_slice_origins,
+        const ElementwiseOperation& element_op)
+        : threadwise_transfer_(src_descs,
+                               StaticallyIndexedArray<Index, nSrc>{},
+                               dst_descs,
+                               StaticallyIndexedArray<Index, nDst>{},
+                               element_op)
+    {
+        static_assert(nSrc == SrcDatas::Size() && nSrc == SrcDescs::Size() &&
+                          nSrc == ThreadTransferSrcResetCoordinateAfterRunFlags::Size() &&
+                          nDst == DstDatas::Size() && nDst == DstDescs::Size() &&
+                          nDst == ThreadTransferDstResetCoordinateAfterRunFlags::Size(),
+                      "wrong!");
+
+        static_for<0, nSrc, 1>{}([&](auto i) {
+            static_assert(
+                nDim == remove_cvref_t<tuple_element_t<i.value, SrcDescs>>::GetNumOfDimension(),
+                "wrong!");
+        });
+
+        static_for<0, nDst, 1>{}([&](auto i) {
+            static_assert(
+                nDim == remove_cvref_t<tuple_element_t<i.value, DstDescs>>::GetNumOfDimension(),
+                "wrong!");
+        });
+
+        static_assert(nDim == ThreadClusterLengths::Size() &&
+                          nDim == ThreadClusterArrangeOrder::Size() &&
+                          nDim == SrcDimAccessOrder::Size() && nDim == DstDimAccessOrder::Size(),
+                      "wrong! nDim not consistent");
+
+        static_assert(
+            is_same<SliceLengths, decltype(thread_slice_lengths * ThreadClusterLengths{})>{},
+            "wrong! threads should be mapped to cover entire slicing window");
+
+        static_assert(ThreadGroup::GetNumOfThread() >= thread_cluster_desc_.GetElementSize(),
+                      "wrong! ThreadGroup::GetNumOfThread() too small");
+
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            const auto thread_cluster_idx = thread_cluster_desc_.CalculateBottomIndex(
+                make_multi_index(get_thread_local_1d_id()));
+
+            const auto thread_data_idx_begin = thread_cluster_idx * thread_slice_lengths;
+
+            const auto src_thread_slice_origins = generate_tuple(
+                [&](auto i) { return src_block_slice_origins[i] + thread_data_idx_begin; },
+                Number<nSrc>{});
+
+            const auto dst_thread_slice_origins = generate_tuple(
+                [&](auto i) { return dst_block_slice_origins[i] + thread_data_idx_begin; },
+                Number<nDst>{});
+
+            threadwise_transfer_.SetSrcSliceOrigins(src_descs, src_thread_slice_origins);
+            threadwise_transfer_.SetDstSliceOrigins(dst_descs, dst_thread_slice_origins);
+        }
+    }
+
+    template <typename SrcBuffers>
+    __device__ void RunRead(const SrcDescs& src_descs, const SrcBuffers& src_bufs)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.RunRead(src_descs, src_bufs);
+        }
+    }
+
+    template <typename T>
+    using is_tuple = decltype(std::declval<T&>().IsTuple());
+
+    template <typename DstBuffers>
+    __device__ void RunWrite(const DstDescs& dst_descs, DstBuffers dst_bufs)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            if constexpr(is_detected<is_tuple, decltype(dst_bufs)>::value)
+                threadwise_transfer_.RunWrite(dst_descs, dst_bufs);
+            else
+                threadwise_transfer_.RunWrite(dst_descs, tie(dst_bufs));
+        }
+    }
+
+    template <typename SrcBuffers, typename DstBuffers>
+    __device__ void Run(const SrcDescs& src_descs,
+                        const SrcBuffers& src_bufs,
+                        const DstDescs& dst_descs,
+                        DstBuffers dst_bufs)
+    {
+        RunRead(src_descs, src_bufs);
+        RunWrite(dst_descs, dst_bufs);
+    }
+
+    template <index_t ISrc>
+    __device__ void
+    MoveSrcSliceWindow(const SrcDescs& src_descs, Number<ISrc> iSrc, const Index& step)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveSrcSliceWindow(src_descs, iSrc, step);
+        }
+    }
+
+    __device__ void MoveSrcSliceWindow(const SrcDescs& src_descs, const Index& step)
+    {
+        static_for<0, SrcDescs::Size(), 1>{}(
+            [&](auto i) { MoveSrcSliceWindow(src_descs, i, step); });
+    }
+
+    template <index_t IDst>
+    __device__ void
+    MoveDstSliceWindow(const DstDescs& dst_descs, Number<IDst> iDst, const Index& step)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveDstSliceWindow(dst_descs, iDst, step);
+        }
+    }
+
+    __device__ void MoveDstSliceWindow(const DstDescs& dst_descs, const Index& step)
+    {
+        static_for<0, DstDescs::Size(), 1>{}(
+            [&](auto i) { MoveDstSliceWindow(dst_descs, i, step); });
+    }
+
+    private:
+    static constexpr auto thread_cluster_desc_ =
+        make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
+
+    using ThreadwiseTransfer =
+        ThreadwiseTensorSliceTransfer_v7r2<SrcDatas,
+                                           DstDatas,
+                                           SrcDescs,
+                                           DstDescs,
+                                           ElementwiseOperation,
+                                           DstInMemOps,
+                                           decltype(thread_slice_lengths),
+                                           SrcDimAccessOrder,
+                                           DstDimAccessOrder,
+                                           SrcVectorDim,
+                                           DstVectorDim,
+                                           SrcScalarPerVector,
+                                           DstScalarPerVector,
+                                           ThreadTransferSrcResetCoordinateAfterRunFlags,
+                                           ThreadTransferDstResetCoordinateAfterRunFlags>;
+
+    ThreadwiseTransfer threadwise_transfer_;
+};
+
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp
+++ b/include/ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+namespace ck {
+namespace conv_tensor_rearrange_op {
+
+struct BaseConvTensorRearrangeOp
+{
+};
+
+struct ImageToColumn : public BaseConvTensorRearrangeOp
+{
+    static constexpr const char* name = "Image to Column";
+};
+
+struct ColumnToImage : public BaseConvTensorRearrangeOp
+{
+    static constexpr const char* name = "Column to Image";
+};
+
+template <typename Op,
+          typename std::enable_if<std::is_base_of<BaseConvTensorRearrangeOp, Op>::value,
+                                  bool>::type = false>
+std::ostream& operator<<(std::ostream& os, const BaseConvTensorRearrangeOp&)
+{
+    os << Op::name;
+    return os;
+}
+
+} // namespace conv_tensor_rearrange_op
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_image_to_column.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_image_to_column.hpp
@@ -12,21 +12,26 @@ namespace tensor_operation {
 namespace device {

 /**
- * \brief Image to column.
+ * \brief Convolution Tensor Rearrange.
 *
- * This Device operator converts image ([G, N, Di, Hi, Wi, C]) to the gemm
- * problem([N * Do * Ho * Wo, Z *  Y * X * C]). G must be equal to 1.
+ * This Device operator supports conversion image ([G, N, Di, Hi, Wi, C]) to
+ * the gemm problem([N * Do * Ho * Wo, Z *  Y * X * C]) (Image to Column) and
+ * conversion gemm form to the image (Column to Image).
+ *
+ * Note that G must be equal to 1.
 *
 * \tparam NDimSpatial Number of spatial dimensions.
- * \tparam InputLayout Input Layout.
+ * \tparam ImageLayout Input Layout.
 * \tparam InputDataType Input Data Type.
 * \tparam OutputDataType Output Data Type.
+ * \tparam ConvTensorRearrangeOp Operation type: ImageToColumn, ColumnToImage.
 */
 template <index_t NDimSpatial,
-          typename InputLayout,
+          typename ImageLayout,
          typename InputDataType,
-          typename OutputDataType>
-struct DeviceImageToColumn : public BaseOperator
+          typename OutputDataType,
+          typename ConvTensorRearrangeOp>
+struct DeviceConvTensorRearrange : public BaseOperator
 {

    /**
@@ -39,8 +44,8 @@ struct DeviceImageToColumn : public BaseOperator
     * \param input_spatial_lengths Input spatial lengths.
     * \param filter_spatial_lengths Filter spatial lengths.
     * \param output_spatial_lengths Output spatial lengths.
-     * \param input_g_n_c_wis_strides Input strides in order [G, N, C, D, H, W].
-     * \param output_m_k_strides Output strides.
+     * \param image_g_n_c_wis_strides Image strides in order [G, N, C, D, H, W].
+     * \param gemm_m_k_strides Gemm form strides.
     * \param conv_filter_strides Convolution filter strides.
     * \param conv_filter_dilations Convolution filter dilations.
     * \param input_left_pads Convolution left pads.
@@ -55,8 +60,8 @@ struct DeviceImageToColumn : public BaseOperator
                        const std::array<index_t, NDimSpatial>& input_spatial_lengths,
                        const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
                        const std::array<index_t, NDimSpatial>& output_spatial_lengths,
-                        const std::array<index_t, NDimSpatial + 3>& input_g_n_c_wis_strides,
-                        const std::array<index_t, 2>& output_m_k_strides,
+                        const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
+                        const std::array<index_t, 2>& gemm_m_k_strides,
                        const std::array<index_t, NDimSpatial>& conv_filter_strides,
                        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
                        const std::array<index_t, NDimSpatial>& input_left_pads,

--- a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_abd.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_abd.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <array>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// GEMM:
+//   input : A0[M, K], B0[K, N],
+//   input : D0[M, N], D1[M, N], ...
+//   output : E[M, N]
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+// Assume:
+//   D0, D1, ... and E have the same layout
+template <typename AsLayout,
+          typename BsLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename AsDataType,
+          typename BsDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct DeviceGemmMultipleABD : public BaseOperator
+{
+    static constexpr index_t NumATensor = AsDataType::Size();
+    static constexpr index_t NumBTensor = BsDataType::Size();
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(std::array<const void*, NumATensor> p_as,
+                        std::array<const void*, NumBTensor> p_bs,
+                        std::array<const void*, NumDTensor> p_ds,
+                        void* p_e,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        std::array<ck::index_t, NumATensor> StrideAs,
+                        std::array<ck::index_t, NumBTensor> StrideBs,
+                        std::array<ck::index_t, NumDTensor> StrideDs,
+                        ck::index_t StrideE,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp
@@ -29,7 +29,9 @@ template <ck::index_t NDimSpatial,
          typename EDataType,
          typename AElementwiseOperation,
          typename BElementwiseOperation,
-          typename CDEElementwiseOperation>
+          typename CDEElementwiseOperation,
+          typename AComputeType = ADataType,
+          typename BComputeType = AComputeType>
 struct DeviceGroupedConvBwdDataMultipleD : public BaseOperator
 {
    static constexpr index_t NumDTensor = DsDataType::Size();

--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp
@@ -20,7 +20,9 @@ template <ck::index_t NDimSpatial,
          typename OutDataType,
          typename InElementwiseOperation,
          typename WeiElementwiseOperation,
-          typename OutElementwiseOperation>
+          typename OutElementwiseOperation,
+          typename ComputeTypeA = InDataType,
+          typename ComputeTypeB = ComputeTypeA>
 struct DeviceGroupedConvBwdWeight : public BaseOperator
 {
    virtual std::unique_ptr<BaseArgument>

--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp
@@ -29,7 +29,8 @@ template <index_t NDimSpatial,
          typename EDataType,
          typename AElementwiseOperation,
          typename BElementwiseOperation,
-          typename CDEElementwiseOperation>
+          typename CDEElementwiseOperation,
+          typename ComputeType = ADataType>
 struct DeviceGroupedConvFwdMultipleD : public BaseOperator
 {
    static constexpr index_t NumDTensor = DsDataType::Size();

--- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_splitk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_splitk.hpp
@@ -8,6 +8,57 @@ namespace ck {
 namespace tensor_operation {
 namespace device {

+///
+/// @brief      Structure representing single GEMM problem arguments.
+///
+///             The pointer to the vector of those structures is passed
+///             to the GroupedGEMM entry point kernel.
+///
+struct GroupedGemmKernelArguments
+{
+    __host__ __device__ GroupedGemmKernelArguments(const void* p_a_grid_,
+                                                   const void* p_b_grid_,
+                                                   void* p_c_grid_,
+                                                   index_t M_,
+                                                   index_t N_,
+                                                   index_t K_,
+                                                   index_t StrideA_,
+                                                   index_t StrideB_,
+                                                   index_t StrideC_)
+        : p_a_grid{p_a_grid_},
+          p_b_grid{p_b_grid_},
+          p_c_grid{p_c_grid_},
+          M{M_},
+          N{N_},
+          K{K_},
+          StrideA{StrideA_},
+          StrideB{StrideB_},
+          StrideC{StrideC_}
+    {
+    }
+
+    const void* p_a_grid;
+    const void* p_b_grid;
+    void* p_c_grid;
+    index_t M;
+    index_t N;
+    index_t K;
+    index_t StrideA;
+    index_t StrideB;
+    index_t StrideC;
+
+    void Print() const
+    {
+        std::cout << "arg {"
+                  << "M:" << M << ", "
+                  << "N:" << N << ", "
+                  << "K:" << K << ", "
+                  << "SA:" << StrideA << ", "
+                  << "SB:" << StrideB << ", "
+                  << "SC:" << StrideC << "}" << std::endl;
+    }
+};
+
 template <typename ALayout,
          typename BLayout,
          typename DsLayout,
@@ -31,7 +82,28 @@ struct DeviceGroupedGemmSplitK : public DeviceGroupedGemm<ALayout,
                                                          BElementwiseOperation,
                                                          CElementwiseOperation>
 {
-    virtual void SetKBatchSize(BaseArgument* p_arg, index_t kbatch) const = 0;
+    //----------------------------------------------------------------------------------------------
+    /// @brief      Sets the k batch size.
+    ///
+    /// @param      p_arg   Pointer to the Argument we're going to change.
+    /// @param[in]  kbatch  The kbatch value.
+    ///
+    virtual void SetKBatchSize([[maybe_unused]] BaseArgument* p_arg,
+                               [[maybe_unused]] index_t kbatch) const
+    {
+    }
+
+    //----------------------------------------------------------------------------------------------
+    /// @brief      Sets the device kernel arguments pointer.
+    ///
+    /// @param      p_arg              The pointer to the Argument we're going to update.
+    /// @param[in]  p_dev_kernel_args  The pointer to the device memory which contains kernel
+    ///                                arguments.
+    ///
+    virtual void SetDeviceKernelArgs([[maybe_unused]] BaseArgument* p_arg,
+                                     [[maybe_unused]] const void* p_dev_kernel_args) const
+    {
+    }
 };

 } // namespace device

--- a/include/ck/tensor_operation/gpu/device/device_softmax.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_softmax.hpp
@@ -22,22 +22,22 @@ template <typename InDataType,
          index_t NumReduceDim>
 struct DeviceSoftmax : public BaseOperator
 {
-    //
-    // @brief      Makes a pointer to Argument class.
-    //
-    // @param[in]  inLengths           Input tensor extent(s) from high to low dimension
-    // @param[in]  inStrides           Input tensor stride(s) from high to low dimension
-    // @param[in]  reduceDims          The dimension(s) the normalization operation is applied
-    // @param[in]  alpha               double type value
-    // @param[in]  beta                double type value
-    // @param[in]  in_dev              Typeless const pointer in device memory storing the input
-    //                                 tensor
-    // @param      out_dev             Typeless pointer in device memory storing the output tensor
-    // @param[in]  in_elementwise_op   The input elementwise operation.
-    // @param[in]  acc_elementwise_op  The accumulation elementwise operation.
-    //
-    // @return     Unique pointer to the Argument class.
-    //
+    ///
+    /// @brief      Makes a pointer to Argument class.
+    ///
+    /// @param[in]  inLengths           Input tensor extent(s) from high to low dimension
+    /// @param[in]  inStrides           Input tensor stride(s) from high to low dimension
+    /// @param[in]  reduceDims          The dimension(s) the normalization operation is applied
+    /// @param[in]  alpha               double type value
+    /// @param[in]  beta                double type value
+    /// @param[in]  in_dev              Typeless const pointer in device memory storing the input
+    ///                                 tensor
+    /// @param      out_dev             Typeless pointer in device memory storing the output tensor
+    /// @param[in]  in_elementwise_op   The input elementwise operation.
+    /// @param[in]  acc_elementwise_op  The accumulation elementwise operation.
+    ///
+    /// @return     Unique pointer to the Argument class.
+    ///
    virtual std::unique_ptr<BaseArgument>
    MakeArgumentPointer(const std::vector<index_t> inLengths,
                        const std::vector<index_t> inStrides,

--- a/include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+
+#include "ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp"
+
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp"
+#include "ck/host_utility/io.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// Image to column for input layout NDHWC:
+//   input : image converted to the gemm problem [N * Do * Ho * Wo, Z * Y * X * C]
+//   output : image [N, Di, Hi, Wi, C]
+template <index_t NDimSpatial,
+          typename ImageLayout,
+          typename InputDataType,
+          typename OutputDataType,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t KPerBlock,
+          typename ThreadClusterLengths,
+          index_t ScalarPerVector,
+          typename std::enable_if<NDimSpatial >= 1 && NDimSpatial <= 3, bool>::type = false>
+struct DeviceColumnToImageImpl
+    : public DeviceConvTensorRearrange<NDimSpatial,
+                                       ImageLayout,
+                                       InputDataType,
+                                       OutputDataType,
+                                       conv_tensor_rearrange_op::ColumnToImage>
+{
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+
+    static constexpr auto ZIdx = Number<I0>{};
+    static constexpr auto YIdx = NDimSpatial == 1 ? I0 : Number<NDimSpatial - I2>{};
+    static constexpr auto XIdx = Number<NDimSpatial - I1>{};
+
+    static constexpr auto spatial_offset = Number<3>{};
+
+    static constexpr auto conv_to_gemm_transformer =
+        TransformConvFwdToGemm<NDimSpatial, ConvolutionForwardSpecialization::Default>{};
+    static constexpr auto matrix_padder =
+        MatrixPadder<GemmSpecialization::MKPadding, index_t, index_t, index_t>{
+            MPerBlock, 0 /* NPerBlock*/, KPerBlock};
+
+    // Calculate number of independent filters for given conv params
+    static index_t GetNumberOfIndependentFilters(const index_t input_spatial_len,
+                                                 const index_t left_pad,
+                                                 const index_t right_pad,
+                                                 const index_t filter_len,
+                                                 const index_t filter_stride,
+                                                 const index_t filter_dilation,
+                                                 const index_t image_offset)
+    {
+        const index_t x_eff = (filter_len - 1) * filter_dilation + 1;
+        const index_t next_filter_padded =
+            math::integer_divide_ceil(x_eff, filter_stride) * filter_stride;
+        // If filter_stride >= x_eff then each filter is independent
+        const index_t independent_filter_stride =
+            filter_stride >= x_eff ? filter_stride : next_filter_padded;
+        const index_t w_eff = input_spatial_len - image_offset + left_pad + right_pad - x_eff;
+        // There are no independent filters
+        if(w_eff < 0)
+            return 0;
+        const index_t independent_kernels_num = w_eff / independent_filter_stride + 1;
+        return independent_kernels_num;
+    }
+
+    // Make column form descriptor
+    static auto
+    MakeInputDescriptor_M_K(const ck::index_t N,
+                            const ck::index_t C,
+                            const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
+                            const std::array<index_t, NDimSpatial>& output_spatial_lengths,
+                            const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                            const std::array<index_t, 2>& gemm_m_k_strides,
+                            const std::array<index_t, NDimSpatial>& independent_filters,
+                            const std::array<index_t, NDimSpatial>& effs)
+    {
+        const index_t DoHoWo = ck::accumulate_n<index_t>(
+            output_spatial_lengths.begin(), NDimSpatial, 1, std::multiplies<>());
+        const index_t CZYX =
+            C * ck::accumulate_n<index_t>(
+                    filter_spatial_lengths.begin(), NDimSpatial, 1, std::multiplies<>());
+
+        const index_t NStride = DoHoWo * gemm_m_k_strides[I0] * gemm_m_k_strides[I1];
+        // Calculate the appropriate stride for each set of independent filters
+        // in each dimension
+        const index_t WStride =
+            math::integer_divide_ceil(effs[XIdx], conv_filter_strides[XIdx]) * gemm_m_k_strides[I0];
+        const index_t HStride = math::integer_divide_ceil(effs[YIdx], conv_filter_strides[YIdx]) *
+                                output_spatial_lengths[XIdx] * gemm_m_k_strides[I0];
+        const index_t DStride = math::integer_divide_ceil(effs[ZIdx], conv_filter_strides[ZIdx]) *
+                                output_spatial_lengths[YIdx] * output_spatial_lengths[XIdx] *
+                                gemm_m_k_strides[I0];
+        // Create descriptor for independent filters in each dimension and
+        // then merge them into column form
+        if constexpr(NDimSpatial == 1)
+        {
+            const auto desc_gemm_form =
+                make_naive_tensor_descriptor(make_tuple(N, independent_filters[XIdx], CZYX),
+                                             make_tuple(NStride, WStride, gemm_m_k_strides[I1]));
+            const auto desc_gemm_form_merged_filters = transform_tensor_descriptor(
+                desc_gemm_form,
+                make_tuple(make_merge_transform(make_tuple(N, independent_filters[XIdx])),
+                           make_pass_through_transform(CZYX)),
+                make_tuple(Sequence<0, 1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+            const auto desc_m_k = matrix_padder.PadADescriptor_M_K(desc_gemm_form_merged_filters);
+            return desc_m_k;
+        }
+        else if constexpr(NDimSpatial == 2)
+        {
+            const auto desc_gemm_form = make_naive_tensor_descriptor(
+                make_tuple(N, independent_filters[YIdx], independent_filters[XIdx], CZYX),
+                make_tuple(NStride, HStride, WStride, gemm_m_k_strides[I1]));
+            const auto desc_gemm_form_merged_filters = transform_tensor_descriptor(
+                desc_gemm_form,
+                make_tuple(make_merge_transform(
+                               make_tuple(N, independent_filters[YIdx], independent_filters[XIdx])),
+                           make_pass_through_transform(CZYX)),
+                make_tuple(Sequence<0, 1, 2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+            const auto desc_m_k = matrix_padder.PadADescriptor_M_K(desc_gemm_form_merged_filters);
+            return desc_m_k;
+        }
+        else if constexpr(NDimSpatial == 3)
+        {
+            const auto desc_gemm_form = make_naive_tensor_descriptor(
+                make_tuple(N,
+                           independent_filters[ZIdx],
+                           independent_filters[YIdx],
+                           independent_filters[XIdx],
+                           CZYX),
+                make_tuple(NStride, DStride, HStride, WStride, gemm_m_k_strides[I1]));
+            const auto desc_gemm_form_merged_filters = transform_tensor_descriptor(
+                desc_gemm_form,
+                make_tuple(make_merge_transform(make_tuple(N,
+                                                           independent_filters[ZIdx],
+                                                           independent_filters[YIdx],
+                                                           independent_filters[XIdx])),
+                           make_pass_through_transform(CZYX)),
+                make_tuple(Sequence<0, 1, 2, 3>{}, Sequence<4>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+            const auto desc_m_k = matrix_padder.PadADescriptor_M_K(desc_gemm_form_merged_filters);
+            return desc_m_k;
+        }
+    }
+
+    // Use MakeADescriptor_M_K from grouped convolution forward
+    static auto
+    MakeOutDescriptor_M_K(const ck::index_t N,
+                          const ck::index_t C,
+                          const std::array<index_t, NDimSpatial>& input_spatial_lengths,
+                          const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
+                          const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
+                          const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                          const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                          const std::array<index_t, NDimSpatial>& input_left_pads,
+                          const std::array<index_t, NDimSpatial>& input_right_pads,
+                          const std::array<index_t, NDimSpatial>& image_offsets,
+                          const std::array<index_t, NDimSpatial>& independent_filters,
+                          const std::array<index_t, NDimSpatial>& effs)
+    {
+        std::array<index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{1};
+        std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{1};
+        std::array<index_t, NDimSpatial + 3> c_g_n_k_wos_lengths{1};
+
+        auto copy = [](const auto& x, auto& y, index_t dst_offset) {
+            std::copy(x.begin(), x.end(), y.begin() + dst_offset);
+        };
+
+        copy(input_spatial_lengths, a_g_n_c_wis_lengths, spatial_offset);
+        copy(filter_spatial_lengths, b_g_k_c_xs_lengths, spatial_offset);
+        // Calculate descriptor only for independent filters
+        copy(independent_filters, c_g_n_k_wos_lengths, spatial_offset);
+
+        // fill only significant values (C and N)
+        a_g_n_c_wis_lengths[I1] = N;
+        a_g_n_c_wis_lengths[I2] = C;
+        b_g_k_c_xs_lengths[I2]  = C;
+        c_g_n_k_wos_lengths[I1] = N;
+
+        // Modify pads to apply offsets
+        std::array<index_t, NDimSpatial> input_left_pads_with_offset;
+        for(index_t i = 0; i < NDimSpatial; i++)
+        {
+            input_left_pads_with_offset[i] = math::max(0, input_left_pads[i] - image_offsets[i]);
+        }
+        // Modify input spatial lengths to apply offsets
+        for(index_t i = 0; i < NDimSpatial; i++)
+        {
+            a_g_n_c_wis_lengths[i + spatial_offset] -=
+                math::max(0, image_offsets[i] - input_left_pads[i]);
+        }
+
+        // Strides to next independent filters
+        std::array<index_t, NDimSpatial> independent_filter_strides;
+        for(index_t i = 0; i < NDimSpatial; i++)
+        {
+            index_t independent_filter_stride =
+                math::integer_divide_ceil(effs[i], conv_filter_strides[i]) * conv_filter_strides[i];
+            // If conv stride is greater than whole filter size, use conv stride
+            independent_filter_strides[i] = conv_filter_strides[i] >= effs[i]
+                                                ? conv_filter_strides[i]
+                                                : independent_filter_stride;
+        }
+
+        // Calculate image form descriptor for the modified convolution problem
+        const auto in_gemmmraw_gemmkraw_desc =
+            conv_to_gemm_transformer.template MakeADescriptor_M_K<ImageLayout>(
+                a_g_n_c_wis_lengths,
+                image_g_n_c_wis_strides,
+                b_g_k_c_xs_lengths,
+                {}, // not needed for A Descriptor
+                c_g_n_k_wos_lengths,
+                {}, // not needed for A Descriptor
+                // conv_filter_strides,
+                independent_filter_strides,
+                conv_filter_dilations,
+                input_left_pads_with_offset,
+                input_right_pads);
+
+        const auto in_gemmm_gemmk_desc =
+            matrix_padder.PadADescriptor_M_K(in_gemmmraw_gemmkraw_desc);
+        return in_gemmm_gemmk_desc;
+    }
+
+    using InputGridDesc =
+        remove_cvref_t<decltype(MakeInputDescriptor_M_K(1, 1, {}, {}, {}, {}, {}, {}))>;
+    using OutputGridDesc = remove_cvref_t<decltype(MakeOutDescriptor_M_K(
+        1, 1, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}))>;
+
+    using Block2ETileMap = remove_cvref_t<
+        decltype(BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, InputGridDesc>(
+            InputGridDesc{}))>;
+
+    using GridwiseTensorRearrangeKernel = GridwiseTensorRearrange<InputGridDesc,
+                                                                  InputDataType,
+                                                                  OutputGridDesc,
+                                                                  OutputDataType,
+                                                                  BlockSize,
+                                                                  MPerBlock,
+                                                                  KPerBlock,
+                                                                  ThreadClusterLengths,
+                                                                  ScalarPerVector,
+                                                                  InMemoryDataOperationEnum::Add,
+                                                                  Block2ETileMap>;
+
+    struct Argument : public BaseArgument
+    {
+        Argument(const void* p_in, // input image
+                 void* p_out,      // output image
+                 const ck::index_t N,
+                 const ck::index_t C,
+                 const std::array<index_t, NDimSpatial>& input_spatial_lengths,
+                 const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
+                 const std::array<index_t, NDimSpatial>& output_spatial_lengths,
+                 const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
+                 const std::array<index_t, 2>& gemm_m_k_strides,
+                 const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                 const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                 const std::array<index_t, NDimSpatial>& input_left_pads,
+                 const std::array<index_t, NDimSpatial>& input_right_pads)
+            : C_(C),
+              X_(filter_spatial_lengths[NDimSpatial - I1]),
+              p_in_{static_cast<const InputDataType*>(p_in)},
+              p_out_{static_cast<OutputDataType*>(p_out)},
+              image_g_n_c_wis_strides_{image_g_n_c_wis_strides},
+              conv_filter_strides_{conv_filter_strides},
+              conv_filter_dilations_{conv_filter_dilations},
+              input_left_pads_{input_left_pads},
+              input_right_pads_{input_right_pads}
+        {
+            const index_t x_eff =
+                (filter_spatial_lengths[XIdx] - 1) * conv_filter_dilations[XIdx] + 1;
+            const index_t y_eff =
+                NDimSpatial < 2
+                    ? I1
+                    : (filter_spatial_lengths[YIdx] - 1) * conv_filter_dilations[YIdx] + 1;
+            const index_t z_eff =
+                NDimSpatial < 3
+                    ? I1
+                    : (filter_spatial_lengths[ZIdx] - 1) * conv_filter_dilations[ZIdx] + 1;
+
+            // Iterate over sets of independent filters
+            for(int z_img_offset = 0; z_img_offset < z_eff;
+                z_img_offset += conv_filter_strides[ZIdx])
+            {
+                for(int y_img_offset = 0; y_img_offset < y_eff;
+                    y_img_offset += conv_filter_strides[YIdx])
+                {
+                    for(int x_img_offset = 0; x_img_offset < x_eff;
+                        x_img_offset += conv_filter_strides[XIdx])
+                    {
+
+                        std::array<index_t, NDimSpatial> image_offsets;
+                        std::array<index_t, NDimSpatial> effs;
+                        // Calculate the starting offset for a given set of
+                        // independent filters
+                        if constexpr(NDimSpatial == 1)
+                        {
+                            image_offsets = {x_img_offset};
+                            effs          = {x_eff};
+                        }
+                        if constexpr(NDimSpatial == 2)
+                        {
+                            image_offsets = {y_img_offset, x_img_offset};
+                            effs          = {y_eff, x_eff};
+                        }
+                        else if constexpr(NDimSpatial == 3)
+                        {
+                            image_offsets = {z_img_offset, y_img_offset, x_img_offset};
+                            effs          = {z_eff, y_eff, x_eff};
+                        }
+
+                        std::array<index_t, NDimSpatial> independent_filters;
+                        for(index_t i = 0; i < NDimSpatial; i++)
+                        {
+                            independent_filters[i] =
+                                GetNumberOfIndependentFilters(input_spatial_lengths[i],
+                                                              input_left_pads[i],
+                                                              input_right_pads[i],
+                                                              filter_spatial_lengths[i],
+                                                              conv_filter_strides[i],
+                                                              conv_filter_dilations[i],
+                                                              image_offsets[i]);
+                        }
+                        const index_t independent_filters_acum = ck::accumulate_n<index_t>(
+                            independent_filters.begin(), NDimSpatial, 1, std::multiplies<>());
+                        if(independent_filters_acum <= 0)
+                            continue;
+
+                        const auto in_grid_desc_m_k =
+                            MakeInputDescriptor_M_K(N,
+                                                    C,
+                                                    filter_spatial_lengths,
+                                                    output_spatial_lengths,
+                                                    conv_filter_strides,
+                                                    gemm_m_k_strides,
+                                                    independent_filters,
+                                                    effs);
+                        const auto out_grid_desc_m_k =
+                            MakeOutDescriptor_M_K(N,
+                                                  C,
+                                                  input_spatial_lengths,
+                                                  filter_spatial_lengths,
+                                                  image_g_n_c_wis_strides,
+                                                  conv_filter_strides,
+                                                  conv_filter_dilations,
+                                                  input_left_pads,
+                                                  input_right_pads,
+                                                  image_offsets,
+                                                  independent_filters,
+                                                  effs);
+                        in_grid_desc_m_k_container_.push_back(in_grid_desc_m_k);
+                        out_grid_desc_m_k_container_.push_back(out_grid_desc_m_k);
+
+                        const index_t x_idx = x_img_offset / conv_filter_strides[XIdx];
+                        const index_t y_idx = y_img_offset / conv_filter_strides[YIdx];
+                        const index_t z_idx = z_img_offset / conv_filter_strides[ZIdx];
+
+                        const index_t x_offset_with_pad =
+                            math::max(0, x_img_offset - input_left_pads[XIdx]);
+                        const index_t y_offset_with_pad =
+                            math::max(0, y_img_offset - input_left_pads[YIdx]);
+                        const index_t z_offset_with_pad =
+                            math::max(0, z_img_offset - input_left_pads[ZIdx]);
+
+                        // Memory offsets to next set of independent filters,
+                        // move to independent filters in each dimension
+                        const index_t in_offset =
+                            x_idx * gemm_m_k_strides[0] +
+                            y_idx * gemm_m_k_strides[0] * output_spatial_lengths[XIdx] +
+                            z_idx * gemm_m_k_strides[0] * output_spatial_lengths[YIdx] *
+                                output_spatial_lengths[XIdx];
+                        // Move to independent filters in appropriate dimensions
+                        const index_t out_offset =
+                            x_offset_with_pad * image_g_n_c_wis_strides[spatial_offset + XIdx] +
+                            y_offset_with_pad * image_g_n_c_wis_strides[spatial_offset + YIdx] +
+                            z_offset_with_pad * image_g_n_c_wis_strides[spatial_offset + ZIdx];
+
+                        const InputDataType* p_in_with_offset =
+                            static_cast<const InputDataType*>(p_in) + in_offset;
+                        OutputDataType* p_out_with_offset =
+                            static_cast<OutputDataType*>(p_out) + out_offset;
+                        p_in_container_.push_back(p_in_with_offset);
+                        p_out_container_.push_back(p_out_with_offset);
+                    }
+                }
+            }
+        }
+
+        void Print() const
+        {
+            for(std::size_t i = 0; i < in_grid_desc_m_k_container_.size(); i++)
+            {
+                std::cout << in_grid_desc_m_k_container_[i] << std::endl;
+                std::cout << out_grid_desc_m_k_container_[i] << std::endl;
+            }
+        }
+
+        const ck::index_t C_;
+        const ck::index_t X_;
+
+        const InputDataType* p_in_;
+        OutputDataType* p_out_;
+
+        const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides_;
+        const std::array<index_t, NDimSpatial>& conv_filter_strides_;
+        const std::array<index_t, NDimSpatial>& conv_filter_dilations_;
+        const std::array<index_t, NDimSpatial>& input_left_pads_;
+        const std::array<index_t, NDimSpatial>& input_right_pads_;
+
+        std::vector<InputGridDesc> in_grid_desc_m_k_container_;
+        std::vector<OutputGridDesc> out_grid_desc_m_k_container_;
+
+        std::vector<const InputDataType*> p_in_container_;
+        std::vector<OutputDataType*> p_out_container_;
+    };
+
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(stream_config.log_level_ > 0)
+            {
+                arg.Print();
+            }
+
+            float elapsed_time = 0.f;
+            const auto kernel  = kernel_tensor_rearrange<InputGridDesc,
+                                                        InputDataType,
+                                                        OutputGridDesc,
+                                                        OutputDataType,
+                                                        Block2ETileMap,
+                                                        GridwiseTensorRearrangeKernel>;
+
+            // Execute each set of independent filters
+            for(std::size_t i = 0; i < arg.in_grid_desc_m_k_container_.size(); i++)
+            {
+                const auto block_2_tile_map =
+                    BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, InputGridDesc>(
+                        arg.out_grid_desc_m_k_container_[i]);
+                const index_t grid_size =
+                    block_2_tile_map.CalculateGridSize(arg.in_grid_desc_m_k_container_[i]);
+                elapsed_time += launch_and_time_kernel(stream_config,
+                                                       kernel,
+                                                       dim3(grid_size),
+                                                       dim3(BlockSize),
+                                                       0,
+                                                       arg.in_grid_desc_m_k_container_[i],
+                                                       arg.p_in_container_[i],
+                                                       arg.out_grid_desc_m_k_container_[i],
+                                                       arg.p_out_container_[i],
+                                                       block_2_tile_map);
+            }
+            return elapsed_time;
+        }
+
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    bool IsSupportedArgument(const Argument& arg)
+    {
+        using namespace tensor_layout::convolution;
+        if constexpr(!(std::is_same_v<ImageLayout, GNWC> || std::is_same_v<ImageLayout, GNHWC> ||
+                       std::is_same_v<ImageLayout, GNDHWC>))
+        {
+            return false;
+        }
+
+        const auto w_pad_left  = arg.input_left_pads_[NDimSpatial - I1];
+        const auto w_pad_right = arg.input_right_pads_[NDimSpatial - I1];
+        const auto dilation_x  = arg.conv_filter_dilations_[NDimSpatial - I1];
+        const auto stride_x    = arg.conv_filter_strides_[NDimSpatial - I1];
+        bool is_w_packed       = arg.image_g_n_c_wis_strides_[NDimSpatial + I2] == arg.C_;
+        bool is_c_packed       = arg.image_g_n_c_wis_strides_[I2] == 1;
+
+        // check vector acces with c not packed
+        if(!is_c_packed && ScalarPerVector != 1)
+            return false;
+        // check vector access of filter window row (only C if C is not packed)
+        if(!is_w_packed && arg.C_ % ScalarPerVector != 0)
+            return false;
+        // check vector access of filter window row (X * C)
+        if(arg.X_ * arg.C_ % ScalarPerVector != 0)
+            return false;
+        // check vector access of pads (w_pad_left/w_pad_right * C)
+        if(w_pad_left * arg.C_ % ScalarPerVector != 0 ||
+           w_pad_right * arg.C_ % ScalarPerVector != 0)
+            return false;
+        // check vector access of with stride and pad
+        if((w_pad_left != 0 || w_pad_right != 0) && stride_x > 1 && arg.C_ % ScalarPerVector != 0)
+            return false;
+        // check vector access of with dilation
+        if(dilation_x > 1 && arg.C_ % ScalarPerVector != 0)
+            return false;
+
+        bool valid = true;
+        for(std::size_t i = 0; i < arg.in_grid_desc_m_k_container_.size(); i++)
+        {
+            valid &= GridwiseTensorRearrangeKernel::CheckValidity(
+                arg.in_grid_desc_m_k_container_[i], arg.out_grid_desc_m_k_container_[i]);
+        }
+        return valid;
+    }
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const void* p_in, // input image
+                             void* p_out,      // output image
+                             const ck::index_t N,
+                             const ck::index_t C,
+                             const std::array<index_t, NDimSpatial>& input_spatial_lengths,
+                             const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
+                             const std::array<index_t, NDimSpatial>& output_spatial_lengths,
+                             const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
+                             const std::array<index_t, 2>& gemm_m_k_strides,
+                             const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                             const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                             const std::array<index_t, NDimSpatial>& input_left_pads,
+                             const std::array<index_t, NDimSpatial>& input_right_pads)
+    {
+        return Argument{static_cast<const InputDataType*>(p_in),
+                        static_cast<OutputDataType*>(p_out),
+                        N,
+                        C,
+                        input_spatial_lengths,
+                        filter_spatial_lengths,
+                        output_spatial_lengths,
+                        image_g_n_c_wis_strides,
+                        gemm_m_k_strides,
+                        conv_filter_strides,
+                        conv_filter_dilations,
+                        input_left_pads,
+                        input_right_pads};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_in, // input image
+                        void* p_out,      // output image
+                        const ck::index_t N,
+                        const ck::index_t C,
+                        const std::array<index_t, NDimSpatial>& input_spatial_lengths,
+                        const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
+                        const std::array<index_t, NDimSpatial>& output_spatial_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
+                        const std::array<index_t, 2>& gemm_m_k_strides,
+                        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                        const std::array<index_t, NDimSpatial>& input_left_pads,
+                        const std::array<index_t, NDimSpatial>& input_right_pads) override
+    {
+        return std::make_unique<Argument>(static_cast<const InputDataType*>(p_in),
+                                          static_cast<OutputDataType*>(p_out),
+                                          N,
+                                          C,
+                                          input_spatial_lengths,
+                                          filter_spatial_lengths,
+                                          output_spatial_lengths,
+                                          image_g_n_c_wis_strides,
+                                          gemm_m_k_strides,
+                                          conv_filter_strides,
+                                          conv_filter_dilations,
+                                          input_left_pads,
+                                          input_right_pads);
+    }
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceColumnToImage"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << KPerBlock << ", "
+            << ScalarPerVector
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_xdl_cshuffle.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_abd.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+
+template <typename GridwiseGemm,
+          typename AsPointer,
+          typename BsPointer,
+          typename DsPointer,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          typename AsGridDesc_AK0_M_AK1,
+          typename BsGridDesc_BK0_N_BK1,
+          typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename Block2ETileMap,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_multiple_abd_xdl_cshuffle(
+            AsPointer p_as_grid,
+            BsPointer p_bs_grid,
+            DsPointer p_ds_grid,
+            EDataType* __restrict__ p_e_grid,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const CDEElementwiseOperation cde_element_op,
+            const AsGridDesc_AK0_M_AK1 as_grid_desc_ak0_m_ak1,
+            const BsGridDesc_BK0_N_BK1 bs_grid_desc_bk0_n_bk1,
+            const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                ds_grid_desc_mblock_mperblock_nblock_nperblock,
+            const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                e_grid_desc_mblock_mperblock_nblock_nperblock,
+            const Block2ETileMap block_2_etile_map)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_as_grid,
+                                                  p_bs_grid,
+                                                  p_ds_grid,
+                                                  p_e_grid,
+                                                  p_shared,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  cde_element_op,
+                                                  as_grid_desc_ak0_m_ak1,
+                                                  bs_grid_desc_bk0_n_bk1,
+                                                  ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  e_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  block_2_etile_map);
+#else
+    ignore = p_as_grid;
+    ignore = p_bs_grid;
+    ignore = p_ds_grid;
+    ignore = p_e_grid;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = cde_element_op;
+    ignore = as_grid_desc_ak0_m_ak1;
+    ignore = bs_grid_desc_bk0_n_bk1;
+    ignore = ds_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = e_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = block_2_etile_map;
+#endif
+}
+
+} // namespace ck
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// GEMM:
+//   input : A[M, K]
+//   input : B[N, K]
+//   input : D0[M, N], D1[M, N], ...
+//   output : E[M, N]
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+// Assume:
+//   D0, D1, ... and E have the same layout
+template <typename AsLayout,
+          typename BsLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename AsDataType,
+          typename BsDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          index_t ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          index_t BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CDEBlockTransferScalarPerVector_NPerBlock,
+          LoopScheduler LoopSched     = make_default_loop_scheduler(),
+          PipelineVersion PipelineVer = PipelineVersion::v1>
+struct DeviceGemmMultipleABD_Xdl_CShuffle : public DeviceGemmMultipleABD<AsLayout,
+                                                                         BsLayout,
+                                                                         DsLayout,
+                                                                         ELayout,
+                                                                         AsDataType,
+                                                                         BsDataType,
+                                                                         DsDataType,
+                                                                         EDataType,
+                                                                         AElementwiseOperation,
+                                                                         BElementwiseOperation,
+                                                                         CDEElementwiseOperation>
+{
+    using DeviceOp = DeviceGemmMultipleABD_Xdl_CShuffle;
+
+    static constexpr index_t NumATensor = AsDataType::Size();
+    static constexpr index_t NumBTensor = BsDataType::Size();
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+#if 0
+    static constexpr auto matrix_padder =
+        MatrixPadder<GemmSpec, index_t, index_t, index_t>{MPerBlock, NPerBlock, KPerBlock};
+
+    static auto MakeAGridDescriptor_M_K(index_t MRaw, index_t KRaw, index_t StrideAs)
+    {
+        const auto a_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, AsLayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(StrideAs, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, AsLayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(I1, StrideAs));
+            }
+        }();
+
+        return matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw);
+    }
+
+    static auto MakeBGridDescriptor_N_K(index_t KRaw, index_t NRaw, index_t StrideBs)
+    {
+        const auto b_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BsLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(I1, StrideBs));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BsLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(StrideBs, I1));
+            }
+        }();
+
+        return matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw);
+    }
+
+    template <typename ELay>
+    static auto MakeEGridDescriptor_M_N(index_t MRaw, index_t NRaw, index_t StrideE)
+    {
+        const auto e_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, ELay>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(StrideE, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ELay>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(I1, StrideE));
+            }
+        }();
+
+        return matrix_padder.PadCDescriptor_M_N(e_grid_desc_mraw_nraw);
+    }
+
+    static auto MakeDsGridDescriptor_M_N(const std::array<index_t, NumDTensor>& MRaws,
+                                         const std::array<index_t, NumDTensor>& NRaws,
+                                         const std::array<index_t, NumDTensor>& DsStride)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+
+                return DeviceOp::MakeEGridDescriptor_M_N<DLayout>(MRaws[i], NRaws[i], DsStride[i]);
+            },
+            Number<NumDTensor>{});
+    }
+#endif
+    using ComputeDataType = EDataType;
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemmMultipleABD_xdl_cshuffle<
+        AsDataType,
+        BsDataType,
+        ComputeDataType,
+        AccDataType,
+        CShuffleDataType,
+        DsDataType,
+        EDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CDEElementwiseOperation,
+        InMemoryDataOperationEnum::Set,
+        NumGemmKPrefetchStage,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CDEBlockTransferScalarPerVector_NPerBlock,
+        LoopSched,
+        PipelineVer>;
+
+    // desc for problem definition
+    using AsGridDesc_M_K =
+        remove_cvref_t<decltype(GridwiseGemm::template MakeAsGridDescriptor_M_K<AsLayout, GemmSpec>(
+            {}, {}, {}))>;
+    using BsGridDesc_N_K =
+        remove_cvref_t<decltype(GridwiseGemm::template MakeBsGridDescriptor_N_K<BsLayout, GemmSpec>(
+            {}, {}, {}))>;
+    using DsGridDesc_M_N =
+        remove_cvref_t<decltype(GridwiseGemm::template MakeDsGridDescriptor_M_N<DsLayout, GemmSpec>(
+            {}, {}, {}))>;
+    using EGridDesc_M_N =
+        decltype(GridwiseGemm::template MakeEGridDescriptor_M_N<ELayout, GemmSpec>(1, 1, 1));
+
+    // desc for blockwise copy
+    using AsGridDesc_AK0_M_AK1 =
+        remove_cvref_t<decltype(GridwiseGemm::MakeAsGridDescriptor_AK0_M_AK1(AsGridDesc_M_K{}))>;
+    using BsGridDesc_BK0_N_BK1 =
+        remove_cvref_t<decltype(GridwiseGemm::MakeBsGridDescriptor_BK0_N_BK1(BsGridDesc_N_K{}))>;
+    using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<
+        decltype(GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+            DsGridDesc_M_N{}))>;
+    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
+        remove_cvref_t<decltype(GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+            EGridDesc_M_N{}))>;
+
+    // block-to-e-tile map
+    using Block2ETileMap =
+        remove_cvref_t<decltype(GridwiseGemm::MakeBlock2ETileMap(EGridDesc_M_N{}))>;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(std::array<const void*, NumATensor> p_as_grid,
+                 std::array<const void*, NumBTensor> p_bs_grid,
+                 std::array<const void*, NumDTensor> p_ds_grid,
+                 void* p_e_grid,
+                 index_t MRaw,
+                 index_t NRaw,
+                 index_t KRaw,
+                 std::array<index_t, NumATensor> StrideAs,
+                 std::array<index_t, NumBTensor> StrideBs,
+                 std::array<index_t, NumDTensor> StrideDs,
+                 index_t StrideE,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CDEElementwiseOperation cde_element_op)
+            : p_as_grid_{},
+              p_bs_grid_{},
+              p_ds_grid_{},
+              p_e_grid_{static_cast<EDataType*>(p_e_grid)},
+              as_grid_desc_m_k_{},
+              bs_grid_desc_n_k_{},
+              ds_grid_desc_m_n_{},
+              e_grid_desc_m_n_{GridwiseGemm::template MakeEGridDescriptor_M_N<ELayout, GemmSpec>(
+                  MRaw, NRaw, StrideE)},
+              as_grid_desc_ak0_m_ak1_{},
+              bs_grid_desc_bk0_n_bk1_{},
+              ds_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              e_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              block_2_etile_map_{GridwiseGemm::MakeBlock2ETileMap(e_grid_desc_m_n_)},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              cde_element_op_{cde_element_op},
+              MRaw_{MRaw},
+              NRaw_{NRaw},
+              KRaw_{KRaw}
+        {
+            // populate pointer, desc for As
+            static_for<0, NumATensor, 1>{}([&](auto i) {
+                using ALayout   = remove_cvref_t<tuple_element_t<i.value, AsLayout>>;
+                using ADataType = remove_cvref_t<tuple_element_t<i.value, AsDataType>>;
+
+                // A pointer
+                p_as_grid_(i) = static_cast<const ADataType*>(p_as_grid[i]);
+
+                // A desc
+                as_grid_desc_m_k_(i) =
+                    GridwiseGemm::template MakeAGridDescriptor_M_K<ALayout, GemmSpec>(
+                        MRaw, KRaw, StrideAs[i]);
+            });
+
+            // populate pointer, desc for Bs
+            static_for<0, NumBTensor, 1>{}([&](auto i) {
+                using BLayout   = remove_cvref_t<tuple_element_t<i.value, BsLayout>>;
+                using BDataType = remove_cvref_t<tuple_element_t<i.value, BsDataType>>;
+
+                // B pointer
+                p_bs_grid_(i) = static_cast<const BDataType*>(p_bs_grid[i]);
+
+                // B desc
+                bs_grid_desc_n_k_(i) =
+                    GridwiseGemm::template MakeBGridDescriptor_N_K<BLayout, GemmSpec>(
+                        KRaw, NRaw, StrideBs[i]);
+            });
+
+            // populate pointer, desc for Ds
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                using DLayout   = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+                using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+
+                // D pointer
+                p_ds_grid_(i) = static_cast<const DDataType*>(p_ds_grid[i]);
+
+                // D desc
+                ds_grid_desc_m_n_(i) =
+                    GridwiseGemm::template MakeEGridDescriptor_M_N<DLayout, GemmSpec>(
+                        MRaw, NRaw, StrideDs[i]);
+            });
+
+            // populate desc for Ds/E
+            if(GridwiseGemm::CheckValidity(as_grid_desc_m_k_,
+                                           bs_grid_desc_n_k_,
+                                           ds_grid_desc_m_n_,
+                                           e_grid_desc_m_n_,
+                                           block_2_etile_map_))
+            {
+                as_grid_desc_ak0_m_ak1_ =
+                    GridwiseGemm::MakeAsGridDescriptor_AK0_M_AK1(as_grid_desc_m_k_);
+
+                bs_grid_desc_bk0_n_bk1_ =
+                    GridwiseGemm::MakeBsGridDescriptor_BK0_N_BK1(bs_grid_desc_n_k_);
+
+                ds_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        ds_grid_desc_m_n_);
+
+                e_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        e_grid_desc_m_n_);
+            }
+        }
+
+        void Print() const
+        {
+            // std::cout << "A[M, K]: " << as_grid_desc_m_k_ << std::endl;
+            // std::cout << "B[N, K]: " << bs_grid_desc_n_k_ << std::endl;
+            // static_for<0, NumDTensor, 1>{}(
+            //[&](auto i) { std::cout << "Ds[M, N]: " << ds_grid_desc_m_n_[i] << std::endl; });
+            // std::cout << "E[M, N]: " << e_grid_desc_m_n_ << std::endl;
+        }
+
+        //  private:
+        // pointers
+        typename GridwiseGemm::AsGridPointer p_as_grid_;
+        typename GridwiseGemm::BsGridPointer p_bs_grid_;
+        typename GridwiseGemm::DsGridPointer p_ds_grid_;
+        EDataType* p_e_grid_;
+
+        // tensor descriptors for problem definiton
+        AsGridDesc_M_K as_grid_desc_m_k_;
+        BsGridDesc_N_K bs_grid_desc_n_k_;
+        DsGridDesc_M_N ds_grid_desc_m_n_;
+        EGridDesc_M_N e_grid_desc_m_n_;
+
+        // tensor descriptors for block/thread-wise copy
+        AsGridDesc_AK0_M_AK1 as_grid_desc_ak0_m_ak1_;
+        BsGridDesc_BK0_N_BK1 bs_grid_desc_bk0_n_bk1_;
+        DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+            ds_grid_desc_mblock_mperblock_nblock_nperblock_;
+        EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock_;
+
+        // block-to-e-tile map
+        Block2ETileMap block_2_etile_map_;
+
+        // element-wise op
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CDEElementwiseOperation cde_element_op_;
+
+        // for checking vector load/store
+        index_t MRaw_;
+        index_t NRaw_;
+        index_t KRaw_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(!GridwiseGemm::CheckValidity(arg.as_grid_desc_m_k_,
+                                            arg.bs_grid_desc_n_k_,
+                                            arg.ds_grid_desc_m_n_,
+                                            arg.e_grid_desc_m_n_,
+                                            arg.block_2_etile_map_))
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            const index_t grid_size =
+                arg.block_2_etile_map_.CalculateGridSize(arg.e_grid_desc_m_n_);
+
+            auto launch_kernel = [&](auto has_main_k_block_loop) {
+                constexpr bool has_main_loop = has_main_k_block_loop.value;
+
+                const auto kernel = kernel_gemm_multiple_abd_xdl_cshuffle<
+                    GridwiseGemm,
+                    typename GridwiseGemm::AsGridPointer,
+                    typename GridwiseGemm::BsGridPointer,
+                    typename GridwiseGemm::DsGridPointer,
+                    EDataType,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CDEElementwiseOperation,
+                    DeviceOp::AsGridDesc_AK0_M_AK1,
+                    DeviceOp::BsGridDesc_BK0_N_BK1,
+                    DeviceOp::DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                    DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                    DeviceOp::Block2ETileMap,
+                    has_main_loop>;
+
+                return launch_and_time_kernel(stream_config,
+                                              kernel,
+                                              dim3(grid_size),
+                                              dim3(BlockSize),
+                                              0,
+                                              arg.p_as_grid_,
+                                              arg.p_bs_grid_,
+                                              arg.p_ds_grid_,
+                                              arg.p_e_grid_,
+                                              arg.a_element_op_,
+                                              arg.b_element_op_,
+                                              arg.cde_element_op_,
+                                              arg.as_grid_desc_ak0_m_ak1_,
+                                              arg.bs_grid_desc_bk0_n_bk1_,
+                                              arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.block_2_etile_map_);
+            };
+
+            const auto K = arg.as_grid_desc_m_k_[I0].GetLength(I1);
+
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            {
+                return launch_kernel(integral_constant<bool, true>{});
+            }
+            else
+            {
+                return launch_kernel(integral_constant<bool, false>{});
+            }
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(!ck::is_xdl_supported())
+        {
+            return false;
+        }
+
+        // check vector load/store
+        {
+            using Row = ck::tensor_layout::gemm::RowMajor;
+            using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+            bool all_valid = true;
+
+            static_for<0, NumATensor, 1>{}([&](auto i) {
+                using ALayout = remove_cvref_t<tuple_element_t<i.value, AsLayout>>;
+                // check vector load of A
+                if constexpr(is_same_v<ALayout, Row> && ABlockTransferSrcVectorDim == 2)
+                {
+                    if(arg.KRaw_ % ABlockTransferSrcScalarPerVector != 0)
+                    {
+                        all_valid = false;
+                    }
+                }
+                else if constexpr(is_same_v<ALayout, Col> && ABlockTransferSrcVectorDim == 1)
+                {
+                    // FIXME: not rigorous
+                    if(arg.MRaw_ % ABlockTransferSrcScalarPerVector != 0)
+                    {
+                        all_valid = false;
+                    }
+                }
+                else
+                {
+                    all_valid = false;
+                }
+            });
+
+            static_for<0, NumBTensor, 1>{}([&](auto i) {
+                using BLayout = remove_cvref_t<tuple_element_t<i.value, BsLayout>>;
+                // check vector laod of B
+                if constexpr(is_same_v<BLayout, Col> && BBlockTransferSrcVectorDim == 2)
+                {
+                    if(arg.KRaw_ % BBlockTransferSrcScalarPerVector != 0)
+                    {
+                        all_valid = false;
+                    }
+                }
+                else if constexpr(is_same_v<BLayout, Row> && BBlockTransferSrcVectorDim == 1)
+                {
+                    // FIXME: not rigorous
+                    if(arg.NRaw_ % BBlockTransferSrcScalarPerVector != 0)
+                    {
+                        all_valid = false;
+                    }
+                }
+                else
+                {
+                    all_valid = false;
+                }
+            });
+
+            // check vector load of Ds
+            // only support RowMajor for now
+
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+
+                if constexpr(!is_same_v<DLayout, Row>)
+                {
+                    all_valid = false;
+                }
+            });
+
+            if(!all_valid)
+            {
+                return false;
+            }
+
+            // check vector store of E
+            // only support RowMajor for now
+            if constexpr(is_same_v<ELayout, Row>)
+            {
+                if(arg.NRaw_ % CDEBlockTransferScalarPerVector_NPerBlock != 0)
+                {
+                    return false;
+                }
+            }
+            else
+            {
+                return false;
+            }
+        }
+
+        return GridwiseGemm::CheckValidity(arg.as_grid_desc_m_k_,
+                                           arg.bs_grid_desc_n_k_,
+                                           arg.ds_grid_desc_m_n_,
+                                           arg.e_grid_desc_m_n_,
+                                           arg.block_2_etile_map_);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(std::array<const void*, NumATensor> p_as,
+                             std::array<const void*, NumBTensor> p_bs,
+                             std::array<const void*, NumDTensor> p_ds,
+                             void* p_e,
+                             index_t MRaw,
+                             index_t NRaw,
+                             index_t KRaw,
+                             std::array<index_t, NumATensor> StrideAs,
+                             std::array<index_t, NumBTensor> StrideBs,
+                             std::array<index_t, NumDTensor> StrideDs,
+                             index_t StrideE,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CDEElementwiseOperation cde_element_op)
+    {
+        return Argument{p_as,
+                        p_bs,
+                        p_ds,
+                        p_e,
+                        MRaw,
+                        NRaw,
+                        KRaw,
+                        StrideAs,
+                        StrideBs,
+                        StrideDs,
+                        StrideE,
+                        a_element_op,
+                        b_element_op,
+                        cde_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(std::array<const void*, NumATensor> p_as,
+                        std::array<const void*, NumBTensor> p_bs,
+                        std::array<const void*, NumDTensor> p_ds,
+                        void* p_e,
+                        index_t MRaw,
+                        index_t NRaw,
+                        index_t KRaw,
+                        std::array<ck::index_t, NumATensor> StrideAs,
+                        std::array<ck::index_t, NumBTensor> StrideBs,
+                        std::array<ck::index_t, NumDTensor> StrideDs,
+                        index_t StrideE,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) override
+    {
+        return std::make_unique<Argument>(p_as,
+                                          p_bs,
+                                          p_ds,
+                                          p_e,
+                                          MRaw,
+                                          NRaw,
+                                          KRaw,
+                                          StrideAs,
+                                          StrideBs,
+                                          StrideDs,
+                                          StrideE,
+                                          a_element_op,
+                                          b_element_op,
+                                          cde_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        std::map<LoopScheduler, std::string> LoopSchedToString{
+            {LoopScheduler::Default, "Default"}, {LoopScheduler::Interwave, "Interwave"}};
+
+        std::map<PipelineVersion, std::string> PipelineVersionToString{{PipelineVersion::v1, "v1"},
+                                                                       {PipelineVersion::v2, "v2"}};
+
+        // clang-format off
+        str << "DeviceGemmMultipleABD_Xdl_CShuffle"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
+            << AK1 << ", "
+            << BK1 << ", "
+            << MPerXDL << ", "
+            << NPerXDL << ", "
+            << MXdlPerWave << ", "
+            << NXdlPerWave << ", "
+            << ABlockTransferSrcScalarPerVector << ", "
+            << BBlockTransferSrcScalarPerVector << ", "
+            << CShuffleMXdlPerWavePerShuffle << ", "
+            << CShuffleNXdlPerWavePerShuffle << ", "
+            << getGemmSpecializationString(GemmSpec)
+            << ">"
+            << " LoopScheduler: "
+            << LoopSchedToString[LoopSched] << ", "
+            << "PipelineVersion: "
+            << PipelineVersionToString[PipelineVer];
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp
@@ -66,7 +66,8 @@ template <typename ALayout,
          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
          LoopScheduler LoopSched     = make_default_loop_scheduler(),
          PipelineVersion PipelineVer = PipelineVersion::v1,
-          typename ComputeType        = CDataType>
+          typename ComputeTypeA       = CDataType,
+          typename ComputeTypeB       = ComputeTypeA>
 struct DeviceGemm_Xdl_CShuffle : public DeviceGemm<ALayout,
                                                   BLayout,
                                                   CLayout,
@@ -131,7 +132,8 @@ struct DeviceGemm_Xdl_CShuffle : public DeviceGemm<ALayout,
        CShuffleBlockTransferScalarPerVector_NPerBlock,
        LoopSched,
        PipelineVer,
-        ComputeType>;
+        ComputeTypeA,
+        ComputeTypeB>;

    using Argument = typename GridwiseGemm::Argument;


--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
@@ -168,7 +168,7 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
                                                     stream_config.stream_id_));

                ave_time = launch_and_time_kernel(
-                    stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, karg, b2c_map);
+                    stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, karg);
            };

            if(has_main_k0_block_loop)

--- a/include/ck/tensor_operation/gpu/device/impl/device_gemv_splitk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemv_splitk.hpp
@@ -303,7 +303,7 @@ struct deviceGemvDl : public DeviceGemv<ALayout,
                        StrideC,
                        GridwiseGemv::CalculateMPadded(M),
                        GridwiseGemv::CalculateNPadded(N),
-                        GridwiseGemv::CalculateKPadded(K, KBatch),
+                        K,
                        GridwiseGemv::CalculateK0(K, KBatch),
                        KBatch}; // //
    }
@@ -336,7 +336,7 @@ struct deviceGemvDl : public DeviceGemv<ALayout,
                                          StrideC,
                                          GridwiseGemv::CalculateMPadded(M),
                                          GridwiseGemv::CalculateNPadded(N),
-                                          GridwiseGemv::CalculateKPadded(K, KBatch),
+                                          K,
                                          GridwiseGemv::CalculateK0(K, KBatch),
                                          KBatch); // //
    }

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp"
+#include "ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/io.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// Conv backward data multiple D:
+//   input : output image A: [G, N, K, Ho, Wo]
+//   input : weight B: [G, K, C, Y, X],
+//   input : D0, D1, ... : [G, N, K, Ho, Wo]
+//   output : input image E: [G, N, C, Hi, Wi]
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+template <index_t NDimSpatial,
+          typename ALayout,   // output image
+          typename BLayout,   // weight
+          typename DsLayout,  // bias
+          typename ELayout,   // input image
+          typename ADataType, // output image
+          typename BDataType, // weight
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,       // bias
+          typename EDataType,        // input image
+          typename AElementwiseOp,   // output image
+          typename BElementwiseOp,   // weight
+          typename CDEElementwiseOp, // C, bias, and input image
+          ConvolutionBackwardDataSpecialization ConvBackwardDataSpecialization,
+          ck::index_t BlockSize,
+          ck::index_t MPerBlock,
+          ck::index_t NPerBlock,
+          ck::index_t K0PerBlock,
+          ck::index_t K1,
+          ck::index_t MPerWMMA,
+          ck::index_t NPerWMMA,
+          ck::index_t MRepeat,
+          ck::index_t NRepeat,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMRepeatPerShuffle,
+          index_t CShuffleNRepeatPerShuffle,
+          typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CDEShuffleBlockTransferScalarPerVector_NPerBlock,
+          index_t NumGemmKPrefetchStage   = 1,
+          LoopScheduler LoopSched         = make_default_loop_scheduler(),
+          ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
+struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
+    : public DeviceGroupedConvBwdDataMultipleD<NDimSpatial,
+                                               ALayout,    // output image
+                                               BLayout,    // weight
+                                               DsLayout,   // bias
+                                               ELayout,    // input image
+                                               ADataType,  // output image
+                                               BDataType,  // weight
+                                               DsDataType, // bias
+                                               EDataType,  // input image
+                                               AElementwiseOp,
+                                               BElementwiseOp,
+                                               CDEElementwiseOp>
+{
+    // TODO: Extend support for more spatial dimensions.
+    static_assert(NDimSpatial == 2 || NDimSpatial == 3,
+                  "wrong! only implemented for 2D and 3D now");
+
+    using DeviceOp = DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle;
+
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    // TODO: Add support for different A and B data types.
+    using ABDataType = ADataType;
+
+    static constexpr auto I0           = Number<0>{};
+    static constexpr auto I1           = Number<1>{};
+    static constexpr auto I2           = Number<2>{};
+    static constexpr auto I3           = Number<3>{};
+    static constexpr index_t KPerBlock = K0PerBlock * K1;
+
+    static constexpr auto transform_conv_to_gemm =
+        TransformConvBwdDataToGemm_v1<NDimSpatial,
+                                      ConvBackwardDataSpecialization,
+                                      K1,
+                                      K1,
+                                      MPerBlock,
+                                      NPerBlock,
+                                      KPerBlock,
+                                      true /* DoPadGemmM */,
+                                      true /* DoPadGemmN */>{};
+
+    static auto GetDummyABDsEGridDescriptor()
+    {
+        const std::array<index_t, NDimSpatial + 3> dummy_tensor_lengths = {1};
+        const std::array<index_t, NDimSpatial + 3> dummy_tensor_strides = {1};
+        const std::array<index_t, NDimSpatial> dummy_spatial_lengths    = {1};
+
+        const auto a_grid_desc_ak0_m_ak1 =
+            transform_conv_to_gemm.template MakeADescriptor_AK0_M_AK1<ALayout>(
+                dummy_tensor_lengths,
+                dummy_tensor_strides,
+                dummy_tensor_lengths,
+                dummy_tensor_strides,
+                dummy_tensor_lengths,
+                dummy_tensor_strides,
+                dummy_spatial_lengths,
+                dummy_spatial_lengths,
+                dummy_spatial_lengths,
+                dummy_spatial_lengths,
+                dummy_spatial_lengths);
+
+        const auto b_grid_desc_bk0_n_bk1 =
+            transform_conv_to_gemm.template MakeBDescriptor_BK0_N_BK1<BLayout>(
+                dummy_tensor_lengths,
+                dummy_tensor_strides,
+                dummy_tensor_lengths,
+                dummy_tensor_strides,
+                dummy_tensor_lengths,
+                dummy_tensor_strides,
+                dummy_spatial_lengths,
+                dummy_spatial_lengths,
+                dummy_spatial_lengths,
+                dummy_spatial_lengths,
+                dummy_spatial_lengths);
+
+        const auto ds_grid_desc_m_n = generate_tuple(
+            [&](auto i) {
+                using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+
+                return transform_conv_to_gemm.template MakeCDescriptor_M_N<DLayout>(
+                    dummy_tensor_lengths,
+                    dummy_tensor_strides,
+                    dummy_tensor_lengths,
+                    dummy_tensor_strides,
+                    dummy_tensor_lengths,
+                    dummy_tensor_strides,
+                    dummy_spatial_lengths,
+                    dummy_spatial_lengths,
+                    dummy_spatial_lengths,
+                    dummy_spatial_lengths,
+                    dummy_spatial_lengths);
+            },
+            Number<NumDTensor>{});
+
+        const auto e_grid_desc_m_n =
+            transform_conv_to_gemm.template MakeCDescriptor_M_N<ELayout>(dummy_tensor_lengths,
+                                                                         dummy_tensor_strides,
+                                                                         dummy_tensor_lengths,
+                                                                         dummy_tensor_strides,
+                                                                         dummy_tensor_lengths,
+                                                                         dummy_tensor_strides,
+                                                                         dummy_spatial_lengths,
+                                                                         dummy_spatial_lengths,
+                                                                         dummy_spatial_lengths,
+                                                                         dummy_spatial_lengths,
+                                                                         dummy_spatial_lengths);
+
+        return make_tuple(
+            a_grid_desc_ak0_m_ak1, b_grid_desc_bk0_n_bk1, ds_grid_desc_m_n, e_grid_desc_m_n);
+    }
+
+    // desc
+    using ABDsEGridDesc = decltype(GetDummyABDsEGridDescriptor());
+
+    using AGridDesc_AK0_M_AK1 = remove_cvref_t<tuple_element_t<0, ABDsEGridDesc>>;
+    using BGridDesc_BK0_N_BK1 = remove_cvref_t<tuple_element_t<1, ABDsEGridDesc>>;
+    using DsGridDesc_M_N      = remove_cvref_t<tuple_element_t<2, ABDsEGridDesc>>;
+    using EGridDesc_M_N       = remove_cvref_t<tuple_element_t<3, ABDsEGridDesc>>;
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle<
+        // DataType Family
+        ADataType,
+        BDataType,
+        AccDataType,
+        CShuffleDataType,
+        DsDataType,
+        EDataType,
+        // InMemory Data Descriptor
+        AGridDesc_AK0_M_AK1,
+        BGridDesc_BK0_N_BK1,
+        DsGridDesc_M_N,
+        EGridDesc_M_N,
+        // ElementwiseOp Family
+        AElementwiseOp,
+        BElementwiseOp,
+        CDEElementwiseOp,
+        InMemoryDataOperationEnum::Set,
+        // Tiling Family
+        MPerBlock,
+        NPerBlock,
+        K0PerBlock,
+        MPerWMMA,
+        NPerWMMA,
+        K1,
+        MRepeat,
+        NRepeat,
+        // ThreadCluster Family
+        BlockSize,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMRepeatPerShuffle,
+        CShuffleNRepeatPerShuffle,
+        CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CDEShuffleBlockTransferScalarPerVector_NPerBlock,
+        NumGemmKPrefetchStage,
+        LoopSched,
+        PipelineVer>;
+
+    using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
+        decltype(GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+            DsGridDesc_M_N{}));
+    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
+        decltype(GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+            EGridDesc_M_N{}));
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const void* p_a,                                 // output image
+                 const void* p_b,                                 // weight
+                 const std::array<const void*, NumDTensor>& p_ds, // bias
+                 void* p_e,                                       // input image
+                 const std::array<index_t, NDimSpatial + 3>& a_g_n_k_wos_lengths,
+                 const std::array<index_t, NDimSpatial + 3>& a_g_n_k_wos_strides,
+                 const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+                 const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
+                 const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>&
+                     ds_g_n_c_wis_lengths,
+                 const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>&
+                     ds_g_n_c_wis_strides,
+                 const std::array<index_t, NDimSpatial + 3>& e_g_n_c_wis_lengths,
+                 const std::array<index_t, NDimSpatial + 3>& e_g_n_c_wis_strides,
+                 const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                 const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                 const std::array<index_t, NDimSpatial>& input_left_pads,
+                 const std::array<index_t, NDimSpatial>& input_right_pads,
+                 const AElementwiseOp& a_element_op,
+                 const BElementwiseOp& b_element_op,
+                 const CDEElementwiseOp& cde_element_op)
+            : p_a_grid_{static_cast<const ADataType*>(p_a)},
+              p_b_grid_{static_cast<const BDataType*>(p_b)},
+              p_ds_grid_{},
+              p_e_grid_{static_cast<EDataType*>(p_e)},
+              num_group_{a_g_n_k_wos_lengths[0]},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              cde_element_op_{cde_element_op},
+              a_g_n_k_wos_lengths_{a_g_n_k_wos_lengths},
+              a_g_n_k_wos_strides_{a_g_n_k_wos_strides},
+              b_g_k_c_xs_lengths_{b_g_k_c_xs_lengths},
+              b_g_k_c_xs_strides_{b_g_k_c_xs_strides},
+              ds_g_n_c_wis_lengths_{ds_g_n_c_wis_lengths},
+              ds_g_n_c_wis_strides_{ds_g_n_c_wis_strides},
+              e_g_n_c_wis_lengths_{e_g_n_c_wis_lengths},
+              e_g_n_c_wis_strides_{e_g_n_c_wis_strides},
+              conv_filter_strides_{conv_filter_strides},
+              conv_filter_dilations_{conv_filter_dilations},
+              input_left_pads_{input_left_pads},
+              input_right_pads_{input_right_pads}
+        {
+            // populate Ds pointer
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+
+                p_ds_grid_(i) = static_cast<const DDataType*>(p_ds[i]);
+            });
+
+            // A/B/Ds/E Batch Stride
+            compute_ptr_offset_of_batch_.BatchStrideA_ = a_g_n_k_wos_strides[0];
+            compute_ptr_offset_of_batch_.BatchStrideB_ = b_g_k_c_xs_strides[0];
+            compute_ptr_offset_of_batch_.BatchStrideE_ = e_g_n_c_wis_strides[0];
+
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                compute_ptr_offset_of_batch_.BatchStrideDs_(i) = ds_g_n_c_wis_strides[i][0];
+            });
+
+            static constexpr auto NonSpatialDimsNum = Number<3>{};
+
+            static constexpr auto DIdx = Number<NonSpatialDimsNum>{};
+            static constexpr auto HIdx =
+                NDimSpatial == 2 ? Number<NonSpatialDimsNum>{} : Number<NonSpatialDimsNum + 1>{};
+            static constexpr auto WIdx = NDimSpatial == 2 ? Number<NonSpatialDimsNum + 1>{}
+                                                          : Number<NonSpatialDimsNum + 2>{};
+
+            static constexpr auto ZIdx = Number<NonSpatialDimsNum>{};
+            static constexpr auto YIdx =
+                NDimSpatial == 2 ? Number<NonSpatialDimsNum>{} : Number<NonSpatialDimsNum + 1>{};
+            static constexpr auto XIdx = NDimSpatial == 2 ? Number<NonSpatialDimsNum + 1>{}
+                                                          : Number<NonSpatialDimsNum + 2>{};
+
+            // problem definition
+            const index_t Z = b_g_k_c_xs_lengths[ZIdx];
+            const index_t Y = b_g_k_c_xs_lengths[YIdx];
+            const index_t X = b_g_k_c_xs_lengths[XIdx];
+
+            const index_t ConvStrideD = conv_filter_strides[DIdx - NonSpatialDimsNum];
+            const index_t ConvStrideH = conv_filter_strides[HIdx - NonSpatialDimsNum];
+            const index_t ConvStrideW = conv_filter_strides[WIdx - NonSpatialDimsNum];
+
+            const index_t ConvDilationD = conv_filter_dilations[DIdx - NonSpatialDimsNum];
+            const index_t ConvDilationH = conv_filter_dilations[HIdx - NonSpatialDimsNum];
+            const index_t ConvDilationW = conv_filter_dilations[WIdx - NonSpatialDimsNum];
+
+            const auto GcdStrideDilationD = math::gcd(ConvStrideD, ConvDilationD);
+            const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH);
+            const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
+
+            const auto ZTilde = NDimSpatial == 3 ? ConvStrideD / GcdStrideDilationD : 1;
+            const auto YTilde = ConvStrideH / GcdStrideDilationH;
+            const auto XTilde = ConvStrideW / GcdStrideDilationW;
+
+            for(index_t i_ztilde = 0; i_ztilde < ZTilde; ++i_ztilde)
+            {
+
+                for(index_t i_ytilde = 0; i_ytilde < YTilde; ++i_ytilde)
+                {
+                    for(index_t i_xtilde = 0; i_xtilde < XTilde; ++i_xtilde)
+                    {
+                        // check slice is valid
+                        const auto ZDotSlice =
+                            NDimSpatial == 3 ? math::integer_divide_ceil(Z - i_ztilde, ZTilde) : 1;
+                        const auto YDotSlice = math::integer_divide_ceil(Y - i_ytilde, YTilde);
+                        const auto XDotSlice = math::integer_divide_ceil(X - i_xtilde, XTilde);
+
+                        if(YDotSlice * XDotSlice * ZDotSlice <= 0)
+                        {
+                            continue;
+                        }
+
+                        std::array<index_t, NDimSpatial> tildes;
+                        if constexpr(NDimSpatial == 2)
+                        {
+                            tildes = {i_ytilde, i_xtilde};
+                        }
+                        else if constexpr(NDimSpatial == 3)
+                        {
+                            tildes = {i_ztilde, i_ytilde, i_xtilde};
+                        }
+                        else
+                        {
+                            throw std::runtime_error("wrong! only implemented for 2D and 3D now");
+                        }
+
+                        const auto a_grid_desc_ak0_m_ak1 =
+                            transform_conv_to_gemm.template MakeADescriptor_AK0_M_AK1<ALayout>(
+                                a_g_n_k_wos_lengths,
+                                a_g_n_k_wos_strides,
+                                b_g_k_c_xs_lengths,
+                                b_g_k_c_xs_strides,
+                                e_g_n_c_wis_lengths,
+                                e_g_n_c_wis_strides,
+                                conv_filter_strides,
+                                conv_filter_dilations,
+                                input_left_pads,
+                                input_right_pads,
+                                tildes);
+
+                        const auto b_grid_desc_bk0_n_bk1 =
+                            transform_conv_to_gemm.template MakeBDescriptor_BK0_N_BK1<BLayout>(
+                                a_g_n_k_wos_lengths,
+                                a_g_n_k_wos_strides,
+                                b_g_k_c_xs_lengths,
+                                b_g_k_c_xs_strides,
+                                e_g_n_c_wis_lengths,
+                                e_g_n_c_wis_strides,
+                                conv_filter_strides,
+                                conv_filter_dilations,
+                                input_left_pads,
+                                input_right_pads,
+                                tildes);
+
+                        DsGridDesc_M_N ds_grid_desc_m_n;
+
+                        // populate Ds desc
+                        static_for<0, NumDTensor, 1>{}([&](auto i) {
+                            using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+
+                            ds_grid_desc_m_n(i) =
+                                transform_conv_to_gemm.template MakeCDescriptor_M_N<DLayout>(
+                                    a_g_n_k_wos_lengths,
+                                    a_g_n_k_wos_strides,
+                                    b_g_k_c_xs_lengths,
+                                    b_g_k_c_xs_strides,
+                                    ds_g_n_c_wis_lengths[i],
+                                    ds_g_n_c_wis_strides[i],
+                                    conv_filter_strides,
+                                    conv_filter_dilations,
+                                    input_left_pads,
+                                    input_right_pads,
+                                    tildes);
+                        });
+
+                        const auto e_grid_desc_m_n =
+                            transform_conv_to_gemm.template MakeCDescriptor_M_N<ELayout>(
+                                a_g_n_k_wos_lengths,
+                                a_g_n_k_wos_strides,
+                                b_g_k_c_xs_lengths,
+                                b_g_k_c_xs_strides,
+                                e_g_n_c_wis_lengths,
+                                e_g_n_c_wis_strides,
+                                conv_filter_strides,
+                                conv_filter_dilations,
+                                input_left_pads,
+                                input_right_pads,
+                                tildes);
+
+                        // for check validity
+                        ds_grid_desc_m_n_container_.push_back(ds_grid_desc_m_n);
+                        e_grid_desc_m_n_container_.push_back(e_grid_desc_m_n);
+
+                        // desc for blockwise copy
+                        a_grid_desc_ak0_m_ak1_container_.push_back(a_grid_desc_ak0_m_ak1);
+                        b_grid_desc_bk0_n_bk1_container_.push_back(b_grid_desc_bk0_n_bk1);
+
+                        // block-to-e-tile-map
+                        auto block_2_ctile_map = GridwiseGemm::MakeDefaultBlock2CTileMap(
+                            e_grid_desc_m_n, 1 /* M01 */, 1 /* N01 */);
+
+                        block_2_ctile_map_container_.push_back(block_2_ctile_map);
+
+                        ds_grid_desc_mblock_mperblock_nblock_nperblock_container_.push_back(
+                            GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                                ds_grid_desc_m_n));
+                        e_grid_desc_mblock_mperblock_nblock_nperblock_container_.push_back(
+                            GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                                e_grid_desc_m_n));
+                    }
+                }
+            }
+        }
+
+        void Print() const
+        {
+            for(std::size_t i = 0; i < a_grid_desc_ak0_m_ak1_container_.size(); i++)
+            {
+                std::cout << "a_grid_desc_ak0_m_ak1_container_"
+                          << a_grid_desc_ak0_m_ak1_container_[i] << std::endl;
+
+                std::cout << "b_grid_desc_bk0_n_bk1_container_"
+                          << b_grid_desc_bk0_n_bk1_container_[i] << std::endl;
+
+                static_for<0, NumDTensor, 1>{}([&](auto j) {
+                    std::cout << "ds_grid_desc_mblock_mperblock_nblock_nperblock_container_"
+                              << ds_grid_desc_mblock_mperblock_nblock_nperblock_container_[i][j]
+                              << std::endl;
+                });
+
+                std::cout << "e_grid_desc_mblock_mperblock_nblock_nperblock_container_"
+                          << e_grid_desc_mblock_mperblock_nblock_nperblock_container_[i]
+                          << std::endl;
+            }
+        }
+
+        // pointers
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        typename GridwiseGemm::DsGridPointer p_ds_grid_;
+        EDataType* p_e_grid_;
+
+        // tensor descriptor for problem definition
+        index_t num_group_;
+        std::vector<DsGridDesc_M_N> ds_grid_desc_m_n_container_;
+        std::vector<EGridDesc_M_N> e_grid_desc_m_n_container_;
+
+        // tensor descriptor for block-wise copy
+        std::vector<AGridDesc_AK0_M_AK1> a_grid_desc_ak0_m_ak1_container_;
+        std::vector<BGridDesc_BK0_N_BK1> b_grid_desc_bk0_n_bk1_container_;
+        std::vector<DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>
+            ds_grid_desc_mblock_mperblock_nblock_nperblock_container_;
+        std::vector<EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>
+            e_grid_desc_mblock_mperblock_nblock_nperblock_container_;
+
+        // block-to-e-tile map
+        std::vector<typename GridwiseGemm::DefaultBlock2CTileMap> block_2_ctile_map_container_;
+
+        // for computing batch offset
+        ComputePtrOffsetOfStridedBatch<NumDTensor> compute_ptr_offset_of_batch_;
+
+        // element-wise op
+        AElementwiseOp a_element_op_;
+        BElementwiseOp b_element_op_;
+        CDEElementwiseOp cde_element_op_;
+
+        // for checking IsSupportedArgument()
+        std::array<index_t, NDimSpatial + 3> a_g_n_k_wos_lengths_;
+        std::array<index_t, NDimSpatial + 3> a_g_n_k_wos_strides_;
+        std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_lengths_;
+        std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_strides_;
+        std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor> ds_g_n_c_wis_lengths_;
+        std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor> ds_g_n_c_wis_strides_;
+        std::array<index_t, NDimSpatial + 3> e_g_n_c_wis_lengths_;
+        std::array<index_t, NDimSpatial + 3> e_g_n_c_wis_strides_;
+        std::array<index_t, NDimSpatial> conv_filter_strides_;
+        std::array<index_t, NDimSpatial> conv_filter_dilations_;
+        std::array<index_t, NDimSpatial> input_left_pads_;
+        std::array<index_t, NDimSpatial> input_right_pads_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(stream_config.log_level_ > 0)
+            {
+                arg.Print();
+            }
+
+            float ave_time = 0;
+
+            for(std::size_t i = 0; i < arg.a_grid_desc_ak0_m_ak1_container_.size(); i++)
+            {
+                const index_t grid_size = arg.block_2_ctile_map_container_[i].CalculateGridSize(
+                                              arg.e_grid_desc_m_n_container_[i]) *
+                                          arg.num_group_;
+
+                const auto GemmK = arg.a_grid_desc_ak0_m_ak1_container_[i].GetLength(I0) *
+                                   arg.a_grid_desc_ak0_m_ak1_container_[i].GetLength(I2);
+
+                auto launch_kernel = [&](auto has_main_k_block_loop) {
+                    constexpr bool has_main_loop = has_main_k_block_loop.value;
+
+                    const auto kernel = kernel_grouped_conv_fwd_multiple_d_wmma_cshuffle<
+                        GridwiseGemm,
+                        ADataType,
+                        BDataType,
+                        typename GridwiseGemm::DsGridPointer,
+                        EDataType,
+                        AElementwiseOp,
+                        BElementwiseOp,
+                        CDEElementwiseOp,
+                        DeviceOp::AGridDesc_AK0_M_AK1,
+                        DeviceOp::BGridDesc_BK0_N_BK1,
+                        typename GridwiseGemm::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                        typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                        remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
+                        ComputePtrOffsetOfStridedBatch<NumDTensor>,
+                        has_main_loop>;
+
+                    return launch_and_time_kernel(
+                        stream_config,
+                        kernel,
+                        dim3(grid_size),
+                        dim3(BlockSize),
+                        0,
+                        arg.p_a_grid_,
+                        arg.p_b_grid_,
+                        arg.p_ds_grid_,
+                        arg.p_e_grid_,
+                        arg.a_element_op_,
+                        arg.b_element_op_,
+                        arg.cde_element_op_,
+                        arg.a_g_n_k_wos_lengths_[0], // Group count
+                        arg.a_grid_desc_ak0_m_ak1_container_[i],
+                        arg.b_grid_desc_bk0_n_bk1_container_[i],
+                        arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_container_[i],
+                        arg.e_grid_desc_mblock_mperblock_nblock_nperblock_container_[i],
+                        arg.block_2_ctile_map_container_[i],
+                        arg.compute_ptr_offset_of_batch_);
+                };
+
+                if(GridwiseGemm::CalculateHasMainKBlockLoop(GemmK))
+                {
+                    ave_time += launch_kernel(integral_constant<bool, true>{});
+                }
+                else
+                {
+                    ave_time += launch_kernel(integral_constant<bool, false>{});
+                }
+            }
+
+            return ave_time;
+        }
+
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        // check device
+        if(get_device_name() == "gfx1100" || get_device_name() == "gfx1101" ||
+           ck::get_device_name() == "gfx1102")
+        {
+            if constexpr(!(is_same_v<AccDataType, float> || is_same_v<AccDataType, int32_t>))
+            {
+                return false;
+            }
+        }
+        else
+        {
+            return false;
+        }
+
+        const index_t ConvK = arg.b_g_k_c_xs_lengths_[1];
+        const index_t ConvC = arg.b_g_k_c_xs_lengths_[2];
+
+        // Specialization
+        if constexpr(ConvBackwardDataSpecialization ==
+                     ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0)
+        {
+            // check if it's a 1x1 convolution with stride=1 and no padding
+            for(int i = 0; i < NDimSpatial; i++)
+            {
+                if(!(arg.b_g_k_c_xs_lengths_[3 + i] == 1 && arg.conv_filter_strides_[i] == 1 &&
+                     arg.input_left_pads_[i] == 0 && arg.input_right_pads_[i] == 0))
+                {
+                    return false;
+                }
+            }
+        }
+
+        // vector load for A matrix from global memory to LDS
+        if constexpr(is_same_v<ALayout, tensor_layout::convolution::GNHWK> ||
+                     is_same_v<ALayout, tensor_layout::convolution::GNDHWK> ||
+                     is_same_v<ALayout, tensor_layout::convolution::NHWGK> ||
+                     is_same_v<ALayout, tensor_layout::convolution::NDHWGK>)
+        {
+            if(!(ABlockTransferSrcVectorDim == 2 && ConvK % ABlockTransferSrcScalarPerVector == 0))
+            {
+                return false;
+            }
+        }
+        else
+        {
+            return false;
+        }
+
+        // vector load for B matrix from global memory to LDS
+        if constexpr(is_same_v<BLayout, tensor_layout::convolution::GKYXC> ||
+                     is_same_v<BLayout, tensor_layout::convolution::GKZYXC>)
+        {
+            if(!(BBlockTransferSrcVectorDim == 1 && ConvC % BBlockTransferSrcScalarPerVector == 0))
+            {
+                return false;
+            }
+        }
+        else
+        {
+            return false;
+        }
+
+        // vector store for Ds
+        bool ds_valid = true;
+
+        static_for<0, NumDTensor, 1>{}([&](auto i) {
+            using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+
+            if constexpr(is_same_v<DLayout, tensor_layout::convolution::GNHWC> ||
+                         is_same_v<DLayout, tensor_layout::convolution::GNDHWC> ||
+                         is_same_v<DLayout, tensor_layout::convolution::NHWGC> ||
+                         is_same_v<DLayout, tensor_layout::convolution::NDHWGC> ||
+                         is_same_v<DLayout, tensor_layout::convolution::G_NHW_C> ||
+                         is_same_v<DLayout, tensor_layout::convolution::GC> ||
+                         is_same_v<DLayout, tensor_layout::convolution::G_C>)
+            {
+                // vector load D matrix from global memory
+                if(!(ConvC % CDEShuffleBlockTransferScalarPerVector_NPerBlock == 0))
+                {
+                    ds_valid = false;
+                }
+            }
+            else
+            {
+                ds_valid = false;
+            }
+        });
+
+        if(!ds_valid)
+        {
+            return false;
+        }
+
+        // vector store for E
+        if constexpr(is_same_v<ELayout, tensor_layout::convolution::GNHWC> ||
+                     is_same_v<ELayout, tensor_layout::convolution::GNDHWC> ||
+                     is_same_v<ELayout, tensor_layout::convolution::NHWGC> ||
+                     is_same_v<ELayout, tensor_layout::convolution::NDHWGC>)
+        {
+            // vector store C matrix into global memory
+            if(!(ConvC % CDEShuffleBlockTransferScalarPerVector_NPerBlock == 0))
+            {
+                return false;
+            }
+        }
+        else
+        {
+            return false;
+        }
+
+        // Gridwise GEMM size
+        for(std::size_t i = 0; i < arg.a_grid_desc_ak0_m_ak1_container_.size(); i++)
+        {
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_container_[i],
+                                            arg.b_grid_desc_bk0_n_bk1_container_[i],
+                                            arg.ds_grid_desc_m_n_container_[i],
+                                            arg.e_grid_desc_m_n_container_[i],
+                                            arg.block_2_ctile_map_container_[i]))
+            {
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto
+    MakeArgument(const void* p_a,                                                 // output image
+                 const void* p_b,                                                 // weight
+                 const std::array<const void*, NumDTensor>& p_ds,                 // bias
+                 void* p_e,                                                       // input image
+                 const std::array<index_t, NDimSpatial + 3>& a_g_n_k_wos_lengths, // output image
+                 const std::array<index_t, NDimSpatial + 3>& a_g_n_k_wos_strides, // output image
+                 const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,  // weight
+                 const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,  // weight
+                 const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>&
+                     ds_g_n_c_wis_lengths, // bias
+                 const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>&
+                     ds_g_n_c_wis_strides,                                        // bias
+                 const std::array<index_t, NDimSpatial + 3>& e_g_n_c_wis_lengths, // input image
+                 const std::array<index_t, NDimSpatial + 3>& e_g_n_c_wis_strides, // input image
+                 const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                 const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                 const std::array<index_t, NDimSpatial>& input_left_pads,
+                 const std::array<index_t, NDimSpatial>& input_right_pads,
+                 const AElementwiseOp& a_element_op,
+                 const BElementwiseOp& b_element_op,
+                 const CDEElementwiseOp& cde_element_op)
+    {
+        return Argument{p_a,
+                        p_b,
+                        p_ds,
+                        p_e,
+                        a_g_n_k_wos_lengths,
+                        a_g_n_k_wos_strides,
+                        b_g_k_c_xs_lengths,
+                        b_g_k_c_xs_strides,
+                        ds_g_n_c_wis_lengths,
+                        ds_g_n_c_wis_strides,
+                        e_g_n_c_wis_lengths,
+                        e_g_n_c_wis_strides,
+                        conv_filter_strides,
+                        conv_filter_dilations,
+                        input_left_pads,
+                        input_right_pads,
+                        a_element_op,
+                        b_element_op,
+                        cde_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(
+        const void* p_a,                                                 // output image
+        const void* p_b,                                                 // weight
+        const std::array<const void*, NumDTensor>& p_ds,                 // bias
+        void* p_e,                                                       // input image
+        const std::array<index_t, NDimSpatial + 3>& a_g_n_k_wos_lengths, // output image
+        const std::array<index_t, NDimSpatial + 3>& a_g_n_k_wos_strides, // output image
+        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,  // weight
+        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,  // weight
+        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>&
+            ds_g_n_c_wis_lengths, // bias
+        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>&
+            ds_g_n_c_wis_strides,                                        // bias
+        const std::array<index_t, NDimSpatial + 3>& e_g_n_c_wis_lengths, // input image
+        const std::array<index_t, NDimSpatial + 3>& e_g_n_c_wis_strides, // input image
+        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+        const std::array<index_t, NDimSpatial>& input_left_pads,
+        const std::array<index_t, NDimSpatial>& input_right_pads,
+        const AElementwiseOp& a_element_op,
+        const BElementwiseOp& b_element_op,
+        const CDEElementwiseOp& cde_element_op) override
+    {
+        return std::make_unique<Argument>(p_a,
+                                          p_b,
+                                          p_ds,
+                                          p_e,
+                                          a_g_n_k_wos_lengths,
+                                          a_g_n_k_wos_strides,
+                                          b_g_k_c_xs_lengths,
+                                          b_g_k_c_xs_strides,
+                                          ds_g_n_c_wis_lengths,
+                                          ds_g_n_c_wis_strides,
+                                          e_g_n_c_wis_lengths,
+                                          e_g_n_c_wis_strides,
+                                          conv_filter_strides,
+                                          conv_filter_dilations,
+                                          input_left_pads,
+                                          input_right_pads,
+                                          a_element_op,
+                                          b_element_op,
+                                          cde_element_op);
+    }
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
+            << getConvBackwardDataSpecializationString(ConvBackwardDataSpecialization) << ", "
+            << K1 << ", "
+            << ABlockTransferSrcScalarPerVector << ", "
+            << BBlockTransferSrcScalarPerVector
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
@@ -14,6 +14,7 @@
 #include "ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp"
 #include "ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/host_utility/kernel_launch.hpp"
 #include "ck/host_utility/io.hpp"
@@ -24,51 +25,6 @@ namespace device {

 namespace {

-template <index_t NumDTensor>
-struct ComputePtrOffsetOfStridedBatch
-{
-    ComputePtrOffsetOfStridedBatch() = default;
-
-    ComputePtrOffsetOfStridedBatch(index_t BatchStrideA,
-                                   index_t BatchStrideB,
-                                   Array<ck::index_t, NumDTensor> BatchStrideDs,
-                                   index_t BatchStrideE)
-        : BatchStrideA_(BatchStrideA),
-          BatchStrideB_(BatchStrideB),
-          BatchStrideDs_(BatchStrideDs),
-          BatchStrideE_(BatchStrideE)
-    {
-    }
-
-    __host__ __device__ constexpr long_index_t GetAPtrOffset(index_t g_idx) const
-    {
-        return g_idx * static_cast<long_index_t>(BatchStrideA_);
-    }
-
-    __host__ __device__ constexpr long_index_t GetBPtrOffset(index_t g_idx) const
-    {
-        return g_idx * static_cast<long_index_t>(BatchStrideB_);
-    }
-
-    __host__ __device__ constexpr auto GetDsPtrOffset(index_t g_idx) const
-    {
-        Array<long_index_t, NumDTensor> ds_offset;
-        static_for<0, NumDTensor, 1>{}(
-            [&](auto i) { ds_offset(i) = g_idx * static_cast<long_index_t>(BatchStrideDs_[i]); });
-        return ds_offset;
-    }
-
-    __host__ __device__ constexpr long_index_t GetEPtrOffset(index_t g_idx) const
-    {
-        return g_idx * static_cast<long_index_t>(BatchStrideE_);
-    }
-
-    index_t BatchStrideA_;
-    index_t BatchStrideB_;
-    Array<ck::index_t, NumDTensor> BatchStrideDs_;
-    index_t BatchStrideE_;
-};
-
 /*
 * \brief Wrapper function of GridwiseGemm::Run to realize BatchedGEMM.
 *
@@ -242,7 +198,9 @@ template <index_t NDimSpatial,
          index_t CShuffleNXdlPerWavePerShuffle,
          typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
          index_t CDEBlockTransferScalarPerVector_NPerBlock,
-          LoopScheduler LoopSched = make_default_loop_scheduler()>
+          LoopScheduler LoopSched = make_default_loop_scheduler(),
+          typename AComputeType   = ADataType,
+          typename BComputeType   = AComputeType>
 struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
    : public DeviceGroupedConvBwdDataMultipleD<NDimSpatial,
                                               ALayout,    // output image
@@ -255,9 +213,11 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                                               EDataType,  // input image
                                               AElementwiseOp,
                                               BElementwiseOp,
-                                               CDEElementwiseOp>
+                                               CDEElementwiseOp,
+                                               AComputeType,
+                                               BComputeType>
 {
-    // FIXME
+    // TODO: Extend support for more spatial dimensions.
    static_assert(NDimSpatial == 2 || NDimSpatial == 3,
                  "wrong! only implemented for 2D and 3D now");

@@ -265,7 +225,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1

    static constexpr index_t NumDTensor = DsDataType::Size();

-    // TODO make A/B datatype different
+    // TODO: Add support for different A and B data types.
    using ABDataType = ADataType;

    static constexpr auto I0 = Number<0>{};
@@ -356,9 +316,9 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1

    // GridwiseGemm
    using GridwiseGemm = GridwiseGemmMultipleD_xdl_cshuffle<
-        ABDataType, // TODO: distinguish A/B datatype
-        ABDataType, // TODO: distinguish A/B datatype
-        ABDataType, // TODO: distinguish A/B datatype
+        ABDataType,
+        ABDataType,
+        AComputeType,
        AccDataType,
        CShuffleDataType,
        DsDataType,
@@ -398,7 +358,9 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
        CShuffleNXdlPerWavePerShuffle,
        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
        CDEBlockTransferScalarPerVector_NPerBlock,
-        LoopSched>;
+        LoopSched,
+        PipelineVersion::v1,
+        BComputeType>;

    template <typename Desc_K0_M_K1>
    static auto transform_k0_m_k1_to_m_k(const Desc_K0_M_K1& desc_k0_m_k1)