Merge remote-tracking branch 'origin/develop' into aosewski/gemm_tile_loop

648f1f13 · Adam Osewski · 4e5190f5 · cb538740 · 648f1f13 · 648f1f13
Commit 648f1f13 authored Sep 29, 2023 by Adam Osewski
20 changed files
--- a/example/52_image_to_column/common.hpp
+++ b/example/52_image_to_column/common.hpp
@@ -10,6 +10,7 @@

 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"

 #include "ck/library/utility/algorithm.hpp"
@@ -20,6 +21,7 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_column_to_image.hpp"

 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -32,7 +34,7 @@ struct ExecutionConfig final
 {
    bool do_verification = true;
    int init_method      = 1;
-    bool time_kernel     = true;
+    bool time_kernel     = false;
 };

 #define DefaultConvParams                                                            \

--- a/example/52_image_to_column/image_to_column_f32.cpp
+++ b/example/52_image_to_column/image_to_column_f32.cpp
@@ -6,15 +6,16 @@
 using InDataType  = FP32;
 using OutDataType = FP32;

-using InLayout = ck::tensor_layout::convolution::GNHWC;
+using ImLayout        = ck::tensor_layout::convolution::GNHWC;
+using ImageToColumnOp = ck::conv_tensor_rearrange_op::ImageToColumn;

 // clang-format off
 using DeviceImgToColInstance = ck::tensor_operation::device::DeviceImageToColumnImpl
-        //#####################|        Num| InLayout| InDataType| OutDataType| Block|  MPer|  KPer|    Thread| Scalar|
+        //#####################|        Num| ImLayout| InDataType| OutDataType| Block|  MPer|  KPer|    Thread| Scalar|
        //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
        //#####################|    Spatial|         |           |            |      |      |      |   Lengths| Vector|
        //#####################|           |         |           |            |      |      |      |          |       |
-                              < NDimSpatial, InLayout, InDataType, OutDataType,   256,   128,   128, S<16, 16>,     1>;
+                              < NDimSpatial, ImLayout, InDataType, OutDataType,   256,   128,   128, S<16, 16>,      1>;
 // clang-format on

 bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::ConvParam& conv_params)
@@ -31,14 +32,14 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv
                conv_params.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());

    const auto in_desc =
-        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_params);
+        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<ImLayout>(conv_params);
    const auto out_desc = HostTensorDescriptor({NDoHoWo, CZYX});

    std::array<ck::index_t, NDimSpatial> input_spatial_lengths{};
    std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{};
    std::array<ck::index_t, NDimSpatial> output_spatial_lengths{};
-    std::array<ck::index_t, NDimSpatial + 3> input_g_n_c_wis_strides{};
-    std::array<ck::index_t, 2> output_m_k_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> image_g_n_c_wis_strides{};
+    std::array<ck::index_t, 2> gemm_m_k_strides{};
    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
    std::array<ck::index_t, NDimSpatial> input_left_pads{};
@@ -49,8 +50,8 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv
    copy(conv_params.input_spatial_lengths_, input_spatial_lengths);
    copy(conv_params.filter_spatial_lengths_, filter_spatial_lengths);
    copy(conv_params.output_spatial_lengths_, output_spatial_lengths);
-    copy(in_desc.GetStrides(), input_g_n_c_wis_strides);
-    copy(out_desc.GetStrides(), output_m_k_strides);
+    copy(in_desc.GetStrides(), image_g_n_c_wis_strides);
+    copy(out_desc.GetStrides(), gemm_m_k_strides);
    copy(conv_params.conv_filter_strides_, conv_filter_strides);
    copy(conv_params.conv_filter_dilations_, conv_filter_dilations);
    copy(conv_params.input_left_pads_, input_left_pads);
@@ -90,8 +91,8 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv
                                         input_spatial_lengths,
                                         filter_spatial_lengths,
                                         output_spatial_lengths,
-                                         input_g_n_c_wis_strides,
-                                         output_m_k_strides,
+                                         image_g_n_c_wis_strides,
+                                         gemm_m_k_strides,
                                         conv_filter_strides,
                                         conv_filter_dilations,
                                         input_left_pads,
@@ -114,7 +115,7 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv
    if(config.do_verification)
    {
        auto ref_image_to_column = ck::tensor_operation::host::
-            ReferenceImageToColumn<NDimSpatial, InLayout, InDataType, OutDataType>();
+            ReferenceImageToColumn<NDimSpatial, ImLayout, InDataType, OutDataType>();

        auto ref_invoker = ref_image_to_column.MakeInvoker();


--- a/example/60_gemm_multiABD/CMakeLists.txt
+++ b/example/60_gemm_multiABD/CMakeLists.txt
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+list(APPEND gpu_list2 gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list2 AND target EQUAL 0)
+     add_example_executable(example_gemm_multiABD_xdl_fp16 gemm_multiABD_xdl_fp16.cpp)
+   set(target 1)
+ endif()
+endforeach()
+endif()
--- a/example/60_gemm_multiABD/gemm_multiABD_xdl_fp16.cpp
+++ b/example/60_gemm_multiABD/gemm_multiABD_xdl_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = F16;
+using BDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DDataType        = F16;
+using EDataType        = F16;
+
+using ALayout = Row;
+using BLayout = Col;
+using DLayout = Row;
+using ELayout = Row;
+
+struct AddScale
+{
+    static constexpr auto I0 = ck::Number<0>{};
+    static constexpr auto I1 = ck::Number<1>{};
+    static constexpr auto I2 = ck::Number<2>{};
+    static constexpr auto I3 = ck::Number<3>{};
+
+    __host__ __device__ constexpr void
+    operator()(ck::half4_t& a, const ck::half4_t& a0, const ck::half4_t& a1) const
+    {
+        const auto a0_v_t = ck::vector_type<ck::half_t, 4>{a0};
+        const auto a1_v_t = ck::vector_type<ck::half_t, 4>{a1};
+
+        auto r_v_t = ck::vector_type<ck::half_t, 4>{};
+
+        r_v_t.AsType<ck::half_t>()(I0) =
+            scale * (a0_v_t.AsType<ck::half_t>()[I0] + a1_v_t.AsType<ck::half_t>()[I0]);
+        r_v_t.AsType<ck::half_t>()(I1) =
+            scale * (a0_v_t.AsType<ck::half_t>()[I1] + a1_v_t.AsType<ck::half_t>()[I1]);
+        r_v_t.AsType<ck::half_t>()(I2) =
+            scale * (a0_v_t.AsType<ck::half_t>()[I2] + a1_v_t.AsType<ck::half_t>()[I2]);
+        r_v_t.AsType<ck::half_t>()(I3) =
+            scale * (a0_v_t.AsType<ck::half_t>()[I3] + a1_v_t.AsType<ck::half_t>()[I3]);
+
+        a = r_v_t.AsType<ck::half4_t>()[I0];
+    }
+
+    __host__ __device__ constexpr void
+    operator()(ck::half_t& a, const ck::half_t& a0, const ck::half_t& a1) const
+    {
+        a = scale * (a0 + a1);
+    }
+
+    static constexpr ck::index_t vec_len = 4;
+
+    float scale = 1.0;
+};
+
+struct AlphaBetaAdd
+{
+    AlphaBetaAdd(float alpha, float beta) : alpha_(alpha), beta_(beta){};
+
+    template <typename E, typename C, typename D>
+    __host__ __device__ constexpr void operator()(E& e, const C& c, const D& d) const;
+
+    template <>
+    __host__ __device__ constexpr void operator()<ck::half_t, float, ck::half_t>(
+        ck::half_t& e, const float& c, const ck::half_t& d) const
+    {
+        e = ck::type_convert<ck::half_t>(alpha_ * c + beta_ * ck::type_convert<float>(d));
+    };
+
+    float alpha_;
+    float beta_;
+};
+
+using AElementOp   = AddScale;
+using BElementOp   = PassThrough;
+using CDEElementOp = AlphaBetaAdd;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleABD_Xdl_CShuffle<
+    ck::Tuple<ALayout, ALayout>,
+    ck::Tuple<BLayout>,
+    ck::Tuple<DLayout>,
+    ELayout,
+    ck::Tuple<ADataType, ADataType>,
+    ck::Tuple<BDataType>,
+    AccDataType,
+    CShuffleDataType,
+    ck::Tuple<DDataType>,
+    EDataType,
+    AElementOp,
+    BElementOp,
+    CDEElementOp,
+    GemmSpec,
+    1,
+    256,
+    256,
+    128,
+    32,
+    8,
+    8,
+    32,
+    32,
+    4,
+    2,
+    S<4, 64, 1>,
+    S<1, 0, 2>,
+    S<1, 0, 2>,
+    2,
+    8,
+    8,
+    1,
+    S<4, 64, 1>,
+    S<1, 0, 2>,
+    S<1, 0, 2>,
+    2,
+    8,
+    8,
+    1,
+    1,
+    1,
+    S<1, 32, 1, 8>,
+    8>;
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    // GEMM shape
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+
+    ck::index_t StrideA = 4096;
+    ck::index_t StrideB = 4096;
+    ck::index_t StrideD = 4096;
+    ck::index_t StrideE = 4096;
+
+    float alpha = 1.0f;
+    float beta  = 1.0f;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 6)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        alpha = std::stof(argv[4]);
+        beta  = std::stof(argv[5]);
+    }
+    else if(argc == 13)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideD = std::stoi(argv[9]);
+        StrideE = std::stoi(argv[10]);
+
+        alpha = std::stof(argv[11]);
+        beta  = std::stof(argv[12]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD, StrideE, alpha, "
+               "beta\n");
+        exit(0);
+    }
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ADataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<ADataType> a1_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<DDataType> d_m_n(f_host_tensor_descriptor(M, N, StrideD, DLayout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+    std::cout << "a0_m_k: " << a0_m_k.mDesc << std::endl;
+    std::cout << "a1_m_k: " << a1_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "d_m_n: " << d_m_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        a1_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        d_m_n.GenerateTensorValue(GeneratorTensor_2<DDataType>{-5, 5});
+        break;
+    default:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        a1_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d_m_n.GenerateTensorValue(GeneratorTensor_3<DDataType>{-0.5, 0.5});
+    }
+
+    DeviceMem a0_device_buf(sizeof(ADataType) * a0_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem a1_device_buf(sizeof(ADataType) * a1_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem d_device_buf(sizeof(DDataType) * d_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a0_device_buf.ToDevice(a0_m_k.mData.data());
+    a1_device_buf.ToDevice(a1_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    d_device_buf.ToDevice(d_m_n.mData.data());
+    e_device_buf.ToDevice(e_m_n_device_result.mData.data());
+
+    auto a_element_op   = AElementOp{0.2};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{alpha, beta};
+
+    // do GEMM
+    auto device_op = DeviceOpInstance{};
+    auto invoker   = device_op.MakeInvoker();
+    auto argument =
+        device_op.MakeArgument(std::array<const void*, 2>{a0_device_buf.GetDeviceBuffer(),
+                                                          a1_device_buf.GetDeviceBuffer()},
+                               std::array<const void*, 1>{b_device_buf.GetDeviceBuffer()},
+                               std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
+                               e_device_buf.GetDeviceBuffer(),
+                               M,
+                               N,
+                               K,
+                               std::array<ck::index_t, 2>{StrideA, StrideA},
+                               std::array<ck::index_t, 1>{StrideB},
+                               std::array<ck::index_t, 1>{StrideD},
+                               StrideE,
+                               a_element_op,
+                               b_element_op,
+                               cde_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_btype =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << std::endl;
+
+    e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+    if(do_verification)
+    {
+        Tensor<CShuffleDataType> c_m_n({M, N});
+
+        Tensor<ADataType> a_m_k({M, K});
+
+        for(int m = 0; m < M; ++m)
+        {
+            for(int k = 0; k < K; ++k)
+            {
+                a_element_op(a_m_k(m, k), a0_m_k(m, k), a1_m_k(m, k));
+            }
+        }
+
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                BDataType,
+                                                                                CShuffleDataType,
+                                                                                AccDataType,
+                                                                                PassThrough,
+                                                                                BElementOp,
+                                                                                PassThrough>;
+        auto ref_gemm               = ReferenceGemmInstance{};
+        auto ref_invoker            = ref_gemm.MakeInvoker();
+
+        auto ref_argument =
+            ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, PassThrough{}, b_element_op, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(int m = 0; m < M; ++m)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d_m_n(m, n));
+            }
+        }
+
+        e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+        return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
+    }
+
+    return 0;
+}
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -7,20 +7,114 @@ add_custom_target(examples)

 function(add_example_executable EXAMPLE_NAME FILE_NAME)
    message("adding example ${EXAMPLE_NAME}")
-    add_executable(${EXAMPLE_NAME} ${FILE_NAME})
-    target_link_libraries(${EXAMPLE_NAME} PRIVATE utility)
-    add_test(NAME ${EXAMPLE_NAME} COMMAND $<TARGET_FILE:${EXAMPLE_NAME}> ${ARGN})
-    add_dependencies(examples ${EXAMPLE_NAME})
-    add_dependencies(check ${EXAMPLE_NAME})
-    rocm_install(TARGETS ${EXAMPLE_NAME} COMPONENT examples)
+    set(result 1)
+    if(DEFINED DTYPES)
+    foreach(source IN LISTS FILE_NAME)
+        set(test 0)
+        foreach(type IN LISTS DTYPES)
+            if(type MATCHES "fp16")
+                set(type1 "_f16")
+            elseif(type MATCHES "fp32")
+                set(type1 "_f32")
+            elseif(type MATCHES "fp8")
+                set(type1 "_f8")
+            elseif(type MATCHES "bf16")
+                set(type1 "_b16")
+            elseif(type MATCHES "fp64")
+                set(type1 "_f64")
+            elseif(type MATCHES "int8")
+                set(type1 "_i8")
+            endif()
+            if("${source}" MATCHES "${type}" OR "${source}" MATCHES "${type1}")
+                #if filename matches any selected type, exit type loop and do no exclude the file from the list
+                set(test 0)
+                break()
+            elseif((source MATCHES "fp8" OR source MATCHES "fp32" OR source MATCHES "fp64" OR source MATCHES "bf16" OR source MATCHES "int8" OR source MATCHES "fp16" OR
+                source MATCHES "_f8" OR source MATCHES "_f32" OR source MATCHES "_f64" OR source MATCHES "_i8" OR source MATCHES "_f16" OR source MATCHES "_b16") AND 
+                NOT(source MATCHES type OR source MATCHES type1))
+                    #if filename contains a type which doesn't match any selected type, mark it for removal
+                    set(test 1)
+            endif()
+        endforeach()
+        if(test EQUAL 1)
+            message("removing example source file ${source} ")
+            list(REMOVE_ITEM FILE_NAME "${source}")
+        endif()
+    endforeach()
+    endif()
+    foreach(source IN LISTS FILE_NAME)
+        if(NOT DEFINED DL_KERNELS AND source MATCHES "_dl")
+            message("removing dl example ${source} ")
+            list(REMOVE_ITEM FILE_NAME "${source}")
+        endif()
+    endforeach()
+    #only continue if there are some source files left on the list
+    if(FILE_NAME)
+        add_executable(${EXAMPLE_NAME} ${FILE_NAME})
+        target_link_libraries(${EXAMPLE_NAME} PRIVATE utility)
+        add_test(NAME ${EXAMPLE_NAME} COMMAND $<TARGET_FILE:${EXAMPLE_NAME}> ${ARGN})
+        add_dependencies(examples ${EXAMPLE_NAME})
+        add_dependencies(check ${EXAMPLE_NAME})
+        rocm_install(TARGETS ${EXAMPLE_NAME} COMPONENT examples)
+        set(result 0)
+    endif()
+    #message("add_example returns ${result}")
+    return(PROPAGATE result)
 endfunction(add_example_executable EXAMPLE_NAME)

 function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
    message("adding example ${EXAMPLE_NAME}")
-    add_executable(${EXAMPLE_NAME} ${FILE_NAME})
-    target_link_libraries(${EXAMPLE_NAME} PRIVATE utility)
-    add_dependencies(examples ${EXAMPLE_NAME})
-    rocm_install(TARGETS ${EXAMPLE_NAME} COMPONENT examples)
+    set(result 1)
+    if(DEFINED DTYPES)
+    foreach(source IN LISTS FILE_NAME)
+        set(test 0)
+        foreach(type IN LISTS DTYPES)
+                if(type MATCHES "fp16")
+                    set(type1 "_f16")
+                elseif(type MATCHES "fp32")
+                    set(type1 "_f32")
+                elseif(type MATCHES "fp8")
+                    set(type1 "_f8")
+                elseif(type MATCHES "bf16")
+                    set(type1 "_b16")
+                elseif(type MATCHES "fp64")
+                    set(type1 "_f64")
+                elseif(type MATCHES "int8")
+                    set(type1 "_i8")
+                endif()
+                if("${source}" MATCHES "${type}" OR "${source}" MATCHES "${type1}")
+                    #if filename matches any selected type, exit type loop and do no exclude the file from the list
+                    set(test 0)
+                    break()
+                elseif((source MATCHES "fp8" OR source MATCHES "fp32" OR source MATCHES "fp64" OR source MATCHES "bf16" OR source MATCHES "int8" OR source MATCHES "fp16" OR
+                  source MATCHES "_f8" OR source MATCHES "_f32" OR source MATCHES "_f64" OR source MATCHES "_i8" OR source MATCHES "_f16" OR source MATCHES "_b16") AND 
+                  NOT(source MATCHES type OR source MATCHES type1))
+                    #if filename contains a type which doesn't match any selected type, mark it for removal
+                    set(test 1)
+                endif()
+        endforeach()
+        if(test EQUAL 1)
+            message("removing example ${source} ")
+            list(REMOVE_ITEM FILE_NAME "${source}")
+        endif()    
+    endforeach()
+    endif()
+    foreach(source IN LISTS FILE_NAME)
+        if(NOT DEFINED DL_KERNELS AND source MATCHES "_dl")
+            message("removing dl example ${source} ")
+            list(REMOVE_ITEM FILE_NAME "${source}")
+        endif()
+    endforeach()
+    #only continue if there are some source files left on the list
+    if(FILE_NAME)
+        add_executable(${EXAMPLE_NAME} ${FILE_NAME})
+        target_link_libraries(${EXAMPLE_NAME} PRIVATE utility)
+        add_dependencies(examples ${EXAMPLE_NAME})
+        rocm_install(TARGETS ${EXAMPLE_NAME} COMPONENT examples)
+        set(result 0)
+    endif()
+    #message("add_example returns ${result}")
+    return(PROPAGATE result)
 endfunction(add_example_executable_no_testing EXAMPLE_NAME)

 # add all example subdir

--- a/include/ck/host_utility/kernel_launch.hpp
+++ b/include/ck/host_utility/kernel_launch.hpp
@@ -34,6 +34,7 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
 #endif
        // warm up
        kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
+        hip_check_error(hipGetLastError());

        const int nrepeat = 10;
 #if DEBUG_LOG
@@ -50,6 +51,7 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
        for(int i = 0; i < nrepeat; ++i)
        {
            kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
+            hip_check_error(hipGetLastError());
        }

        hip_check_error(hipEventRecord(stop, stream_config.stream_id_));
@@ -64,11 +66,13 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
    else
    {
        kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
+        hip_check_error(hipGetLastError());

        return 0;
    }
 #else
    kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
+    hip_check_error(hipGetLastError());

    return 0;
 #endif
@@ -101,6 +105,7 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
        // warm up
        preprocess();
        kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
+        hip_check_error(hipGetLastError());

        const int nrepeat = 10;
 #if DEBUG_LOG
@@ -118,6 +123,7 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
        {
            preprocess();
            kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
+            hip_check_error(hipGetLastError());
        }

        hip_check_error(hipEventRecord(stop, stream_config.stream_id_));
@@ -133,11 +139,13 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
    {
        preprocess();
        kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
+        hip_check_error(hipGetLastError());

        return 0;
    }
 #else
    kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
+    hip_check_error(hipGetLastError());

    return 0;
 #endif

--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r2.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r2.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/cluster_descriptor.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r2.hpp"
+#include "ck/utility/is_detected.hpp"
+
+namespace ck {
+
+// Thread-group level multi-source, multi-destination tensor slice data movement
+// Assume:
+//   1. All sources and destinations are DynamicBuffer
+//   2. Same VectorDim and ScalerPerVector for all sources and destinations
+//   3. DstInMemOps are per destination tensor
+//   4. ThreadTransferSrcResetCoordinateAfterRunFlags are per source tensor
+//   5. ThreadTransferDstResetCoordinateAfterRunFlags are per destination tensor
+//
+// Does following things to avoid scratch memory issue
+//   1. Pass tensor descritpors by reference (or tuple of references)
+//   2. Does not keep reference to tensor descriptor
+//   3. Does not construct new tensor coordinate when call Run()
+template <typename ThreadGroup,
+          typename SrcDatas,
+          typename DstDatas,
+          typename SrcDescs,
+          typename DstDescs,
+          typename ElementwiseOperation,
+          typename DstInMemOps, // Sequence<InMemoryDataOperationEnum ...>
+          typename SliceLengths,
+          typename ThreadClusterLengths,
+          typename ThreadClusterArrangeOrder,
+          typename SrcDimAccessOrder,
+          typename DstDimAccessOrder,
+          index_t SrcVectorDim,
+          index_t DstVectorDim,
+          index_t SrcScalarPerVector,
+          index_t DstScalarPerVector,
+          typename ThreadTransferSrcResetCoordinateAfterRunFlags,
+          typename ThreadTransferDstResetCoordinateAfterRunFlags>
+struct ThreadGroupTensorSliceTransfer_v7r2
+{
+    static constexpr index_t nDim =
+        remove_cvref_t<tuple_element_t<0, SrcDescs>>::GetNumOfDimension();
+
+    static constexpr index_t nSrc = remove_cvref_t<SrcDescs>::Size();
+    static constexpr index_t nDst = remove_cvref_t<DstDescs>::Size();
+
+    using Index = MultiIndex<nDim>;
+
+    static constexpr auto thread_slice_lengths = SliceLengths{} / ThreadClusterLengths{};
+
+    __device__ constexpr ThreadGroupTensorSliceTransfer_v7r2(
+        const SrcDescs& src_descs,
+        const StaticallyIndexedArray<Index, nSrc>& src_block_slice_origins,
+        const DstDescs& dst_descs,
+        const StaticallyIndexedArray<Index, nDst>& dst_block_slice_origins,
+        const ElementwiseOperation& element_op)
+        : threadwise_transfer_(src_descs,
+                               StaticallyIndexedArray<Index, nSrc>{},
+                               dst_descs,
+                               StaticallyIndexedArray<Index, nDst>{},
+                               element_op)
+    {
+        static_assert(nSrc == SrcDatas::Size() && nSrc == SrcDescs::Size() &&
+                          nSrc == ThreadTransferSrcResetCoordinateAfterRunFlags::Size() &&
+                          nDst == DstDatas::Size() && nDst == DstDescs::Size() &&
+                          nDst == ThreadTransferDstResetCoordinateAfterRunFlags::Size(),
+                      "wrong!");
+
+        static_for<0, nSrc, 1>{}([&](auto i) {
+            static_assert(
+                nDim == remove_cvref_t<tuple_element_t<i.value, SrcDescs>>::GetNumOfDimension(),
+                "wrong!");
+        });
+
+        static_for<0, nDst, 1>{}([&](auto i) {
+            static_assert(
+                nDim == remove_cvref_t<tuple_element_t<i.value, DstDescs>>::GetNumOfDimension(),
+                "wrong!");
+        });
+
+        static_assert(nDim == ThreadClusterLengths::Size() &&
+                          nDim == ThreadClusterArrangeOrder::Size() &&
+                          nDim == SrcDimAccessOrder::Size() && nDim == DstDimAccessOrder::Size(),
+                      "wrong! nDim not consistent");
+
+        static_assert(
+            is_same<SliceLengths, decltype(thread_slice_lengths * ThreadClusterLengths{})>{},
+            "wrong! threads should be mapped to cover entire slicing window");
+
+        static_assert(ThreadGroup::GetNumOfThread() >= thread_cluster_desc_.GetElementSize(),
+                      "wrong! ThreadGroup::GetNumOfThread() too small");
+
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            const auto thread_cluster_idx = thread_cluster_desc_.CalculateBottomIndex(
+                make_multi_index(get_thread_local_1d_id()));
+
+            const auto thread_data_idx_begin = thread_cluster_idx * thread_slice_lengths;
+
+            const auto src_thread_slice_origins = generate_tuple(
+                [&](auto i) { return src_block_slice_origins[i] + thread_data_idx_begin; },
+                Number<nSrc>{});
+
+            const auto dst_thread_slice_origins = generate_tuple(
+                [&](auto i) { return dst_block_slice_origins[i] + thread_data_idx_begin; },
+                Number<nDst>{});
+
+            threadwise_transfer_.SetSrcSliceOrigins(src_descs, src_thread_slice_origins);
+            threadwise_transfer_.SetDstSliceOrigins(dst_descs, dst_thread_slice_origins);
+        }
+    }
+
+    template <typename SrcBuffers>
+    __device__ void RunRead(const SrcDescs& src_descs, const SrcBuffers& src_bufs)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.RunRead(src_descs, src_bufs);
+        }
+    }
+
+    template <typename T>
+    using is_tuple = decltype(std::declval<T&>().IsTuple());
+
+    template <typename DstBuffers>
+    __device__ void RunWrite(const DstDescs& dst_descs, DstBuffers dst_bufs)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            if constexpr(is_detected<is_tuple, decltype(dst_bufs)>::value)
+                threadwise_transfer_.RunWrite(dst_descs, dst_bufs);
+            else
+                threadwise_transfer_.RunWrite(dst_descs, tie(dst_bufs));
+        }
+    }
+
+    template <typename SrcBuffers, typename DstBuffers>
+    __device__ void Run(const SrcDescs& src_descs,
+                        const SrcBuffers& src_bufs,
+                        const DstDescs& dst_descs,
+                        DstBuffers dst_bufs)
+    {
+        RunRead(src_descs, src_bufs);
+        RunWrite(dst_descs, dst_bufs);
+    }
+
+    template <index_t ISrc>
+    __device__ void
+    MoveSrcSliceWindow(const SrcDescs& src_descs, Number<ISrc> iSrc, const Index& step)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveSrcSliceWindow(src_descs, iSrc, step);
+        }
+    }
+
+    __device__ void MoveSrcSliceWindow(const SrcDescs& src_descs, const Index& step)
+    {
+        static_for<0, SrcDescs::Size(), 1>{}(
+            [&](auto i) { MoveSrcSliceWindow(src_descs, i, step); });
+    }
+
+    template <index_t IDst>
+    __device__ void
+    MoveDstSliceWindow(const DstDescs& dst_descs, Number<IDst> iDst, const Index& step)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveDstSliceWindow(dst_descs, iDst, step);
+        }
+    }
+
+    __device__ void MoveDstSliceWindow(const DstDescs& dst_descs, const Index& step)
+    {
+        static_for<0, DstDescs::Size(), 1>{}(
+            [&](auto i) { MoveDstSliceWindow(dst_descs, i, step); });
+    }
+
+    private:
+    static constexpr auto thread_cluster_desc_ =
+        make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
+
+    using ThreadwiseTransfer =
+        ThreadwiseTensorSliceTransfer_v7r2<SrcDatas,
+                                           DstDatas,
+                                           SrcDescs,
+                                           DstDescs,
+                                           ElementwiseOperation,
+                                           DstInMemOps,
+                                           decltype(thread_slice_lengths),
+                                           SrcDimAccessOrder,
+                                           DstDimAccessOrder,
+                                           SrcVectorDim,
+                                           DstVectorDim,
+                                           SrcScalarPerVector,
+                                           DstScalarPerVector,
+                                           ThreadTransferSrcResetCoordinateAfterRunFlags,
+                                           ThreadTransferDstResetCoordinateAfterRunFlags>;
+
+    ThreadwiseTransfer threadwise_transfer_;
+};
+
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp
+++ b/include/ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+namespace ck {
+namespace conv_tensor_rearrange_op {
+
+struct BaseConvTensorRearrangeOp
+{
+};
+
+struct ImageToColumn : public BaseConvTensorRearrangeOp
+{
+    static constexpr const char* name = "Image to Column";
+};
+
+struct ColumnToImage : public BaseConvTensorRearrangeOp
+{
+    static constexpr const char* name = "Column to Image";
+};
+
+template <typename Op,
+          typename std::enable_if<std::is_base_of<BaseConvTensorRearrangeOp, Op>::value,
+                                  bool>::type = false>
+std::ostream& operator<<(std::ostream& os, const BaseConvTensorRearrangeOp&)
+{
+    os << Op::name;
+    return os;
+}
+
+} // namespace conv_tensor_rearrange_op
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_image_to_column.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_image_to_column.hpp
@@ -12,21 +12,26 @@ namespace tensor_operation {
 namespace device {

 /**
- * \brief Image to column.
+ * \brief Convolution Tensor Rearrange.
 *
- * This Device operator converts image ([G, N, Di, Hi, Wi, C]) to the gemm
- * problem([N * Do * Ho * Wo, Z *  Y * X * C]). G must be equal to 1.
+ * This Device operator supports conversion image ([G, N, Di, Hi, Wi, C]) to
+ * the gemm problem([N * Do * Ho * Wo, Z *  Y * X * C]) (Image to Column) and
+ * conversion gemm form to the image (Column to Image).
+ *
+ * Note that G must be equal to 1.
 *
 * \tparam NDimSpatial Number of spatial dimensions.
- * \tparam InputLayout Input Layout.
+ * \tparam ImageLayout Input Layout.
 * \tparam InputDataType Input Data Type.
 * \tparam OutputDataType Output Data Type.
+ * \tparam ConvTensorRearrangeOp Operation type: ImageToColumn, ColumnToImage.
 */
 template <index_t NDimSpatial,
-          typename InputLayout,
+          typename ImageLayout,
          typename InputDataType,
-          typename OutputDataType>
-struct DeviceImageToColumn : public BaseOperator
+          typename OutputDataType,
+          typename ConvTensorRearrangeOp>
+struct DeviceConvTensorRearrange : public BaseOperator
 {

    /**
@@ -39,8 +44,8 @@ struct DeviceImageToColumn : public BaseOperator
     * \param input_spatial_lengths Input spatial lengths.
     * \param filter_spatial_lengths Filter spatial lengths.
     * \param output_spatial_lengths Output spatial lengths.
-     * \param input_g_n_c_wis_strides Input strides in order [G, N, C, D, H, W].
-     * \param output_m_k_strides Output strides.
+     * \param image_g_n_c_wis_strides Image strides in order [G, N, C, D, H, W].
+     * \param gemm_m_k_strides Gemm form strides.
     * \param conv_filter_strides Convolution filter strides.
     * \param conv_filter_dilations Convolution filter dilations.
     * \param input_left_pads Convolution left pads.
@@ -55,8 +60,8 @@ struct DeviceImageToColumn : public BaseOperator
                        const std::array<index_t, NDimSpatial>& input_spatial_lengths,
                        const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
                        const std::array<index_t, NDimSpatial>& output_spatial_lengths,
-                        const std::array<index_t, NDimSpatial + 3>& input_g_n_c_wis_strides,
-                        const std::array<index_t, 2>& output_m_k_strides,
+                        const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
+                        const std::array<index_t, 2>& gemm_m_k_strides,
                        const std::array<index_t, NDimSpatial>& conv_filter_strides,
                        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
                        const std::array<index_t, NDimSpatial>& input_left_pads,

--- a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_abd.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_abd.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <array>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// GEMM:
+//   input : A0[M, K], B0[K, N],
+//   input : D0[M, N], D1[M, N], ...
+//   output : E[M, N]
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+// Assume:
+//   D0, D1, ... and E have the same layout
+template <typename AsLayout,
+          typename BsLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename AsDataType,
+          typename BsDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct DeviceGemmMultipleABD : public BaseOperator
+{
+    static constexpr index_t NumATensor = AsDataType::Size();
+    static constexpr index_t NumBTensor = BsDataType::Size();
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(std::array<const void*, NumATensor> p_as,
+                        std::array<const void*, NumBTensor> p_bs,
+                        std::array<const void*, NumDTensor> p_ds,
+                        void* p_e,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        std::array<ck::index_t, NumATensor> StrideAs,
+                        std::array<ck::index_t, NumBTensor> StrideBs,
+                        std::array<ck::index_t, NumDTensor> StrideDs,
+                        ck::index_t StrideE,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp
@@ -273,6 +273,9 @@ struct DeviceGemmDl : public DeviceGemm<ALayout,
              block_2_ctile_map_{},
              M01_{M01},
              N01_{N01},
+              M_raw_{M},
+              N_raw_{N},
+              K_raw_{K},
              a_element_op_{a_element_op},
              b_element_op_{b_element_op},
              c_element_op_{c_element_op}
@@ -314,6 +317,10 @@ struct DeviceGemmDl : public DeviceGemm<ALayout,
        index_t M01_;
        index_t N01_;

+        index_t M_raw_;
+        index_t N_raw_;
+        index_t K_raw_;
+
        // TODO: unused since gridwise_gemm_dl_v1r3 does NOT support prologue for the time being.
        AElementwiseOperation a_element_op_;
        BElementwiseOperation b_element_op_;
@@ -485,6 +492,50 @@ struct DeviceGemmDl : public DeviceGemm<ALayout,

    static bool IsSupportedArgument(const Argument& arg)
    {
+        // Make sure that the M, N, K dimensions before padding are divisible by respective vector
+        // lengths.
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+        {
+            constexpr auto A_K_vec_length =
+                ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1::At(I0) *
+                ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1::At(I3);
+            if(arg.K_raw_ % A_K_vec_length != 0)
+            {
+                return false;
+            }
+        }
+        else
+        {
+            constexpr auto A_M_vec_lenght =
+                ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1::At(I1) *
+                ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1::At(I2);
+            if(arg.M_raw_ % A_M_vec_lenght != 0)
+            {
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+        {
+            constexpr auto B_N_vec_lenght =
+                BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1::At(I1) *
+                BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1::At(I2);
+            if(arg.N_raw_ % B_N_vec_lenght != 0)
+            {
+                return false;
+            }
+        }
+        else
+        {
+            constexpr auto B_K_vec_length =
+                BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1::At(I0) *
+                BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1::At(I3);
+            if(arg.K_raw_ % B_K_vec_length != 0)
+            {
+                return false;
+            }
+        }
+
        if(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx1030" ||
           ck::get_device_name() == "gfx1100" || ck::get_device_name() == "gfx1101" ||
           ck::get_device_name() == "gfx1102")

--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_xdl_cshuffle.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
@@ -14,6 +14,7 @@
 #include "ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp"
 #include "ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/host_utility/kernel_launch.hpp"
 #include "ck/host_utility/io.hpp"
@@ -24,51 +25,6 @@ namespace device {

 namespace {

-template <index_t NumDTensor>
-struct ComputePtrOffsetOfStridedBatch
-{
-    ComputePtrOffsetOfStridedBatch() = default;
-
-    ComputePtrOffsetOfStridedBatch(index_t BatchStrideA,
-                                   index_t BatchStrideB,
-                                   Array<ck::index_t, NumDTensor> BatchStrideDs,
-                                   index_t BatchStrideE)
-        : BatchStrideA_(BatchStrideA),
-          BatchStrideB_(BatchStrideB),
-          BatchStrideDs_(BatchStrideDs),
-          BatchStrideE_(BatchStrideE)
-    {
-    }
-
-    __host__ __device__ constexpr long_index_t GetAPtrOffset(index_t g_idx) const
-    {
-        return g_idx * static_cast<long_index_t>(BatchStrideA_);
-    }
-
-    __host__ __device__ constexpr long_index_t GetBPtrOffset(index_t g_idx) const
-    {
-        return g_idx * static_cast<long_index_t>(BatchStrideB_);
-    }
-
-    __host__ __device__ constexpr auto GetDsPtrOffset(index_t g_idx) const
-    {
-        Array<long_index_t, NumDTensor> ds_offset;
-        static_for<0, NumDTensor, 1>{}(
-            [&](auto i) { ds_offset(i) = g_idx * static_cast<long_index_t>(BatchStrideDs_[i]); });
-        return ds_offset;
-    }
-
-    __host__ __device__ constexpr long_index_t GetEPtrOffset(index_t g_idx) const
-    {
-        return g_idx * static_cast<long_index_t>(BatchStrideE_);
-    }
-
-    index_t BatchStrideA_;
-    index_t BatchStrideB_;
-    Array<ck::index_t, NumDTensor> BatchStrideDs_;
-    index_t BatchStrideE_;
-};
-
 /*
 * \brief Wrapper function of GridwiseGemm::Run to realize BatchedGEMM.
 *
@@ -257,7 +213,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                                               BElementwiseOp,
                                               CDEElementwiseOp>
 {
-    // FIXME
+    // TODO: Extend support for more spatial dimensions.
    static_assert(NDimSpatial == 2 || NDimSpatial == 3,
                  "wrong! only implemented for 2D and 3D now");

@@ -265,7 +221,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1

    static constexpr index_t NumDTensor = DsDataType::Size();

-    // TODO make A/B datatype different
+    // TODO: Add support for different A and B data types.
    using ABDataType = ADataType;

    static constexpr auto I0 = Number<0>{};

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_dl.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
@@ -19,6 +19,7 @@
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_dl_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/host_utility/kernel_launch.hpp"
 #include "ck/host_utility/io.hpp"
@@ -29,51 +30,6 @@ namespace device {

 namespace {

-template <index_t NumDTensor>
-struct ComputePtrOffsetOfStridedBatch
-{
-    ComputePtrOffsetOfStridedBatch() = default;
-
-    ComputePtrOffsetOfStridedBatch(index_t BatchStrideA,
-                                   index_t BatchStrideB,
-                                   Array<ck::index_t, NumDTensor> BatchStrideDs,
-                                   index_t BatchStrideE)
-        : BatchStrideA_(BatchStrideA),
-          BatchStrideB_(BatchStrideB),
-          BatchStrideDs_(BatchStrideDs),
-          BatchStrideE_(BatchStrideE)
-    {
-    }
-
-    __host__ __device__ constexpr long_index_t GetAPtrOffset(index_t g_idx) const
-    {
-        return g_idx * static_cast<long_index_t>(BatchStrideA_);
-    }
-
-    __host__ __device__ constexpr long_index_t GetBPtrOffset(index_t g_idx) const
-    {
-        return g_idx * static_cast<long_index_t>(BatchStrideB_);
-    }
-
-    __host__ __device__ constexpr auto GetDsPtrOffset(index_t g_idx) const
-    {
-        Array<long_index_t, NumDTensor> ds_offset;
-        static_for<0, NumDTensor, 1>{}(
-            [&](auto i) { ds_offset(i) = g_idx * static_cast<long_index_t>(BatchStrideDs_[i]); });
-        return ds_offset;
-    }
-
-    __host__ __device__ constexpr long_index_t GetEPtrOffset(index_t g_idx) const
-    {
-        return g_idx * static_cast<long_index_t>(BatchStrideE_);
-    }
-
-    index_t BatchStrideA_;
-    index_t BatchStrideB_;
-    Array<ck::index_t, NumDTensor> BatchStrideDs_;
-    index_t BatchStrideE_;
-};
-
 /*
 * \brief Wrapper function of GridwiseGemm::Run to realize BatchedGEMM.
 *

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
@@ -19,6 +19,7 @@
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/host_utility/kernel_launch.hpp"
 #include "ck/host_utility/io.hpp"
@@ -27,72 +28,23 @@ namespace ck {
 namespace tensor_operation {
 namespace device {

-namespace {
-
-template <index_t NumDTensor>
-struct ComputePtrOffsetOfStridedBatch
-{
-    ComputePtrOffsetOfStridedBatch() = default;
-
-    ComputePtrOffsetOfStridedBatch(index_t BatchStrideA,
-                                   index_t BatchStrideB,
-                                   Array<ck::index_t, NumDTensor> BatchStrideDs,
-                                   index_t BatchStrideE)
-        : BatchStrideA_(BatchStrideA),
-          BatchStrideB_(BatchStrideB),
-          BatchStrideDs_(BatchStrideDs),
-          BatchStrideE_(BatchStrideE)
-    {
-    }
-
-    __host__ __device__ constexpr long_index_t GetAPtrOffset(index_t g_idx) const
-    {
-        return g_idx * static_cast<long_index_t>(BatchStrideA_);
-    }
-
-    __host__ __device__ constexpr long_index_t GetBPtrOffset(index_t g_idx) const
-    {
-        return g_idx * static_cast<long_index_t>(BatchStrideB_);
-    }
-
-    __host__ __device__ constexpr auto GetDsPtrOffset(index_t g_idx) const
-    {
-        Array<long_index_t, NumDTensor> ds_offset;
-        static_for<0, NumDTensor, 1>{}(
-            [&](auto i) { ds_offset(i) = g_idx * static_cast<long_index_t>(BatchStrideDs_[i]); });
-        return ds_offset;
-    }
-
-    __host__ __device__ constexpr long_index_t GetEPtrOffset(index_t g_idx) const
-    {
-        return g_idx * static_cast<long_index_t>(BatchStrideE_);
-    }
-
-    index_t BatchStrideA_;
-    index_t BatchStrideB_;
-    Array<ck::index_t, NumDTensor> BatchStrideDs_;
-    index_t BatchStrideE_;
-};
-
-} // namespace
-
-///
-/// @brief      Device Convolution operation.
-///
-/// Supports:
-///  @li         Forward convolution with up to 3 spatial dimentions
-///  @li         Input tensor in GNWC data format
-///  @li         Weight tensor in GKXC data format
-///  @li         Output tensor in GNWK data format
-///
-/// 1D:
-/// out[N, Wo, K] = in[N, Wi, C] * wei[K, X, C]
-/// 2D:
-/// out[N, Ho, Wo, K] = in[N, Hi, Wi, C] * wei[K, Y, X, C]
-/// 3D:
-/// out[N, Do, Ho, Wo, K] = in[N, Di, Hi, Wi, C] * wei[K, Z, Y, X, C]
-/// Assume:
-///  AK1 == BK1
+//
+// @brief      Device Convolution operation.
+//
+// Supports:
+//  @li         Forward convolution with up to 3 spatial dimentions
+//  @li         Input tensor in GNWC data format
+//  @li         Weight tensor in GKXC data format
+//  @li         Output tensor in GNWK data format
+//
+// 1D:
+// out[N, Wo, K] = in[N, Wi, C] * wei[K, X, C]
+// 2D:
+// out[N, Ho, Wo, K] = in[N, Hi, Wi, C] * wei[K, Y, X, C]
+// 3D:
+// out[N, Do, Ho, Wo, K] = in[N, Di, Hi, Wi, C] * wei[K, Z, Y, X, C]
+// Assume:
+//  AK1 == BK1
 template <index_t NDimSpatial,
          typename ALayout,
          typename BLayout,

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp
@@ -19,6 +19,7 @@
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/host_utility/kernel_launch.hpp"
 #include "ck/host_utility/io.hpp"
@@ -29,51 +30,6 @@ namespace device {

 namespace {

-template <index_t NumDTensor>
-struct ComputePtrOffsetOfStridedBatch
-{
-    ComputePtrOffsetOfStridedBatch() = default;
-
-    ComputePtrOffsetOfStridedBatch(index_t BatchStrideA,
-                                   index_t BatchStrideB,
-                                   Array<ck::index_t, NumDTensor> BatchStrideDs,
-                                   index_t BatchStrideE)
-        : BatchStrideA_(BatchStrideA),
-          BatchStrideB_(BatchStrideB),
-          BatchStrideDs_(BatchStrideDs),
-          BatchStrideE_(BatchStrideE)
-    {
-    }
-
-    __host__ __device__ constexpr long_index_t GetAPtrOffset(index_t g_idx) const
-    {
-        return g_idx * static_cast<long_index_t>(BatchStrideA_);
-    }
-
-    __host__ __device__ constexpr long_index_t GetBPtrOffset(index_t g_idx) const
-    {
-        return g_idx * static_cast<long_index_t>(BatchStrideB_);
-    }
-
-    __host__ __device__ constexpr auto GetDsPtrOffset(index_t g_idx) const
-    {
-        Array<long_index_t, NumDTensor> ds_offset;
-        static_for<0, NumDTensor, 1>{}(
-            [&](auto i) { ds_offset(i) = g_idx * static_cast<long_index_t>(BatchStrideDs_[i]); });
-        return ds_offset;
-    }
-
-    __host__ __device__ constexpr long_index_t GetEPtrOffset(index_t g_idx) const
-    {
-        return g_idx * static_cast<long_index_t>(BatchStrideE_);
-    }
-
-    index_t BatchStrideA_;
-    index_t BatchStrideB_;
-    Array<ck::index_t, NumDTensor> BatchStrideDs_;
-    index_t BatchStrideE_;
-};
-
 /*
 * \brief Wrapper function of GridwiseGemm::Run to realize BatchedGEMM.
 *

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <index_t NumDTensor>
+struct ComputePtrOffsetOfStridedBatch
+{
+    ComputePtrOffsetOfStridedBatch() = default;
+
+    ComputePtrOffsetOfStridedBatch(index_t BatchStrideA,
+                                   index_t BatchStrideB,
+                                   Array<ck::index_t, NumDTensor> BatchStrideDs,
+                                   index_t BatchStrideE)
+        : BatchStrideA_(BatchStrideA),
+          BatchStrideB_(BatchStrideB),
+          BatchStrideDs_(BatchStrideDs),
+          BatchStrideE_(BatchStrideE)
+    {
+    }
+
+    __host__ __device__ constexpr long_index_t GetAPtrOffset(index_t g_idx) const
+    {
+        return g_idx * static_cast<long_index_t>(BatchStrideA_);
+    }
+
+    __host__ __device__ constexpr long_index_t GetBPtrOffset(index_t g_idx) const
+    {
+        return g_idx * static_cast<long_index_t>(BatchStrideB_);
+    }
+
+    __host__ __device__ constexpr auto GetDsPtrOffset(index_t g_idx) const
+    {
+        Array<long_index_t, NumDTensor> ds_offset;
+        static_for<0, NumDTensor, 1>{}(
+            [&](auto i) { ds_offset(i) = g_idx * static_cast<long_index_t>(BatchStrideDs_[i]); });
+        return ds_offset;
+    }
+
+    __host__ __device__ constexpr long_index_t GetEPtrOffset(index_t g_idx) const
+    {
+        return g_idx * static_cast<long_index_t>(BatchStrideE_);
+    }
+
+    index_t BatchStrideA_;
+    index_t BatchStrideB_;
+    Array<ck::index_t, NumDTensor> BatchStrideDs_;
+    index_t BatchStrideE_;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck