best performing kernel for GEMV codex problem with M=1 with inverted B matrix

930b2872 · Harisankar Sadasivan · a1e17d18 · a4f72a31 · 930b2872 · 930b2872
Commit 930b2872 authored Oct 11, 2023 by Harisankar Sadasivan
20 changed files
--- a/test/batched_gemm_softmax_gemm_permute/CMakeLists.txt
+++ b/test/batched_gemm_softmax_gemm_permute/CMakeLists.txt
@@ -2,25 +2,28 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)
-   if(DTYPES MATCHES "fp16" OR DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
+    add_custom_target(test_batched_gemm_softmax_gemm_permute)
-     add_custom_target(test_batched_gemm_softmax_gemm_permute)
+    add_gtest_executable(test_batched_gemm_softmax_gemm_permute_fp16 test_batched_gemm_softmax_gemm_permute_fp16.cpp)
-   endif()
+    if(result EQUAL 0)
-   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+      target_link_libraries(test_batched_gemm_softmax_gemm_permute_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
-     add_gtest_executable(test_batched_gemm_softmax_gemm_permute_fp16 test_batched_gemm_softmax_gemm_permute_fp16.cpp)
+      add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_fp16)
-     add_gtest_executable(test_batched_gemm_bias_softmax_gemm_permute_fp16 test_batched_gemm_bias_softmax_gemm_permute_fp16.cpp)
+    endif()
-     target_link_libraries(test_batched_gemm_softmax_gemm_permute_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
+    add_gtest_executable(test_batched_gemm_bias_softmax_gemm_permute_fp16 test_batched_gemm_bias_softmax_gemm_permute_fp16.cpp)
-     target_link_libraries(test_batched_gemm_bias_softmax_gemm_permute_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
+    if(result EQUAL 0)
-     add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_fp16)
+      target_link_libraries(test_batched_gemm_bias_softmax_gemm_permute_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
-     add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_bias_softmax_gemm_permute_fp16)
+      add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_bias_softmax_gemm_permute_fp16)
-   endif()
+    endif()
-   if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
-     add_gtest_executable(test_batched_gemm_softmax_gemm_permute_bf16 test_batched_gemm_softmax_gemm_permute_bf16.cpp)
+    add_gtest_executable(test_batched_gemm_softmax_gemm_permute_bf16 test_batched_gemm_softmax_gemm_permute_bf16.cpp)
-     add_gtest_executable(test_batched_gemm_bias_softmax_gemm_permute_bf16 test_batched_gemm_bias_softmax_gemm_permute_bf16.cpp)
+    if(result EQUAL 0)
-     target_link_libraries(test_batched_gemm_softmax_gemm_permute_bf16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
+      target_link_libraries(test_batched_gemm_softmax_gemm_permute_bf16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
-     target_link_libraries(test_batched_gemm_bias_softmax_gemm_permute_bf16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
+      add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_bf16)
-     add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_bf16)
+    endif()
-     add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_bias_softmax_gemm_permute_bf16)
+    add_gtest_executable(test_batched_gemm_bias_softmax_gemm_permute_bf16 test_batched_gemm_bias_softmax_gemm_permute_bf16.cpp)
-   endif()
+    if(result EQUAL 0)
+      target_link_libraries(test_batched_gemm_bias_softmax_gemm_permute_bf16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
+      add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_bias_softmax_gemm_permute_bf16)
+    endif()
   set(target 1)
 endif()
 endforeach()
\ No newline at end of file
--- a/test/batchnorm/batchnorm_bwd_rank_4.cpp
+++ b/test/batchnorm/batchnorm_bwd_rank_4.cpp
@@ -70,10 +70,23 @@ class TestBatchNormBwdRank4 : public ::testing::Test
    }
 };
-using KernelTypes = ::testing::Types<std::tuple<F16, F32, F32, F32, F16, F32, F32>,
+using KernelTypes = ::testing::Types<
-                                     std::tuple<F32, F32, F32, F32, F32, F32, F32>,
+#ifdef CK_ENABLE_FP16
-                                     std::tuple<BF16, F32, F32, F32, BF16, F32, F32>,
+    std::tuple<F16, F32, F32, F32, F16, F32, F32>
-                                     std::tuple<F64, F64, F64, F64, F64, F64, F64>>;
+#endif
+#ifdef CK_ENABLE_FP32
+    ,
+    std::tuple<F32, F32, F32, F32, F32, F32, F32>
+#endif
+#ifdef CK_ENABLE_BF16
+    ,
+    std::tuple<BF16, F32, F32, F32, BF16, F32, F32>
+#endif
+#ifdef CK_ENABLE_FP64
+    ,
+    std::tuple<F64, F64, F64, F64, F64, F64, F64>
+#endif
+    >;
 TYPED_TEST_SUITE(TestBatchNormBwdRank4, KernelTypes);

--- a/test/batchnorm/batchnorm_fwd_rank_4.cpp
+++ b/test/batchnorm/batchnorm_fwd_rank_4.cpp
@@ -87,10 +87,23 @@ class TestBatchNormFwdRank4 : public ::testing::Test
    }
 };
-using KernelTypes = ::testing::Types<std::tuple<F16, F16, F32, F16, F16, F32>,
+using KernelTypes = ::testing::Types<
-                                     std::tuple<F32, F32, F32, F32, F32, F32>,
+#ifdef CK_ENABLE_FP16
-                                     std::tuple<BF16, BF16, F32, BF16, BF16, F32>,
+    std::tuple<F16, F16, F32, F16, F16, F32>
-                                     std::tuple<F64, F64, F64, F64, F64, F64>>;
+#endif
+#ifdef CK_ENABLE_FP32
+    ,
+    std::tuple<F32, F32, F32, F32, F32, F32>
+#endif
+#ifdef CK_ENABLE_BF16
+    ,
+    std::tuple<BF16, BF16, F32, BF16, BF16, F32>
+#endif
+#ifdef CK_ENABLE_FP64
+    ,
+    std::tuple<F64, F64, F64, F64, F64, F64>
+#endif
+    >;
 TYPED_TEST_SUITE(TestBatchNormFwdRank4, KernelTypes);

--- a/test/batchnorm/batchnorm_infer_rank_4.cpp
+++ b/test/batchnorm/batchnorm_infer_rank_4.cpp
@@ -67,10 +67,23 @@ class TestBatchNormInferRank4 : public ::testing::Test
    }
 };
-using KernelTypes = ::testing::Types<std::tuple<F16, F16, F32, F16, F16, F32>,
+using KernelTypes = ::testing::Types<
-                                     std::tuple<F32, F32, F32, F32, F32, F32>,
+#ifdef CK_ENABLE_FP16
-                                     std::tuple<BF16, BF16, F32, BF16, BF16, F32>,
+    std::tuple<F16, F16, F32, F16, F16, F32>
-                                     std::tuple<F64, F64, F64, F64, F64, F64>>;
+#endif
+#ifdef CK_ENABLE_FP32
+    ,
+    std::tuple<F32, F32, F32, F32, F32, F32>
+#endif
+#ifdef CK_ENABLE_BF16
+    ,
+    std::tuple<BF16, BF16, F32, BF16, BF16, F32>
+#endif
+#ifdef CK_ENABLE_FP64
+    ,
+    std::tuple<F64, F64, F64, F64, F64, F64>
+#endif
+    >;
 TYPED_TEST_SUITE(TestBatchNormInferRank4, KernelTypes);

--- a/test/conv_tensor_rearrange/CMakeLists.txt
+++ b/test/conv_tensor_rearrange/CMakeLists.txt
+add_gtest_executable(test_conv_tensor_rearrange test_conv_tensor_rearrange.cpp)
+target_link_libraries(test_conv_tensor_rearrange PRIVATE utility device_image_to_column_instance device_column_to_image_instance)
+add_gtest_executable(test_conv_tensor_rearrange_interface test_conv_tensor_rearrange_interface.cpp)
+target_link_libraries(test_conv_tensor_rearrange_interface PRIVATE utility)
--- a/test/conv_tensor_rearrange/test_conv_tensor_rearrange.cpp
+++ b/test/conv_tensor_rearrange/test_conv_tensor_rearrange.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <tuple>
+#include <vector>
+#include <gtest/gtest.h>
+#include "profiler/profile_conv_tensor_rearrange_impl.hpp"
+template <typename Tuple>
+class TestConvTensorRearrange : public ::testing::Test
+{
+    protected:
+    using ImLayout              = std::tuple_element_t<0, Tuple>;
+    using ConvTensorRearrangeOp = std::tuple_element_t<1, Tuple>;
+    std::vector<ck::utils::conv::ConvParam> conv_params;
+    template <ck::index_t NDimSpatial, typename InDataType, typename OutDataType>
+    void Run()
+    {
+        EXPECT_FALSE(conv_params.empty());
+        bool pass = true;
+        for(auto& param : conv_params)
+        {
+            pass = pass && ck::profiler::profile_conv_tensor_rearrange_impl<NDimSpatial,
+                                                                            ImLayout,
+                                                                            InDataType,
+                                                                            OutDataType,
+                                                                            ConvTensorRearrangeOp>(
+                               true,  // do_verification
+                               1,     // init_method: integer value
+                               false, // do_log
+                               false, // time_kernel
+                               param);
+        }
+        EXPECT_TRUE(pass);
+    }
+};
+using namespace ck::tensor_layout::convolution;
+using namespace ck::conv_tensor_rearrange_op;
+using KernelTypes1d =
+    ::testing::Types<std::tuple<GNWC, ImageToColumn>, std::tuple<GNWC, ColumnToImage>>;
+using KernelTypes2d =
+    ::testing::Types<std::tuple<GNHWC, ImageToColumn>, std::tuple<GNHWC, ColumnToImage>>;
+using KernelTypes3d =
+    ::testing::Types<std::tuple<GNDHWC, ImageToColumn>, std::tuple<GNDHWC, ColumnToImage>>;
+template <typename Tuple>
+class TestConvTensorRearrange1d : public TestConvTensorRearrange<Tuple>
+{
+};
+template <typename Tuple>
+class TestConvTensorRearrange2d : public TestConvTensorRearrange<Tuple>
+{
+};
+template <typename Tuple>
+class TestConvTensorRearrange3d : public TestConvTensorRearrange<Tuple>
+{
+};
+TYPED_TEST_SUITE(TestConvTensorRearrange1d, KernelTypes1d);
+TYPED_TEST_SUITE(TestConvTensorRearrange2d, KernelTypes2d);
+TYPED_TEST_SUITE(TestConvTensorRearrange3d, KernelTypes3d);
+TYPED_TEST(TestConvTensorRearrange1d, Test1D)
+{
+    this->conv_params.clear();
+    this->conv_params.push_back({1, 1, 4, 1, 192, {3}, {28}, {1}, {1}, {1}, {1}});
+    this->conv_params.push_back({1, 1, 64, 1, 64, {3}, {14}, {1}, {1}, {1}, {1}});
+    this->conv_params.push_back({1, 1, 64, 1, 64, {1}, {7}, {3}, {1}, {0}, {0}});
+    this->conv_params.push_back({1, 1, 64, 1, 64, {1}, {3}, {1}, {1}, {0}, {0}});
+    // ScalarPerVector should be 1
+    this->conv_params.push_back({1, 1, 4, 1, 1, {3}, {28}, {1}, {1}, {1}, {1}});
+    // stride != 1
+    this->conv_params.push_back({1, 1, 1, 1, 4, {3}, {28}, {2}, {1}, {1}, {1}});
+    // dilation != 1
+    this->conv_params.push_back({1, 1, 1, 1, 4, {3}, {28}, {1}, {2}, {1}, {1}});
+#ifdef CK_ENABLE_FP32
+    this->template Run<1, float, float>();
+#endif
+#ifdef CK_ENABLE_BF16
+    this->template Run<1, ck::bhalf_t, ck::bhalf_t>();
+#endif
+#ifdef CK_ENABLE_FP16
+    this->template Run<1, ck::half_t, ck::half_t>();
+#endif
+#ifdef CK_ENABLE_INT8
+    this->template Run<1, int8_t, int8_t>();
+#endif
+}
+TYPED_TEST(TestConvTensorRearrange2d, Test2D)
+{
+    this->conv_params.clear();
+    this->conv_params.push_back(
+        {2, 1, 4, 1, 192, {3, 3}, {28, 28}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->conv_params.push_back(
+        {2, 1, 64, 1, 64, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->conv_params.push_back({2, 1, 64, 1, 64, {1, 1}, {7, 7}, {3, 3}, {1, 1}, {0, 0}, {0, 0}});
+    this->conv_params.push_back({2, 1, 64, 1, 64, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
+    this->conv_params.push_back(
+        {2, 1, 64, 1, 64, {3, 3}, {28, 28}, {2, 2}, {2, 2}, {1, 1}, {1, 1}});
+#ifdef CK_ENABLE_FP32
+    this->template Run<2, float, float>();
+#endif
+#ifdef CK_ENABLE_BF16
+    this->template Run<2, ck::bhalf_t, ck::bhalf_t>();
+#endif
+#ifdef CK_ENABLE_FP16
+    this->template Run<2, ck::half_t, ck::half_t>();
+#endif
+#ifdef CK_ENABLE_INT8
+    this->template Run<2, int8_t, int8_t>();
+#endif
+}
+TYPED_TEST(TestConvTensorRearrange3d, Test3D)
+{
+    this->conv_params.clear();
+    this->conv_params.push_back(
+        {3, 1, 16, 1, 64, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {3, 3, 3}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 1, 2, 1, 64, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+    this->conv_params.push_back(
+        {3, 1, 32, 1, 64, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 1, 64, 1, 64, {3, 3, 3}, {14, 14, 14}, {2, 2, 2}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}});
+#ifdef CK_ENABLE_FP32
+    this->template Run<3, float, float>();
+#endif
+#ifdef CK_ENABLE_BF16
+    this->template Run<3, ck::bhalf_t, ck::bhalf_t>();
+#endif
+#ifdef CK_ENABLE_FP16
+    this->template Run<3, ck::half_t, ck::half_t>();
+#endif
+#ifdef CK_ENABLE_INT8
+    this->template Run<3, int8_t, int8_t>();
+#endif
+}
--- a/test/conv_tensor_rearrange/test_conv_tensor_rearrange_interface.cpp
+++ b/test/conv_tensor_rearrange/test_conv_tensor_rearrange_interface.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <tuple>
+#include <vector>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp"
+#include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include <gtest/gtest.h>
+using DataType = float;
+using ImLayout = ck::tensor_layout::convolution::GNWC;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using namespace ck::conv_tensor_rearrange_op;
+template <ck::index_t ScalarPerVector, bool IsCPacked>
+class TestConvTensorRearrangeInterface : public ::testing::Test
+{
+    protected:
+    static constexpr ck::index_t NDimSpatial = 1;
+    // clang-format off
+    using DeviceImgToColInstance = ck::tensor_operation::device::DeviceImageToColumnImpl
+        //        Num| ImLayout| InDataType| OutDataType| Block|  MPer|  KPer|    Thread| Scalar|
+        //        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
+        //    Spatial|         |           |            |      |      |      |   Lengths| Vector|
+        //           |         |           |            |      |      |      |          |       |
+        < NDimSpatial, ImLayout,   DataType,    DataType,   256,   128,   128, S<16, 16>,ScalarPerVector>;
+    using DeviceColToimgInstance = ck::tensor_operation::device::DeviceColumnToImageImpl
+        //        Num| ImLayout| InDataType| OutDataType| Block|  MPer|  KPer|    Thread| Scalar|
+        //        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
+        //    Spatial|         |           |            |      |      |      |   Lengths| Vector|
+        //           |         |           |            |      |      |      |          |       |
+        < NDimSpatial, ImLayout,   DataType,    DataType,   256,   128,   128, S<16, 16>,ScalarPerVector>;
+    // clang-format on
+    ck::utils::conv::ConvParam conv_param;
+    template <typename ConvTensorRearrangeOp>
+    bool Run()
+    {
+        const auto N = conv_param.N_;
+        const auto C = conv_param.C_;
+        const auto FakeC =
+            conv_param.C_ / 2; // Fake C to simulate the behavior that C is not packed
+        const ck::index_t NDoHoWo =
+            N *
+            ck::accumulate_n<ck::index_t>(
+                conv_param.output_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
+        const ck::index_t CZYX =
+            C *
+            ck::accumulate_n<ck::index_t>(
+                conv_param.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
+        const auto image_desc =
+            ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<ImLayout>(
+                conv_param);
+        const auto gemm_desc = HostTensorDescriptor({NDoHoWo, CZYX});
+        std::array<ck::index_t, NDimSpatial> input_spatial_lengths{};
+        std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{};
+        std::array<ck::index_t, NDimSpatial> output_spatial_lengths{};
+        std::array<ck::index_t, NDimSpatial + 3> input_g_n_c_wis_strides{};
+        std::array<ck::index_t, 2> output_m_k_strides{};
+        std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+        std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+        std::array<ck::index_t, NDimSpatial> input_left_pads{};
+        std::array<ck::index_t, NDimSpatial> input_right_pads{};
+        auto copy = [](const auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); };
+        copy(conv_param.input_spatial_lengths_, input_spatial_lengths);
+        copy(conv_param.filter_spatial_lengths_, filter_spatial_lengths);
+        copy(conv_param.output_spatial_lengths_, output_spatial_lengths);
+        copy(image_desc.GetStrides(), input_g_n_c_wis_strides);
+        copy(gemm_desc.GetStrides(), output_m_k_strides);
+        copy(conv_param.conv_filter_strides_, conv_filter_strides);
+        copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+        copy(conv_param.input_left_pads_, input_left_pads);
+        copy(conv_param.input_right_pads_, input_right_pads);
+        if constexpr(std::is_same_v<ConvTensorRearrangeOp, ImageToColumn>)
+        {
+            auto img2col  = DeviceImgToColInstance{};
+            auto argument = img2col.MakeArgument(nullptr,
+                                                 nullptr,
+                                                 N,
+                                                 IsCPacked ? C : FakeC,
+                                                 input_spatial_lengths,
+                                                 filter_spatial_lengths,
+                                                 output_spatial_lengths,
+                                                 input_g_n_c_wis_strides,
+                                                 output_m_k_strides,
+                                                 conv_filter_strides,
+                                                 conv_filter_dilations,
+                                                 input_left_pads,
+                                                 input_right_pads);
+            return img2col.IsSupportedArgument(argument);
+        }
+        else if constexpr(std::is_same_v<ConvTensorRearrangeOp, ColumnToImage>)
+        {
+            auto col2img  = DeviceColToimgInstance{};
+            auto argument = col2img.MakeArgument(nullptr,
+                                                 nullptr,
+                                                 N,
+                                                 IsCPacked ? C : FakeC,
+                                                 input_spatial_lengths,
+                                                 filter_spatial_lengths,
+                                                 output_spatial_lengths,
+                                                 input_g_n_c_wis_strides,
+                                                 output_m_k_strides,
+                                                 conv_filter_strides,
+                                                 conv_filter_dilations,
+                                                 input_left_pads,
+                                                 input_right_pads);
+            return col2img.IsSupportedArgument(argument);
+        }
+    }
+};
+class TestConvTensorRearrangeInterface1ScalarPerVector
+    : public TestConvTensorRearrangeInterface<1, true>
+{
+};
+class TestConvTensorRearrangeInterface4ScalarPerVector
+    : public TestConvTensorRearrangeInterface<4, true>
+{
+};
+class TestConvTensorRearrangeInterface4ScalarPerVectorFakeC
+    : public TestConvTensorRearrangeInterface<4, false>
+{
+};
+TEST_F(TestConvTensorRearrangeInterface1ScalarPerVector, X1ScalarPerVector)
+{
+    // vector load C * X % ScalarPerVector
+    this->conv_param  = {1, 1, 1, 1, 1, {3}, {3}, {1}, {1}, {0}, {0}};
+    bool is_supported = this->template Run<ImageToColumn>();
+    EXPECT_TRUE(is_supported);
+    is_supported = this->template Run<ColumnToImage>();
+    EXPECT_TRUE(is_supported);
+    // vector load C * left_pad_x % ScalarPerVector
+    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {1}, {3}, {0}};
+    is_supported     = this->template Run<ImageToColumn>();
+    EXPECT_TRUE(is_supported);
+    is_supported = this->template Run<ColumnToImage>();
+    EXPECT_TRUE(is_supported);
+    // vector load C * right_pad_x % ScalarPerVector
+    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {1}, {0}, {3}};
+    is_supported     = this->template Run<ImageToColumn>();
+    EXPECT_TRUE(is_supported);
+    is_supported = this->template Run<ColumnToImage>();
+    EXPECT_TRUE(is_supported);
+    // vector load C % ScalarPerVector, right_pad and stride
+    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {2}, {1}, {0}, {3}};
+    is_supported     = this->template Run<ImageToColumn>();
+    EXPECT_TRUE(is_supported);
+    is_supported = this->template Run<ColumnToImage>();
+    EXPECT_TRUE(is_supported);
+    // vector load C % ScalarPerVector, left_pad and stride
+    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {2}, {1}, {3}, {0}};
+    is_supported     = this->template Run<ImageToColumn>();
+    EXPECT_TRUE(is_supported);
+    is_supported = this->template Run<ColumnToImage>();
+    EXPECT_TRUE(is_supported);
+    // vector load C % ScalarPerVector, dilation
+    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {2}, {0}, {0}};
+    is_supported     = this->template Run<ImageToColumn>();
+    EXPECT_TRUE(is_supported);
+    is_supported = this->template Run<ColumnToImage>();
+    EXPECT_TRUE(is_supported);
+    // C = 4
+    this->conv_param = {1, 1, 1, 1, 4, {3}, {3}, {1}, {1}, {3}, {3}};
+    is_supported     = this->template Run<ImageToColumn>();
+    EXPECT_TRUE(is_supported);
+    is_supported = this->template Run<ColumnToImage>();
+    EXPECT_TRUE(is_supported);
+}
+TEST_F(TestConvTensorRearrangeInterface4ScalarPerVector, X4ScalarPerVector)
+{
+    // vector load C * X % ScalarPerVector
+    this->conv_param  = {1, 1, 1, 1, 1, {3}, {3}, {1}, {1}, {0}, {0}};
+    bool is_supported = this->template Run<ImageToColumn>();
+    EXPECT_FALSE(is_supported);
+    is_supported = this->template Run<ColumnToImage>();
+    EXPECT_FALSE(is_supported);
+    // vector load C * left_pad_x % ScalarPerVector
+    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {1}, {3}, {0}};
+    is_supported     = this->template Run<ImageToColumn>();
+    EXPECT_FALSE(is_supported);
+    is_supported = this->template Run<ColumnToImage>();
+    EXPECT_FALSE(is_supported);
+    // vector load C * right_pad_x % ScalarPerVector
+    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {1}, {0}, {3}};
+    is_supported     = this->template Run<ImageToColumn>();
+    EXPECT_FALSE(is_supported);
+    is_supported = this->template Run<ColumnToImage>();
+    EXPECT_FALSE(is_supported);
+    // vector load C % ScalarPerVector, right_pad and stride
+    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {2}, {1}, {0}, {3}};
+    is_supported     = this->template Run<ImageToColumn>();
+    EXPECT_FALSE(is_supported);
+    is_supported = this->template Run<ColumnToImage>();
+    EXPECT_FALSE(is_supported);
+    // vector load C % ScalarPerVector, left_pad and stride
+    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {2}, {1}, {3}, {0}};
+    is_supported     = this->template Run<ImageToColumn>();
+    EXPECT_FALSE(is_supported);
+    is_supported = this->template Run<ColumnToImage>();
+    EXPECT_FALSE(is_supported);
+    // vector load C % ScalarPerVector, dilation
+    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {2}, {0}, {0}};
+    is_supported     = this->template Run<ImageToColumn>();
+    EXPECT_FALSE(is_supported);
+    is_supported = this->template Run<ColumnToImage>();
+    EXPECT_FALSE(is_supported);
+    // C = 4
+    this->conv_param = {1, 1, 1, 1, 4, {3}, {3}, {1}, {1}, {3}, {3}};
+    is_supported     = this->template Run<ImageToColumn>();
+    EXPECT_TRUE(is_supported);
+    is_supported = this->template Run<ColumnToImage>();
+    EXPECT_TRUE(is_supported);
+}
+TEST_F(TestConvTensorRearrangeInterface4ScalarPerVectorFakeC, X4ScalarPerVectorFakeC)
+{
+    // C = 3
+    this->conv_param  = {1, 1, 1, 1, 3, {4}, {3}, {1}, {1}, {0}, {0}};
+    bool is_supported = this->template Run<ImageToColumn>();
+    EXPECT_FALSE(is_supported);
+    is_supported = this->template Run<ColumnToImage>();
+    EXPECT_FALSE(is_supported);
+    // C = 4
+    this->conv_param = {1, 1, 1, 1, 8, {4}, {3}, {1}, {1}, {0}, {0}};
+    is_supported     = this->template Run<ImageToColumn>();
+    EXPECT_TRUE(is_supported);
+    is_supported = this->template Run<ColumnToImage>();
+    EXPECT_TRUE(is_supported);
+}
--- a/test/data_type/CMakeLists.txt
+++ b/test/data_type/CMakeLists.txt
 if (USE_BITINT_EXTENSION_INT4)
  add_gtest_executable(test_int4 int4.cpp)
-  target_link_libraries(test_int4 PRIVATE utility)
+  if(result EQUAL 0)
+    target_link_libraries(test_int4 PRIVATE utility)
+  endif()
 endif()
-if(DTYPES MATCHES "fp8" OR NOT DEFINED DTYPES)
+add_gtest_executable(test_fp8 fp8.cpp)
-  add_gtest_executable(test_f8 f8.cpp)
+if(result EQUAL 0)
-  target_link_libraries(test_f8 PRIVATE utility)
+  target_link_libraries(test_fp8 PRIVATE utility)
 endif()
+add_gtest_executable(test_bf8 bf8.cpp)
-if(DTYPES MATCHES "bf8" OR NOT DEFINED DTYPES)
+if(result EQUAL 0)
-  add_gtest_executable(test_bf8 bf8.cpp)
  target_link_libraries(test_bf8 PRIVATE utility)
 endif()
+add_gtest_executable(test_type_convert_const type_convert_const.cpp)
--- a/test/data_type/f8.cpp
+++ b/test/data_type/f8.cpp
--- a/test/data_type/type_convert_const.cpp
+++ b/test/data_type/type_convert_const.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+#include "gtest/gtest.h"
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/type_convert.hpp"
+using ck::bhalf_t;
+using ck::type_convert;
+TEST(TypeConvertConst, ConvertToConst)
+{
+    constexpr float bf16_epsilon = 0.0078125;
+    constexpr float rel_tol      = 2 * bf16_epsilon;
+    const std::vector<float> cases = {0.0, -123.f, 3.981323f, 0.2429f};
+    for(float x : cases)
+    {
+        const float abs_tol = std::abs(rel_tol * x);
+        {
+            bhalf_t y = type_convert<bhalf_t>(x);
+            // Test non-const bhalf to const float.
+            const float y_float = type_convert<const float>(y);
+            ASSERT_NEAR(y_float, x, abs_tol);
+        }
+        {
+            // Test non-const float to const bhalf.
+            const bhalf_t y = type_convert<const bhalf_t>(x);
+            // Remove the constness manually to not rely on const casts anymore since the
+            // possible issue could hide after two casts.
+            bhalf_t& y_nonconst = const_cast<bhalf_t&>(y);
+            float y_float       = type_convert<float>(y_nonconst);
+            ASSERT_NEAR(y_float, x, abs_tol);
+        }
+    }
+}
+TEST(TypeConvertConst, ConvertFromConst)
+{
+    constexpr float bf16_epsilon = 0.0078125;
+    constexpr float rel_tol      = 2 * bf16_epsilon;
+    const std::vector<float> cases = {0.0, -123.f, 3.981323f, 0.2429f};
+    for(const float x : cases)
+    {
+        const float abs_tol = std::abs(rel_tol * x);
+        {
+            // Test const float to const bhalf_t.
+            const bhalf_t y = type_convert<const bhalf_t>(x);
+            // Remove the constness manually to not rely on const casts anymore since the
+            // possible issue could hide after two casts.
+            bhalf_t& y_nonconst = const_cast<bhalf_t&>(y);
+            float y_float       = type_convert<float>(y_nonconst);
+            ASSERT_NEAR(y_float, x, abs_tol);
+        }
+        {
+            // Test const float to non-const bhalf.
+            bhalf_t y     = type_convert<bhalf_t>(x);
+            float y_float = type_convert<float>(y);
+            ASSERT_NEAR(y_float, x, abs_tol);
+        }
+        {
+            const bhalf_t y = type_convert<const bhalf_t>(x);
+            // Test const bhalf to non-const float.
+            float y_float = type_convert<float>(y);
+            ASSERT_NEAR(y_float, x, abs_tol);
+        }
+        // Tests with full type specializations for X.
+        {
+            // Test const float to const bhalf_t.
+            const bhalf_t y = type_convert<const bhalf_t, const float>(x);
+            // Remove the constness manually to not rely on const casts anymore since the
+            // possible issue could hide after two casts.
+            bhalf_t& y_nonconst = const_cast<bhalf_t&>(y);
+            float y_float       = type_convert<float>(y_nonconst);
+            ASSERT_NEAR(y_float, x, abs_tol);
+        }
+        {
+            // Test const float to non-const bhalf.
+            bhalf_t y     = type_convert<bhalf_t, const float>(x);
+            float y_float = type_convert<float>(y);
+            ASSERT_NEAR(y_float, x, abs_tol);
+        }
+        {
+            const bhalf_t y = type_convert<const bhalf_t, const float>(x);
+            // Test const bhalf to non-const float.
+            float y_float = type_convert<float, const bhalf_t>(y);
+            ASSERT_NEAR(y_float, x, abs_tol);
+        }
+    }
+}
--- a/test/elementwise_normalization/CMakeLists.txt
+++ b/test/elementwise_normalization/CMakeLists.txt
-if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+add_custom_target(test_elementwise_normalization)
-  add_custom_target(test_elementwise_normalization)
+add_gtest_executable(test_elementwise_layernorm_fp16 test_elementwise_layernorm_fp16.cpp)
-  add_gtest_executable(test_elementwise_layernorm_fp16 test_elementwise_layernorm_fp16.cpp)
+if(result EQUAL 0)
  target_link_libraries(test_elementwise_layernorm_fp16 PRIVATE utility device_elementwise_normalization_instance)
  add_dependencies(test_elementwise_normalization test_elementwise_layernorm_fp16)
 endif()
\ No newline at end of file
--- a/test/gemm/CMakeLists.txt
+++ b/test/gemm/CMakeLists.txt
-if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
 add_test_executable(test_gemm_fp32 gemm_fp32.cpp)
-target_link_libraries(test_gemm_fp32 PRIVATE utility)
+if(result EQUAL 0)
-target_link_libraries(test_gemm_fp32 PRIVATE device_gemm_instance)
+    target_link_libraries(test_gemm_fp32 PRIVATE utility device_gemm_instance)
 endif()
-if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
 add_test_executable(test_gemm_fp16 gemm_fp16.cpp)
-target_link_libraries(test_gemm_fp16 PRIVATE utility)
+if(result EQUAL 0)
-target_link_libraries(test_gemm_fp16 PRIVATE device_gemm_instance)
+    target_link_libraries(test_gemm_fp16 PRIVATE utility device_gemm_instance)
-add_library(gemm_standalone_xdl_fp16_instances STATIC
+    add_library(gemm_standalone_xdl_fp16_instances STATIC
    instance/gemm_f16_nn_instance.cpp
    instance/gemm_f16_nt_instance.cpp
    instance/gemm_f16_tn_instance.cpp
    instance/gemm_wavelet_f16_tn_instance.cpp
    instance/gemm_f16_tt_instance.cpp
-)
+    )
+endif()
 add_test_executable(test_gemm_standalone_xdl_fp16 gemm_standalone_xdl_fp16.cpp)
-target_link_libraries(test_gemm_standalone_xdl_fp16 PRIVATE gemm_standalone_xdl_fp16_instances utility)
+if(result EQUAL 0)
-target_include_directories(test_gemm_standalone_xdl_fp16 PRIVATE instance/)
+    target_link_libraries(test_gemm_standalone_xdl_fp16 PRIVATE gemm_standalone_xdl_fp16_instances utility)
+    target_include_directories(test_gemm_standalone_xdl_fp16 PRIVATE instance/)
 endif()
-if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
 add_test_executable(test_gemm_bf16 gemm_bf16.cpp)
-target_link_libraries(test_gemm_bf16 PRIVATE utility)
+if(result EQUAL 0)
-target_link_libraries(test_gemm_bf16 PRIVATE device_gemm_instance)
+    target_link_libraries(test_gemm_bf16 PRIVATE utility device_gemm_instance)
 endif()
-if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
 add_test_executable(test_gemm_int8 gemm_int8.cpp)
-target_link_libraries(test_gemm_int8 PRIVATE utility)
+if(result EQUAL 0)
-target_link_libraries(test_gemm_int8 PRIVATE device_gemm_instance)
+    target_link_libraries(test_gemm_int8 PRIVATE utility device_gemm_instance)
 endif()
\ No newline at end of file
--- a/test/gemm_layernorm/CMakeLists.txt
+++ b/test/gemm_layernorm/CMakeLists.txt
@@ -2,12 +2,12 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)
-  if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
   add_custom_target(test_gemm_layernorm)
   add_gtest_executable(test_gemm_add_relu_add_layernorm_fp16 test_gemm_add_relu_add_layernorm_fp16.cpp)
-   target_link_libraries(test_gemm_add_relu_add_layernorm_fp16 PRIVATE utility device_gemm_add_relu_add_layernorm_instance)
+   if(result EQUAL 0)
-   add_dependencies(test_gemm_layernorm test_gemm_add_relu_add_layernorm_fp16)
+     target_link_libraries(test_gemm_add_relu_add_layernorm_fp16 PRIVATE utility device_gemm_add_relu_add_layernorm_instance)
-   set(target 1)
+     add_dependencies(test_gemm_layernorm test_gemm_add_relu_add_layernorm_fp16)
-  endif()
+     set(target 1)
+   endif()
 endif()
 endforeach()
--- a/test/gemm_reduce/CMakeLists.txt
+++ b/test/gemm_reduce/CMakeLists.txt
-if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+add_test_executable(test_gemm_reduce_fp16 gemm_reduce_fp16.cpp)
-  add_test_executable(test_gemm_reduce_fp16 gemm_reduce_fp16.cpp)
+if(result EQUAL 0)
-  target_link_libraries(test_gemm_reduce_fp16 PRIVATE utility)
+  target_link_libraries(test_gemm_reduce_fp16 PRIVATE utility device_gemm_reduce_instance)
-  target_link_libraries(test_gemm_reduce_fp16 PRIVATE device_gemm_reduce_instance)
 endif()
\ No newline at end of file
--- a/test/gemm_split_k/test_gemm_splitk_ut_cases.inc
+++ b/test/gemm_split_k/test_gemm_splitk_ut_cases.inc
@@ -2,7 +2,7 @@
 TYPED_TEST(TestGemmSplitK_MK_KN, SmallM)
 {
-    std::vector<int> Ms{0, 1, 2, 3, 4, 5, 6};
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
    constexpr int N = 512;
    constexpr int K = 320;
@@ -16,7 +16,7 @@ TYPED_TEST(TestGemmSplitK_MK_KN, SmallM)
 TYPED_TEST(TestGemmSplitK_MK_NK, SmallM)
 {
-    std::vector<int> Ms{0, 1, 2, 3, 4, 5, 6};
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
    constexpr int N = 512;
    constexpr int K = 320;
@@ -30,7 +30,7 @@ TYPED_TEST(TestGemmSplitK_MK_NK, SmallM)
 TYPED_TEST(TestGemmSplitK_KM_KN, SmallM)
 {
-    std::vector<int> Ms{0, 1, 2, 3, 4, 5, 6};
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
    constexpr int N = 512;
    constexpr int K = 320;
@@ -43,7 +43,7 @@ TYPED_TEST(TestGemmSplitK_KM_KN, SmallM)
 TYPED_TEST(TestGemmSplitK_KM_NK, SmallM)
 {
-    std::vector<int> Ms{0, 1, 2, 3, 4, 5, 6};
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
    constexpr int N = 512;
    constexpr int K = 320;

--- a/test/grouped_convnd_bwd_data/CMakeLists.txt
+++ b/test/grouped_convnd_bwd_data/CMakeLists.txt
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list_xdl gfx908 gfx90a gfx940)
-    add_gtest_executable(test_grouped_convnd_bwd_data test_grouped_convnd_bwd_data.cpp)
+list(APPEND gpu_list_wmma gfx1100 gfx1101 gfx1102)
-    target_link_libraries(test_grouped_convnd_bwd_data PRIVATE utility device_grouped_conv2d_bwd_data_instance device_grouped_conv3d_bwd_data_instance)
+set(target 0)
-    add_gtest_executable(test_grouped_convnd_bwd_data_interface test_grouped_convnd_bwd_data_interface.cpp)
+foreach(gpu IN LISTS GPU_TARGETS)
-    target_link_libraries(test_grouped_convnd_bwd_data_interface PRIVATE utility device_grouped_conv2d_bwd_data_instance)
+    if(gpu IN_LIST gpu_list_xdl AND target EQUAL 0)
-endif()
+        add_gtest_executable(test_grouped_convnd_bwd_data test_grouped_convnd_bwd_data.cpp)
\ No newline at end of file
+        target_link_libraries(test_grouped_convnd_bwd_data PRIVATE utility device_grouped_conv2d_bwd_data_instance device_grouped_conv3d_bwd_data_instance)
+        add_gtest_executable(test_grouped_convnd_bwd_data_interface test_grouped_convnd_bwd_data_interface_xdl.cpp)
+        target_link_libraries(test_grouped_convnd_bwd_data_interface PRIVATE utility device_grouped_conv2d_bwd_data_instance)
+        set(target 1)
+    endif()
+    if(gpu IN_LIST gpu_list_wmma AND target EQUAL 0)
+        add_gtest_executable(test_grouped_convnd_bwd_data test_grouped_convnd_bwd_data.cpp)
+        target_link_libraries(test_grouped_convnd_bwd_data PRIVATE utility device_grouped_conv2d_bwd_data_instance device_grouped_conv3d_bwd_data_instance)
+        add_gtest_executable(test_grouped_convnd_bwd_data_interface test_grouped_convnd_bwd_data_interface_wmma.cpp)
+        target_link_libraries(test_grouped_convnd_bwd_data_interface PRIVATE utility device_grouped_conv2d_bwd_data_instance)
+        set(target 1)
+    endif()
+endforeach()
\ No newline at end of file
--- a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data.cpp
+++ b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data.cpp
@@ -51,16 +51,20 @@ using namespace ck::tensor_layout::convolution;
 using KernelTypes2d = ::testing::Types<std::tuple<float, GNHWK, GKYXC, GNHWC>,
                                       std::tuple<ck::half_t, GNHWK, GKYXC, GNHWC>,
                                       std::tuple<ck::bhalf_t, GNHWK, GKYXC, GNHWC>,
+                                       std::tuple<int8_t, GNHWK, GKYXC, GNHWC>,
                                       std::tuple<float, NHWGK, GKYXC, NHWGC>,
                                       std::tuple<ck::half_t, NHWGK, GKYXC, NHWGC>,
-                                       std::tuple<ck::bhalf_t, NHWGK, GKYXC, NHWGC>>;
+                                       std::tuple<ck::bhalf_t, NHWGK, GKYXC, NHWGC>,
+                                       std::tuple<int8_t, NHWGK, GKYXC, NHWGC>>;
 using KernelTypes3d = ::testing::Types<std::tuple<float, GNDHWK, GKZYXC, GNDHWC>,
                                       std::tuple<ck::half_t, GNDHWK, GKZYXC, GNDHWC>,
                                       std::tuple<ck::bhalf_t, GNDHWK, GKZYXC, GNDHWC>,
+                                       std::tuple<int8_t, GNDHWK, GKZYXC, GNDHWC>,
                                       std::tuple<float, NDHWGK, GKZYXC, NDHWGC>,
                                       std::tuple<ck::half_t, NDHWGK, GKZYXC, NDHWGC>,
-                                       std::tuple<ck::bhalf_t, NDHWGK, GKZYXC, NDHWGC>>;
+                                       std::tuple<ck::bhalf_t, NDHWGK, GKZYXC, NDHWGC>,
+                                       std::tuple<int8_t, NDHWGK, GKZYXC, NDHWGC>>;
 template <typename Tuple>
 class TestGroupedConvndBwdData2d : public TestGroupedConvndBwdData<Tuple>

--- a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_interface_wmma.cpp
+++ b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_interface_wmma.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <tuple>
+#include <vector>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include <gtest/gtest.h>
+using DataType    = ck::half_t;
+using AccDataType = float;
+using Pass        = ck::tensor_operation::element_wise::PassThrough;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using ConvBackwardDataSpecialization =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization;
+static constexpr auto ConvBwdDataDefault   = ConvBackwardDataSpecialization::Default;
+static constexpr auto Filter1x1Stride1Pad0 = ConvBackwardDataSpecialization::Filter1x1Stride1Pad0;
+template <typename Tuple, ConvBackwardDataSpecialization ConvSpec>
+class TestGroupedConvndBwdData : public ::testing::Test
+{
+    protected:
+    static constexpr ck::index_t NDimSpatial = 2;
+    using OutLayout = std::tuple_element_t<0, Tuple>;
+    using WeiLayout = std::tuple_element_t<1, Tuple>;
+    using InLayout  = std::tuple_element_t<2, Tuple>;
+    // clang-format off
+    using GroupedConvBwdDataDeviceInstance = ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
+            //|    NumDim|        A|         B|          Ds|       E|        AData|        BData|    AccData|          CShuffle|     DsData|       EData|           A|           B|          CDE|       ConvForward| Block|  MPer|  NPer| K0Per| K1|  MPer| NPer| MRepeat| NRepeat|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+            //|   Spatial|   Layout|    Layout|      Layout|  Layout|         Type|         Type|       Type|          DataType|       Type|        Type| Elementwise| Elementwise|  Elementwise|    Specialization|  Size| Block| Block| Block|   |  WMMA| WMMA|        |        |   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+            //|          |         |          |            |        |             |             |           |                  |           |            |   Operation|   Operation|    Operation|                  |      |      |      |      |   |      |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+            //|          |         |          |            |        |             |             |           |                  |           |            |            |            |             |                  |      |      |      |      |   |      |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+            < NDimSpatial,OutLayout, WeiLayout, ck::Tuple<>, InLayout,       DataType,  DataType, AccDataType,          DataType,  ck::Tuple<>,   DataType,        Pass,        Pass,        Pass,         ConvSpec, 64,    32,    64,     8,  8,    16,   16,       1,       4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         1,           1,           1,               S<1, 32, 1, 2>,               8>;
+    // clang-format on
+    ck::utils::conv::ConvParam conv_param;
+    template <ck::index_t NDimSpatial>
+    bool Run()
+    {
+        const auto out_g_n_k_wos_desc =
+            ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(
+                conv_param);
+        const auto wei_g_k_c_xs_desc =
+            ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(
+                conv_param);
+        const auto in_g_n_c_wis_desc =
+            ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(
+                conv_param);
+        std::array<ck::index_t, NDimSpatial + 3> out_lengths{};
+        std::array<ck::index_t, NDimSpatial + 3> out_strides{};
+        std::array<ck::index_t, NDimSpatial + 3> wei_lengths{};
+        std::array<ck::index_t, NDimSpatial + 3> wei_strides{};
+        std::array<ck::index_t, NDimSpatial + 3> in_lengths{};
+        std::array<ck::index_t, NDimSpatial + 3> in_strides{};
+        std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+        std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+        std::array<ck::index_t, NDimSpatial> input_left_pads{};
+        std::array<ck::index_t, NDimSpatial> input_right_pads{};
+        auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+        copy(out_g_n_k_wos_desc.GetLengths(), out_lengths);
+        copy(out_g_n_k_wos_desc.GetStrides(), out_strides);
+        copy(wei_g_k_c_xs_desc.GetLengths(), wei_lengths);
+        copy(wei_g_k_c_xs_desc.GetStrides(), wei_strides);
+        copy(in_g_n_c_wis_desc.GetLengths(), in_lengths);
+        copy(in_g_n_c_wis_desc.GetStrides(), in_strides);
+        copy(conv_param.conv_filter_strides_, conv_filter_strides);
+        copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+        copy(conv_param.input_left_pads_, input_left_pads);
+        copy(conv_param.input_right_pads_, input_right_pads);
+        auto conv = GroupedConvBwdDataDeviceInstance{};
+        auto argument = conv.MakeArgument(nullptr,
+                                          nullptr,
+                                          std::array<const void*, 0>{},
+                                          nullptr,
+                                          out_lengths,
+                                          out_strides,
+                                          wei_lengths,
+                                          wei_strides,
+                                          {},
+                                          {},
+                                          in_lengths,
+                                          in_strides,
+                                          conv_filter_strides,
+                                          conv_filter_dilations,
+                                          input_left_pads,
+                                          input_right_pads,
+                                          Pass{},
+                                          Pass{},
+                                          Pass{});
+        return conv.IsSupportedArgument(argument);
+    }
+};
+using GNHWC = ck::tensor_layout::convolution::GNHWC;
+using NHWGC = ck::tensor_layout::convolution::NHWGC;
+using GKYXC = ck::tensor_layout::convolution::GKYXC;
+using GNHWK = ck::tensor_layout::convolution::GNHWK;
+using NHWGK = ck::tensor_layout::convolution::NHWGK;
+using KernelTypes =
+    ::testing::Types<std::tuple<GNHWK, GKYXC, GNHWC>, std::tuple<NHWGK, GKYXC, NHWGC>>;
+template <typename Tuple>
+class TestGroupedConvndBwdDataDefault : public TestGroupedConvndBwdData<Tuple, ConvBwdDataDefault>
+{
+};
+template <typename Tuple>
+class TestGroupedConvndBwdDataFilter1x1
+    : public TestGroupedConvndBwdData<Tuple, Filter1x1Stride1Pad0>
+{
+};
+TYPED_TEST_SUITE(TestGroupedConvndBwdDataDefault, KernelTypes);
+TYPED_TEST_SUITE(TestGroupedConvndBwdDataFilter1x1, KernelTypes);
+TYPED_TEST(TestGroupedConvndBwdDataFilter1x1, SpecializationCheck)
+{
+    // Check filter 3,3 instead of 1,1
+    this->conv_param  = {2, 2, 4, 192, 192, {3, 3}, {28, 28}, {1, 1}, {1, 1}, {0, 0}, {0, 0}};
+    bool is_supported = this->template Run<2>();
+    EXPECT_FALSE(is_supported);
+    // Check strides 2,2 instead of 1,1
+    this->conv_param = {2, 2, 4, 192, 192, {1, 1}, {28, 28}, {2, 2}, {1, 1}, {0, 0}, {0, 0}};
+    is_supported     = this->template Run<2>();
+    EXPECT_FALSE(is_supported);
+    // Check with pad
+    this->conv_param = {2, 2, 4, 192, 192, {1, 1}, {28, 28}, {1, 1}, {1, 1}, {1, 1}, {1, 1}};
+    is_supported     = this->template Run<2>();
+    EXPECT_FALSE(is_supported);
+    // Supported version
+    this->conv_param = {2, 2, 4, 192, 192, {1, 1}, {28, 28}, {1, 1}, {1, 1}, {0, 0}, {0, 0}};
+    is_supported     = this->template Run<2>();
+    EXPECT_TRUE(is_supported);
+}
+TYPED_TEST(TestGroupedConvndBwdDataDefault, VectorLoadCheck)
+{
+    // vector load for A
+    this->conv_param  = {2, 2, 128, 129, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}};
+    bool is_supported = this->template Run<2>();
+    EXPECT_FALSE(is_supported);
+    // vector load for B, E, Ds
+    this->conv_param = {2, 2, 128, 128, 257, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}};
+    is_supported     = this->template Run<2>();
+    EXPECT_FALSE(is_supported);
+}
--- a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_interface.cpp
+++ b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_interface.cpp
--- a/test/grouped_convnd_fwd/CMakeLists.txt
+++ b/test/grouped_convnd_fwd/CMakeLists.txt
-add_gtest_executable(test_grouped_convnd_fwd grouped_convnd_fwd.cpp)
+add_gtest_executable(test_grouped_convnd_fwd test_grouped_convnd_fwd.cpp)
 target_link_libraries(test_grouped_convnd_fwd PRIVATE utility device_grouped_conv1d_fwd_instance device_grouped_conv2d_fwd_instance device_grouped_conv3d_fwd_instance)