Apply cshuffle to bwd_weight_cshuffle operator

ebb5522c · Mateusz Ozga · root · fdfe2102 · ebb5522c · ebb5522c
Commit ebb5522c authored Nov 12, 2024 by Mateusz Ozga Committed by root Dec 16, 2024
5 changed files
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev5_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev5_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_instances<
+                                       3,
+                                       NDHWGC,
+                                       GKZYXC,
+                                       NDHWGK,
+                                       ConvBwdWeightDefault,
+                                       BlockGemmPipelineScheduler::Intrawave,
+                                       BlockGemmPipelineVersion::v5>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.

 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
@@ -10,7 +10,7 @@ namespace device {
 namespace instance {

 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev2_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                           NDHWGC,
                                                           GKZYXC,
@@ -22,22 +22,15 @@ void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances
                                                           PassThrough,
                                                           PassThrough>>>& instances)
 {
-    // 1. Default
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_instances<3,
-                                                                   NDHWGC,
-                                                                   GKZYXC,
-                                                                   NDHWGK,
-                                                                   ConvBwdWeightDefault>{});
-    // 2. Filter1x1Stride1Pad0
    add_device_operation_instances(instances,
                                   device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_instances<
                                       3,
                                       NDHWGC,
                                       GKZYXC,
                                       NDHWGK,
-                                       ConvBwdWeightFilter1x1Stride1Pad0>{});
+                                       ConvBwdWeightFilter1x1Stride1Pad0,
+                                       BlockGemmPipelineScheduler::Intrawave,
+                                       BlockGemmPipelineVersion::v2>{});
 }

 } // namespace instance

--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev5_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev5_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_instances<
+                                       3,
+                                       NDHWGC,
+                                       GKZYXC,
+                                       NDHWGK,
+                                       ConvBwdWeightFilter1x1Stride1Pad0,
+                                       BlockGemmPipelineScheduler::Intrawave,
+                                       BlockGemmPipelineVersion::v5>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/test/grouped_convnd_bwd_weight/CMakeLists.txt
+++ b/test/grouped_convnd_bwd_weight/CMakeLists.txt
@@ -7,7 +7,7 @@ if(GPU_TARGETS MATCHES "gfx9" OR DL_KERNELS)
 endif()
 add_gtest_executable(test_grouped_convnd_bwd_weight_interface_xdl test_grouped_convnd_bwd_weight_interface_xdl.cpp)
 if(result EQUAL 0)
-   target_link_libraries(test_grouped_convnd_bwd_weight_interface_xdl PRIVATE utility)
+   target_link_libraries(test_grouped_convnd_bwd_weight_interface_xdl PRIVATE utility device_grouped_conv1d_bwd_weight_instance device_grouped_conv2d_bwd_weight_instance device_grouped_conv3d_bwd_weight_instance)
 endif()
 add_gtest_executable(test_grouped_convnd_bwd_weight_interface_wmma test_grouped_convnd_bwd_weight_interface_wmma.cpp)
 if(result EQUAL 0)

--- a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_interface_xdl.cpp
+++ b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_interface_xdl.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include <cstdlib>
 #include <iostream>
@@ -12,69 +12,143 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp"
-
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp"
+#include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/convolution_parameter.hpp"
 #include "ck/library/utility/algorithm.hpp"
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/device_memory.hpp"

 #include <gtest/gtest.h>

-using F16         = ck::half_t;
-using F32         = float;
+namespace ctl = ck::tensor_layout::convolution;
+
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;

+using InDataType  = ck::bhalf_t;
+using WeiDataType = float;
+using OutDataType = ck::bhalf_t;
+using AccDataType = float;
 template <ck::index_t... Is>
+
 using S = ck::Sequence<Is...>;
-using ConvolutionBackwardWeightSpecialization =
-    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization;

-static constexpr auto ConvBwdWeightDefault = ConvolutionBackwardWeightSpecialization::Default;
-static constexpr auto Filter1x1Stride1Pad0 =
-    ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0;
+static constexpr auto ConvBwdWeightDefault =
+    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Default;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;

-template <typename Tuple, ConvolutionBackwardWeightSpecialization ConvSpec>
-class TestGroupedConvndBwdWeight : public ::testing::Test
+template <typename InputLay, typename WeightLay, typename OutputLay>
+struct CommonLayoutSetting
 {
-    protected:
-    static constexpr ck::index_t NDimSpatial = 2;
+    using InputLayout  = InputLay;
+    using WeightLayout = WeightLay;
+    using OutputLayout = OutputLay;
+};
+
+template <ck::index_t NDimSpatial>
+struct CommonLayoutSettingSelector
+    : CommonLayoutSetting<ck::tuple_element_t<NDimSpatial - 1,
+                                              ck::Tuple<ck::tensor_layout::convolution::GNWC,
+                                                        ck::tensor_layout::convolution::GNHWC,
+                                                        ck::tensor_layout::convolution::GNDHWC>>,
+                          ck::tuple_element_t<NDimSpatial - 1,
+                                              ck::Tuple<ck::tensor_layout::convolution::GKXC,
+                                                        ck::tensor_layout::convolution::GKYXC,
+                                                        ck::tensor_layout::convolution::GKZYXC>>,
+                          ck::tuple_element_t<NDimSpatial - 1,
+                                              ck::Tuple<ck::tensor_layout::convolution::GNWK,
+                                                        ck::tensor_layout::convolution::GNHWK,
+                                                        ck::tensor_layout::convolution::GNDHWK>>>
+{
+};
+
+template <ck::index_t NDimSpatial>
+using InputLayout = typename CommonLayoutSettingSelector<NDimSpatial>::InputLayout;

-    using InLayout  = std::tuple_element_t<2, Tuple>;
-    using WeiLayout = std::tuple_element_t<1, Tuple>;
-    using OutLayout = std::tuple_element_t<0, Tuple>;
+template <ck::index_t NDimSpatial>
+using WeightLayout = typename CommonLayoutSettingSelector<NDimSpatial>::WeightLayout;

-    // clang-format off
-    using GroupedConvBwdWeightDeviceInstance = ck::tensor_operation::device::DeviceGroupedConvBwdWeight_Xdl_CShuffle
-        //##########|     Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                      ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
-        //##########|     Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                            Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
-        //##########| Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|                    Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
-        //##########|        |         |          |          |       |        |        |        |            |            |            |                                  |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
-        <         NDimSpatial,  InLayout, WeiLayout,OutLayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>;
-    // clang-format on
+template <ck::index_t NDimSpatial>
+using OutputLayout = typename CommonLayoutSettingSelector<NDimSpatial>::OutputLayout;

+class TestGroupedConvndBwdWeight : public ::testing::Test
+{
+    protected:
    ck::utils::conv::ConvParam conv_param;
-    ck::index_t split_k{2};

    template <ck::index_t NDimSpatial>
-    bool Run()
+    void RunReference(Tensor<InDataType>& in,
+                      Tensor<WeiDataType>& wei_host_result,
+                      Tensor<OutDataType>& out)
+    {
+        auto ref_conv     = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
+                                                                           InDataType,
+                                                                           WeiDataType,
+                                                                           OutDataType,
+                                                                           PassThrough,
+                                                                           PassThrough,
+                                                                           PassThrough>{};
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(in,
+                                                  wei_host_result,
+                                                  out,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  PassThrough{},
+                                                  PassThrough{},
+                                                  PassThrough{},
+                                                  {},
+                                                  {},
+                                                  {});
+
+        ref_invoker.Run(ref_argument);
+    }
+
+    template <ck::index_t NDimSpatial>
+    bool PerformConvWeight(ck::index_t split_k)
    {
+        bool passed{true};

        const auto in_g_n_c_wis_desc =
-            ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(
-                conv_param);
+            ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<
+                InputLayout<NDimSpatial>>(conv_param);

        const auto wei_g_k_c_xs_desc =
-            ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(
-                conv_param);
+            ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<
+                WeightLayout<NDimSpatial>>(conv_param);

        const auto out_g_n_k_wos_desc =
-            ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(
-                conv_param);
+            ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<
+                OutputLayout<NDimSpatial>>(conv_param);
+
+        Tensor<InDataType> in(in_g_n_c_wis_desc);
+        Tensor<WeiDataType> wei_host_result(wei_g_k_c_xs_desc);
+        Tensor<WeiDataType> wei_device_result(wei_g_k_c_xs_desc);
+        Tensor<OutDataType> out(out_g_n_k_wos_desc);
+
+        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+        out.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+
+        DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
+        DeviceMem wei_device_buf(sizeof(WeiDataType) *
+                                 wei_device_result.mDesc.GetElementSpaceSize());
+        DeviceMem out_device_buf(sizeof(OutDataType) * out.mDesc.GetElementSpaceSize());
+
+        in_device_buf.ToDevice(in.mData.data());
+        out_device_buf.ToDevice(out.mData.data());
+
+        // init to 0
+        wei_device_buf.SetZero();

        std::array<ck::index_t, NDimSpatial + 3> input_lengths{};
-        std::array<ck::index_t, NDimSpatial + 3> filter_lengths{};
-        std::array<ck::index_t, NDimSpatial + 3> output_lengths{};
        std::array<ck::index_t, NDimSpatial + 3> input_strides{};
+        std::array<ck::index_t, NDimSpatial + 3> filter_lengths{};
        std::array<ck::index_t, NDimSpatial + 3> weights_strides{};
+        std::array<ck::index_t, NDimSpatial + 3> output_lengths{};
        std::array<ck::index_t, NDimSpatial + 3> output_strides{};
        std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
        std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
@@ -94,86 +168,261 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
        range_copy(conv_param.input_left_pads_, begin(input_left_pads));
        range_copy(conv_param.input_right_pads_, begin(input_right_pads));

-        auto conv = GroupedConvBwdWeightDeviceInstance{};
-
-        auto argument = conv.MakeArgument(nullptr,
-                                          nullptr,
-                                          nullptr,
-                                          input_lengths,
-                                          input_strides,
-                                          filter_lengths,
-                                          weights_strides,
-                                          output_lengths,
-                                          output_strides,
-                                          conv_filter_strides,
-                                          conv_filter_dilations,
-                                          input_left_pads,
-                                          input_right_pads,
-                                          PassThrough{},
-                                          PassThrough{},
-                                          PassThrough{},
-                                          split_k);
-        return conv.IsSupportedArgument(argument);
+        RunReference<NDimSpatial>(in, wei_host_result, out);
+
+        using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvBwdWeight_Xdl_CShuffle<
+            NDimSpatial,
+            ck::tuple_element_t<NDimSpatial - 1,
+                                ck::Tuple<ck::tensor_layout::convolution::GNWC,
+                                          ck::tensor_layout::convolution::GNHWC,
+                                          ck::tensor_layout::convolution::GNDHWC>>,
+            ck::tuple_element_t<NDimSpatial - 1,
+                                ck::Tuple<ck::tensor_layout::convolution::GKXC,
+                                          ck::tensor_layout::convolution::GKYXC,
+                                          ck::tensor_layout::convolution::GKZYXC>>,
+            ck::tuple_element_t<NDimSpatial - 1,
+                                ck::Tuple<ck::tensor_layout::convolution::GNWK,
+                                          ck::tensor_layout::convolution::GNHWK,
+                                          ck::tensor_layout::convolution::GNDHWK>>,
+            InDataType,           // InDataType
+            WeiDataType,          // WeiDataType
+            OutDataType,          // OutDataType
+            AccDataType,          // AccDataType
+            PassThrough,          // InElementwiseOperation
+            PassThrough,          // WeiElementwiseOperation
+            PassThrough,          // OutElementwiseOperation
+            ConvBwdWeightDefault, // ConvolutionBackwardWeightSpecialization
+            64,                   // BlockSize
+            16,                   // MPerBlock
+            16,                   // NPerBlock
+            32,                   // K0PerBlock
+            8,                    // K1
+            16,                   // MPerXdl
+            16,                   // NPerXdl
+            1,                    // MXdlPerWave
+            1,                    // NXdlPerWave
+            S<4, 16, 1>,          // ABlockTransferThreadClusterLengths_K0_M_K1
+            S<2, 0, 1>,           // ABlockTransferThreadClusterArrangeOrder
+            S<1, 0, 2>,           // ABlockTransferSrcAccessOrder
+            1,                    // ABlockTransferSrcVectorDim
+            1,                    // ABlockTransferSrcScalarPerVector
+            4,                    // ABlockTransferDstScalarPerVector_K1
+            false,                // ABlockLdsAddExtraM
+            S<4, 16, 1>,          // BBlockTransferThreadClusterLengths_K0_N_K1
+            S<2, 0, 1>,           // BBlockTransferThreadClusterArrangeOrder
+            S<1, 0, 2>,           // BBlockTransferSrcAccessOrder
+            1,                    // BBlockTransferSrcVectorDim
+            1,                    // BBlockTransferSrcScalarPerVector
+            4,                    // BBlockTransferDstScalarPerVector_K1
+            false,                // BBlockLdsAddExtraN
+            1,                    // CShuffleMXdlPerWavePerShuffle
+            1,                    // CShuffleNXdlPerWavePerShuffle
+            S<1, 8, 1, 8>,        // CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+            1>;                   // CBlockTransferScalarPerVector_NWaveNPerXdl
+
+        auto conv_ptr = DeviceOp{};
+        auto argument =
+            conv_ptr.MakeArgument(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+                                  static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+                                  static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+                                  input_lengths,
+                                  input_strides,
+                                  filter_lengths,
+                                  weights_strides,
+                                  output_lengths,
+                                  output_strides,
+                                  conv_filter_strides,
+                                  conv_filter_dilations,
+                                  input_left_pads,
+                                  input_right_pads,
+                                  PassThrough{},
+                                  PassThrough{},
+                                  PassThrough{},
+                                  split_k);
+
+        auto invoker_ptr = conv_ptr.MakeInvoker();
+
+        if(conv_ptr.IsSupportedArgument(argument))
+        {
+            float avg_time = invoker_ptr.Run(argument, StreamConfig{nullptr, false});
+            wei_device_buf.FromDevice(wei_device_result.mData.data());
+            passed &= ck::utils::check_err(
+                wei_device_result.mData, wei_host_result.mData, "Error: incorrect results!");
+
+            std::size_t flop = conv_param.GetFlops() +
+                               3 * conv_param.GetOutputByte<WeiDataType>() / sizeof(WeiDataType);
+            std::size_t num_bytes = conv_param.GetByte<InDataType, WeiDataType, OutDataType>() +
+                                    conv_param.GetOutputByte<WeiDataType>();
+
+            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+            float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, "
+                      << "split_k " << split_k << std::endl;
+        }
+        return passed;
+    }
+
+    template <ck::index_t NDimSpatial>
+    void Run()
+    {
+        bool pass = true;
+
+        for(auto split_k : {1, 2})
+        {
+            pass = pass && PerformConvWeight<NDimSpatial>(split_k);
+            EXPECT_TRUE(pass);
+        }
    }
 };

-using GNHWC = ck::tensor_layout::convolution::GNHWC;
-using NHWGC = ck::tensor_layout::convolution::NHWGC;
+TEST_F(TestGroupedConvndBwdWeight, TestGroupedConvndBwdWeight_NDimSpatial_1_Filter_1x1)
+{
+    this->conv_param = {
+        1, 2, 4, 192, 192, {1, 1, 1}, {28, 28, 28}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}};
+    this->template Run<1>();
+}

-using GKYXC = ck::tensor_layout::convolution::GKYXC;
+TEST_F(TestGroupedConvndBwdWeight, TestGroupedConvndBwdWeight_NDimSpatial_1_Filter_3x3)
+{
+    this->conv_param = {
+        1, 2, 4, 192, 192, {3, 3, 3}, {28, 28, 28}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}};
+    this->template Run<1>();
+}

-using GNHWK = ck::tensor_layout::convolution::GNHWK;
-using NHWGK = ck::tensor_layout::convolution::NHWGK;
+TEST_F(TestGroupedConvndBwdWeight, TestGroupedConvndBwdWeight_NDimSpatial_2_Filter_1x1)
+{
+    this->conv_param = {
+        2, 2, 4, 192, 192, {1, 1, 1}, {28, 28, 28}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}};
+    this->template Run<2>();
+}

-using KernelTypes =
-    ::testing::Types<std::tuple<GNHWK, GKYXC, GNHWC>, std::tuple<NHWGK, GKYXC, NHWGC>>;
+TEST_F(TestGroupedConvndBwdWeight, TestGroupedConvndBwdWeight_NDimSpatial_2_Filter_3x3)
+{
+    this->conv_param = {
+        2, 2, 4, 192, 192, {3, 3, 3}, {28, 28, 28}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}};
+    this->template Run<2>();
+}

-template <typename Tuple>
-class TestGroupedConvndBwdWeightDefault
-    : public TestGroupedConvndBwdWeight<Tuple, ConvBwdWeightDefault>
+TEST_F(TestGroupedConvndBwdWeight, TestGroupedConvndBwdWeight_NDimSpatial_3_Filter_1x1)
 {
-};
+    this->conv_param = {
+        3, 2, 4, 192, 192, {1, 1, 1}, {28, 28, 28}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}};
+    this->template Run<3>();
+}

-template <typename Tuple>
-class TestGroupedConvndBwdWeightFilter1x1
-    : public TestGroupedConvndBwdWeight<Tuple, Filter1x1Stride1Pad0>
+TEST_F(TestGroupedConvndBwdWeight, TestGroupedConvndBwdWeight_NDimSpatial_3_Filter_3x3)
 {
-};
+    this->conv_param = {
+        3, 2, 4, 192, 192, {3, 3, 3}, {28, 28, 28}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}};
+    this->template Run<3>();
+}

-TYPED_TEST_SUITE(TestGroupedConvndBwdWeightDefault, KernelTypes);
-TYPED_TEST_SUITE(TestGroupedConvndBwdWeightFilter1x1, KernelTypes);
+TEST_F(TestGroupedConvndBwdWeight, TestGroupedConvndBwdWeight_NDimSpatial_1_Stride_1x1)
+{
+    this->conv_param = {
+        1, 2, 4, 192, 192, {1, 1, 1}, {28, 28, 28}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}};
+    this->template Run<1>();
+}

-TYPED_TEST(TestGroupedConvndBwdWeightFilter1x1, SpecializationCheck)
+TEST_F(TestGroupedConvndBwdWeight, TestGroupedConvndBwdWeight_NDimSpatial_1_Stride_2x2)
 {
-    // Check filter 3,3 instead of 1,1
-    this->conv_param  = {2, 2, 4, 192, 192, {3, 3}, {28, 28}, {1, 1}, {1, 1}, {0, 0}, {0, 0}};
-    bool is_supported = this->template Run<2>();
-    EXPECT_FALSE(is_supported);
+    this->conv_param = {
+        1, 2, 4, 192, 192, {1, 1, 1}, {28, 28, 28}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}};
+    this->template Run<1>();
+}

-    // Check strides 2,2 instead of 1,1
-    this->conv_param = {2, 2, 4, 192, 192, {1, 1}, {28, 28}, {2, 2}, {1, 1}, {0, 0}, {0, 0}};
-    is_supported     = this->template Run<2>();
-    EXPECT_FALSE(is_supported);
+TEST_F(TestGroupedConvndBwdWeight, TestGroupedConvndBwdWeight_NDimSpatial_2_Stride_1x1)
+{
+    this->conv_param = {
+        2, 2, 4, 192, 192, {1, 1, 1}, {28, 28, 28}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}};
+    this->template Run<2>();
+}

-    // Check with pad
-    this->conv_param = {2, 2, 4, 192, 192, {1, 1}, {28, 28}, {1, 1}, {1, 1}, {1, 1}, {1, 1}};
-    is_supported     = this->template Run<2>();
-    EXPECT_FALSE(is_supported);
+TEST_F(TestGroupedConvndBwdWeight, TestGroupedConvndBwdWeight_NDimSpatial_2_Stride_2x2)
+{
+    this->conv_param = {
+        2, 2, 4, 192, 192, {1, 1, 1}, {28, 28, 28}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}};
+    this->template Run<2>();
+}

-    // Supported version
-    this->conv_param = {2, 2, 128, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}};
-    is_supported     = this->template Run<2>();
-    EXPECT_TRUE(is_supported);
+TEST_F(TestGroupedConvndBwdWeight, TestGroupedConvndBwdWeight_NDimSpatial_3_Stride_1x1)
+{
+    this->conv_param = {
+        3, 2, 4, 192, 192, {1, 1, 1}, {28, 28, 28}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}};
+    this->template Run<3>();
+}
+
+TEST_F(TestGroupedConvndBwdWeight, TestGroupedConvndBwdWeight_NDimSpatial_3_Stride_2x2)
+{
+    this->conv_param = {
+        3, 2, 4, 192, 192, {1, 1, 1}, {28, 28, 28}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}};
+    this->template Run<3>();
+}
+
+TEST_F(TestGroupedConvndBwdWeight, TestGroupedConvndBwdWeight_NDimSpatial_1_WithPadding)
+{
+    this->conv_param = {
+        1, 2, 4, 192, 192, {1, 1, 1}, {28, 28, 28}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}};
+    this->template Run<1>();
+}
+
+TEST_F(TestGroupedConvndBwdWeight, TestGroupedConvndBwdWeight_NDimSpatial_2_WithPadding)
+{
+    this->conv_param = {
+        2, 2, 4, 192, 192, {1, 1, 1}, {28, 28, 28}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}};
+    this->template Run<2>();
+}
+
+TEST_F(TestGroupedConvndBwdWeight, TestGroupedConvndBwdWeight_NDimSpatial_3_WithPadding)
+{
+    this->conv_param = {
+        3, 2, 4, 192, 192, {1, 1, 1}, {28, 28, 28}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}};
+    this->template Run<3>();
+}
+
+TEST_F(TestGroupedConvndBwdWeight, TestGroupedConvndBwdWeight_NDimSpatial_1_SupportedVersion)
+{
+    this->conv_param = {
+        1, 2, 4, 192, 192, {1, 1, 1}, {28, 28, 28}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}};
+    this->template Run<1>();
+}
+
+TEST_F(TestGroupedConvndBwdWeight, TestGroupedConvndBwdWeight_NDimSpatial_2_SupportedVersion)
+{
+    this->conv_param = {
+        2, 2, 4, 192, 192, {1, 1, 1}, {28, 28, 28}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}};
+    this->template Run<2>();
+}
+
+TEST_F(TestGroupedConvndBwdWeight, TestGroupedConvndBwdWeight_NDimSpatial_3_SupportedVersion)
+{
+    this->conv_param = {
+        3, 2, 4, 192, 192, {1, 1, 1}, {28, 28, 28}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}};
+    this->template Run<3>();
+}
+
+TEST_F(TestGroupedConvndBwdWeight, TestGroupedConvndBwdWeight_NDimSpatial_1_VectorLoadForA)
+{
+    this->conv_param = {1, 2, 128, 129, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}};
+    this->template Run<1>();
+}
+
+TEST_F(TestGroupedConvndBwdWeight, TestGroupedConvndBwdWeight_NDimSpatial_2_VectorLoadForA)
+{
+    this->conv_param = {2, 2, 128, 129, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}};
+    this->template Run<2>();
+}
+
+TEST_F(TestGroupedConvndBwdWeight, TestGroupedConvndBwdWeight_NDimSpatial_1_VectorLoadForB_E_DS)
+{
+    this->conv_param = {1, 2, 128, 128, 257, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}};
+    this->template Run<1>();
 }

-TYPED_TEST(TestGroupedConvndBwdWeightDefault, VectorLoadCheck)
+TEST_F(TestGroupedConvndBwdWeight, TestGroupedConvndBwdWeight_NDimSpatial_2_VectorLoadForB_E_DS)
 {
-    // vector load for A
-    this->conv_param  = {2, 2, 128, 129, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}};
-    bool is_supported = this->template Run<2>();
-    EXPECT_FALSE(is_supported);
-    // vector load for B, E, Ds
    this->conv_param = {2, 2, 128, 128, 257, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}};
-    is_supported     = this->template Run<2>();
-    EXPECT_FALSE(is_supported);
+    this->template Run<2>();
 }