Add client example of grouped conv2d backward data (data type: fp16) (#481)

* Improve example reusability * Remove no-longer used file * Rename folder of grouped_conv_bwd_data example * Add normal grouped conv bwd example * Add interface 'DeviceGroupedConvBwdData' * Prettify comment of device op type arguments * Add grouped conv2d/conv3d backward data fp16 instances * Fix wrong template argument * Add grouped_conv2d_bwd_data client example * Use simpler expression to calculate memory size * Fix formating * Remove grouped_conv3d_bw_data instances Underlying device operator is not ready to handle 3D input * Remove no-longer necessary include directive * Add missing include directive * Use more realistic conv param in example

Add client example of grouped conv2d backward data (data type: fp16) (#481)
* Improve example reusability * Remove no-longer used file * Rename folder of grouped_conv_bwd_data example * Add normal grouped conv bwd example * Add interface 'DeviceGroupedConvBwdData' * Prettify comment of device op type arguments * Add grouped conv2d/conv3d backward data fp16 instances * Fix wrong template argument * Add grouped_conv2d_bwd_data client example * Use simpler expression to calculate memory size * Fix formating * Remove grouped_conv3d_bw_data instances Underlying device operator is not ready to handle 3D input * Remove no-longer necessary include directive * Add missing include directive * Use more realistic conv param in example
9e57a290 · Po Yen Chen · GitHub · 1a0b0e7b · 9e57a290 · 9e57a290
Unverified Commit 9e57a290 authored Nov 03, 2022 by Po Yen Chen Committed by GitHub Nov 02, 2022
15 changed files
--- a/client_example/09_grouped_conv2d_bwd_data/CMakeLists.txt
+++ b/client_example/09_grouped_conv2d_bwd_data/CMakeLists.txt
+add_executable(client_grouped_conv2d_bwd_data grouped_conv2d_bwd_data.cpp)
+target_link_libraries(client_grouped_conv2d_bwd_data PRIVATE composable_kernel::device_operations)
--- a/client_example/09_grouped_conv2d_bwd_data/grouped_conv2d_bwd_data.cpp
+++ b/client_example/09_grouped_conv2d_bwd_data/grouped_conv2d_bwd_data.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <cstdlib>
+#include <iomanip>
+#include <iostream>
+#include <iterator>
+#include <numeric>
+#include <vector>
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+using InDataType  = ck::half_t;
+using WeiDataType = ck::half_t;
+using OutDataType = ck::half_t;
+using InLayout    = ck::tensor_layout::convolution::GNHWC;
+using WeiLayout   = ck::tensor_layout::convolution::GKYXC;
+using OutLayout   = ck::tensor_layout::convolution::GNHWK;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+static constexpr ck::index_t NumDimSpatial = 2;
+static constexpr ck::index_t G             = 32;
+static constexpr ck::index_t N             = 256;
+static constexpr ck::index_t K             = 192;
+static constexpr ck::index_t C             = 192;
+static constexpr ck::index_t Y             = 3;
+static constexpr ck::index_t X             = 3;
+static constexpr ck::index_t Hi            = 28;
+static constexpr ck::index_t Wi            = 28;
+static constexpr ck::index_t Ho            = 28;
+static constexpr ck::index_t Wo            = 28;
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+    void* GetDeviceBuffer() { return p_mem_; }
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+    void* p_mem_;
+};
+int main()
+{
+    std::array<ck::index_t, NumDimSpatial + 3> in_lengths{G, N, Hi, Wi, C};
+    std::array<ck::index_t, NumDimSpatial + 3> in_strides{0, 0, 0, 0, 1};
+    std::array<ck::index_t, NumDimSpatial + 3> wei_lengths{G, K, Y, X, C};
+    std::array<ck::index_t, NumDimSpatial + 3> wei_strides{0, 0, 0, 0, 1};
+    std::array<ck::index_t, NumDimSpatial + 3> out_lengths{G, N, Ho, Wo, K};
+    std::array<ck::index_t, NumDimSpatial + 3> out_strides{0, 0, 0, 0, 1};
+    std::partial_sum(rbegin(in_lengths),
+                     std::prev(rend(in_lengths)),
+                     std::next(rbegin(in_strides)),
+                     std::multiplies<>{});
+    std::partial_sum(rbegin(wei_lengths),
+                     std::prev(rend(wei_lengths)),
+                     std::next(rbegin(wei_strides)),
+                     std::multiplies<>{});
+    std::partial_sum(rbegin(out_lengths),
+                     std::prev(rend(out_lengths)),
+                     std::next(rbegin(out_strides)),
+                     std::multiplies<>{});
+    // transpose GNHWC/GKYXC/GNHWK to GNCHW/GKCYX/GNCHW
+    std::rotate(
+        rbegin(in_lengths), std::next(rbegin(in_lengths)), std::next(rbegin(in_lengths), 3));
+    std::rotate(
+        rbegin(in_strides), std::next(rbegin(in_strides)), std::next(rbegin(in_strides), 3));
+    std::rotate(
+        rbegin(wei_lengths), std::next(rbegin(wei_lengths)), std::next(rbegin(wei_lengths), 3));
+    std::rotate(
+        rbegin(wei_strides), std::next(rbegin(wei_strides)), std::next(rbegin(wei_strides), 3));
+    std::rotate(
+        rbegin(out_lengths), std::next(rbegin(out_lengths)), std::next(rbegin(out_lengths), 3));
+    std::rotate(
+        rbegin(out_strides), std::next(rbegin(out_strides)), std::next(rbegin(out_strides), 3));
+    std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1};
+    std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1};
+    std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
+    std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
+    SimpleDeviceMem in(sizeof(InDataType) * G * N * Hi * Wi * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
+    SimpleDeviceMem out(sizeof(OutDataType) * G * N * Ho * Wo * K);
+    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvBwdData<NumDimSpatial,
+                                                                            InLayout,
+                                                                            WeiLayout,
+                                                                            OutLayout,
+                                                                            InDataType,
+                                                                            WeiDataType,
+                                                                            OutDataType,
+                                                                            PassThrough,
+                                                                            PassThrough,
+                                                                            PassThrough>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+    std::string best_op_name;
+    int best_op_id        = -1;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    float best_tflops     = 0;
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr        = op_ptrs[i];
+        auto argument_ptr   = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                                        wei.GetDeviceBuffer(),
+                                                        out.GetDeviceBuffer(),
+                                                        in_lengths,
+                                                        in_strides,
+                                                        wei_lengths,
+                                                        wei_strides,
+                                                        out_lengths,
+                                                        out_strides,
+                                                        filter_strides,
+                                                        filter_dilations,
+                                                        input_left_pads,
+                                                        input_right_pads,
+                                                        PassThrough{},
+                                                        PassThrough{},
+                                                        PassThrough{});
+        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+            std::size_t flop      = std::size_t(2) * G * N * K * C * Ho * Wo * Y * X;
+            std::size_t num_bytes = sizeof(InDataType) * G * N * Hi * Wi * C +
+                                    sizeof(WeiDataType) * G * K * Y * X * C +
+                                    sizeof(OutDataType) * G * N * Ho * Wo * K;
+            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+            float gb_per_sec = num_bytes / 1.E6 / avg_time;
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+            if(tflops > best_tflops)
+            {
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+                best_tflops     = tflops;
+            }
+        }
+        else
+        {
+            std::cerr << op_name << " does not support this problem" << std::endl;
+        }
+    }
+    if(best_op_id < 0)
+    {
+        std::cerr << "no suitable instance" << std::endl;
+        return EXIT_FAILURE;
+    }
+    std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
+              << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+    // run the best intance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                                        wei.GetDeviceBuffer(),
+                                                        out.GetDeviceBuffer(),
+                                                        in_lengths,
+                                                        in_strides,
+                                                        wei_lengths,
+                                                        wei_strides,
+                                                        out_lengths,
+                                                        out_strides,
+                                                        filter_strides,
+                                                        filter_dilations,
+                                                        input_left_pads,
+                                                        input_right_pads,
+                                                        PassThrough{},
+                                                        PassThrough{},
+                                                        PassThrough{});
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+        std::cout << "Done" << std::endl;
+    }
+}
--- a/example/38_grouped_conv_bwd_data_bias_relu/CMakeLists.txt
+++ b/example/38_grouped_conv_bwd_data_bias_relu/CMakeLists.txt
-add_example_executable(example_grouped_conv_bwd_data_bias_relu_fp16 grouped_conv_bwd_data_bias_relu_fp16.cpp)
--- a/example/38_grouped_conv_bwd_data_bias_relu/grouped_conv_bwd_data_bias_relu_fp16.cpp
+++ b/example/38_grouped_conv_bwd_data_bias_relu/grouped_conv_bwd_data_bias_relu_fp16.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "grouped_conv_bwd_data_bias_relu_common.hpp"
-#include "ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp"
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-using OutDataType      = ck::half_t;
-using WeiDataType      = ck::half_t;
-using AccDataType      = float;
-using CShuffleDataType = ck::half_t;
-using BiasDataType     = ck::half_t; // bias
-using InDataType       = ck::half_t;
-using OutLayout  = ck::tensor_layout::convolution::GNHWK;
-using WeiLayout  = ck::tensor_layout::convolution::GKYXC;
-using BiasLayout = ck::tensor_layout::convolution::G_C;
-using InLayout   = ck::tensor_layout::convolution::GNHWC;
-using OutElementOp     = ck::tensor_operation::element_wise::PassThrough;
-using WeiElementOp     = ck::tensor_operation::element_wise::PassThrough;
-using CBiasInElementOp = ck::tensor_operation::element_wise::AddRelu;
-static constexpr auto ConvBwdDataDefault =
-    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default;
-template <ck::index_t NDimSpatial>
-using DeviceConvNdBwdDataInstance =
-    ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<
-        NDimSpatial,
-        OutLayout,
-        WeiLayout,
-        ck::Tuple<BiasLayout>,
-        InLayout,
-        OutDataType,
-        WeiDataType,
-        AccDataType,
-        CShuffleDataType,
-        ck::Tuple<BiasDataType>,
-        InDataType,
-        OutElementOp,
-        WeiElementOp,
-        CBiasInElementOp,
-        ConvBwdDataDefault,
-        true, // DoPadGemmM
-        true, // DoPadGemmN
-        1,
-        256,
-        128,
-        256,
-        32,
-        8,
-        2,
-        32,
-        32,
-        2,
-        4,
-        S<4, 64, 1>,
-        S<1, 0, 2>,
-        S<1, 0, 2>,
-        2,
-        8,
-        8,
-        1,
-        S<4, 64, 1>,
-        S<0, 2, 1>,
-        S<0, 2, 1>,
-        1,
-        4,
-        2,
-        0,
-        1,
-        1,
-        S<1, 32, 1, 8>,
-        8>;
-int main(int argc, char* argv[])
-{
-    namespace ctc = ck::tensor_layout::convolution;
-    print_helper_msg();
-    bool do_verification = true;
-    int init_method      = 1;
-    bool time_kernel     = false;
-    ck::utils::conv::ConvParam conv_param{
-        2, 2, 128, 256, 256, {3, 3}, {14, 14}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
-    if(argc == 1)
-    {
-        // use default
-    }
-    else if(argc == 4)
-    {
-        do_verification = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
-        time_kernel     = std::stoi(argv[3]);
-    }
-    else
-    {
-        do_verification                   = std::stoi(argv[1]);
-        init_method                       = std::stoi(argv[2]);
-        time_kernel                       = std::stoi(argv[3]);
-        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
-        conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
-    }
-    const auto in_element_op  = CBiasInElementOp{};
-    const auto wei_element_op = WeiElementOp{};
-    const auto out_element_op = OutElementOp{};
-    if(conv_param.num_dim_spatial_ == 2)
-    {
-        // output image: GNHWK
-        const auto out_g_n_k_wos_desc =
-            ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(
-                conv_param);
-        // weight: GKYXC
-        const auto wei_g_k_c_xs_desc =
-            ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(
-                conv_param);
-        // input image bias: G_C
-        const auto bias_g_n_c_wis_desc =
-            HostTensorDescriptor({conv_param.G_,
-                                  conv_param.N_,
-                                  conv_param.C_,
-                                  conv_param.input_spatial_lengths_[0],
-                                  conv_param.input_spatial_lengths_[1]},
-                                 {
-                                     conv_param.C_, // g
-                                     0,             // n
-                                     1,             // c
-                                     0,             // hi
-                                     0              // wi
-                                 });
-        // input image: GNHWC
-        const auto in_g_n_c_wis_desc =
-            ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(
-                conv_param);
-        using DeviceInstance = DeviceConvNdBwdDataInstance<2>;
-        run_conv_bwd_data_bias_relu<2,
-                                    OutDataType,
-                                    WeiDataType,
-                                    BiasDataType,
-                                    InDataType,
-                                    OutElementOp,
-                                    WeiElementOp,
-                                    CBiasInElementOp,
-                                    DeviceInstance>(do_verification,
-                                                    init_method,
-                                                    time_kernel,
-                                                    conv_param,
-                                                    out_g_n_k_wos_desc,
-                                                    wei_g_k_c_xs_desc,
-                                                    bias_g_n_c_wis_desc,
-                                                    in_g_n_c_wis_desc,
-                                                    wei_element_op,
-                                                    out_element_op,
-                                                    in_element_op);
-    }
-    return 0;
-}
--- a/example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt
+++ b/example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt
+add_custom_target(example_grouped_conv_bwd_data)
+add_example_executable(example_grouped_conv_bwd_data_fp16 grouped_conv_bwd_data_fp16.cpp)
+add_example_executable(example_grouped_conv_bwd_data_bias_relu_fp16 grouped_conv_bwd_data_bias_relu_fp16.cpp)
+add_dependencies(example_grouped_conv_bwd_data example_grouped_conv_bwd_data_fp16)
+add_dependencies(example_grouped_conv_bwd_data example_grouped_conv_bwd_data_bias_relu_fp16)
--- a/example/38_grouped_conv_bwd_data_multiple_d/common.hpp
+++ b/example/38_grouped_conv_bwd_data_multiple_d/common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <cstdlib>
+#include <initializer_list>
+#include <iostream>
+#include <numeric>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+static inline constexpr ck::index_t NDimSpatial = 2;
+static constexpr auto ConvBwdDataDefault =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default;
+using FP16 = ck::half_t;
+using FP32 = float;
+struct ExecutionConfig final
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = true;
+};
+#define DefaultConvParams                                                                \
+    ck::utils::conv::ConvParam                                                           \
+    {                                                                                    \
+        NDimSpatial, 32, 4, 192, 192, {3, 3}, {28, 28}, {1, 1}, {1, 1}, {1, 1}, { 1, 1 } \
+    }
+inline void print_help_msg()
+{
+    std::cerr << "arg1: verification (0=no, 1=yes)\n"
+              << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+              << "arg3: time kernel (0=no, 1=yes)\n"
+              << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
+}
+inline bool parse_cmd_args(int argc,
+                           char* argv[],
+                           ExecutionConfig& config,
+                           ck::utils::conv::ConvParam& conv_params)
+{
+    constexpr int num_execution_config_args =
+        3; // arguments for do_verification, init_method, time_kernel
+    constexpr int num_conv_param_leading_args = 5; // arguments for num_dim_spatial_, G_, N_, K_, C_
+    constexpr int threshold_to_catch_partial_args = 1 + num_execution_config_args;
+    constexpr int threshold_to_catch_all_args =
+        threshold_to_catch_partial_args + num_conv_param_leading_args;
+    if(argc == 1)
+    {
+        // use default
+        config = ExecutionConfig{};
+    }
+    // catch only ExecutionConfig arguments
+    else if(argc == threshold_to_catch_partial_args)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+    }
+    // catch both ExecutionConfig & ConvParam arguments
+    else if(threshold_to_catch_all_args < argc && ((argc - threshold_to_catch_all_args) % 3 == 0))
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
+        conv_params                       = ck::utils::conv::parse_conv_param(
+            num_dim_spatial, threshold_to_catch_partial_args, argv);
+    }
+    else
+    {
+        print_help_msg();
+        return false;
+    }
+    return true;
+}
--- a/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_bias_relu_fp16.cpp
+++ b/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_bias_relu_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include "common.hpp"
+using OutDataType      = FP16;
+using WeiDataType      = FP16;
+using AccDataType      = FP32;
+using CShuffleDataType = FP16;
+using BiasDataType     = FP16; // bias
+using InDataType       = FP16;
+using OutLayout  = ck::tensor_layout::convolution::GNHWK;
+using WeiLayout  = ck::tensor_layout::convolution::GKYXC;
+using BiasLayout = ck::Tuple<ck::tensor_layout::convolution::G_C>;
+using InLayout   = ck::tensor_layout::convolution::GNHWC;
+using OutElementOp = PassThrough;
+using WeiElementOp = PassThrough;
+using InElementOp  = ck::tensor_operation::element_wise::AddRelu;
+// clang-format off
+using DeviceConvInstance = ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
+// ######| NDimSpatial|   ALayout|   BLayout|   DsLayout|  ELayout|       AData|       BData|     AccData|         CShuffle|                  DsData|      EData| AElementwise| BElementwise| CDEElementwise| ConvolutionBackward| DoPad| DoPad|      NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MXdl|    NXdl|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffleMXdl| CShuffleNXdl|   CDEBlockTransfer| CDEBlockTransfer|
+// ######|            |          |          |           |         |        Type|        Type|        Type|         DataType|                    Type|       Type|    Operation|    Operation|      Operation|  DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector|
+// ######|            |          |          |           |         |            |            |            |                 |                        |           |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|
+// ######|            |          |          |           |         |            |            |            |                 |                        |           |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |
+         < NDimSpatial, OutLayout, WeiLayout, BiasLayout, InLayout, OutDataType, WeiDataType, AccDataType, CShuffleDataType, ck::Tuple<BiasDataType>, InDataType, OutElementOp, WeiElementOp,    InElementOp,  ConvBwdDataDefault,  true,  true,             1,   256,   128,   256,    32,   8,   2,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,            1,            1,     S<1, 32, 1, 8>,                8>;
+// clang-format on
+#include "run_grouped_conv_bwd_data_bias_relu_example.inc"
+int main(int argc, char* argv[]) { return run_grouped_conv_bwd_data_bias_relu_example(argc, argv); }
--- a/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_fp16.cpp
+++ b/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include "common.hpp"
+using OutDataType      = FP16;
+using WeiDataType      = FP16;
+using AccDataType      = FP32;
+using CShuffleDataType = FP16;
+using DsDataType       = ck::Tuple<>;
+using InDataType       = FP16;
+using OutLayout = ck::tensor_layout::convolution::GNHWK;
+using WeiLayout = ck::tensor_layout::convolution::GKYXC;
+using DsLayout  = ck::Tuple<>;
+using InLayout  = ck::tensor_layout::convolution::GNHWC;
+using OutElementOp = PassThrough;
+using WeiElementOp = PassThrough;
+using InElementOp  = PassThrough;
+// clang-format off
+using DeviceConvInstance = ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
+// ######| NDimSpatial|   ALayout|   BLayout|   DsLayout|  ELayout|       AData|       BData|     AccData|         CShuffle|       DsData|      EData| AElementwise| BElementwise| CDEElementwise| ConvolutionBackward| DoPad| DoPad|      NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MXdl|    NXdl|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffleMXdl| CShuffleNXdl|   CDEBlockTransfer| CDEBlockTransfer|
+// ######|            |          |          |           |         |        Type|        Type|        Type|         DataType|         Type|       Type|    Operation|    Operation|      Operation|  DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector|
+// ######|            |          |          |           |         |            |            |            |                 |             |           |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|
+// ######|            |          |          |           |         |            |            |            |                 |             |           |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |
+         < NDimSpatial, OutLayout, WeiLayout,   DsLayout, InLayout, OutDataType, WeiDataType, AccDataType, CShuffleDataType,   DsDataType, InDataType, OutElementOp, WeiElementOp,    InElementOp,  ConvBwdDataDefault,  true,  true,             1,   256,   128,   256,    32,   8,   2,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,            1,            1,     S<1, 32, 1, 8>,                8>;
+// clang-format on
+#include "run_grouped_conv_bwd_data_example.inc"
+int main(int argc, char* argv[]) { return run_grouped_conv_bwd_data_example(argc, argv); }
--- a/example/38_grouped_conv_bwd_data_bias_relu/grouped_conv_bwd_data_bias_relu_common.hpp
+++ b/example/38_grouped_conv_bwd_data_bias_relu/grouped_conv_bwd_data_bias_relu_common.hpp
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include <iostream>
+bool run_conv_bwd_data_bias_relu(const ExecutionConfig& config,
-#include <numeric>
+                                 const ck::utils::conv::ConvParam& conv_params,
-#include <initializer_list>
+                                 const HostTensorDescriptor& out_g_n_k_wos_desc,
-#include <cstdlib>
+                                 const HostTensorDescriptor& wei_g_k_c_xs_desc,
+                                 const HostTensorDescriptor& bias_g_n_c_wis_desc,
-#include "ck/ck.hpp"
+                                 const HostTensorDescriptor& in_g_n_c_wis_desc,
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+                                 const OutElementOp& out_element_op,
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+                                 const WeiElementOp& wei_element_op,
+                                 const InElementOp& in_element_op)
-#include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/utility/host_tensor_generator.hpp"
-#include "ck/library/utility/convolution_parameter.hpp"
-#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp"
-void print_helper_msg()
-{
-    std::cout << "arg1: verification (0=no, 1=yes)\n"
-              << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
-              << "arg3: time kernel (0=no, 1=yes)\n"
-              << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
-}
-template <ck::index_t NDimSpatial,
-          typename OutDataType,
-          typename WeiDataType,
-          typename BiasDataType,
-          typename InDataType,
-          typename OutElementOp,
-          typename WeiElementOp,
-          typename InElementOp,
-          typename DeviceInstance>
-int run_conv_bwd_data_bias_relu(bool do_verification,
-                                int init_method,
-                                bool time_kernel,
-                                const ck::utils::conv::ConvParam& conv_param,
-                                const HostTensorDescriptor& out_g_n_k_wos_desc,
-                                const HostTensorDescriptor& wei_g_k_c_xs_desc,
-                                const HostTensorDescriptor& bias_g_n_c_wis_desc,
-                                const HostTensorDescriptor& in_g_n_c_wis_desc,
-                                const OutElementOp& out_element_op,
-                                const WeiElementOp& wei_element_op,
-                                const InElementOp& in_element_op)
 {
    Tensor<OutDataType> out(out_g_n_k_wos_desc);
    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
@@ -58,7 +22,7 @@ int run_conv_bwd_data_bias_relu(bool do_verification,
    std::cout << "bias: " << bias.mDesc << std::endl;
    std::cout << "in: " << in_host.mDesc << std::endl;
-    switch(init_method)
+    switch(config.init_method)
    {
    case 0: break;
    case 1:
@@ -107,13 +71,15 @@ int run_conv_bwd_data_bias_relu(bool do_verification,
    copy(bias_g_n_c_wis_desc.GetStrides(), d0_g_n_c_wis_strides);
    copy(in_g_n_c_wis_desc.GetLengths(), e_g_n_c_wis_lengths);
    copy(in_g_n_c_wis_desc.GetStrides(), e_g_n_c_wis_strides);
-    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_params.conv_filter_strides_, conv_filter_strides);
-    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_params.conv_filter_dilations_, conv_filter_dilations);
-    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_params.input_left_pads_, input_left_pads);
-    copy(conv_param.input_right_pads_, input_right_pads);
+    copy(conv_params.input_right_pads_, input_right_pads);
+    static_assert(std::is_default_constructible_v<DeviceConvInstance>);
    // do conv
-    auto conv     = DeviceInstance{};
+    auto conv     = DeviceConvInstance{};
    auto invoker  = conv.MakeInvoker();
    auto argument = conv.MakeArgument(
        out_device_buf.GetDeviceBuffer(),
@@ -138,16 +104,17 @@ int run_conv_bwd_data_bias_relu(bool do_verification,
    if(!conv.IsSupportedArgument(argument))
    {
-        printf("wrong! device_conv with the specified compilation parameters does "
+        std::cerr << "wrong! device_conv with the specified compilation parameters does "
-               "not support this Conv problem\n");
+                     "not support this Conv problem"
+                  << std::endl;
-        return 1;
+        return false;
    }
-    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
-    std::size_t flop      = conv_param.GetFlops();
+    std::size_t flop      = conv_params.GetFlops();
-    std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
+    std::size_t num_btype = conv_params.GetByte<InDataType, WeiDataType, OutDataType>();
    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
@@ -156,10 +123,8 @@ int run_conv_bwd_data_bias_relu(bool do_verification,
    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
              << std::endl;
-    if(do_verification)
+    if(config.do_verification)
    {
-        using PassThrough = ck::tensor_operation::element_wise::PassThrough;
        // c doesn't physically exist, any layout is fine
        Tensor<float> c_host(in_g_n_c_wis_desc);
@@ -176,10 +141,10 @@ int run_conv_bwd_data_bias_relu(bool do_verification,
        auto ref_argument = ref_conv.MakeArgument(c_host,
                                                  wei,
                                                  out,
-                                                  conv_param.conv_filter_strides_,
+                                                  conv_params.conv_filter_strides_,
-                                                  conv_param.conv_filter_dilations_,
+                                                  conv_params.conv_filter_dilations_,
-                                                  conv_param.input_left_pads_,
+                                                  conv_params.input_left_pads_,
-                                                  conv_param.input_right_pads_,
+                                                  conv_params.input_right_pads_,
                                                  PassThrough{},
                                                  wei_element_op,
                                                  out_element_op);
@@ -192,8 +157,68 @@ int run_conv_bwd_data_bias_relu(bool do_verification,
        in_device_buf.FromDevice(in_device.mData.data());
-        return ck::utils::check_err(in_device.mData, in_host.mData) ? 0 : 1;
+        return ck::utils::check_err(in_device.mData, in_host.mData);
+    }
+    return true;
+}
+int run_grouped_conv_bwd_data_bias_relu_example(int argc, char* argv[])
+{
+    namespace ctc = ck::tensor_layout::convolution;
+    ExecutionConfig config;
+    ck::utils::conv::ConvParam conv_params = DefaultConvParams;
+    if(!parse_cmd_args(argc, argv, config, conv_params))
+    {
+        return EXIT_FAILURE;
+    }
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{};
+    if(conv_params.num_dim_spatial_ != NDimSpatial)
+    {
+        std::cerr << "unsupported # of spatials dimensions" << std::endl;
+        return EXIT_FAILURE;
    }
-    return 0;
+    // output image: GNHWK
+    const auto out_g_n_k_wos_desc =
+        ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(
+            conv_params);
+    // weight: GKYXC
+    const auto wei_g_k_c_xs_desc =
+        ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_params);
+    // input image bias: G_C
+    const auto bias_g_n_c_wis_desc = HostTensorDescriptor({conv_params.G_,
+                                                           conv_params.N_,
+                                                           conv_params.C_,
+                                                           conv_params.input_spatial_lengths_[0],
+                                                           conv_params.input_spatial_lengths_[1]},
+                                                          {
+                                                              conv_params.C_, // g
+                                                              0,              // n
+                                                              1,              // c
+                                                              0,              // hi
+                                                              0               // wi
+                                                          });
+    // input image: GNHWC
+    const auto in_g_n_c_wis_desc =
+        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_params);
+    return !run_conv_bwd_data_bias_relu(config,
+                                        conv_params,
+                                        out_g_n_k_wos_desc,
+                                        wei_g_k_c_xs_desc,
+                                        bias_g_n_c_wis_desc,
+                                        in_g_n_c_wis_desc,
+                                        wei_element_op,
+                                        out_element_op,
+                                        in_element_op);
 }
--- a/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_example.inc
+++ b/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_example.inc
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+bool run_conv_bwd_data(const ExecutionConfig& config,
+                       const ck::utils::conv::ConvParam& conv_params,
+                       const HostTensorDescriptor& out_g_n_k_wos_desc,
+                       const HostTensorDescriptor& wei_g_k_c_xs_desc,
+                       const HostTensorDescriptor& in_g_n_c_wis_desc,
+                       const OutElementOp& out_element_op,
+                       const WeiElementOp& wei_element_op,
+                       const InElementOp& in_element_op)
+{
+    Tensor<OutDataType> out(out_g_n_k_wos_desc);
+    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<InDataType> in_host(in_g_n_c_wis_desc);
+    Tensor<InDataType> in_device(in_g_n_c_wis_desc);
+    std::cout << "out: " << out.mDesc << std::endl;
+    std::cout << "wei: " << wei.mDesc << std::endl;
+    std::cout << "in: " << in_host.mDesc << std::endl;
+    switch(config.init_method)
+    {
+    case 0: break;
+    case 1:
+        out.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+        break;
+    default:
+        out.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
+        wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
+    }
+    DeviceMem out_device_buf(sizeof(OutDataType) * out.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem in_device_buf(sizeof(InDataType) * in_device.mDesc.GetElementSpaceSize());
+    out_device_buf.ToDevice(out.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+    // reset input to zero
+    in_device_buf.SetZero();
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+    auto copy = [](auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); };
+    copy(out_g_n_k_wos_desc.GetLengths(), a_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), a_g_n_k_wos_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(in_g_n_c_wis_desc.GetLengths(), e_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), e_g_n_c_wis_strides);
+    copy(conv_params.conv_filter_strides_, conv_filter_strides);
+    copy(conv_params.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_params.input_left_pads_, input_left_pads);
+    copy(conv_params.input_right_pads_, input_right_pads);
+    static_assert(std::is_default_constructible_v<DeviceConvInstance>);
+    // do conv
+    auto conv     = DeviceConvInstance{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(out_device_buf.GetDeviceBuffer(),
+                                      wei_device_buf.GetDeviceBuffer(),
+                                      std::array<const void*, 0>{},
+                                      in_device_buf.GetDeviceBuffer(),
+                                      a_g_n_k_wos_lengths,
+                                      a_g_n_k_wos_strides,
+                                      b_g_k_c_xs_lengths,
+                                      b_g_k_c_xs_strides,
+                                      std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{},
+                                      std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{},
+                                      e_g_n_c_wis_lengths,
+                                      e_g_n_c_wis_strides,
+                                      conv_filter_strides,
+                                      conv_filter_dilations,
+                                      input_left_pads,
+                                      input_right_pads,
+                                      out_element_op,
+                                      wei_element_op,
+                                      in_element_op);
+    if(!conv.IsSupportedArgument(argument))
+    {
+        std::cerr << "wrong! device_conv with the specified compilation parameters does "
+                     "not support this Conv problem"
+                  << std::endl;
+        return false;
+    }
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+    std::size_t flop      = conv_params.GetFlops();
+    std::size_t num_btype = conv_params.GetByte<InDataType, WeiDataType, OutDataType>();
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << std::endl;
+    if(config.do_verification)
+    {
+        auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdData<NDimSpatial,
+                                                                         InDataType,
+                                                                         WeiDataType,
+                                                                         OutDataType,
+                                                                         PassThrough,
+                                                                         WeiElementOp,
+                                                                         OutElementOp>();
+        auto ref_invoker = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(in_host,
+                                                  wei,
+                                                  out,
+                                                  conv_params.conv_filter_strides_,
+                                                  conv_params.conv_filter_dilations_,
+                                                  conv_params.input_left_pads_,
+                                                  conv_params.input_right_pads_,
+                                                  PassThrough{},
+                                                  wei_element_op,
+                                                  out_element_op);
+        ref_invoker.Run(ref_argument);
+        in_device_buf.FromDevice(in_device.mData.data());
+        return ck::utils::check_err(in_device.mData, in_host.mData);
+    }
+    return true;
+}
+int run_grouped_conv_bwd_data_example(int argc, char* argv[])
+{
+    namespace ctc = ck::tensor_layout::convolution;
+    ExecutionConfig config;
+    ck::utils::conv::ConvParam conv_params = DefaultConvParams;
+    if(!parse_cmd_args(argc, argv, config, conv_params))
+    {
+        return EXIT_FAILURE;
+    }
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{};
+    if(conv_params.num_dim_spatial_ != NDimSpatial)
+    {
+        std::cerr << "unsupported # of spatials dimensions" << std::endl;
+        return EXIT_FAILURE;
+    }
+    // output image: GNHWK
+    const auto out_g_n_k_wos_desc =
+        ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(
+            conv_params);
+    // weight: GKYXC
+    const auto wei_g_k_c_xs_desc =
+        ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_params);
+    // input image: GNHWC
+    const auto in_g_n_c_wis_desc =
+        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_params);
+    return !run_conv_bwd_data(config,
+                              conv_params,
+                              out_g_n_k_wos_desc,
+                              wei_g_k_c_xs_desc,
+                              in_g_n_c_wis_desc,
+                              wei_element_op,
+                              out_element_op,
+                              in_element_op);
+}
--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <array>
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+template <ck::index_t NDimSpatial,
+          typename InputLayout,
+          typename WeightLayout,
+          typename OutputLayout,
+          typename InputDataType,
+          typename WeightDataType,
+          typename OutputDataType,
+          typename InputElementwiseOperation,
+          typename WeightElementwiseOperation,
+          typename OutputElementwiseOperation>
+struct DeviceGroupedConvBwdData : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(void* p_input,
+                        const void* p_weight,
+                        const void* p_output,
+                        const std::array<index_t, NDimSpatial + 3>& input_g_n_c_wis_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& input_g_n_c_wis_strides,
+                        const std::array<index_t, NDimSpatial + 3>& weight_g_k_c_xs_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& weight_g_k_c_xs_strides,
+                        const std::array<index_t, NDimSpatial + 3>& output_g_n_k_wos_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& output_g_n_k_wos_strides,
+                        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                        const std::array<index_t, NDimSpatial>& input_left_pads,
+                        const std::array<index_t, NDimSpatial>& input_right_pads,
+                        const InputElementwiseOperation& input_element_op,
+                        const WeightElementwiseOperation& weight_element_op,
+                        const OutputElementwiseOperation& output_element_op) = 0;
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp
@@ -6,6 +6,7 @@
 #include <vector>
 #include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -62,6 +63,100 @@ struct DeviceGroupedConvBwdDataMultipleD : public BaseOperator
    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
 };
+template <ck::index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct DeviceGroupedConvBwdDataMultipleD<NDimSpatial,
+                                         ALayout,
+                                         BLayout,
+                                         Tuple<>,
+                                         ELayout,
+                                         ADataType,
+                                         BDataType,
+                                         Tuple<>,
+                                         EDataType,
+                                         AElementwiseOperation,
+                                         BElementwiseOperation,
+                                         CDEElementwiseOperation>
+    : public DeviceGroupedConvBwdData<NDimSpatial,
+                                      ELayout,
+                                      BLayout,
+                                      ALayout,
+                                      EDataType,
+                                      BDataType,
+                                      ADataType,
+                                      CDEElementwiseOperation,
+                                      BElementwiseOperation,
+                                      AElementwiseOperation>
+{
+    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(
+        const void* p_a,                                                 // output image
+        const void* p_b,                                                 // weight
+        const std::array<const void*, 0>&,                               // bias
+        void* p_e,                                                       // input image
+        const std::array<index_t, NDimSpatial + 3>& a_g_n_k_wos_lengths, // output image
+        const std::array<index_t, NDimSpatial + 3>& a_g_n_k_wos_strides, // output image
+        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,  // weight
+        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,  // weight
+        const std::array<std::array<index_t, NDimSpatial + 3>, 0>&,      // bias
+        const std::array<std::array<index_t, NDimSpatial + 3>, 0>&,      // bias
+        const std::array<index_t, NDimSpatial + 3>& e_g_n_c_wis_lengths, // input image
+        const std::array<index_t, NDimSpatial + 3>& e_g_n_c_wis_strides, // input image
+        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+        const std::array<index_t, NDimSpatial>& input_left_pads,
+        const std::array<index_t, NDimSpatial>& input_right_pads,
+        const AElementwiseOperation& a_element_op,
+        const BElementwiseOperation& b_element_op,
+        const CDEElementwiseOperation& cde_element_op) = 0;
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(void* p_input,
+                        const void* p_weight,
+                        const void* p_output,
+                        const std::array<index_t, NDimSpatial + 3>& input_g_n_c_wis_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& input_g_n_c_wis_strides,
+                        const std::array<index_t, NDimSpatial + 3>& weight_g_k_c_xs_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& weight_g_k_c_xs_strides,
+                        const std::array<index_t, NDimSpatial + 3>& output_g_n_k_wos_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& output_g_n_k_wos_strides,
+                        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                        const std::array<index_t, NDimSpatial>& input_left_pads,
+                        const std::array<index_t, NDimSpatial>& input_right_pads,
+                        const CDEElementwiseOperation& input_element_op,
+                        const BElementwiseOperation& weight_element_op,
+                        const AElementwiseOperation& output_element_op) override final
+    {
+        return MakeArgumentPointer(p_output,
+                                   p_weight,
+                                   std::array<const void*, 0>{},
+                                   p_input,
+                                   output_g_n_k_wos_lengths,
+                                   output_g_n_k_wos_strides,
+                                   weight_g_k_c_xs_lengths,
+                                   weight_g_k_c_xs_strides,
+                                   std::array<std::array<index_t, NDimSpatial + 3>, 0>{},
+                                   std::array<std::array<index_t, NDimSpatial + 3>, 0>{},
+                                   input_g_n_c_wis_lengths,
+                                   input_g_n_c_wis_strides,
+                                   conv_filter_strides,
+                                   conv_filter_dilations,
+                                   input_left_pads,
+                                   input_right_pads,
+                                   output_element_op,
+                                   weight_element_op,
+                                   input_element_op);
+    }
+};
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// conv2d backward data
+void add_device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdData<2,
+                                                         GNHWC,
+                                                         GKYXC,
+                                                         GNHWK,
+                                                         F16,
+                                                         F16,
+                                                         F16,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         PassThrough>>>& instances);
+template <ck::index_t NumDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupedConvBwdData<
+    NumDimSpatial,
+    InLayout,
+    WeiLayout,
+    OutLayout,
+    InDataType,
+    WeiDataType,
+    OutDataType,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough>>
+{
+    using DeviceOp = DeviceGroupedConvBwdData<NumDimSpatial,
+                                              InLayout,
+                                              WeiLayout,
+                                              OutLayout,
+                                              InDataType,
+                                              WeiDataType,
+                                              OutDataType,
+                                              ck::tensor_operation::element_wise::PassThrough,
+                                              ck::tensor_operation::element_wise::PassThrough,
+                                              ck::tensor_operation::element_wise::PassThrough>;
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, GNHWC> &&
+                     is_same_v<WeiLayout, GKYXC> && is_same_v<OutLayout, GNHWK>)
+        {
+            if constexpr(is_same_v<InDataType, F16> && is_same_v<WeiDataType, F16> &&
+                         is_same_v<OutDataType, F16>)
+            {
+                add_device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instances(op_ptrs);
+            }
+        }
+        return op_ptrs;
+    }
+};
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/CMakeLists.txt
+add_instance_library(device_grouped_conv2d_bwd_data_instance
+   device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
+)
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+using F16 = ck::half_t;
+using F32 = float;
+using EmptyTuple = ck::Tuple<>;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using GNHWC = ck::tensor_layout::convolution::GNHWC;
+using GKYXC = ck::tensor_layout::convolution::GKYXC;
+using GNHWK = ck::tensor_layout::convolution::GNHWK;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+static constexpr auto ConvBwdDataDefault =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default;
+static constexpr auto ConvBwdDataFilter1x1Stride1Pad0 =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0;
+using device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instances = std::tuple<
+    // clang-format off
+        // 1. Default
+        // ##############################################|    NDim| ALayout| BLayout|   DsLayout| ELayout| AData| BData| AccData| CShuffle|     DsData| EData| AElementwise| BElementwise| CDEElementwise| ConvolutionBackward| DoPad| DoPad|      NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MXdl|    NXdl|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffleMXdl| CShuffleNXdl|   CDEBlockTransfer| CDEBlockTransfer|
+        // ##############################################| Spatial|        |        |           |        |  Type|  Type|    Type| DataType|       Type|  Type|    Operation|    Operation|      Operation|  DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector|
+        // ##############################################|        |        |        |           |        |      |      |        |         |           |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|
+        // ##############################################|        |        |        |           |        |      |      |        |         |           |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, EmptyTuple,   GNHWC,   F16,   F16,     F32,      F16, EmptyTuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,   256,   256,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, EmptyTuple,   GNHWC,   F16,   F16,     F32,      F16, EmptyTuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,   256,   128,   256,    32,   8,   8,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, EmptyTuple,   GNHWC,   F16,   F16,     F32,      F16, EmptyTuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,   128,   128,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 8>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, EmptyTuple,   GNHWC,   F16,   F16,     F32,      F16, EmptyTuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,   256,   128,   128,    32,   8,   8,   32,   32,       2,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, EmptyTuple,   GNHWC,   F16,   F16,     F32,      F16, EmptyTuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,   128,   128,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 4>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, EmptyTuple,   GNHWC,   F16,   F16,     F32,      F16, EmptyTuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,   128,    64,   128,    32,   8,   8,   32,   32,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 8>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, EmptyTuple,   GNHWC,   F16,   F16,     F32,      F16, EmptyTuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,    64,    64,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, EmptyTuple,   GNHWC,   F16,   F16,     F32,      F16, EmptyTuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,   256,   128,    64,    32,   8,   8,   32,   32,       2,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, EmptyTuple,   GNHWC,   F16,   F16,     F32,      F16, EmptyTuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,   256,    64,   128,    32,   8,   8,   32,   32,       1,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, EmptyTuple,   GNHWC,   F16,   F16,     F32,      F16, EmptyTuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,   128,   128,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 4>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, EmptyTuple,   GNHWC,   F16,   F16,     F32,      F16, EmptyTuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,   128,    32,   128,    32,   8,   8,   32,   32,       1,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 8>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, EmptyTuple,   GNHWC,   F16,   F16,     F32,      F16, EmptyTuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,    64,    64,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, EmptyTuple,   GNHWC,   F16,   F16,     F32,      F16, EmptyTuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,    64,    32,    64,    32,   8,   8,   32,   32,       1,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8>,
+        // 2. Filter1x1Stride1Pad0
+        // ##############################################|    NDim| ALayout| BLayout|   DsLayout| ELayout| AData| BData| AccData| CShuffle|     DsData| EData| AElementwise| BElementwise| CDEElementwise|              ConvolutionBackward| DoPad| DoPad|      NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MXdl|    NXdl|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffleMXdl| CShuffleNXdl|   CDEBlockTransfer| CDEBlockTransfer|
+        // ##############################################| Spatial|        |        |           |        |  Type|  Type|    Type| DataType|       Type|  Type|    Operation|    Operation|      Operation|               DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector|
+        // ##############################################|        |        |        |           |        |      |      |        |         |           |      |             |             |               |                                 |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|
+        // ##############################################|        |        |        |           |        |      |      |        |         |           |      |             |             |               |                                 |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, EmptyTuple,   GNHWC,   F16,   F16,     F32,      F16, EmptyTuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,  true,  true,             1,   256,   256,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, EmptyTuple,   GNHWC,   F16,   F16,     F32,      F16, EmptyTuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,  true,  true,             1,   256,   128,   256,    32,   8,   8,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, EmptyTuple,   GNHWC,   F16,   F16,     F32,      F16, EmptyTuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,  true,  true,             1,   128,   128,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 8>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, EmptyTuple,   GNHWC,   F16,   F16,     F32,      F16, EmptyTuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,  true,  true,             1,   256,   128,   128,    32,   8,   8,   32,   32,       2,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, EmptyTuple,   GNHWC,   F16,   F16,     F32,      F16, EmptyTuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,  true,  true,             1,   128,   128,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 4>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, EmptyTuple,   GNHWC,   F16,   F16,     F32,      F16, EmptyTuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,  true,  true,             1,   128,    64,   128,    32,   8,   8,   32,   32,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 8>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, EmptyTuple,   GNHWC,   F16,   F16,     F32,      F16, EmptyTuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,  true,  true,             1,    64,    64,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, EmptyTuple,   GNHWC,   F16,   F16,     F32,      F16, EmptyTuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,  true,  true,             1,   256,   128,    64,    32,   8,   8,   32,   32,       2,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, EmptyTuple,   GNHWC,   F16,   F16,     F32,      F16, EmptyTuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,  true,  true,             1,   256,    64,   128,    32,   8,   8,   32,   32,       1,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, EmptyTuple,   GNHWC,   F16,   F16,     F32,      F16, EmptyTuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,  true,  true,             1,   128,   128,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 4>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, EmptyTuple,   GNHWC,   F16,   F16,     F32,      F16, EmptyTuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,  true,  true,             1,   128,    32,   128,    32,   8,   8,   32,   32,       1,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 8>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, EmptyTuple,   GNHWC,   F16,   F16,     F32,      F16, EmptyTuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,  true,  true,             1,    64,    64,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, EmptyTuple,   GNHWC,   F16,   F16,     F32,      F16, EmptyTuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,  true,  true,             1,    64,    32,    64,    32,   8,   8,   32,   32,       1,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8>
+    // clang-format on
+    >;
+void add_device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdData<2,
+                                                         GNHWC,
+                                                         GKYXC,
+                                                         GNHWK,
+                                                         F16,
+                                                         F16,
+                                                         F16,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instances{});
+}
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck