merge develop to attn-train-develop-qloop

76f2b6cd · danyao12 · 9b4c780a · 1ee99dca · 76f2b6cd · 76f2b6cd
Commit 76f2b6cd authored Jul 14, 2023 by danyao12
20 changed files
--- a/client_example/09_quantization/conv2d_fwd_bias_tanh_perlayer_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_bias_tanh_perlayer_quantization.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iomanip>
+#include <iostream>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+using InDataType   = int8_t;
+using WeiDataType  = int8_t;
+using BiasDataType = int32_t;
+using OutDataType  = int8_t;
+
+using InLayout     = ck::tensor_layout::convolution::NHWGC;
+using WeiLayout    = ck::tensor_layout::convolution::GKYXC;
+using BiasLayout   = ck::tensor_layout::convolution::G_K;
+using OutLayout    = ck::tensor_layout::convolution::NHWGK;
+using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
+using ActivationOp = ck::tensor_operation::element_wise::TanH;
+using OutElementOp = ck::tensor_operation::element_wise::Add_Mul_Activation_Mul_Clamp<ActivationOp>;
+
+static constexpr ck::index_t NumDimSpatial = 2;
+static constexpr ck::index_t G             = 4;
+static constexpr ck::index_t N             = 4;    // batch size
+static constexpr ck::index_t K             = 32;   // output channel
+static constexpr ck::index_t C             = 64;   // input channel (per group)
+static constexpr ck::index_t Y             = 3;    // filter H
+static constexpr ck::index_t X             = 3;    // filter W
+static constexpr ck::index_t Hi            = 71;   // input H
+static constexpr ck::index_t Wi            = 71;   // input W
+static constexpr ck::index_t Ho            = 36;   // output H
+static constexpr ck::index_t Wo            = 36;   // output W
+static constexpr float sacc                = 0.5f; //  scale of acc
+static constexpr float sz_inv              = 0.5f; // inverse of scale_z
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+    // We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
+    // However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
+    // Hence, we need to adjust the order of stride
+    std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
+    std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
+    std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
+    std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
+    std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo};
+    std::array<ck::index_t, 5> bias_strides{K, 0, 1, 0, 0};
+    std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
+    std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
+
+    std::array<ck::index_t, 2> in_left_pad{1, 1};
+    std::array<ck::index_t, 2> in_right_pad{1, 1};
+    std::array<ck::index_t, 2> conv_strides{2, 2};
+    std::array<ck::index_t, 2> conv_dilations{1, 1};
+
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
+    SimpleDeviceMem bias(sizeof(BiasDataType) * G * K);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
+
+    using DeviceOp =
+        ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
+                                                                    InLayout,
+                                                                    WeiLayout,
+                                                                    ck::Tuple<BiasLayout>,
+                                                                    OutLayout,
+                                                                    InDataType,
+                                                                    WeiDataType,
+                                                                    ck::Tuple<BiasDataType>,
+                                                                    OutDataType,
+                                                                    PassThrough,
+                                                                    PassThrough,
+                                                                    OutElementOp>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    int best_op_id        = -1;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    float best_tflops     = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr      = op_ptrs[i];
+        auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                                        wei.GetDeviceBuffer(),
+                                                        {bias.GetDeviceBuffer()},
+                                                        out.GetDeviceBuffer(),
+                                                        in_lengths,
+                                                        in_strides,
+                                                        weight_lengths,
+                                                        weight_strides,
+                                                        {bias_lengths},
+                                                        {bias_strides},
+                                                        out_lengths,
+                                                        out_strides,
+                                                        conv_strides,
+                                                        conv_dilations,
+                                                        in_left_pad,
+                                                        in_right_pad,
+                                                        PassThrough{},
+                                                        PassThrough{},
+                                                        OutElementOp{sacc, sz_inv, ActivationOp{}});
+
+        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t flop = G * 2 * N * K * C * Ho * Wo * Y * X;
+            std::size_t num_bytes =
+                G * sizeof(InDataType) * N * Hi * Wi * C + G * sizeof(WeiDataType) * K * Y * X * C +
+                G * sizeof(BiasDataType) * K + G * sizeof(OutDataType) * N * Ho * Wo * K;
+
+            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+            float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+                best_tflops     = tflops;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    // run the best intance
+    if(best_op_id != -1)
+    {
+        std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
+                  << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                                        wei.GetDeviceBuffer(),
+                                                        {bias.GetDeviceBuffer()},
+                                                        out.GetDeviceBuffer(),
+                                                        in_lengths,
+                                                        in_strides,
+                                                        weight_lengths,
+                                                        weight_strides,
+                                                        {bias_lengths},
+                                                        {bias_strides},
+                                                        out_lengths,
+                                                        out_strides,
+                                                        conv_strides,
+                                                        conv_dilations,
+                                                        in_left_pad,
+                                                        in_right_pad,
+                                                        PassThrough{},
+                                                        PassThrough{},
+                                                        OutElementOp{sacc, sz_inv, ActivationOp{}});
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+
+    return 0;
+}
\ No newline at end of file
--- a/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #include <iomanip>
 #include <iostream>
@@ -16,25 +16,25 @@ using WeiDataType          = int8_t;
 using RequantScaleDataType = float;
 using OutDataType          = int8_t;

-using InLayout           = ck::tensor_layout::convolution::GNHWC;
+using InLayout           = ck::tensor_layout::convolution::NHWGC;
 using WeiLayout          = ck::tensor_layout::convolution::GKYXC;
 using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
-using OutLayout          = ck::tensor_layout::convolution::GNHWK;
+using OutLayout          = ck::tensor_layout::convolution::NHWGK;
 using PassThrough        = ck::tensor_operation::element_wise::PassThrough;
 using ActivationOp       = PassThrough;
 using OutElementOp       = ck::tensor_operation::element_wise::Activation_Mul2_Clamp<ActivationOp>;

 static constexpr ck::index_t NumDimSpatial = 2;
-static constexpr ck::index_t G             = 1;
-static constexpr ck::index_t N             = 4;
-static constexpr ck::index_t K             = 64;
-static constexpr ck::index_t C             = 32;
-static constexpr ck::index_t Y             = 3;
-static constexpr ck::index_t X             = 3;
-static constexpr ck::index_t Hi            = 71;
-static constexpr ck::index_t Wi            = 71;
-static constexpr ck::index_t Ho            = 36;
-static constexpr ck::index_t Wo            = 36;
+static constexpr ck::index_t G             = 4;
+static constexpr ck::index_t N             = 4;  // batch size
+static constexpr ck::index_t K             = 32; // output channel
+static constexpr ck::index_t C             = 64; // input channel (per group)
+static constexpr ck::index_t Y             = 3;  // filter H
+static constexpr ck::index_t X             = 3;  // filter W
+static constexpr ck::index_t Hi            = 71; // input H
+static constexpr ck::index_t Wi            = 71; // input W
+static constexpr ck::index_t Ho            = 36; // output H
+static constexpr ck::index_t Wo            = 36; // output W

 struct SimpleDeviceMem
 {
@@ -54,23 +54,27 @@ struct SimpleDeviceMem

 int main(int argc, char* argv[])
 {
+    // We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
+    // However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
+    // Hence, we need to adjust the order of stride
    std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
-    std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
+    std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
    std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
    std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
    std::array<ck::index_t, 5> requant_scale_lengths{G, N, K, Ho, Wo};
    std::array<ck::index_t, 5> requant_scale_strides{K, 0, 1, 0, 0};
-    std::array<ck::index_t, 5> out_lengths{G, N, C, Ho, Wo};
-    std::array<ck::index_t, 5> out_strides{N * Ho * Wo * C, Ho * Wo * C, 1, Wo * C, C};
+    std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
+    std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
+
    std::array<ck::index_t, 2> in_left_pad{1, 1};
    std::array<ck::index_t, 2> in_right_pad{1, 1};
    std::array<ck::index_t, 2> conv_strides{2, 2};
    std::array<ck::index_t, 2> conv_dilations{1, 1};

-    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
-    SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
-    SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K * Y * X * C);
-    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
+    SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * G * K);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);

    using DeviceOp =
        ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
@@ -131,9 +135,9 @@ int main(int argc, char* argv[])
            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});

            std::size_t flop = G * 2 * N * K * C * Ho * Wo * Y * X;
-            std::size_t num_bytes = G * sizeof(InDataType) * N * Hi * Wi * C +
-                                    G * sizeof(WeiDataType) * K * Y * X * C +
-                                    G * sizeof(OutDataType) * N * Ho * Wo * K;
+            std::size_t num_bytes =
+                G * sizeof(InDataType) * N * Hi * Wi * C + G * sizeof(WeiDataType) * K * Y * X * C +
+                G * sizeof(RequantScaleDataType) * K + G * sizeof(OutDataType) * N * Ho * Wo * K;

            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
            float gb_per_sec = num_bytes / 1.E6 / avg_time;
@@ -156,11 +160,12 @@ int main(int argc, char* argv[])
        }
    }

+    // run the best intance
+    if(best_op_id != -1)
+    {
        std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
                  << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;

-    // run the best intance
-    {
        auto& op_ptr = op_ptrs[best_op_id];
        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
                  << std::endl;

--- a/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #include <iomanip>
 #include <iostream>
@@ -15,24 +15,25 @@ using InDataType  = int8_t;
 using WeiDataType = int8_t;
 using OutDataType = int8_t;

-using InLayout     = ck::tensor_layout::convolution::GNHWC;
+using InLayout     = ck::tensor_layout::convolution::NHWGC;
 using WeiLayout    = ck::tensor_layout::convolution::GKYXC;
-using OutLayout    = ck::tensor_layout::convolution::GNHWK;
+using OutLayout    = ck::tensor_layout::convolution::NHWGK;
 using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
 using ActivationOp = PassThrough;
 using OutElementOp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<ActivationOp>;

 static constexpr ck::index_t NumDimSpatial = 2;
-static constexpr ck::index_t G             = 1;
-static constexpr ck::index_t N             = 4;
-static constexpr ck::index_t K             = 64;
-static constexpr ck::index_t C             = 32;
-static constexpr ck::index_t Y             = 3;
-static constexpr ck::index_t X             = 3;
-static constexpr ck::index_t Hi            = 71;
-static constexpr ck::index_t Wi            = 71;
-static constexpr ck::index_t Ho            = 36;
-static constexpr ck::index_t Wo            = 36;
+static constexpr ck::index_t G             = 4;
+static constexpr ck::index_t N             = 4;    // batch size
+static constexpr ck::index_t K             = 32;   // output channel
+static constexpr ck::index_t C             = 64;   // input channel (per group)
+static constexpr ck::index_t Y             = 3;    // filter H
+static constexpr ck::index_t X             = 3;    // filter W
+static constexpr ck::index_t Hi            = 71;   // input H
+static constexpr ck::index_t Wi            = 71;   // input W
+static constexpr ck::index_t Ho            = 36;   // output H
+static constexpr ck::index_t Wo            = 36;   // output W
+static constexpr float requant_scale       = 0.5f; // requantize qAcc to qY

 struct SimpleDeviceMem
 {
@@ -52,20 +53,24 @@ struct SimpleDeviceMem

 int main(int argc, char* argv[])
 {
+    // We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
+    // However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
+    // Hence, we need to adjust the order of stride
    std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
-    std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
+    std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
    std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
    std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
-    std::array<ck::index_t, 5> out_lengths{G, N, C, Ho, Wo};
-    std::array<ck::index_t, 5> out_strides{N * Ho * Wo * C, Ho * Wo * C, 1, Wo * C, C};
+    std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
+    std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
+
    std::array<ck::index_t, 2> in_left_pad{1, 1};
    std::array<ck::index_t, 2> in_right_pad{1, 1};
    std::array<ck::index_t, 2> conv_strides{2, 2};
    std::array<ck::index_t, 2> conv_dilations{1, 1};

-    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
-    SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
-    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);

    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
                                                                                 InLayout,
@@ -97,7 +102,8 @@ int main(int argc, char* argv[])
    for(int i = 0; i < op_ptrs.size(); ++i)
    {
        auto& op_ptr = op_ptrs[i];
-        auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+        auto argument_ptr =
+            op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
                                        wei.GetDeviceBuffer(),
                                        {},
                                        out.GetDeviceBuffer(),
@@ -115,7 +121,7 @@ int main(int argc, char* argv[])
                                        in_right_pad,
                                        PassThrough{},
                                        PassThrough{},
-                                                        OutElementOp{0.5f, ActivationOp{}});
+                                        OutElementOp{requant_scale, ActivationOp{}});

        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
        std::string op_name = op_ptr->GetTypeString();
@@ -150,15 +156,16 @@ int main(int argc, char* argv[])
        }
    }

+    if(best_op_id != -1)
+    {
        std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
                  << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;

-    // run the best intance
-    {
        auto& op_ptr = op_ptrs[best_op_id];
        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
                  << std::endl;
-        auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+        auto argument_ptr =
+            op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
                                        wei.GetDeviceBuffer(),
                                        {},
                                        out.GetDeviceBuffer(),
@@ -176,7 +183,7 @@ int main(int argc, char* argv[])
                                        in_right_pad,
                                        PassThrough{},
                                        PassThrough{},
-                                                        OutElementOp{0.5f, ActivationOp{}});
+                                        OutElementOp{requant_scale, ActivationOp{}});

        auto invoker_ptr = op_ptr->MakeInvokerPointer();


--- a/client_example/09_quantization/gemm_quantization.cpp
+++ b/client_example/09_quantization/gemm_quantization.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iomanip>
+#include <iostream>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/quantization/gemm_quantization.hpp"
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using ActivationOp = PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<ActivationOp>;
+
+using ADataType = int8_t;
+using BDataType = int8_t;
+using EDataType = int8_t;
+
+using ALayout = Row;
+using BLayout = Col;
+using ELayout = Row;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+    ck::index_t M = 1024;
+    ck::index_t N = 1024;
+    ck::index_t K = 1024;
+
+    ck::index_t StrideA = 1024;
+    ck::index_t StrideB = 1024;
+    ck::index_t StrideE = 1024;
+
+    float requant_scale = 0.03;
+
+    auto f_matrix_space_size =
+        [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
+            using Layout = decltype(layout);
+
+            if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return (nRow - 1) * stride + nCol;
+            }
+            else
+            {
+                return (nCol - 1) * stride + nRow;
+            }
+        };
+
+    SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
+    SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
+    SimpleDeviceMem e_device_buf(sizeof(EDataType) * f_matrix_space_size(M, N, StrideE, ELayout{}));
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<ALayout,
+                                                                       BLayout,
+                                                                       ck::Tuple<>,
+                                                                       ELayout,
+                                                                       ADataType,
+                                                                       BDataType,
+                                                                       ck::Tuple<>,
+                                                                       EDataType,
+                                                                       AElementOp,
+                                                                       BElementOp,
+                                                                       CDEElementOp>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    const auto a_element_op   = AElementOp{};
+    const auto b_element_op   = BElementOp{};
+    const auto cde_element_op = CDEElementOp{requant_scale, ActivationOp{}};
+
+    std::string best_op_name;
+    int best_op_id        = -1;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    float best_tflops     = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr      = op_ptrs[i];
+        auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+                                                        b_device_buf.GetDeviceBuffer(),
+                                                        {},
+                                                        e_device_buf.GetDeviceBuffer(),
+                                                        M,
+                                                        N,
+                                                        K,
+                                                        StrideA,
+                                                        StrideB,
+                                                        {},
+                                                        StrideE,
+                                                        a_element_op,
+                                                        b_element_op,
+                                                        cde_element_op);
+
+        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t flop = std::size_t(2) * M * N * K;
+            std::size_t num_bytes =
+                sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+            float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+                best_tflops     = tflops;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    if(best_op_id != -1)
+    {
+        std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
+                  << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+                                                        b_device_buf.GetDeviceBuffer(),
+                                                        {},
+                                                        e_device_buf.GetDeviceBuffer(),
+                                                        M,
+                                                        N,
+                                                        K,
+                                                        StrideA,
+                                                        StrideB,
+                                                        {},
+                                                        StrideE,
+                                                        a_element_op,
+                                                        b_element_op,
+                                                        cde_element_op);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+
+    return 0;
+}
\ No newline at end of file
--- a/client_example/10_grouped_conv2d_bwd_data/grouped_conv2d_bwd_data.cpp
+++ b/client_example/10_grouped_conv2d_bwd_data/grouped_conv2d_bwd_data.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #include <cstdlib>
 #include <iomanip>

--- a/client_example/11_grouped_conv_bwd_weight/CMakeLists.txt
+++ b/client_example/11_grouped_conv_bwd_weight/CMakeLists.txt
-add_executable(client_grouped_conv2d_bwd_weight grouped_conv2d_bwd_weight.cpp)
-target_link_libraries(client_grouped_conv2d_bwd_weight PRIVATE composable_kernel::device_operations)
+add_executable(client_grouped_conv1d_bwd_weight_fp16 grouped_conv1d_bwd_weight_fp16.cpp)
+add_executable(client_grouped_conv2d_bwd_weight_fp16 grouped_conv2d_bwd_weight_fp16.cpp)
+add_executable(client_grouped_conv3d_bwd_weight_fp16 grouped_conv3d_bwd_weight_fp16.cpp)
+add_executable(client_grouped_conv3d_bwd_weight_fp32 grouped_conv3d_bwd_weight_fp32.cpp)
+
+target_link_libraries(client_grouped_conv1d_bwd_weight_fp16 PRIVATE composable_kernel::device_operations)
+target_link_libraries(client_grouped_conv2d_bwd_weight_fp16 PRIVATE composable_kernel::device_operations)
+target_link_libraries(client_grouped_conv3d_bwd_weight_fp16 PRIVATE composable_kernel::device_operations)
+target_link_libraries(client_grouped_conv3d_bwd_weight_fp32 PRIVATE composable_kernel::device_operations)
--- a/client_example/11_grouped_conv_bwd_weight/grouped_conv2d_bwd_weight.cpp
+++ b/client_example/11_grouped_conv_bwd_weight/grouped_conv2d_bwd_weight.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #include <cstdlib>
 #include <iomanip>
@@ -13,27 +13,8 @@
 #include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

-using InDataType  = ck::half_t;
-using WeiDataType = ck::half_t;
-using OutDataType = ck::half_t;
-
-using InLayout    = ck::tensor_layout::convolution::GNHWC;
-using WeiLayout   = ck::tensor_layout::convolution::GKYXC;
-using OutLayout   = ck::tensor_layout::convolution::GNHWK;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;

-static constexpr ck::index_t NumDimSpatial = 2;
-static constexpr ck::index_t G             = 32;
-static constexpr ck::index_t N             = 256;
-static constexpr ck::index_t K             = 192;
-static constexpr ck::index_t C             = 192;
-static constexpr ck::index_t Y             = 3;
-static constexpr ck::index_t X             = 3;
-static constexpr ck::index_t Hi            = 28;
-static constexpr ck::index_t Wi            = 28;
-static constexpr ck::index_t Ho            = 28;
-static constexpr ck::index_t Wo            = 28;
-
 struct SimpleDeviceMem
 {
    SimpleDeviceMem() = delete;
@@ -50,22 +31,95 @@ struct SimpleDeviceMem
    void* p_mem_;
 };

-int main()
+template <ck::index_t NumDimSpatial>
+std::size_t GetFlops(ck::index_t G,
+                     ck::index_t N,
+                     ck::index_t K,
+                     ck::index_t C,
+                     const std::array<ck::index_t, NumDimSpatial>& output_spatial_lengths,
+                     const std::array<ck::index_t, NumDimSpatial>& filter_spatial_lengths)
 {
-    std::array<ck::index_t, NumDimSpatial> input_spatial_lengths{Hi, Wi};
-    std::array<ck::index_t, NumDimSpatial> filter_spatial_lengths{Y, X};
-    std::array<ck::index_t, NumDimSpatial> output_spatial_lengths{Ho, Wo};
+    // 2 * G * N * K * C * <output spatial lengths product> * <filter spatial lengths product>
+    return static_cast<std::size_t>(2) * G * N * K * C *
+           std::accumulate(std::begin(output_spatial_lengths),
+                           std::end(output_spatial_lengths),
+                           static_cast<std::size_t>(1),
+                           std::multiplies<>()) *
+           std::accumulate(std::begin(filter_spatial_lengths),
+                           std::end(filter_spatial_lengths),
+                           static_cast<std::size_t>(1),
+                           std::multiplies<>());
+}

-    std::array<ck::index_t, NumDimSpatial> conv_filter_strides{1, 1};
-    std::array<ck::index_t, NumDimSpatial> conv_filter_dilations{1, 1};
-    std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
-    std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
+template <typename InDataType, ck::index_t NumDimSpatial>
+std::size_t GetInputByte(ck::index_t G,
+                         ck::index_t N,
+                         ck::index_t C,
+                         const std::array<ck::index_t, NumDimSpatial>& input_spatial_lengths)
+{
+    // sizeof(InDataType) * (G * N * C * <input spatial lengths product>) +
+    return sizeof(InDataType) * (G * N * C *
+                                 std::accumulate(std::begin(input_spatial_lengths),
+                                                 std::end(input_spatial_lengths),
+                                                 static_cast<std::size_t>(1),
+                                                 std::multiplies<>()));
+}

-    ck::index_t split_k = 2;
+template <typename WeiDataType, ck::index_t NumDimSpatial>
+std::size_t GetWeightByte(ck::index_t G,
+                          ck::index_t K,
+                          ck::index_t C,
+                          const std::array<ck::index_t, NumDimSpatial>& filter_spatial_lengths)
+{
+    // sizeof(WeiDataType) * (G * K * C * <filter spatial lengths product>) +
+    return sizeof(WeiDataType) * (G * K * C *
+                                  std::accumulate(std::begin(filter_spatial_lengths),
+                                                  std::end(filter_spatial_lengths),
+                                                  static_cast<std::size_t>(1),
+                                                  std::multiplies<>()));
+}

-    SimpleDeviceMem in(sizeof(InDataType) * G * N * Hi * Wi * C);
-    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
-    SimpleDeviceMem out(sizeof(OutDataType) * G * N * Ho * Wo * K);
+template <typename OutDataType, ck::index_t NumDimSpatial>
+std::size_t GetOutputByte(ck::index_t G,
+                          ck::index_t N,
+                          ck::index_t K,
+                          const std::array<ck::index_t, NumDimSpatial>& output_spatial_lengths)
+{
+    // sizeof(OutDataType) * (G * N * K * <output spatial lengths product>);
+    return sizeof(OutDataType) * (G * N * K *
+                                  std::accumulate(std::begin(output_spatial_lengths),
+                                                  std::end(output_spatial_lengths),
+                                                  static_cast<std::size_t>(1),
+                                                  std::multiplies<std::size_t>()));
+}
+
+template <ck::index_t NumDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout>
+bool run_grouped_conv_bwd_weight(
+    const ck::index_t G,
+    const ck::index_t N,
+    const ck::index_t K,
+    const ck::index_t C,
+    const std::array<ck::index_t, NumDimSpatial>& input_spatial_lengths,
+    const std::array<ck::index_t, NumDimSpatial>& filter_spatial_lengths,
+    const std::array<ck::index_t, NumDimSpatial>& output_spatial_lengths,
+    const std::array<ck::index_t, NumDimSpatial + 3>& input_strides,
+    const std::array<ck::index_t, NumDimSpatial + 3>& output_strides,
+    const std::array<ck::index_t, NumDimSpatial>& conv_filter_strides,
+    const std::array<ck::index_t, NumDimSpatial>& conv_filter_dilations,
+    const std::array<ck::index_t, NumDimSpatial>& input_left_pads,
+    const std::array<ck::index_t, NumDimSpatial>& input_right_pads)
+{
+
+    ck::index_t split_k = 2;
+    SimpleDeviceMem in(GetInputByte<InDataType, NumDimSpatial>(G, N, C, input_spatial_lengths));
+    SimpleDeviceMem wei(GetWeightByte<WeiDataType, NumDimSpatial>(G, K, C, filter_spatial_lengths));
+    SimpleDeviceMem out(GetOutputByte<OutDataType, NumDimSpatial>(G, N, K, output_spatial_lengths));

    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvBwdWeight<NumDimSpatial,
                                                                              InLayout,
@@ -105,6 +159,8 @@ int main()
                                                        input_spatial_lengths,
                                                        filter_spatial_lengths,
                                                        output_spatial_lengths,
+                                                        input_strides,
+                                                        output_strides,
                                                        conv_filter_strides,
                                                        conv_filter_dilations,
                                                        input_left_pads,
@@ -120,10 +176,12 @@ int main()
        {
            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});

-            std::size_t flop      = std::size_t(2) * G * N * K * C * Ho * Wo * Y * X;
-            std::size_t num_bytes = sizeof(InDataType) * G * N * Hi * Wi * C +
-                                    sizeof(WeiDataType) * G * K * Y * X * C +
-                                    sizeof(OutDataType) * G * N * Ho * Wo * K;
+            std::size_t flop =
+                GetFlops<NumDimSpatial>(G, N, K, C, output_spatial_lengths, filter_spatial_lengths);
+            std::size_t num_bytes =
+                GetInputByte<InDataType, NumDimSpatial>(G, N, C, input_spatial_lengths) +
+                GetWeightByte<WeiDataType, NumDimSpatial>(G, K, C, filter_spatial_lengths) +
+                GetOutputByte<OutDataType, NumDimSpatial>(G, N, K, output_spatial_lengths);

            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
            float gb_per_sec = num_bytes / 1.E6 / avg_time;
@@ -149,7 +207,7 @@ int main()
    if(best_op_id < 0)
    {
        std::cerr << "no suitable instance" << std::endl;
-        return EXIT_FAILURE;
+        return false;
    }

    std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
@@ -170,6 +228,8 @@ int main()
                                                        input_spatial_lengths,
                                                        filter_spatial_lengths,
                                                        output_spatial_lengths,
+                                                        input_strides,
+                                                        output_strides,
                                                        conv_filter_strides,
                                                        conv_filter_dilations,
                                                        input_left_pads,
@@ -187,4 +247,6 @@ int main()

        std::cout << "Done" << std::endl;
    }
+
+    return true;
 }
--- a/client_example/11_grouped_conv_bwd_weight/grouped_conv1d_bwd_weight_fp16.cpp
+++ b/client_example/11_grouped_conv_bwd_weight/grouped_conv1d_bwd_weight_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+
+using InDataType  = ck::half_t;
+using WeiDataType = ck::half_t;
+using OutDataType = ck::half_t;
+
+using InLayout  = ck::tensor_layout::convolution::GNWC;
+using WeiLayout = ck::tensor_layout::convolution::GKXC;
+using OutLayout = ck::tensor_layout::convolution::GNWK;
+
+static constexpr ck::index_t NumDimSpatial = 1;
+static constexpr ck::index_t G             = 32;
+static constexpr ck::index_t N             = 256;
+static constexpr ck::index_t K             = 192;
+static constexpr ck::index_t C             = 192;
+static constexpr ck::index_t X             = 3;
+static constexpr ck::index_t Wi            = 28;
+static constexpr ck::index_t Wo            = 28;
+static constexpr std::array<ck::index_t, NumDimSpatial> input_spatial_lengths{Wi};
+static constexpr std::array<ck::index_t, NumDimSpatial> filter_spatial_lengths{X};
+static constexpr std::array<ck::index_t, NumDimSpatial> output_spatial_lengths{Wo};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_strides{N * Wi * C, Wi* C, C, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_strides{N * Wo * K, Wo* K, K, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_strides{1};
+static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_dilations{1};
+static constexpr std::array<ck::index_t, NumDimSpatial> input_left_pads{1};
+static constexpr std::array<ck::index_t, NumDimSpatial> input_right_pads{1};
+
+int main()
+{
+    return run_grouped_conv_bwd_weight<NumDimSpatial,
+                                       InDataType,
+                                       WeiDataType,
+                                       OutDataType,
+                                       InLayout,
+                                       WeiLayout,
+                                       OutLayout>(G,
+                                                  N,
+                                                  K,
+                                                  C,
+                                                  input_spatial_lengths,
+                                                  filter_spatial_lengths,
+                                                  output_spatial_lengths,
+                                                  input_strides,
+                                                  output_strides,
+                                                  conv_filter_strides,
+                                                  conv_filter_dilations,
+                                                  input_left_pads,
+                                                  input_right_pads)
+               ? EXIT_SUCCESS
+               : EXIT_FAILURE;
+}
--- a/client_example/11_grouped_conv_bwd_weight/grouped_conv2d_bwd_weight_fp16.cpp
+++ b/client_example/11_grouped_conv_bwd_weight/grouped_conv2d_bwd_weight_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+
+using InDataType  = ck::half_t;
+using WeiDataType = ck::half_t;
+using OutDataType = ck::half_t;
+
+using InLayout  = ck::tensor_layout::convolution::GNHWC;
+using WeiLayout = ck::tensor_layout::convolution::GKYXC;
+using OutLayout = ck::tensor_layout::convolution::GNHWK;
+
+static constexpr ck::index_t NumDimSpatial = 2;
+static constexpr ck::index_t G             = 32;
+static constexpr ck::index_t N             = 256;
+static constexpr ck::index_t K             = 192;
+static constexpr ck::index_t C             = 192;
+static constexpr ck::index_t Y             = 3;
+static constexpr ck::index_t X             = 3;
+static constexpr ck::index_t Hi            = 28;
+static constexpr ck::index_t Wi            = 28;
+static constexpr ck::index_t Ho            = 28;
+static constexpr ck::index_t Wo            = 28;
+static constexpr std::array<ck::index_t, NumDimSpatial> input_spatial_lengths{Hi, Wi};
+static constexpr std::array<ck::index_t, NumDimSpatial> filter_spatial_lengths{Y, X};
+static constexpr std::array<ck::index_t, NumDimSpatial> output_spatial_lengths{Ho, Wo};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_strides{
+    N * Hi * Wi * C, Hi* Wi* C, Wi* C, C, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_strides{
+    N * Ho * Wo * K, Ho* Wo* K, Wo* K, K, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_strides{1, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_dilations{1, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
+
+int main()
+{
+    return run_grouped_conv_bwd_weight<NumDimSpatial,
+                                       InDataType,
+                                       WeiDataType,
+                                       OutDataType,
+                                       InLayout,
+                                       WeiLayout,
+                                       OutLayout>(G,
+                                                  N,
+                                                  K,
+                                                  C,
+                                                  input_spatial_lengths,
+                                                  filter_spatial_lengths,
+                                                  output_spatial_lengths,
+                                                  input_strides,
+                                                  output_strides,
+                                                  conv_filter_strides,
+                                                  conv_filter_dilations,
+                                                  input_left_pads,
+                                                  input_right_pads)
+               ? EXIT_SUCCESS
+               : EXIT_FAILURE;
+}
--- a/client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp16.cpp
+++ b/client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+
+using InDataType  = ck::half_t;
+using WeiDataType = ck::half_t;
+using OutDataType = ck::half_t;
+
+using InLayout  = ck::tensor_layout::convolution::GNDHWC;
+using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
+using OutLayout = ck::tensor_layout::convolution::GNDHWK;
+
+static constexpr ck::index_t NumDimSpatial = 3;
+static constexpr ck::index_t G             = 8;
+static constexpr ck::index_t N             = 64;
+static constexpr ck::index_t K             = 128;
+static constexpr ck::index_t C             = 128;
+static constexpr ck::index_t Z             = 3;
+static constexpr ck::index_t Y             = 3;
+static constexpr ck::index_t X             = 3;
+static constexpr ck::index_t Di            = 28;
+static constexpr ck::index_t Hi            = 28;
+static constexpr ck::index_t Wi            = 3;
+static constexpr ck::index_t Do            = 28;
+static constexpr ck::index_t Ho            = 28;
+static constexpr ck::index_t Wo            = 3;
+static constexpr std::array<ck::index_t, NumDimSpatial> input_spatial_lengths{Di, Hi, Wi};
+static constexpr std::array<ck::index_t, NumDimSpatial> filter_spatial_lengths{Z, Y, X};
+static constexpr std::array<ck::index_t, NumDimSpatial> output_spatial_lengths{Do, Ho, Wo};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_strides{
+    N * Di * Hi * Wi * C, Di* Hi* Wi* C, Hi* Wi* C, Wi* C, C, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_strides{
+    N * Do * Ho * Wo * K, Do* Ho* Wo* K, Ho* Wo* K, Wo* K, K, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_strides{1, 1, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_dilations{1, 1, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1, 1};
+
+int main()
+{
+    return run_grouped_conv_bwd_weight<NumDimSpatial,
+                                       InDataType,
+                                       WeiDataType,
+                                       OutDataType,
+                                       InLayout,
+                                       WeiLayout,
+                                       OutLayout>(G,
+                                                  N,
+                                                  K,
+                                                  C,
+                                                  input_spatial_lengths,
+                                                  filter_spatial_lengths,
+                                                  output_spatial_lengths,
+                                                  input_strides,
+                                                  output_strides,
+                                                  conv_filter_strides,
+                                                  conv_filter_dilations,
+                                                  input_left_pads,
+                                                  input_right_pads)
+               ? EXIT_SUCCESS
+               : EXIT_FAILURE;
+}
--- a/client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp32.cpp
+++ b/client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+
+using InDataType  = float;
+using WeiDataType = float;
+using OutDataType = float;
+
+using InLayout  = ck::tensor_layout::convolution::GNDHWC;
+using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
+using OutLayout = ck::tensor_layout::convolution::GNDHWK;
+
+static constexpr ck::index_t NumDimSpatial = 3;
+static constexpr ck::index_t G             = 8;
+static constexpr ck::index_t N             = 64;
+static constexpr ck::index_t K             = 128;
+static constexpr ck::index_t C             = 128;
+static constexpr ck::index_t Z             = 3;
+static constexpr ck::index_t Y             = 3;
+static constexpr ck::index_t X             = 3;
+static constexpr ck::index_t Di            = 28;
+static constexpr ck::index_t Hi            = 28;
+static constexpr ck::index_t Wi            = 3;
+static constexpr ck::index_t Do            = 28;
+static constexpr ck::index_t Ho            = 28;
+static constexpr ck::index_t Wo            = 3;
+static constexpr std::array<ck::index_t, NumDimSpatial> input_spatial_lengths{Di, Hi, Wi};
+static constexpr std::array<ck::index_t, NumDimSpatial> filter_spatial_lengths{Z, Y, X};
+static constexpr std::array<ck::index_t, NumDimSpatial> output_spatial_lengths{Do, Ho, Wo};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_strides{
+    N * Di * Hi * Wi * C, Di* Hi* Wi* C, Hi* Wi* C, Wi* C, C, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_strides{
+    N * Do * Ho * Wo * K, Do* Ho* Wo* K, Ho* Wo* K, Wo* K, K, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_strides{1, 1, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_dilations{1, 1, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1, 1};
+static constexpr std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1, 1};
+
+int main()
+{
+    return run_grouped_conv_bwd_weight<NumDimSpatial,
+                                       InDataType,
+                                       WeiDataType,
+                                       OutDataType,
+                                       InLayout,
+                                       WeiLayout,
+                                       OutLayout>(
+               G,
+               N,
+               K,
+               C,
+               {Di, Hi, Wi},
+               {Z, Y, X},
+               {Do, Ho, Wo},
+               {N * Di * Hi * Wi * C, Di * Hi * Wi * C, Hi * Wi * C, Wi * C, C, 1},
+               {N * Do * Ho * Wo * K, Do * Ho * Wo * K, Ho * Wo * K, Wo * K, K, 1},
+               {1, 1, 1},
+               {1, 1, 1},
+               {1, 1, 1},
+               {1, 1, 1})
+               ? EXIT_SUCCESS
+               : EXIT_FAILURE;
+}
--- a/client_example/12_elementwise_normalization/elementwise_layernorm2d.cpp
+++ b/client_example/12_elementwise_normalization/elementwise_layernorm2d.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #include <iomanip>
 #include <vector>

--- a/client_example/13_batchnorm/CMakeLists.txt
+++ b/client_example/13_batchnorm/CMakeLists.txt
 add_executable(client_batchnorm_fwd_nhwc batchnorm_fwd_nhwc.cpp)
 add_executable(client_batchnorm_bwd_nhwc batchnorm_bwd_nhwc.cpp)
+add_executable(client_batchnorm_infer_nhwc batchnorm_infer_nhwc.cpp)
 target_link_libraries(client_batchnorm_fwd_nhwc PRIVATE composable_kernel::device_operations)
 target_link_libraries(client_batchnorm_bwd_nhwc PRIVATE composable_kernel::device_operations)
+target_link_libraries(client_batchnorm_infer_nhwc PRIVATE composable_kernel::device_operations)
--- a/client_example/13_batchnorm/batchnorm_bwd_nhwc.cpp
+++ b/client_example/13_batchnorm/batchnorm_bwd_nhwc.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #include <functional>
 #include <numeric>

--- a/client_example/13_batchnorm/batchnorm_fwd_nhwc.cpp
+++ b/client_example/13_batchnorm/batchnorm_fwd_nhwc.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #include <functional>
 #include <numeric>

--- a/client_example/13_batchnorm/batchnorm_infer_nhwc.cpp
+++ b/client_example/13_batchnorm/batchnorm_infer_nhwc.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <functional>
+#include <numeric>
+#include <iomanip>
+#include <iostream>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/utility/tuple.hpp"
+#include "ck/library/tensor_operation_instance/gpu/batchnorm_infer.hpp"
+
+using XDataType       = float;
+using YDataType       = float;
+using ScaleDataType   = float;
+using BiasDataType    = float;
+using MeanVarDataType = float;
+
+constexpr int Rank                  = 4;
+constexpr int NumBatchNormReduceDim = 3;
+
+using Normalize = ck::tensor_operation::element_wise::NormalizeInInfer;
+
+const double epsilon = std::numeric_limits<float>::epsilon();
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+    std::array<ck::index_t, Rank> xyLengths{16, 8, 128, 256};
+    std::array<ck::index_t, Rank> xyStrides{8 * 128 * 256, 128 * 256, 256, 1};
+    std::array<ck::index_t, Rank - NumBatchNormReduceDim> scaleBiasMeanVarLengths{256};
+    std::array<ck::index_t, Rank - NumBatchNormReduceDim> scaleBiasMeanVarStrides{1};
+    std::array<int, NumBatchNormReduceDim> reduceDims{0, 1, 2};
+    std::array<int, Rank - NumBatchNormReduceDim> invariantDims{3};
+
+    ck::index_t numXYElement =
+        std::accumulate(xyLengths.begin(), xyLengths.end(), 1, std::multiplies<ck::index_t>());
+
+    ck::index_t numScaleBiasMeanVarElement = std::accumulate(scaleBiasMeanVarLengths.begin(),
+                                                             scaleBiasMeanVarLengths.end(),
+                                                             1,
+                                                             std::multiplies<ck::index_t>());
+
+    SimpleDeviceMem x(sizeof(XDataType) * numXYElement);
+    SimpleDeviceMem y(sizeof(YDataType) * numXYElement);
+    SimpleDeviceMem scale(sizeof(ScaleDataType) * numScaleBiasMeanVarElement);
+    SimpleDeviceMem bias(sizeof(BiasDataType) * numScaleBiasMeanVarElement);
+    SimpleDeviceMem mean(sizeof(MeanVarDataType) * numScaleBiasMeanVarElement);
+    SimpleDeviceMem variance(sizeof(MeanVarDataType) * numScaleBiasMeanVarElement);
+
+    // values in variance need be non-negative
+    (void)hipMemset(
+        variance.GetDeviceBuffer(), 0, sizeof(MeanVarDataType) * numScaleBiasMeanVarElement);
+
+    std::array<ck::index_t, Rank> aligned_scaleBiasMeanVarStrides{0};
+
+    int i = 0;
+    for(auto dim : invariantDims)
+    {
+        assert(xyLengths[dim] == scaleBiasMeanVarLengths[i]);
+
+        aligned_scaleBiasMeanVarStrides[dim] = scaleBiasMeanVarStrides[i];
+        i++;
+    };
+
+    using DeviceOp = ck::tensor_operation::device::DeviceElementwise<
+        ck::Tuple<XDataType, MeanVarDataType, MeanVarDataType, ScaleDataType, BiasDataType>,
+        ck::Tuple<YDataType>,
+        Normalize,
+        Rank>;
+
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr = op_ptrs[i];
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer(xyLengths,
+                                                        {xyStrides,
+                                                         aligned_scaleBiasMeanVarStrides,
+                                                         aligned_scaleBiasMeanVarStrides,
+                                                         aligned_scaleBiasMeanVarStrides,
+                                                         aligned_scaleBiasMeanVarStrides},
+                                                        {xyStrides},
+                                                        {x.GetDeviceBuffer(),
+                                                         mean.GetDeviceBuffer(),
+                                                         variance.GetDeviceBuffer(),
+                                                         scale.GetDeviceBuffer(),
+                                                         bias.GetDeviceBuffer()},
+                                                        {y.GetDeviceBuffer()},
+                                                        Normalize{epsilon});
+
+        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t num_bytes =
+                numXYElement * (sizeof(XDataType) + sizeof(YDataType)) +
+                numScaleBiasMeanVarElement * (sizeof(ScaleDataType) + sizeof(BiasDataType) +
+                                              sizeof(MeanVarDataType) + sizeof(MeanVarDataType));
+
+            float gb_per_sec = num_bytes / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
+                      << op_name << std::endl;
+
+            if(ave_time < best_ave_time)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    if(found)
+    {
+        std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
+                  << best_op_name << std::endl;
+
+        // run the best intance
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer(xyLengths,
+                                                        {xyStrides,
+                                                         aligned_scaleBiasMeanVarStrides,
+                                                         aligned_scaleBiasMeanVarStrides,
+                                                         aligned_scaleBiasMeanVarStrides,
+                                                         aligned_scaleBiasMeanVarStrides},
+                                                        {xyStrides},
+                                                        {x.GetDeviceBuffer(),
+                                                         mean.GetDeviceBuffer(),
+                                                         variance.GetDeviceBuffer(),
+                                                         scale.GetDeviceBuffer(),
+                                                         bias.GetDeviceBuffer()},
+                                                        {y.GetDeviceBuffer()},
+                                                        Normalize{epsilon});
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+
+    return 0;
+}
--- a/client_example/14_instance_id/batchnorm_fwd_instance_id.cpp
+++ b/client_example/14_instance_id/batchnorm_fwd_instance_id.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #include <functional>
 #include <numeric>

--- a/client_example/15_convnd_bwd_data/CMakeLists.txt
+++ b/client_example/15_convnd_bwd_data/CMakeLists.txt
+add_executable(client_conv3d_bwd_data_fp16 conv3d_bwd_data_fp16.cpp)
+add_executable(client_conv3d_bwd_data_fp32 conv3d_bwd_data_fp32.cpp)
+
+target_link_libraries(client_conv3d_bwd_data_fp16 PRIVATE composable_kernel::device_operations)
+target_link_libraries(client_conv3d_bwd_data_fp32 PRIVATE composable_kernel::device_operations)
--- a/client_example/15_convnd_bwd_data/common.hpp
+++ b/client_example/15_convnd_bwd_data/common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iomanip>
+#include <iostream>
+#include <iterator>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+std::size_t GetFlops(ck::index_t N,
+                     ck::index_t K,
+                     ck::index_t C,
+                     const std::vector<ck::index_t>& output_spatial_lengths,
+                     const std::vector<ck::index_t>& weights_spatial_lengths)
+{
+    // 2 * N * K * C * <output spatial lengths product> * <filter spatial lengths product>
+
+    return static_cast<std::size_t>(2) * N * K * C *
+           std::accumulate(std::begin(output_spatial_lengths),
+                           std::end(output_spatial_lengths),
+                           static_cast<std::size_t>(1),
+                           std::multiplies<>()) *
+           std::accumulate(std::begin(weights_spatial_lengths),
+                           std::end(weights_spatial_lengths),
+                           static_cast<std::size_t>(1),
+                           std::multiplies<>());
+}
+
+template <typename InDataType>
+std::size_t
+GetInputByte(ck::index_t N, ck::index_t C, const std::vector<ck::index_t>& input_spatial_lengths)
+{
+    // sizeof(InDataType) * (N * C * <input spatial lengths product>) +
+    return sizeof(InDataType) * N * C *
+           std::accumulate(std::begin(input_spatial_lengths),
+                           std::end(input_spatial_lengths),
+                           static_cast<std::size_t>(1),
+                           std::multiplies<>());
+}
+
+template <typename WeiDataType>
+std::size_t
+GetWeightByte(ck::index_t K, ck::index_t C, const std::vector<ck::index_t>& weights_spatial_lengths)
+{
+    // sizeof(WeiDataType) * (K * C * <filter spatial lengths product>) +
+    return sizeof(WeiDataType) * K * C *
+           std::accumulate(std::begin(weights_spatial_lengths),
+                           std::end(weights_spatial_lengths),
+                           static_cast<std::size_t>(1),
+                           std::multiplies<>());
+}
+
+template <typename OutDataType>
+std::size_t
+GetOutputByte(ck::index_t N, ck::index_t K, const std::vector<ck::index_t>& output_spatial_lengths)
+{
+    // sizeof(OutDataType) * (N * K * <output spatial lengths product>);
+    return sizeof(OutDataType) * N * K *
+           std::accumulate(std::begin(output_spatial_lengths),
+                           std::end(output_spatial_lengths),
+                           static_cast<std::size_t>(1),
+                           std::multiplies<std::size_t>());
+}
+
+template <ck::index_t NumDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout>
+bool run_conv_bwd_data(ck::index_t N,
+                       ck::index_t K,
+                       ck::index_t C,
+                       const std::vector<ck::index_t>& in_spatial_lengths,
+                       const std::vector<ck::index_t>& wei_spatial_lengths,
+                       const std::vector<ck::index_t>& out_spatial_lengths)
+{
+    std::size_t in_mem_size  = GetInputByte<InDataType>(N, C, in_spatial_lengths);
+    std::size_t wei_mem_size = GetWeightByte<WeiDataType>(K, C, wei_spatial_lengths);
+    std::size_t out_mem_size = GetOutputByte<OutDataType>(N, K, out_spatial_lengths);
+
+    SimpleDeviceMem in(in_mem_size);
+    SimpleDeviceMem wei(wei_mem_size);
+    SimpleDeviceMem out(out_mem_size);
+
+    std::vector<ck::index_t> filter_strides(NumDimSpatial, 1);
+    std::vector<ck::index_t> filter_dilations(NumDimSpatial, 1);
+    std::vector<ck::index_t> input_left_pads(NumDimSpatial, 1);
+    std::vector<ck::index_t> input_right_pads(NumDimSpatial, 1);
+
+    using DeviceOp = ck::tensor_operation::device::DeviceConvBwdData<NumDimSpatial,
+                                                                     InLayout,
+                                                                     WeiLayout,
+                                                                     OutLayout,
+                                                                     InDataType,
+                                                                     WeiDataType,
+                                                                     OutDataType,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     PassThrough>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    int best_op_id        = -1;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    float best_tflops     = 0;
+
+    std::size_t flop      = GetFlops(N, K, C, out_spatial_lengths, wei_spatial_lengths);
+    std::size_t num_bytes = in_mem_size + wei_mem_size + out_mem_size;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr        = op_ptrs[i];
+        auto argument_ptr   = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                                        wei.GetDeviceBuffer(),
+                                                        out.GetDeviceBuffer(),
+                                                        N,
+                                                        K,
+                                                        C,
+                                                        in_spatial_lengths,
+                                                        wei_spatial_lengths,
+                                                        out_spatial_lengths,
+                                                        filter_strides,
+                                                        filter_dilations,
+                                                        input_left_pads,
+                                                        input_right_pads,
+                                                        PassThrough{},
+                                                        PassThrough{},
+                                                        PassThrough{});
+        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+            float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+                best_tflops     = tflops;
+            }
+        }
+        else
+        {
+            std::cerr << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    if(best_op_id < 0)
+    {
+        std::cerr << "no suitable instance" << std::endl;
+        return false;
+    }
+
+    std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
+              << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+    // run the best intance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                                        wei.GetDeviceBuffer(),
+                                                        out.GetDeviceBuffer(),
+                                                        N,
+                                                        K,
+                                                        C,
+                                                        in_spatial_lengths,
+                                                        wei_spatial_lengths,
+                                                        out_spatial_lengths,
+                                                        filter_strides,
+                                                        filter_dilations,
+                                                        input_left_pads,
+                                                        input_right_pads,
+                                                        PassThrough{},
+                                                        PassThrough{},
+                                                        PassThrough{});
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+    return true;
+}
--- a/client_example/15_convnd_bwd_data/conv3d_bwd_data_fp16.cpp
+++ b/client_example/15_convnd_bwd_data/conv3d_bwd_data_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+
+using InDataType  = ck::half_t;
+using WeiDataType = ck::half_t;
+using OutDataType = ck::half_t;
+
+using InLayout  = ck::tensor_layout::convolution::NDHWC;
+using WeiLayout = ck::tensor_layout::convolution::KZYXC;
+using OutLayout = ck::tensor_layout::convolution::NDHWK;
+
+static constexpr ck::index_t NumDimSpatial = 3;
+static constexpr ck::index_t N             = 64;
+static constexpr ck::index_t K             = 128;
+static constexpr ck::index_t C             = 64;
+static constexpr ck::index_t Z             = 3;
+static constexpr ck::index_t Y             = 3;
+static constexpr ck::index_t X             = 3;
+static constexpr ck::index_t Di            = 28;
+static constexpr ck::index_t Hi            = 28;
+static constexpr ck::index_t Wi            = 28;
+static constexpr ck::index_t Do            = 28;
+static constexpr ck::index_t Ho            = 28;
+static constexpr ck::index_t Wo            = 28;
+
+int main()
+{
+    return run_conv_bwd_data<NumDimSpatial,
+                             InDataType,
+                             WeiDataType,
+                             OutDataType,
+                             InLayout,
+                             WeiLayout,
+                             OutLayout>(N, K, C, {Di, Hi, Wi}, {Z, Y, X}, {Do, Ho, Wo})
+               ? EXIT_SUCCESS
+               : EXIT_FAILURE;
+}