Add bilinear conv fwd and bwd data instances (#1164)

bf98b476 · Bartłomiej Kocot · GitHub · a78be3f6 · a78be3f6 · a78be3f6
Unverified Commit bf98b476 authored Feb 13, 2024 by Bartłomiej Kocot Committed by GitHub Feb 13, 2024
20 changed files
--- a/example/62_conv_fwd_activ/convnd_fwd_xdl_relu_fp16.cpp
+++ b/example/62_conv_fwd_activ/convnd_fwd_xdl_relu_fp16.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "convnd_fwd_activ_common.hpp"
-
-using OutElementOp = ck::tensor_operation::element_wise::Relu;
-
-using DeviceGroupedConvNDFwdActivInstance = DeviceGroupedConvNDFwdInstance<OutElementOp>;
-#include "run_convnd_fwd_activ_example.inc"
-
-int main(int argc, char* argv[]) { return !run_convnd_fwd_example(argc, argv); }
--- a/example/62_conv_fwd_activ/convnd_fwd_xdl_sigmoid_fp16.cpp
+++ b/example/62_conv_fwd_activ/convnd_fwd_xdl_sigmoid_fp16.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "convnd_fwd_activ_common.hpp"
-
-using OutElementOp = ck::tensor_operation::element_wise::Sigmoid;
-
-using DeviceGroupedConvNDFwdActivInstance = DeviceGroupedConvNDFwdInstance<OutElementOp>;
-#include "run_convnd_fwd_activ_example.inc"
-
-int main(int argc, char* argv[]) { return !run_convnd_fwd_example(argc, argv); }
--- a/example/62_conv_fwd_activ/convnd_fwd_xdl_softrelu_fp16.cpp
+++ b/example/62_conv_fwd_activ/convnd_fwd_xdl_softrelu_fp16.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "convnd_fwd_activ_common.hpp"
-
-using OutElementOp = ck::tensor_operation::element_wise::SoftRelu;
-
-using DeviceGroupedConvNDFwdActivInstance = DeviceGroupedConvNDFwdInstance<OutElementOp>;
-#include "run_convnd_fwd_activ_example.inc"
-
-int main(int argc, char* argv[]) { return !run_convnd_fwd_example(argc, argv); }
--- a/example/62_conv_fwd_activ/convnd_fwd_xdl_tanh_fp16.cpp
+++ b/example/62_conv_fwd_activ/convnd_fwd_xdl_tanh_fp16.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "convnd_fwd_activ_common.hpp"
-
-using OutElementOp = ck::tensor_operation::element_wise::TanH;
-
-using DeviceGroupedConvNDFwdActivInstance = DeviceGroupedConvNDFwdInstance<OutElementOp>;
-#include "run_convnd_fwd_activ_example.inc"
-
-int main(int argc, char* argv[]) { return !run_convnd_fwd_example(argc, argv); }
--- a/example/62_convnd_activ/CMakeLists.txt
+++ b/example/62_convnd_activ/CMakeLists.txt
+add_subdirectory(binary)
+add_subdirectory(multi_AB)
+add_subdirectory(unary)
+
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
+      add_custom_target(example_convnd_activ_xdl)
+      # ScaleAdd ScaleAdd Relu
+      add_example_executable(example_convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16 convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_xdl example_convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16)
+      add_example_executable(example_convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16 convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_xdl example_convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16)
+   set(target 1)
+ endif()
+endforeach()
--- a/example/62_convnd_activ/binary/CMakeLists.txt
+++ b/example/62_convnd_activ/binary/CMakeLists.txt
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
+      add_custom_target(example_convnd_activ_binary_xdl)
+      # Bilinear residual
+      add_example_executable(example_convnd_fwd_xdl_bilinear_residual_fp16 convnd_fwd_xdl_bilinear_residual_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_binary_xdl example_convnd_fwd_xdl_bilinear_residual_fp16)
+      add_example_executable(example_convnd_bwd_data_xdl_bilinear_residual_fp16 convnd_bwd_data_xdl_bilinear_residual_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_binary_xdl example_convnd_bwd_data_xdl_bilinear_residual_fp16)
+      set(target 1)
+ endif()
+endforeach()
--- a/example/62_convnd_activ/binary/convnd_bwd_data_xdl_bilinear_residual_fp16.cpp
+++ b/example/62_convnd_activ/binary/convnd_bwd_data_xdl_bilinear_residual_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <type_traits>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+
+constexpr ck::index_t NDimSpatial = 3;
+using InDataType                  = ck::half_t;
+using WeiDataType                 = ck::half_t;
+using AccDataType                 = float;
+using CShuffleDataType            = ck::half_t;
+using OutDataType                 = ck::half_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InLayout  = ck::tensor_layout::convolution::GNDHWC;
+using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
+using OutLayout = ck::tensor_layout::convolution::GNDHWK;
+
+using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using InElementOp  = ck::tensor_operation::element_wise::Bilinear;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default;
+
+template <typename OutElementOp>
+using DeviceGroupedConvNDBwdDataInstance =
+    ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<
+        NDimSpatial,
+        OutLayout,
+        WeiLayout,
+        ck::Tuple<InLayout>,
+        InLayout,
+        OutDataType,
+        WeiDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<InDataType>,
+        InDataType,
+        OutElementOp,
+        WeiElementOp,
+        InElementOp,
+        ConvSpec, // ConvForwardSpecialization
+        true,
+        true,
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        32,          // KPerBlock
+        8,           // AK1
+        2,           // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        8,           // ABlockTransferSrcScalarPerVector
+        8,           // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<0, 2, 1>,  // BBlockTransferThreadClusterArrangeOrder
+        S<0, 2, 1>,  // BBlockTransferSrcAccessOrder
+        1,           // BBlockTransferSrcVectorDim
+        4,           // BBlockTransferSrcScalarPerVector
+        2,           // BBlockTransferDstScalarPerVector_BK1
+        0,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 32, 1, 8>,
+        8>;
+
+using DeviceGroupedConvNDActivInstance = DeviceGroupedConvNDBwdDataInstance<OutElementOp>;
+
+namespace {
+// Use custom implementation to pass two more tensors for post op
+template <ck::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementOp,
+          typename WeiElementOp,
+          typename OutElementOp,
+          typename DeviceConvNDInstance>
+bool run_grouped_conv(bool do_verification,
+                      int init_method,
+                      bool time_kernel,
+                      const ck::utils::conv::ConvParam& conv_param,
+                      const HostTensorDescriptor& in_g_n_c_wis_desc,
+                      const HostTensorDescriptor& wei_g_k_c_xs_desc,
+                      const HostTensorDescriptor& out_g_n_k_wos_desc,
+                      const InElementOp& in_element_op,
+                      const WeiElementOp& wei_element_op,
+                      const OutElementOp& out_element_op)
+{
+    constexpr ck::index_t NumDs = 1;
+    Tensor<OutDataType> out(out_g_n_k_wos_desc);
+    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<InDataType> in_host(in_g_n_c_wis_desc);
+
+    std::cout << "out: " << out.mDesc << std::endl;
+    std::cout << "wei: " << wei.mDesc << std::endl;
+    std::cout << "in: " << in_host.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        out.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+        in_host.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+        break;
+    default:
+        out.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
+        wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
+        in_host.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
+    }
+
+    // Initialize based on out_host
+    Tensor<InDataType> in_device(in_host);
+
+    DeviceMem out_device_buf(sizeof(OutDataType) * out.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem in_device_buf(sizeof(InDataType) * in_device.mDesc.GetElementSpaceSize());
+
+    out_device_buf.ToDevice(out.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+    in_device_buf.ToDevice(in_device.mData.data());
+
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+
+    auto copy = [](auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+
+    copy(out_g_n_k_wos_desc.GetLengths(), a_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), a_g_n_k_wos_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(in_g_n_c_wis_desc.GetLengths(), e_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), e_g_n_c_wis_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+
+    // Use output as D
+    const std::array<const void*, NumDs> ds = {in_device_buf.GetDeviceBuffer()};
+
+    auto conv     = DeviceConvNDInstance{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(
+        out_device_buf.GetDeviceBuffer(),
+        wei_device_buf.GetDeviceBuffer(),
+        ds,
+        in_device_buf.GetDeviceBuffer(),
+        a_g_n_k_wos_lengths,
+        a_g_n_k_wos_strides,
+        b_g_k_c_xs_lengths,
+        b_g_k_c_xs_strides,
+        std::array<std::array<ck::index_t, NDimSpatial + 3>, NumDs>{e_g_n_c_wis_lengths},
+        std::array<std::array<ck::index_t, NDimSpatial + 3>, NumDs>{e_g_n_c_wis_strides},
+        e_g_n_c_wis_lengths,
+        e_g_n_c_wis_strides,
+        conv_filter_strides,
+        conv_filter_dilations,
+        input_left_pads,
+        input_right_pads,
+        out_element_op,
+        wei_element_op,
+        in_element_op);
+
+    if(!conv.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error("The device op with the specified compilation parameters does "
+                                 "not support this convolution problem.");
+    }
+
+    float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop =
+        conv_param.GetFlops() + 3 * conv_param.GetInputByte<InDataType>() / sizeof(InDataType);
+    std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>() +
+                            conv_param.GetOutputByte<InDataType>();
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+    float gb_per_sec = num_btype / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << conv.GetTypeString() << std::endl;
+
+    if(do_verification)
+    {
+        std::array<Tensor<OutDataType>, NumDs> d_tensors = {in_host};
+        auto ref_conv =
+            ck::tensor_operation::host::ReferenceConvBwdData<NDimSpatial,
+                                                             InDataType,
+                                                             WeiDataType,
+                                                             OutDataType,
+                                                             InElementOp,
+                                                             WeiElementOp,
+                                                             OutElementOp,
+                                                             0, /*Num A Elementwise Tensors*/
+                                                             0, /*Num B Elementwise Tensors*/
+                                                             NumDs>();
+
+        auto ref_invoker = ref_conv.MakeInvoker();
+
+        auto ref_argument = ref_conv.MakeArgument(in_host,
+                                                  wei,
+                                                  out,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  out_element_op,
+                                                  {},
+                                                  {},
+                                                  d_tensors);
+
+        ref_invoker.Run(ref_argument);
+
+        in_device_buf.FromDevice(in_device.mData.data());
+
+        return ck::utils::check_err(in_device.mData, in_host.mData);
+    }
+
+    return true;
+}
+
+} // namespace
+
+#include "../run_convnd_activ_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_example(argc, argv); }
--- a/example/62_convnd_activ/binary/convnd_fwd_xdl_bilinear_residual_fp16.cpp
+++ b/example/62_convnd_activ/binary/convnd_fwd_xdl_bilinear_residual_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <type_traits>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+
+constexpr ck::index_t NDimSpatial = 3;
+using InDataType                  = ck::half_t;
+using WeiDataType                 = ck::half_t;
+using AccDataType                 = float;
+using CShuffleDataType            = ck::half_t;
+using OutDataType                 = ck::half_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InLayout  = ck::tensor_layout::convolution::GNDHWC;
+using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
+using OutLayout = ck::tensor_layout::convolution::GNDHWK;
+
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+using OutElementOp = ck::tensor_operation::element_wise::Bilinear;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <typename OutElementOp>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<OutLayout>,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<OutDataType>,
+        OutDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        32,          // KPerBlock
+        8,           // AK1
+        8,           // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        8,           // ABlockTransferSrcScalarPerVector
+        8,           // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        8,           // BBlockTransferSrcScalarPerVector
+        8,           // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 32, 1, 8>,
+        8>;
+
+using DeviceGroupedConvNDActivInstance = DeviceGroupedConvNDFwdInstance<OutElementOp>;
+
+namespace {
+// Use custom implementation to pass two more tensors for post op
+template <ck::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementOp,
+          typename WeiElementOp,
+          typename OutElementOp,
+          typename DeviceConvNDFwdInstance>
+bool run_grouped_conv(bool do_verification,
+                      int init_method,
+                      bool time_kernel,
+                      const ck::utils::conv::ConvParam& conv_param,
+                      const HostTensorDescriptor& in_g_n_c_wis_desc,
+                      const HostTensorDescriptor& wei_g_k_c_xs_desc,
+                      const HostTensorDescriptor& out_g_n_k_wos_desc,
+                      const InElementOp& in_element_op,
+                      const WeiElementOp& wei_element_op,
+                      const OutElementOp& out_element_op)
+{
+    constexpr ck::index_t NumDs = 1;
+    Tensor<InDataType> in(in_g_n_c_wis_desc);
+    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
+
+    std::cout << "in: " << in.mDesc << std::endl;
+    std::cout << "wei: " << wei.mDesc << std::endl;
+    std::cout << "out: " << out_host.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-2, 2});
+        wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-2, 2});
+        out_host.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-2, 2});
+        break;
+    default:
+        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-1.0, 1.0});
+        wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-2, 2});
+        out_host.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.05, 0.05});
+    }
+
+    // Initialize based on out_host
+    Tensor<OutDataType> out_device(out_host);
+
+    DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
+
+    in_device_buf.ToDevice(in.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+    out_device_buf.ToDevice(out_device.mData.data());
+
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+
+    auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+
+    copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+
+    // Use output as D
+    const std::array<const void*, NumDs> ds = {out_device_buf.GetDeviceBuffer()};
+
+    auto conv     = DeviceConvNDFwdInstance{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(
+        in_device_buf.GetDeviceBuffer(),
+        wei_device_buf.GetDeviceBuffer(),
+        ds,
+        out_device_buf.GetDeviceBuffer(),
+        a_g_n_c_wis_lengths,
+        a_g_n_c_wis_strides,
+        b_g_k_c_xs_lengths,
+        b_g_k_c_xs_strides,
+        std::array<std::array<ck::index_t, NDimSpatial + 3>, NumDs>{e_g_n_k_wos_lengths},
+        std::array<std::array<ck::index_t, NDimSpatial + 3>, NumDs>{e_g_n_k_wos_strides},
+        e_g_n_k_wos_lengths,
+        e_g_n_k_wos_strides,
+        conv_filter_strides,
+        conv_filter_dilations,
+        input_left_pads,
+        input_right_pads,
+        in_element_op,
+        wei_element_op,
+        out_element_op);
+
+    if(!conv.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error("The device op with the specified compilation parameters does "
+                                 "not support this convolution problem.");
+    }
+
+    float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop =
+        conv_param.GetFlops() + 3 * conv_param.GetOutputByte<OutDataType>() / sizeof(OutDataType);
+    std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>() +
+                            conv_param.GetOutputByte<OutDataType>();
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+    float gb_per_sec = num_btype / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << conv.GetTypeString() << std::endl;
+
+    if(do_verification)
+    {
+        std::array<Tensor<OutDataType>, NumDs> d_tensors = {out_host};
+        auto ref_conv =
+            ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                         InDataType,
+                                                         WeiDataType,
+                                                         OutDataType,
+                                                         InElementOp,
+                                                         WeiElementOp,
+                                                         OutElementOp,
+                                                         0, /*Num A Elementwise Tensors*/
+                                                         0, /*Num B Elementwise Tensors*/
+                                                         NumDs>();
+
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(in,
+                                                  wei,
+                                                  out_host,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  out_element_op,
+                                                  {},
+                                                  {},
+                                                  d_tensors);
+
+        ref_invoker.Run(ref_argument);
+
+        out_device_buf.FromDevice(out_device.mData.data());
+
+        return ck::utils::check_err(out_device, out_host, "Error: incorrect results!");
+    }
+
+    return true;
+}
+
+} // namespace
+
+#include "../run_convnd_activ_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_example(argc, argv); }
--- a/example/62_conv_fwd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16.cpp
+++ b/example/62_conv_fwd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include <algorithm>
 #include <cstdlib>
@@ -97,7 +97,7 @@ using DeviceGroupedConvNDFwdInstance =
        S<1, 32, 1, 8>,
        8>;

-using DeviceGroupedConvNDFwdActivInstance = DeviceGroupedConvNDFwdInstance<OutElementOp>;
+using DeviceGroupedConvNDActivInstance = DeviceGroupedConvNDFwdInstance<OutElementOp>;

 namespace {
 // Use custom implementation to pass two more tensors for post op
@@ -109,16 +109,16 @@ template <ck::index_t NDimSpatial,
          typename WeiElementOp,
          typename OutElementOp,
          typename DeviceConvNDFwdInstance>
-bool run_grouped_conv_fwd(bool do_verification,
-                          int init_method,
-                          bool time_kernel,
-                          const ck::utils::conv::ConvParam& conv_param,
-                          const HostTensorDescriptor& in_g_n_c_wis_desc,
-                          const HostTensorDescriptor& wei_g_k_c_xs_desc,
-                          const HostTensorDescriptor& out_g_n_k_wos_desc,
-                          const InElementOp& in_element_op,
-                          const WeiElementOp& wei_element_op,
-                          const OutElementOp& out_element_op)
+bool run_grouped_conv(bool do_verification,
+                      int init_method,
+                      bool time_kernel,
+                      const ck::utils::conv::ConvParam& conv_param,
+                      const HostTensorDescriptor& in_g_n_c_wis_desc,
+                      const HostTensorDescriptor& wei_g_k_c_xs_desc,
+                      const HostTensorDescriptor& out_g_n_k_wos_desc,
+                      const InElementOp& in_element_op,
+                      const WeiElementOp& wei_element_op,
+                      const OutElementOp& out_element_op)
 {
    constexpr ck::index_t NumDs = 2;
    const ck::index_t G         = out_g_n_k_wos_desc.GetLengths()[0];
@@ -289,6 +289,6 @@ bool run_grouped_conv_fwd(bool do_verification,

 } // namespace

-#include "run_convnd_fwd_activ_example.inc"
+#include "run_convnd_activ_example.inc"

-int main(int argc, char* argv[]) { return !run_convnd_fwd_example(argc, argv); }
+int main(int argc, char* argv[]) { return !run_convnd_example(argc, argv); }
--- a/example/62_conv_fwd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16.cpp
+++ b/example/62_conv_fwd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include <cstdlib>
 #include <iostream>
@@ -94,7 +94,7 @@ using DeviceGroupedConvNDFwdInstance =
        S<1, 32, 1, 8>,
        8>;

-using DeviceGroupedConvNDFwdActivInstance = DeviceGroupedConvNDFwdInstance<OutElementOp>;
+using DeviceGroupedConvNDActivInstance = DeviceGroupedConvNDFwdInstance<OutElementOp>;

 namespace {
 // Use custom implementation to pass two more tensors for post op
@@ -106,16 +106,16 @@ template <ck::index_t NDimSpatial,
          typename WeiElementOp,
          typename OutElementOp,
          typename DeviceConvNDFwdInstance>
-bool run_grouped_conv_fwd(bool do_verification,
-                          int init_method,
-                          bool time_kernel,
-                          const ck::utils::conv::ConvParam& conv_param,
-                          const HostTensorDescriptor& in_g_n_c_wis_desc,
-                          const HostTensorDescriptor& wei_g_k_c_xs_desc,
-                          const HostTensorDescriptor& out_g_n_k_wos_desc,
-                          const InElementOp& in_element_op,
-                          const WeiElementOp& wei_element_op,
-                          const OutElementOp& out_element_op)
+bool run_grouped_conv(bool do_verification,
+                      int init_method,
+                      bool time_kernel,
+                      const ck::utils::conv::ConvParam& conv_param,
+                      const HostTensorDescriptor& in_g_n_c_wis_desc,
+                      const HostTensorDescriptor& wei_g_k_c_xs_desc,
+                      const HostTensorDescriptor& out_g_n_k_wos_desc,
+                      const InElementOp& in_element_op,
+                      const WeiElementOp& wei_element_op,
+                      const OutElementOp& out_element_op)
 {
    constexpr ck::index_t NumDs = 2;
    Tensor<InDataType> in(in_g_n_c_wis_desc);
@@ -265,6 +265,6 @@ bool run_grouped_conv_fwd(bool do_verification,

 } // namespace

-#include "run_convnd_fwd_activ_example.inc"
+#include "run_convnd_activ_example.inc"

-int main(int argc, char* argv[]) { return !run_convnd_fwd_example(argc, argv); }
+int main(int argc, char* argv[]) { return !run_convnd_example(argc, argv); }
--- a/example/62_convnd_activ/multi_AB/CMakeLists.txt
+++ b/example/62_convnd_activ/multi_AB/CMakeLists.txt
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
+      add_custom_target(example_convnd_activ_multi_ab_xdl)
+      # ScaleAdd on A and B
+      add_example_executable(example_conv_fwd_xdl_scaleadd_ab_fp16 conv_fwd_xdl_scaleadd_ab_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_multi_ab_xdl example_conv_fwd_xdl_scaleadd_ab_fp16)
+      add_example_executable(example_conv_fwd_xdl_scaleadd_ab_fp32 conv_fwd_xdl_scaleadd_ab_fp32.cpp)
+      add_example_dependencies(example_convnd_activ_multi_ab_xdl example_conv_fwd_xdl_scaleadd_ab_fp32)
+      add_example_executable(example_conv_fwd_xdl_scaleadd_ab_bf16 conv_fwd_xdl_scaleadd_ab_bf16.cpp)
+      add_example_dependencies(example_convnd_activ_multi_ab_xdl example_conv_fwd_xdl_scaleadd_ab_bf16)
+      add_example_executable(example_conv_fwd_xdl_scaleadd_ab_int8 conv_fwd_xdl_scaleadd_ab_int8.cpp)
+      add_example_dependencies(example_convnd_activ_multi_ab_xdl example_conv_fwd_xdl_scaleadd_ab_int8)
+   set(target 1)
+ endif()
+endforeach()
--- a/example/62_conv_fwd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_bf16.cpp
+++ b/example/62_conv_fwd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_bf16.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include "convnd_fwd_activ_multi_ab_common.hpp"

@@ -14,13 +14,13 @@ using BDataTypes  = ck::Tuple<DataType, DataType>;
 using InElementOp  = ck::tensor_operation::element_wise::ScaleAdd;
 using WeiElementOp = ck::tensor_operation::element_wise::ScaleAdd;

-using DeviceGroupedConvNDFwdActivInstance = DeviceGroupedConvNDMultiABFwdInstance<DataType,
-                                                                                  AccDataType,
-                                                                                  ADataTypes,
-                                                                                  BDataTypes,
-                                                                                  InElementOp,
-                                                                                  WeiElementOp>;
+using DeviceGroupedConvNDActivInstance = DeviceGroupedConvNDMultiABFwdInstance<DataType,
+                                                                               AccDataType,
+                                                                               ADataTypes,
+                                                                               BDataTypes,
+                                                                               InElementOp,
+                                                                               WeiElementOp>;

-#include "../run_convnd_fwd_activ_example.inc"
+#include "../run_convnd_activ_example.inc"

-int main(int argc, char* argv[]) { return !run_convnd_fwd_example(argc, argv); }
+int main(int argc, char* argv[]) { return !run_convnd_example(argc, argv); }
--- a/example/62_conv_fwd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_fp16.cpp
+++ b/example/62_conv_fwd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_fp16.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include "convnd_fwd_activ_multi_ab_common.hpp"

@@ -14,13 +14,13 @@ using BDataTypes  = ck::Tuple<DataType, DataType>;
 using InElementOp  = ck::tensor_operation::element_wise::ScaleAdd;
 using WeiElementOp = ck::tensor_operation::element_wise::ScaleAdd;

-using DeviceGroupedConvNDFwdActivInstance = DeviceGroupedConvNDMultiABFwdInstance<DataType,
-                                                                                  AccDataType,
-                                                                                  ADataTypes,
-                                                                                  BDataTypes,
-                                                                                  InElementOp,
-                                                                                  WeiElementOp>;
+using DeviceGroupedConvNDActivInstance = DeviceGroupedConvNDMultiABFwdInstance<DataType,
+                                                                               AccDataType,
+                                                                               ADataTypes,
+                                                                               BDataTypes,
+                                                                               InElementOp,
+                                                                               WeiElementOp>;

-#include "../run_convnd_fwd_activ_example.inc"
+#include "../run_convnd_activ_example.inc"

-int main(int argc, char* argv[]) { return !run_convnd_fwd_example(argc, argv); }
+int main(int argc, char* argv[]) { return !run_convnd_example(argc, argv); }
--- a/example/62_conv_fwd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_fp32.cpp
+++ b/example/62_conv_fwd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_fp32.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include "convnd_fwd_activ_multi_ab_common.hpp"

@@ -14,13 +14,13 @@ using BDataTypes  = ck::Tuple<DataType, DataType>;
 using InElementOp  = ck::tensor_operation::element_wise::ScaleAdd;
 using WeiElementOp = ck::tensor_operation::element_wise::ScaleAdd;

-using DeviceGroupedConvNDFwdActivInstance = DeviceGroupedConvNDMultiABFwdInstance<DataType,
-                                                                                  AccDataType,
-                                                                                  ADataTypes,
-                                                                                  BDataTypes,
-                                                                                  InElementOp,
-                                                                                  WeiElementOp>;
+using DeviceGroupedConvNDActivInstance = DeviceGroupedConvNDMultiABFwdInstance<DataType,
+                                                                               AccDataType,
+                                                                               ADataTypes,
+                                                                               BDataTypes,
+                                                                               InElementOp,
+                                                                               WeiElementOp>;

-#include "../run_convnd_fwd_activ_example.inc"
+#include "../run_convnd_activ_example.inc"

-int main(int argc, char* argv[]) { return !run_convnd_fwd_example(argc, argv); }
+int main(int argc, char* argv[]) { return !run_convnd_example(argc, argv); }
--- a/example/62_conv_fwd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_int8.cpp
+++ b/example/62_conv_fwd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_int8.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include "convnd_fwd_activ_multi_ab_common.hpp"

@@ -14,13 +14,13 @@ using BDataTypes  = ck::Tuple<DataType, DataType>;
 using InElementOp  = ck::tensor_operation::element_wise::ScaleAdd;
 using WeiElementOp = ck::tensor_operation::element_wise::ScaleAdd;

-using DeviceGroupedConvNDFwdActivInstance = DeviceGroupedConvNDMultiABFwdInstance<DataType,
-                                                                                  AccDataType,
-                                                                                  ADataTypes,
-                                                                                  BDataTypes,
-                                                                                  InElementOp,
-                                                                                  WeiElementOp>;
+using DeviceGroupedConvNDActivInstance = DeviceGroupedConvNDMultiABFwdInstance<DataType,
+                                                                               AccDataType,
+                                                                               ADataTypes,
+                                                                               BDataTypes,
+                                                                               InElementOp,
+                                                                               WeiElementOp>;

-#include "../run_convnd_fwd_activ_example.inc"
+#include "../run_convnd_activ_example.inc"

-int main(int argc, char* argv[]) { return !run_convnd_fwd_example(argc, argv); }
+int main(int argc, char* argv[]) { return !run_convnd_example(argc, argv); }
--- a/example/62_conv_fwd_activ/multi_AB/convnd_fwd_activ_multi_ab_common.hpp
+++ b/example/62_conv_fwd_activ/multi_AB/convnd_fwd_activ_multi_ab_common.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include <cstdlib>
 #include <iostream>
@@ -100,16 +100,16 @@ template <ck::index_t NDimSpatial,
          typename WeiElementOp,
          typename OutElementOp,
          typename DeviceConvNDFwdInstance>
-bool run_grouped_conv_fwd(bool do_verification,
-                          int init_method,
-                          bool time_kernel,
-                          const ck::utils::conv::ConvParam& conv_param,
-                          const HostTensorDescriptor& in_g_n_c_wis_desc,
-                          const HostTensorDescriptor& wei_g_k_c_xs_desc,
-                          const HostTensorDescriptor& out_g_n_k_wos_desc,
-                          const InElementOp& in_element_op,
-                          const WeiElementOp& wei_element_op,
-                          const OutElementOp& out_element_op)
+bool run_grouped_conv(bool do_verification,
+                      int init_method,
+                      bool time_kernel,
+                      const ck::utils::conv::ConvParam& conv_param,
+                      const HostTensorDescriptor& in_g_n_c_wis_desc,
+                      const HostTensorDescriptor& wei_g_k_c_xs_desc,
+                      const HostTensorDescriptor& out_g_n_k_wos_desc,
+                      const InElementOp& in_element_op,
+                      const WeiElementOp& wei_element_op,
+                      const OutElementOp& out_element_op)
 {
    constexpr ck::index_t NumAs = 2;
    constexpr ck::index_t NumBs = 2;

--- a/example/62_conv_fwd_activ/run_convnd_fwd_activ_example.inc
+++ b/example/62_conv_fwd_activ/run_convnd_fwd_activ_example.inc
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -11,7 +11,7 @@ void print_helper_msg()
              << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
 }

-bool run_convnd_fwd_example(int argc, char* argv[])
+bool run_convnd_example(int argc, char* argv[])
 {
    print_helper_msg();

@@ -63,23 +63,23 @@ bool run_convnd_fwd_example(int argc, char* argv[])
            ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(
                conv_param);

-        return run_grouped_conv_fwd<NDimSpatial,
-                                    InDataType,
-                                    WeiDataType,
-                                    OutDataType,
-                                    InElementOp,
-                                    WeiElementOp,
-                                    OutElementOp,
-                                    DeviceGroupedConvNDFwdActivInstance>(do_verification,
-                                                                         init_method,
-                                                                         time_kernel,
-                                                                         conv_param,
-                                                                         in_g_n_c_wis_desc,
-                                                                         wei_g_k_c_xs_desc,
-                                                                         out_g_n_k_wos_desc,
-                                                                         in_element_op,
-                                                                         wei_element_op,
-                                                                         out_element_op);
+        return run_grouped_conv<NDimSpatial,
+                                InDataType,
+                                WeiDataType,
+                                OutDataType,
+                                InElementOp,
+                                WeiElementOp,
+                                OutElementOp,
+                                DeviceGroupedConvNDActivInstance>(do_verification,
+                                                                  init_method,
+                                                                  time_kernel,
+                                                                  conv_param,
+                                                                  in_g_n_c_wis_desc,
+                                                                  wei_g_k_c_xs_desc,
+                                                                  out_g_n_k_wos_desc,
+                                                                  in_element_op,
+                                                                  wei_element_op,
+                                                                  out_element_op);
    };

    if(conv_param.num_dim_spatial_ == 3)

--- a/example/62_conv_fwd_activ/CMakeLists.txt
+++ b/example/62_conv_fwd_activ/CMakeLists.txt
@@ -2,48 +2,34 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)
-      add_custom_target(example_convnd_fwd_activ_xdl)
+      add_custom_target(example_convnd_activ_unary_xdl)
      # Sigmoid
      add_example_executable(example_convnd_fwd_xdl_sigmoid_fp16 convnd_fwd_xdl_sigmoid_fp16.cpp)
-      add_example_dependencies(example_convnd_fwd_activ_xdl example_convnd_fwd_xdl_sigmoid_fp16)
+      add_example_dependencies(example_convnd_activ_unary_xdl example_convnd_fwd_xdl_sigmoid_fp16)
      # Tanh
      add_example_executable(example_convnd_fwd_xdl_tanh_fp16 convnd_fwd_xdl_tanh_fp16.cpp)
-      add_example_dependencies(example_convnd_fwd_activ_xdl example_convnd_fwd_xdl_tanh_fp16)
+      add_example_dependencies(example_convnd_activ_unary_xdl example_convnd_fwd_xdl_tanh_fp16)
      # Relu
      add_example_executable(example_convnd_fwd_xdl_relu_fp16 convnd_fwd_xdl_relu_fp16.cpp)
-      add_example_dependencies(example_convnd_fwd_activ_xdl example_convnd_fwd_xdl_relu_fp16)
+      add_example_dependencies(example_convnd_activ_unary_xdl example_convnd_fwd_xdl_relu_fp16)
      # SoftRelu
      add_example_executable(example_convnd_fwd_xdl_softrelu_fp16 convnd_fwd_xdl_softrelu_fp16.cpp)
-      add_example_dependencies(example_convnd_fwd_activ_xdl example_convnd_fwd_xdl_softrelu_fp16)
+      add_example_dependencies(example_convnd_activ_unary_xdl example_convnd_fwd_xdl_softrelu_fp16)
      # Abs
      add_example_executable(example_convnd_fwd_xdl_abs_fp16 convnd_fwd_xdl_abs_fp16.cpp)
-      add_example_dependencies(example_convnd_fwd_activ_xdl example_convnd_fwd_xdl_abs_fp16)
+      add_example_dependencies(example_convnd_activ_unary_xdl example_convnd_fwd_xdl_abs_fp16)
      # Pow
      add_example_executable(example_convnd_fwd_xdl_pow_fp16 convnd_fwd_xdl_pow_fp16.cpp)
-      add_example_dependencies(example_convnd_fwd_activ_xdl example_convnd_fwd_xdl_pow_fp16)
+      add_example_dependencies(example_convnd_activ_unary_xdl example_convnd_fwd_xdl_pow_fp16)
      # Clipped Relu
      add_example_executable(example_convnd_fwd_xdl_clippedrelu_fp16 convnd_fwd_xdl_clippedrelu_fp16.cpp)
-      add_example_dependencies(example_convnd_fwd_activ_xdl example_convnd_fwd_xdl_clippedrelu_fp16)
+      add_example_dependencies(example_convnd_activ_unary_xdl example_convnd_fwd_xdl_clippedrelu_fp16)
      # Leaky Relu
      add_example_executable(example_convnd_fwd_xdl_leakyrelu_fp16 convnd_fwd_xdl_leakyrelu_fp16.cpp)
-      add_example_dependencies(example_convnd_fwd_activ_xdl example_convnd_fwd_xdl_leakyrelu_fp16)
+      add_example_dependencies(example_convnd_activ_unary_xdl example_convnd_fwd_xdl_leakyrelu_fp16)
      # Elu
      add_example_executable(example_convnd_fwd_xdl_elu_fp16 convnd_fwd_xdl_elu_fp16.cpp)
-      add_example_dependencies(example_convnd_fwd_activ_xdl example_convnd_fwd_xdl_elu_fp16)
-      # ScaleAdd on A and B
-      add_example_executable(example_conv_fwd_xdl_scaleadd_ab_fp16 multi_AB/conv_fwd_xdl_scaleadd_ab_fp16.cpp)
-      add_example_dependencies(example_convnd_fwd_activ_xdl example_conv_fwd_xdl_scaleadd_ab_fp16)
-      add_example_executable(example_conv_fwd_xdl_scaleadd_ab_fp32 multi_AB/conv_fwd_xdl_scaleadd_ab_fp32.cpp)
-      add_example_dependencies(example_convnd_fwd_activ_xdl example_conv_fwd_xdl_scaleadd_ab_fp32)
-      add_example_executable(example_conv_fwd_xdl_scaleadd_ab_bf16 multi_AB/conv_fwd_xdl_scaleadd_ab_bf16.cpp)
-      add_example_dependencies(example_convnd_fwd_activ_xdl example_conv_fwd_xdl_scaleadd_ab_bf16)
-      add_example_executable(example_conv_fwd_xdl_scaleadd_ab_int8 multi_AB/conv_fwd_xdl_scaleadd_ab_int8.cpp)
-      add_example_dependencies(example_convnd_fwd_activ_xdl example_conv_fwd_xdl_scaleadd_ab_int8)
-      # ScaleAdd ScaleAdd Relu
-      add_example_executable(example_convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16 convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16.cpp)
-      add_example_dependencies(example_convnd_fwd_activ_xdl example_convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16)
-      add_example_executable(example_convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16 convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16.cpp)
-      add_example_dependencies(example_convnd_fwd_activ_xdl example_convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16)
+      add_example_dependencies(example_convnd_activ_unary_xdl example_convnd_fwd_xdl_elu_fp16)
   set(target 1)
 endif()
 endforeach()
--- a/example/62_conv_fwd_activ/convnd_fwd_activ_common.hpp
+++ b/example/62_conv_fwd_activ/convnd_fwd_activ_common.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -102,16 +102,16 @@ template <ck::index_t NDimSpatial,
          typename WeiElementOp,
          typename OutElementOp,
          typename DeviceConvNDFwdInstance>
-bool run_grouped_conv_fwd(bool do_verification,
-                          int init_method,
-                          bool time_kernel,
-                          const ck::utils::conv::ConvParam& conv_param,
-                          const HostTensorDescriptor& in_g_n_c_wis_desc,
-                          const HostTensorDescriptor& wei_g_k_c_xs_desc,
-                          const HostTensorDescriptor& out_g_n_k_wos_desc,
-                          const InElementOp& in_element_op,
-                          const WeiElementOp& wei_element_op,
-                          const OutElementOp& out_element_op)
+bool run_grouped_conv(bool do_verification,
+                      int init_method,
+                      bool time_kernel,
+                      const ck::utils::conv::ConvParam& conv_param,
+                      const HostTensorDescriptor& in_g_n_c_wis_desc,
+                      const HostTensorDescriptor& wei_g_k_c_xs_desc,
+                      const HostTensorDescriptor& out_g_n_k_wos_desc,
+                      const InElementOp& in_element_op,
+                      const WeiElementOp& wei_element_op,
+                      const OutElementOp& out_element_op)
 {
    Tensor<InDataType> in(in_g_n_c_wis_desc);
    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);

--- a/example/62_convnd_activ/unary/convnd_fwd_xdl_abs_fp16.cpp
+++ b/example/62_convnd_activ/unary/convnd_fwd_xdl_abs_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_activ_unary_common.hpp"
+
+using OutElementOp = ck::tensor_operation::element_wise::UnaryAbs;
+
+using DeviceGroupedConvNDActivInstance = DeviceGroupedConvNDFwdInstance<OutElementOp>;
+#include "../run_convnd_activ_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_example(argc, argv); }