Add dynamic elementwise op (#1426)

* Add dynamic elementwise op Co-authored-by: ThruptiRajLakshmanaGowda <thruptiraj.lakshmanagowda@amd.com> * CI issues fix * Custom parameter value for dynamic functions - Comments addressed --------- Co-authored-by: ThruptiRajLakshmanaGowda <thruptiraj.lakshmanagowda@amd.com> Co-authored-by: ThruptiRajLakshmanaGowda <tlakshma@amd.com>

Add dynamic elementwise op (#1426)
* Add dynamic elementwise op Co-authored-by: ThruptiRajLakshmanaGowda <thruptiraj.lakshmanagowda@amd.com> * CI issues fix * Custom parameter value for dynamic functions - Comments addressed --------- Co-authored-by: ThruptiRajLakshmanaGowda <thruptiraj.lakshmanagowda@amd.com> Co-authored-by: ThruptiRajLakshmanaGowda <tlakshma@amd.com>
31bf253a · Bartłomiej Kocot · GitHub · 54f0e6f4 · 31bf253a · 31bf253a
Unverified Commit 31bf253a authored Oct 26, 2024 by Bartłomiej Kocot Committed by GitHub Oct 26, 2024
20 changed files
--- a/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp
+++ b/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 /*
 Computes C_m_o = Relu(A0[m, k] * B0[n, k] + D00[m, n] + D01[mn]) * B1[n, o] + D1[m, o]
@@ -60,14 +60,14 @@ struct AddAddRelu
    {
        const ck::half_t x = c + d0 + d1;
-        ck::tensor_operation::element_wise::Relu{}.template operator()<ck::half_t>(e, x);
+        ck::tensor_operation::element_wise::Relu{}.operator()(e, x);
    }
    __host__ __device__ void
    operator()(float& e, const float& c, const ck::half_t& d0, const ck::half_t& d1) const
    {
        const float x = c + (d0 + d1);
-        ck::tensor_operation::element_wise::Relu{}.template operator()<float>(e, x);
+        ck::tensor_operation::element_wise::Relu{}.operator()(e, x);
    }
 };

--- a/example/62_convnd_activ/CMakeLists.txt
+++ b/example/62_convnd_activ/CMakeLists.txt
@@ -6,6 +6,7 @@ add_subdirectory(convscale_add)
 add_subdirectory(convscale_reduce)
 add_subdirectory(multi_AB)
 add_subdirectory(unary)
+add_subdirectory(dynamic_unary)
 add_custom_target(example_convnd_activ_xdl)
 # ScaleAdd ScaleAdd Relu

--- a/example/62_convnd_activ/dynamic_unary/CMakeLists.txt
+++ b/example/62_convnd_activ/dynamic_unary/CMakeLists.txt
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
+      add_custom_target(example_convnd_activ_dynamic_unary_xdl)
+      # Sigmoid
+      add_example_executable(example_convnd_fwd_xdl_dynamic_sigmoid_fp16 convnd_fwd_xdl_dynamic_sigmoid_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_sigmoid_fp16)
+      # Tanh
+      add_example_executable(example_convnd_fwd_xdl_dynamic_tanh_fp16 convnd_fwd_xdl_dynamic_tanh_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_tanh_fp16)
+      # Relu
+      add_example_executable(example_convnd_fwd_xdl_dynamic_relu_fp16 convnd_fwd_xdl_dynamic_relu_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_relu_fp16)
+      # SoftRelu
+      add_example_executable(example_convnd_fwd_xdl_dynamic_softrelu_fp16 convnd_fwd_xdl_dynamic_softrelu_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_softrelu_fp16)
+      # Abs
+      add_example_executable(example_convnd_fwd_xdl_dynamic_abs_fp16 convnd_fwd_xdl_dynamic_abs_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_abs_fp16)
+      # Pow
+      add_example_executable(example_convnd_fwd_xdl_dynamic_pow_fp16 convnd_fwd_xdl_dynamic_pow_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_pow_fp16)
+      # Clipped Relu
+      add_example_executable(example_convnd_fwd_xdl_dynamic_clippedrelu_fp16 convnd_fwd_xdl_dynamic_clippedrelu_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_clippedrelu_fp16)
+      # Leaky Relu
+      add_example_executable(example_convnd_fwd_xdl_dynamic_leakyrelu_fp16 convnd_fwd_xdl_dynamic_leakyrelu_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_leakyrelu_fp16)
+      # Elu
+      add_example_executable(example_convnd_fwd_xdl_dynamic_elu_fp16 convnd_fwd_xdl_dynamic_elu_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_elu_fp16)
+      # Swish
+      add_example_executable(example_convnd_fwd_xdl_dynamic_swish_fp16 convnd_fwd_xdl_dynamic_swish_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_swish_fp16)
+      # PassThrough
+      add_example_executable(example_convnd_fwd_xdl_dynamic_passthrough_fp16 convnd_fwd_xdl_dynamic_passthrough_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_passthrough_fp16)
+      # Logistic
+      add_example_executable(example_convnd_fwd_xdl_dynamic_logistic_fp16 convnd_fwd_xdl_dynamic_logistic_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_logistic_fp16)
+   set(target 1)
+ endif()
+endforeach()
--- a/example/62_convnd_activ/dynamic_unary/convnd_fwd_activ_dynamic_unary_common.hpp
+++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_activ_dynamic_unary_common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <type_traits>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+constexpr ck::index_t NDimSpatial = 3;
+using InDataType                  = ck::half_t;
+using WeiDataType                 = ck::half_t;
+using AccDataType                 = float;
+using CShuffleDataType            = ck::half_t;
+using OutDataType                 = ck::half_t;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using InLayout  = ck::tensor_layout::convolution::GNDHWC;
+using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
+using OutLayout = ck::tensor_layout::convolution::GNDHWK;
+using InElementOp      = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp     = ck::tensor_operation::element_wise::PassThrough;
+using DynamicElementOp = ck::tensor_operation::element_wise::DynamicUnaryOp;
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+using DeviceGroupedConvNDActivInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<>,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<>,
+        OutDataType,
+        InElementOp,
+        WeiElementOp,
+        DynamicElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        32,          // KPerBlock
+        8,           // AK1
+        8,           // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        8,           // ABlockTransferSrcScalarPerVector
+        8,           // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        8,           // BBlockTransferSrcScalarPerVector
+        8,           // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 32, 1, 8>,
+        8>;
+template <ck::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementOp,
+          typename WeiElementOp,
+          typename OutElementOp,
+          typename DeviceConvNDFwdInstance>
+bool run_grouped_conv(bool do_verification,
+                      int init_method,
+                      bool time_kernel,
+                      const ck::utils::conv::ConvParam& conv_param,
+                      const HostTensorDescriptor& in_g_n_c_wis_desc,
+                      const HostTensorDescriptor& wei_g_k_c_xs_desc,
+                      const HostTensorDescriptor& out_g_n_k_wos_desc,
+                      const InElementOp& in_element_op,
+                      const WeiElementOp& wei_element_op,
+                      const OutElementOp& out_element_op)
+{
+    Tensor<InDataType> in(in_g_n_c_wis_desc);
+    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
+    Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
+    std::cout << "in: " << in.mDesc << std::endl;
+    std::cout << "wei: " << wei.mDesc << std::endl;
+    std::cout << "out: " << out_host.mDesc << std::endl;
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-2, 2});
+        wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-2, 2});
+        break;
+    default:
+        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-1.0, 1.0});
+        wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.05, 0.05});
+    }
+    DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
+    in_device_buf.ToDevice(in.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+    auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+    copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+    // do Conv
+    auto conv     = DeviceConvNDFwdInstance{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
+                                      wei_device_buf.GetDeviceBuffer(),
+                                      std::array<const void*, 0>{},
+                                      out_device_buf.GetDeviceBuffer(),
+                                      a_g_n_c_wis_lengths,
+                                      a_g_n_c_wis_strides,
+                                      b_g_k_c_xs_lengths,
+                                      b_g_k_c_xs_strides,
+                                      std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
+                                      std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
+                                      e_g_n_k_wos_lengths,
+                                      e_g_n_k_wos_strides,
+                                      conv_filter_strides,
+                                      conv_filter_dilations,
+                                      input_left_pads,
+                                      input_right_pads,
+                                      in_element_op,
+                                      wei_element_op,
+                                      out_element_op);
+    if(!conv.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error("The device op with the specified compilation parameters does "
+                                 "not support this convolution problem.");
+    }
+    float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+    std::size_t flop      = conv_param.GetFlops();
+    std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
+    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+    float gb_per_sec = num_btype / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << conv.GetTypeString() << std::endl;
+    if(do_verification)
+    {
+        auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                                     InDataType,
+                                                                     WeiDataType,
+                                                                     OutDataType,
+                                                                     InElementOp,
+                                                                     WeiElementOp,
+                                                                     OutElementOp>();
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(in,
+                                                  wei,
+                                                  out_host,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  out_element_op);
+        ref_invoker.Run(ref_argument);
+        out_device_buf.FromDevice(out_device.mData.data());
+        return ck::utils::check_err(out_device, out_host, "Error: incorrect results!");
+    }
+    return true;
+}
--- a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_abs_fp16.cpp
+++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_abs_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "convnd_fwd_activ_dynamic_unary_common.hpp"
+#include "../run_convnd_activ_dynamic_example.inc"
+int main(int argc, char* argv[])
+{
+    ck::tensor_operation::element_wise::UnaryAbs out_element_op;
+    return !run_convnd_example(argc, argv, out_element_op);
+}
--- a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_clippedrelu_fp16.cpp
+++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_clippedrelu_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "convnd_fwd_activ_dynamic_unary_common.hpp"
+#include "../run_convnd_activ_dynamic_example.inc"
+int main(int argc, char* argv[])
+{
+    ck::tensor_operation::element_wise::ClippedRelu out_element_op(0.f, 1.f);
+    return !run_convnd_example(argc, argv, out_element_op);
+}
--- a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_elu_fp16.cpp
+++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_elu_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "convnd_fwd_activ_dynamic_unary_common.hpp"
+#include "../run_convnd_activ_dynamic_example.inc"
+int main(int argc, char* argv[])
+{
+    ck::tensor_operation::element_wise::Elu out_element_op(2.f);
+    return !run_convnd_example(argc, argv, out_element_op);
+}
--- a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_leakyrelu_fp16.cpp
+++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_leakyrelu_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "convnd_fwd_activ_dynamic_unary_common.hpp"
+#include "../run_convnd_activ_dynamic_example.inc"
+int main(int argc, char* argv[])
+{
+    ck::tensor_operation::element_wise::LeakyRelu out_element_op(0.f);
+    return !run_convnd_example(argc, argv, out_element_op);
+}
--- a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_logistic_fp16.cpp
+++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_logistic_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "convnd_fwd_activ_dynamic_unary_common.hpp"
+#include "../run_convnd_activ_dynamic_example.inc"
+int main(int argc, char* argv[])
+{
+    ck::tensor_operation::element_wise::Logistic out_element_op(1.0f);
+    return !run_convnd_example(argc, argv, out_element_op);
+}
--- a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_passthrough_fp16.cpp
+++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_passthrough_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "convnd_fwd_activ_dynamic_unary_common.hpp"
+#include "../run_convnd_activ_dynamic_example.inc"
+int main(int argc, char* argv[])
+{
+    ck::tensor_operation::element_wise::PassThrough out_element_op;
+    return !run_convnd_example(argc, argv, out_element_op);
+}
--- a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_pow_fp16.cpp
+++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_pow_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "convnd_fwd_activ_dynamic_unary_common.hpp"
+#include "../run_convnd_activ_dynamic_example.inc"
+int main(int argc, char* argv[])
+{
+    ck::tensor_operation::element_wise::Power out_element_op(4.f, 1.f, 2.f);
+    return !run_convnd_example(argc, argv, out_element_op);
+}
--- a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_relu_fp16.cpp
+++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_relu_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "convnd_fwd_activ_dynamic_unary_common.hpp"
+#include "../run_convnd_activ_dynamic_example.inc"
+int main(int argc, char* argv[])
+{
+    ck::tensor_operation::element_wise::Relu out_element_op;
+    return !run_convnd_example(argc, argv, out_element_op);
+}
--- a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_sigmoid_fp16.cpp
+++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_sigmoid_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "convnd_fwd_activ_dynamic_unary_common.hpp"
+#include "../run_convnd_activ_dynamic_example.inc"
+int main(int argc, char* argv[])
+{
+    ck::tensor_operation::element_wise::Sigmoid out_element_op;
+    return !run_convnd_example(argc, argv, out_element_op);
+}
--- a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_softrelu_fp16.cpp
+++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_softrelu_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "convnd_fwd_activ_dynamic_unary_common.hpp"
+#include "../run_convnd_activ_dynamic_example.inc"
+int main(int argc, char* argv[])
+{
+    ck::tensor_operation::element_wise::SoftRelu out_element_op;
+    return !run_convnd_example(argc, argv, out_element_op);
+}
--- a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_swish_fp16.cpp
+++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_swish_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "convnd_fwd_activ_dynamic_unary_common.hpp"
+#include "../run_convnd_activ_dynamic_example.inc"
+int main(int argc, char* argv[])
+{
+    ck::tensor_operation::element_wise::Swish out_element_op(1.0f);
+    return !run_convnd_example(argc, argv, out_element_op);
+}
--- a/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_tanh_fp16.cpp
+++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_xdl_dynamic_tanh_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "convnd_fwd_activ_dynamic_unary_common.hpp"
+#include "../run_convnd_activ_dynamic_example.inc"
+int main(int argc, char* argv[])
+{
+    ck::tensor_operation::element_wise::TanH out_element_op;
+    return !run_convnd_example(argc, argv, out_element_op);
+}
--- a/example/62_convnd_activ/run_convnd_activ_dynamic_example.inc
+++ b/example/62_convnd_activ/run_convnd_activ_dynamic_example.inc
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+void print_helper_msg()
+{
+    std::cout << "arg1: verification (0=no, 1=yes)\n"
+              << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+              << "arg3: time kernel (0=no, 1=yes)\n"
+              << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
+}
+template <typename OutElementOp>
+bool run_convnd_example(int argc, char* argv[], const OutElementOp& out_element_op)
+{
+    print_helper_msg();
+    bool do_verification = true;
+    // Use floats for SoftRelu by default to avoid overflow after e^x.
+    int init_method =
+        std::is_same_v<OutElementOp, ck::tensor_operation::element_wise::SoftRelu> ? 2 : 1;
+    bool time_kernel = false;
+    // Following shapes are selected to avoid overflow. Expect inf in case of
+    // size increase for some elementwise ops.
+    ck::utils::conv::ConvParam conv_param{
+        3, 2, 16, 128, 8, {3, 3, 3}, {17, 17, 17}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}};
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else
+    {
+        do_verification                   = std::stoi(argv[1]);
+        init_method                       = std::stoi(argv[2]);
+        time_kernel                       = std::stoi(argv[3]);
+        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
+        conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
+    }
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto run = [&]() {
+        const auto in_g_n_c_wis_desc =
+            ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(
+                conv_param);
+        const auto wei_g_k_c_xs_desc =
+            ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(
+                conv_param);
+        const auto out_g_n_k_wos_desc =
+            ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(
+                conv_param);
+        return run_grouped_conv<NDimSpatial,
+                                InDataType,
+                                WeiDataType,
+                                OutDataType,
+                                InElementOp,
+                                WeiElementOp,
+                                OutElementOp,
+                                DeviceGroupedConvNDActivInstance>(do_verification,
+                                                                  init_method,
+                                                                  time_kernel,
+                                                                  conv_param,
+                                                                  in_g_n_c_wis_desc,
+                                                                  wei_g_k_c_xs_desc,
+                                                                  out_g_n_k_wos_desc,
+                                                                  in_element_op,
+                                                                  wei_element_op,
+                                                                  out_element_op);
+    };
+    if(conv_param.num_dim_spatial_ == 3)
+    {
+        return run();
+    }
+    return false;
+}
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
@@ -85,9 +85,9 @@ __global__ void
            BsPointer p_bs_grid,
            DsPointer p_ds_grid,
            EDataType* __restrict__ p_e_grid,
-            const AElementwiseOperation a_element_op,
+            AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
+            BElementwiseOperation b_element_op,
-            const CDEElementwiseOperation cde_element_op,
+            CDEElementwiseOperation cde_element_op,
            const AGridDesc_AK0_M_AK1 a_grid_desc_k0_m_k1,
            const BGridDesc_BK0_N_BK1 b_grid_desc_k0_n_k1,
            const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
@@ -121,6 +121,19 @@ __global__ void
    static_for<0, NumDTensor, 1>{}(
        [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_group_offset[i]; });
+    if constexpr(is_same_v<AElementwiseOperation, element_wise::DynamicUnaryOp>)
+    {
+        a_element_op.InitUnaryOpPtrOnDevice();
+    }
+    if constexpr(is_same_v<BElementwiseOperation, element_wise::DynamicUnaryOp>)
+    {
+        b_element_op.InitUnaryOpPtrOnDevice();
+    }
+    if constexpr(is_same_v<CDEElementwiseOperation, element_wise::DynamicUnaryOp>)
+    {
+        cde_element_op.InitUnaryOpPtrOnDevice();
+    }
    if constexpr(isMultiA || isMultiB)
    {
        AsPointer p_as_grid_grp;

--- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
@@ -405,7 +405,7 @@ struct ScaleAddScaleAddRelu
                                                                              const float& d1) const
    {
        const float x = c * alpha1_ + alpha2_ * d0 + d1;
-        Relu{}.template operator()<float>(e, x);
+        e             = x > 0 ? x : 0;
    }
    template <>
@@ -416,7 +416,7 @@ struct ScaleAddScaleAddRelu
                        type_convert<float>(d1);
        float result = 0;
-        Relu{}.template operator()<float>(result, x);
+        result       = x > 0 ? x : 0;
        e = type_convert<half_t>(result);
    }
@@ -429,7 +429,7 @@ struct ScaleAddScaleAddRelu
                        type_convert<float>(d1);
        float result = 0;
-        Relu{}.template operator()<float>(result, x);
+        result       = x > 0 ? x : 0;
        e = type_convert<bhalf_t>(result);
    }
@@ -441,7 +441,7 @@ struct ScaleAddScaleAddRelu
        const float x = type_convert<float>(c) * alpha1_ + alpha2_ * d0 + d1;
        float result = 0;
-        Relu{}.template operator()<float>(result, x);
+        result       = x > 0 ? x : 0;
        e = type_convert<int8_t>(result);
    }

--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -7,11 +7,36 @@
 #include "ck/utility/math.hpp"
 #include "ck/utility/math_v2.hpp"
 #include "ck/utility/type_convert.hpp"
+#include <cassert>
 namespace ck {
 namespace tensor_operation {
 namespace element_wise {
+struct UnaryOpBase
+{
+    public:
+    __host__ __device__ virtual ~UnaryOpBase() = default;
+    __host__ __device__ UnaryOpBase()                   = default;
+    __host__ __device__ UnaryOpBase(const UnaryOpBase&) = default;
+    __host__ __device__ UnaryOpBase& operator=(const UnaryOpBase&) = default;
+    __host__ __device__ UnaryOpBase(UnaryOpBase&&)                 = default;
+    __host__ __device__ UnaryOpBase& operator=(UnaryOpBase&&) = default;
+    __host__ __device__ virtual inline void operator()(float& y, const float& x) const = 0;
+    __host__ __device__ virtual inline void operator()(double& y, const double& x) const = 0;
+    __host__ __device__ virtual inline void operator()(int32_t& y, const int32_t& x) const = 0;
+    __host__ __device__ virtual inline void operator()(int8_t& y, const int8_t& x) const = 0;
+    __host__ __device__ virtual inline void operator()(half_t& y, const half_t& x) const = 0;
+    __host__ __device__ virtual inline void operator()(bhalf_t& y, const bhalf_t& x) const = 0;
+};
 struct PassThroughPack2
 {
    template <typename Y, typename X>
@@ -25,17 +50,24 @@ struct PassThroughPack2
    constexpr const static bool is_pack2_invocable = true;
 };
-struct PassThrough
+struct PassThrough : public UnaryOpBase
 {
+    __host__ __device__ inline void operator()(float& y, const float& x) const final { y = x; }
+    __host__ __device__ inline void operator()(double& y, const double& x) const final { y = x; }
+    __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final { y = x; }
+    __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final { y = x; }
+    __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final { y = x; }
+    __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final { y = x; }
    template <typename Y, typename X>
    __host__ __device__ void operator()(Y& y, const X& x) const;
-    template <>
-    __host__ __device__ void operator()<double, double>(double& y, const double& x) const
-    {
-        y = x;
-    }
    template <>
    __host__ __device__ void operator()<float, double>(float& y, const double& x) const
    {
@@ -48,36 +80,12 @@ struct PassThrough
        y = type_convert<double>(x);
    }
-    template <>
-    __host__ __device__ void operator()<float, float>(float& y, const float& x) const
-    {
-        y = x;
-    }
-    template <>
-    __host__ __device__ void operator()<half_t, half_t>(half_t& y, const half_t& x) const
-    {
-        y = x;
-    }
    template <>
    __host__ __device__ void operator()<half_t, float>(half_t& y, const float& x) const
    {
        y = type_convert<half_t>(x);
    }
-    template <>
-    __host__ __device__ void operator()<bhalf_t, bhalf_t>(bhalf_t& y, const bhalf_t& x) const
-    {
-        y = x;
-    }
-    template <>
-    __host__ __device__ void operator()<int32_t, int32_t>(int32_t& y, const int32_t& x) const
-    {
-        y = x;
-    }
    template <>
    __host__ __device__ void operator()<bhalf_t, float>(bhalf_t& y, const float& x) const
    {
@@ -102,12 +110,6 @@ struct PassThrough
        y = type_convert<float>(x);
    }
-    template <>
-    __host__ __device__ void operator()<int8_t, int8_t>(int8_t& y, const int8_t& x) const
-    {
-        y = x;
-    }
    template <>
    __host__ __device__ void operator()<half_t, int8_t>(half_t& y, const int8_t& x) const
    {
@@ -407,20 +409,38 @@ struct UnarySquare
    };
 };
-struct UnaryAbs
+struct UnaryAbs : public UnaryOpBase
 {
-    template <typename T>
+    __host__ __device__ inline void operator()(float& y, const float& x) const final
-    __host__ __device__ void operator()(T& y, const T& x) const
    {
-        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+        y = ck::math::abs(x);
-                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
+    }
-                          is_same<T, int8_t>::value,
-                      "Data type is not supported by this operation!");
+    __host__ __device__ inline void operator()(double& y, const double& x) const final
+    {
+        y = ck::math::abs(x);
+    }
+    __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final
+    {
        y = ck::math::abs(x);
-    };
+    }
+    __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final
+    {
+        y = ck::math::abs(x);
+    }
+    __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final
+    {
+        y = ck::math::abs(x);
+    }
+    __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final
+    {
+        y = ck::math::abs(x);
+    }
-    template <>
    __host__ __device__ void operator()(f8_t& y, const f8_t& x) const
    {
        y = ck::type_convert<f8_t>(ck::math::abs(ck::type_convert<float>(x)));
@@ -439,20 +459,34 @@ struct UnarySqrt
    };
 };
-struct Relu
+struct Relu : public UnaryOpBase
 {
-    template <typename T>
+    __host__ __device__ inline void operator()(float& y, const float& x) const final
-    __host__ __device__ void operator()(T& y, const T& x) const
    {
-        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
-                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
-                          is_same<T, int8_t>::value,
-                      "Data type is not supported by this operation!");
        y = x > 0 ? x : 0;
    }
-    template <>
+    __host__ __device__ inline void operator()(double& y, const double& x) const final
-    __host__ __device__ void operator()(bhalf_t& y, const bhalf_t& x) const
+    {
+        y = x > 0 ? x : 0;
+    }
+    __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final
+    {
+        y = x > 0 ? x : 0;
+    }
+    __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final
+    {
+        y = x > 0 ? x : 0;
+    }
+    __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final
+    {
+        y = x > 0 ? x : 0;
+    }
+    __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final
    {
        float x_f32 = ck::type_convert<float>(x);
        float y_f32 = x_f32 > 0 ? x_f32 : 0;
@@ -599,18 +633,46 @@ struct Gelu
    }
 };
-struct Sigmoid
+struct Sigmoid : public UnaryOpBase
 {
-    template <typename T>
-    __host__ __device__ void operator()(T& y, const T& x) const
+    __host__ __device__ inline void operator()(float& y, const float& x) const final
    {
-        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+        constexpr float one = type_convert<float>(1);
-                          is_same<T, ck::half_t>::value || is_same<T, int8_t>::value ||
-                          is_same<T, int32_t>::value,
-                      "Data type is not supported by this operation!");
-        constexpr T one = type_convert<T>(1);
        y                   = one / (one + ck::math::exp(-x));
-    };
+    }
+    __host__ __device__ inline void operator()(double& y, const double& x) const final
+    {
+        constexpr double one = type_convert<double>(1);
+        y                    = one / (one + ck::math::exp(-x));
+    }
+    __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final
+    {
+        constexpr int32_t one = type_convert<int32_t>(1);
+        y                     = one / (one + ck::math::exp(-x));
+    }
+    __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final
+    {
+        constexpr int8_t one = type_convert<int8_t>(1);
+        y                    = one / (one + ck::math::exp(-x));
+    }
+    __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final
+    {
+        constexpr half_t one = type_convert<half_t>(1);
+        y                    = one / (one + ck::math::exp(-x));
+    }
+    __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final
+    {
+        constexpr float one = type_convert<float>(1);
+        float x_f32         = ck::type_convert<float>(x);
+        float y_f32         = one / (one + ck::math::exp(x_f32));
+        y                   = ck::type_convert<bhalf_t>(y_f32);
+    }
 };
 struct Silu
@@ -626,18 +688,37 @@ struct Silu
    };
 };
-struct TanH
+struct TanH : public UnaryOpBase
 {
-    template <typename T>
+    __host__ __device__ inline void operator()(float& y, const float& x) const final
-    __host__ __device__ void operator()(T& y, const T& x) const
    {
-        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+        y = ck::math::tanh(x);
-                          is_same<T, ck::half_t>::value || is_same<T, int8_t>::value ||
+    }
-                          is_same<T, int32_t>::value,
-                      "Data type is not supported by this operation!");
+    __host__ __device__ inline void operator()(double& y, const double& x) const final
+    {
        y = ck::math::tanh(x);
-    };
+    }
+    __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final
+    {
+        y = ck::math::tanh(x);
+    }
+    __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final
+    {
+        y = ck::math::tanh(x);
+    }
+    __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final
+    {
+        y = ck::math::tanh(x);
+    }
+    __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final
+    {
+        y = ck::math::tanh(x);
+    }
 };
 struct ACos
@@ -878,138 +959,393 @@ struct Rcp
    };
 };
-struct Swish
+struct Swish : public UnaryOpBase
 {
-    Swish(float beta = 1.0f) : beta_(beta) {}
+    __host__ __device__ Swish(float beta = 1.0f) : beta_(beta) {}
+    __host__ __device__ float get_beta() const { return beta_; }
+    const float beta_;
+    __host__ __device__ inline void operator()(float& y, const float& x) const final
+    {
+        float bx = -beta_ * type_convert<float>(x);
+        y        = type_convert<float>(x / (1.f + ck::math::exp(bx)));
+    }
+    __host__ __device__ inline void operator()(double& y, const double& x) const final
+    {
+        float bx = -beta_ * type_convert<float>(x);
+        y        = type_convert<double>(x / (1.f + ck::math::exp(bx)));
+    }
+    __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final
+    {
+        float bx = -beta_ * type_convert<float>(x);
+        y        = type_convert<int32_t>(x / (1.f + ck::math::exp(bx)));
+    }
+    __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final
+    {
+        float bx = -beta_ * type_convert<float>(x);
+        y        = type_convert<int8_t>(x / (1.f + ck::math::exp(bx)));
+    }
+    __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final
+    {
+        float bx = -beta_ * type_convert<float>(x);
+        y        = type_convert<half_t>(x / (1.f + ck::math::exp(bx)));
+    }
+    __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final
+    {
+        float bx = -beta_ * type_convert<float>(x);
+        y        = type_convert<bhalf_t>(x / (1.f + ck::math::exp(bx)));
+    }
    template <typename Y, typename X>
    __host__ __device__ void operator()(Y& y, const X& x) const
    {
        static_assert(is_same<X, float>::value || is_same<X, double>::value ||
-                          is_same<X, ck::half_t>::value,
+                          is_same<X, half_t>::value,
                      "Data type is not supported by this operation!");
        static_assert(is_same<Y, float>::value || is_same<Y, double>::value ||
-                          is_same<Y, ck::half_t>::value,
+                          is_same<Y, half_t>::value,
                      "Data type is not supported by this operation!");
        float bx = -beta_ * type_convert<float>(x);
        y        = type_convert<Y>(x / (1.f + ck::math::exp(bx)));
-    };
+    }
-    const float beta_;
 };
-struct SoftRelu
+struct SoftRelu : public UnaryOpBase
 {
-    SoftRelu(float alpha = 1.f) : alpha_(alpha){};
+    __host__ __device__ SoftRelu(float alpha = 1.0f) : alpha_(alpha) {}
-    template <typename T>
+    __host__ __device__ float get_alpha() const { return alpha_; }
-    __host__ __device__ void operator()(T& y, const T& x) const
+    const float alpha_;
+    __host__ __device__ inline void operator()(float& y, const float& x) const final
    {
-        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+        float casted_alpha  = type_convert<float>(alpha_);
-                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
+        constexpr float one = type_convert<float>(1);
-                          is_same<T, int8_t>::value,
+        y                   = ck::math::log(one + ck::math::exp(x * casted_alpha)) / casted_alpha;
-                      "Data type is not supported by this operation!");
+    }
-        T casted_alpha  = type_convert<T>(alpha_);
-        constexpr T one = type_convert<T>(1);
+    __host__ __device__ inline void operator()(double& y, const double& x) const final
+    {
+        double casted_alpha  = type_convert<double>(alpha_);
+        constexpr double one = type_convert<double>(1);
+        y                    = ck::math::log(one + ck::math::exp(x * casted_alpha)) / casted_alpha;
+    }
+    __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final
+    {
+        int32_t casted_alpha  = type_convert<int32_t>(alpha_);
+        constexpr int32_t one = type_convert<int32_t>(1);
+        y                     = ck::math::log(one + ck::math::exp(x * casted_alpha)) / casted_alpha;
+    }
+    __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final
+    {
+        int8_t casted_alpha  = type_convert<int8_t>(alpha_);
+        constexpr int8_t one = type_convert<int8_t>(1);
+        y                    = ck::math::log(one + ck::math::exp(x * casted_alpha)) / casted_alpha;
+    }
+    __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final
+    {
+        half_t casted_alpha  = type_convert<half_t>(alpha_);
+        constexpr half_t one = type_convert<half_t>(1);
+        y                    = ck::math::log(one + ck::math::exp(x * casted_alpha)) / casted_alpha;
+    }
+    __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final
+    {
+        bhalf_t casted_alpha  = type_convert<bhalf_t>(alpha_);
+        constexpr bhalf_t one = type_convert<bhalf_t>(1);
        y                     = ck::math::log(one + ck::math::exp(x * casted_alpha)) / casted_alpha;
    }
-    const float alpha_;
 };
-struct Power
+struct Power : public UnaryOpBase
 {
-    Power(float alpha = 0.f, float beta = 1.f, float gamma = 2.f)
+    __host__ __device__ Power(float alpha = 0.f, float beta = 1.f, float gamma = 2.f)
-        : alpha_(alpha), beta_(beta), gamma_(gamma){};
+        : alpha_(alpha), beta_(beta), gamma_(gamma)
-    template <typename T>
-    __host__ __device__ void operator()(T& y, const T& x) const
    {
-        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
-                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
-                          is_same<T, int8_t>::value,
-                      "Data type is not supported by this operation!");
-        T casted_alpha     = type_convert<T>(alpha_);
-        T casted_beta      = type_convert<T>(beta_);
-        T casted_gamma     = type_convert<T>(gamma_);
-        T shifted_scaled_x = casted_alpha + casted_beta * x;
-        y                  = ck::math::pow(shifted_scaled_x, casted_gamma);
    }
+    __host__ __device__ float get_alpha() const { return alpha_; }
+    __host__ __device__ float get_beta() const { return beta_; }
+    __host__ __device__ float get_gamma() const { return gamma_; }
    const float alpha_;
    const float beta_;
    const float gamma_;
+    __host__ __device__ inline void operator()(float& y, const float& x) const final
+    {
+        float casted_alpha = type_convert<float>(alpha_);
+        float casted_beta  = type_convert<float>(beta_);
+        float casted_gamma = type_convert<float>(gamma_);
+        float shifted_scaled_x = casted_alpha + casted_beta * x;
+        y                      = ck::math::pow(shifted_scaled_x, casted_gamma);
+    }
+    __host__ __device__ inline void operator()(double& y, const double& x) const final
+    {
+        double casted_alpha = type_convert<double>(alpha_);
+        double casted_beta  = type_convert<double>(beta_);
+        double casted_gamma = type_convert<double>(gamma_);
+        double shifted_scaled_x = casted_alpha + casted_beta * x;
+        y                       = ck::math::pow(shifted_scaled_x, casted_gamma);
+    }
+    __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final
+    {
+        int32_t casted_alpha = type_convert<int32_t>(alpha_);
+        int32_t casted_beta  = type_convert<int32_t>(beta_);
+        int32_t casted_gamma = type_convert<int32_t>(gamma_);
+        int32_t shifted_scaled_x = casted_alpha + casted_beta * x;
+        y                        = ck::math::pow(shifted_scaled_x, casted_gamma);
+    }
+    __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final
+    {
+        int8_t casted_alpha = type_convert<int8_t>(alpha_);
+        int8_t casted_beta  = type_convert<int8_t>(beta_);
+        int8_t casted_gamma = type_convert<int8_t>(gamma_);
+        int8_t shifted_scaled_x = casted_alpha + casted_beta * x;
+        y                       = ck::math::pow(shifted_scaled_x, casted_gamma);
+    }
+    __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final
+    {
+        half_t casted_alpha = type_convert<half_t>(alpha_);
+        half_t casted_beta  = type_convert<half_t>(beta_);
+        half_t casted_gamma = type_convert<half_t>(gamma_);
+        half_t shifted_scaled_x = casted_alpha + casted_beta * x;
+        y                       = ck::math::pow(shifted_scaled_x, casted_gamma);
+    }
+    __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final
+    {
+        bhalf_t casted_alpha = type_convert<bhalf_t>(alpha_);
+        bhalf_t casted_beta  = type_convert<bhalf_t>(beta_);
+        bhalf_t casted_gamma = type_convert<bhalf_t>(gamma_);
+        bhalf_t shifted_scaled_x = casted_alpha + casted_beta * x;
+        y                        = ck::math::pow(shifted_scaled_x, casted_gamma);
+    }
 };
-struct ClippedRelu
+struct ClippedRelu : public UnaryOpBase
 {
-    ClippedRelu(float alpha = 0.f, float beta = 1.f) : alpha_(alpha), beta_(beta){};
+    __host__ __device__ ClippedRelu(float alpha = 0.f, float beta = 1.f)
+        : alpha_(alpha), beta_(beta)
-    template <typename T>
-    __host__ __device__ void operator()(T& y, const T& x) const
    {
-        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
-                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
-                          is_same<T, int8_t>::value,
-                      "Data type is not supported by this operation!");
-        T casted_alpha = type_convert<T>(alpha_);
-        T casted_beta  = type_convert<T>(beta_);
-        y              = ck::math::min(casted_beta, ck::math::max(casted_alpha, x));
    }
+    __host__ __device__ float get_alpha() const { return alpha_; }
+    __host__ __device__ float get_beta() const { return beta_; }
    const float alpha_;
    const float beta_;
+    __host__ __device__ inline void operator()(float& y, const float& x) const final
+    {
+        float casted_alpha = type_convert<float>(alpha_);
+        float casted_beta  = type_convert<float>(beta_);
+        y                  = ck::math::min(casted_beta, ck::math::max(casted_alpha, x));
+    }
+    __host__ __device__ inline void operator()(double& y, const double& x) const final
+    {
+        double casted_alpha = type_convert<double>(alpha_);
+        double casted_beta  = type_convert<double>(beta_);
+        y                   = ck::math::min(casted_beta, ck::math::max(casted_alpha, x));
+    }
+    __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final
+    {
+        int32_t casted_alpha = type_convert<int32_t>(alpha_);
+        int32_t casted_beta  = type_convert<int32_t>(beta_);
+        y                    = ck::math::min(casted_beta, ck::math::max(casted_alpha, x));
+    }
+    __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final
+    {
+        int8_t casted_alpha = type_convert<int8_t>(alpha_);
+        int8_t casted_beta  = type_convert<int8_t>(beta_);
+        y                   = ck::math::min(casted_beta, ck::math::max(casted_alpha, x));
+    }
+    __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final
+    {
+        half_t casted_alpha = type_convert<half_t>(alpha_);
+        half_t casted_beta  = type_convert<half_t>(beta_);
+        y                   = ck::math::min(casted_beta, ck::math::max(casted_alpha, x));
+    }
+    __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final
+    {
+        bhalf_t casted_alpha = type_convert<bhalf_t>(alpha_);
+        bhalf_t casted_beta  = type_convert<bhalf_t>(beta_);
+        y                    = ck::math::min(casted_beta, ck::math::max(casted_alpha, x));
+    }
 };
-struct LeakyRelu
+struct LeakyRelu : public UnaryOpBase
 {
-    LeakyRelu(float alpha = 0.01f) : alpha_(alpha){};
-    template <typename T>
+    __host__ __device__ LeakyRelu(float alpha = 0.f) : alpha_(alpha) {}
-    __host__ __device__ void operator()(T& y, const T& x) const
+    __host__ __device__ float get_alpha() const { return alpha_; }
+    const float alpha_;
+    __host__ __device__ inline void operator()(float& y, const float& x) const final
    {
-        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+        float casted_alpha = type_convert<float>(alpha_);
-                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
-                          is_same<T, int8_t>::value,
-                      "Data type is not supported by this operation!");
-        T casted_alpha = type_convert<T>(alpha_);
        y                  = x >= 0 ? x : x * casted_alpha;
    }
-    const float alpha_;
+    __host__ __device__ inline void operator()(double& y, const double& x) const final
+    {
+        double casted_alpha = type_convert<double>(alpha_);
+        y                   = x >= 0 ? x : x * casted_alpha;
+    }
+    __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final
+    {
+        int32_t casted_alpha = type_convert<int32_t>(alpha_);
+        y                    = x >= 0 ? x : x * casted_alpha;
+    }
+    __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final
+    {
+        int8_t casted_alpha = type_convert<int8_t>(alpha_);
+        y                   = x >= 0 ? x : x * casted_alpha;
+    }
+    __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final
+    {
+        half_t casted_alpha = type_convert<half_t>(alpha_);
+        y                   = x >= 0 ? x : x * casted_alpha;
+    }
+    __host__ __device__ inline void operator()([[maybe_unused]] bhalf_t& y,
+                                               [[maybe_unused]] const bhalf_t& x) const final
+    {
+    }
 };
-struct Elu
+struct Elu : public UnaryOpBase
 {
-    Elu(float alpha = 1.f) : alpha_(alpha){};
-    template <typename T>
+    __host__ __device__ Elu(float alpha = 1.f) : alpha_(alpha) {}
-    __host__ __device__ void operator()(T& y, const T& x) const
+    __host__ __device__ float get_alpha() const { return alpha_; }
+    const float alpha_;
+    __host__ __device__ inline void operator()(float& y, const float& x) const final
    {
-        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+        float casted_alpha = type_convert<float>(alpha_);
-                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
+        y                  = x > 0 ? x : casted_alpha * ck::math::expm1(x);
-                          is_same<T, int8_t>::value,
+    }
-                      "Data type is not supported by this operation!");
-        T casted_alpha = type_convert<T>(alpha_);
+    __host__ __device__ inline void operator()(double& y, const double& x) const final
+    {
+        double casted_alpha = type_convert<double>(alpha_);
+        y                   = x > 0 ? x : casted_alpha * ck::math::expm1(x);
+    }
+    __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final
+    {
+        int32_t casted_alpha = type_convert<int32_t>(alpha_);
+        y                    = x > 0 ? x : casted_alpha * ck::math::expm1(x);
+    }
+    __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final
+    {
+        int8_t casted_alpha = type_convert<int8_t>(alpha_);
+        y                   = x > 0 ? x : casted_alpha * ck::math::expm1(x);
+    }
+    __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final
+    {
+        half_t casted_alpha = type_convert<half_t>(alpha_);
+        y                   = x > 0 ? x : casted_alpha * ck::math::expm1(x);
+    }
+    __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final
+    {
+        bhalf_t casted_alpha = type_convert<bhalf_t>(alpha_);
        y                    = x > 0 ? x : casted_alpha * ck::math::expm1(x);
    }
-    const float alpha_;
 };
-struct Logistic
+struct Logistic : public UnaryOpBase
 {
-    Logistic(float alpha = 1.f) : alpha_(alpha){};
-    template <typename T>
+    __host__ __device__ Logistic(float alpha = 1.0f) : alpha_(alpha) {}
-    __host__ __device__ void operator()(T& y, const T& x) const
+    __host__ __device__ float get_alpha() const { return alpha_; }
+    const float alpha_;
+    __host__ __device__ inline void operator()(float& y, const float& x) const final
    {
-        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+        float casted_alpha  = type_convert<float>(alpha_);
-                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
+        constexpr float one = type_convert<float>(1);
-                          is_same<T, int8_t>::value,
+        y                   = casted_alpha / (one + ck::math::exp(-x) * casted_alpha);
-                      "Data type is not supported by this operation!");
+    }
-        T casted_alpha  = type_convert<T>(alpha_);
-        constexpr T one = type_convert<T>(1);
+    __host__ __device__ inline void operator()(double& y, const double& x) const final
+    {
+        double casted_alpha  = type_convert<double>(alpha_);
+        constexpr double one = type_convert<double>(1);
+        y                    = casted_alpha / (one + ck::math::exp(-x) * casted_alpha);
+    }
+    __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final
+    {
+        int32_t casted_alpha  = type_convert<int32_t>(alpha_);
+        constexpr int32_t one = type_convert<int32_t>(1);
+        y                     = casted_alpha / (one + ck::math::exp(-x) * casted_alpha);
+    }
+    __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final
+    {
+        int8_t casted_alpha  = type_convert<int8_t>(alpha_);
+        constexpr int8_t one = type_convert<int8_t>(1);
+        y                    = casted_alpha / (one + ck::math::exp(-x) * casted_alpha);
+    }
+    __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final
+    {
+        half_t casted_alpha  = type_convert<half_t>(alpha_);
+        constexpr half_t one = type_convert<half_t>(1);
+        y                    = casted_alpha / (one + ck::math::exp(-x) * casted_alpha);
+    }
+    __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final
+    {
+        bhalf_t casted_alpha  = type_convert<bhalf_t>(alpha_);
+        constexpr bhalf_t one = type_convert<bhalf_t>(1);
        y                     = casted_alpha / (one + ck::math::exp(-x) * casted_alpha);
    }
-    const float alpha_;
 };
 struct ConvInvscale
@@ -1074,7 +1410,7 @@ struct ConvScaleRelu
    __host__ __device__ void operator()<f8_t, float>(f8_t& e, const float& c) const
    {
        float x;
-        Relu{}.template operator()<float>(x, c * scale_in_ * scale_wei_);
+        Relu{}(x, c * scale_in_ * scale_wei_);
        e = type_convert<f8_t>(x * scale_out_);
    };
@@ -1153,6 +1489,239 @@ struct FastNumericArrayConverter<uint8_t, ck::half_t, N>
    __device__ OutputArray operator()(InputArray const& Input) { return convert(Input); }
 };
+struct DynamicUnaryOp
+{
+    DynamicUnaryOp& operator=(const DynamicUnaryOp& other)
+    {
+        if(this != &other)
+        {
+            unary_op_ptr_  = other.unary_op_ptr_;
+            unary_op_type_ = other.unary_op_type_;
+        }
+        return *this;
+    }
+    __host__ __device__ DynamicUnaryOp() = delete;
+    __host__ __device__ DynamicUnaryOp(const Swish& swish)
+    {
+        unary_op_type_ = UnaryOpType::Swish;
+        beta           = swish.get_beta();
+    }
+    __host__ __device__ DynamicUnaryOp(const Swish&& swish)
+    {
+        unary_op_type_ = UnaryOpType::Swish;
+        beta           = swish.get_beta();
+    }
+    __host__ __device__ DynamicUnaryOp(const Sigmoid&) { unary_op_type_ = UnaryOpType::Sigmoid; }
+    __host__ __device__ DynamicUnaryOp(const Sigmoid&&) { unary_op_type_ = UnaryOpType::Sigmoid; }
+    __host__ __device__ DynamicUnaryOp(const PassThrough&)
+    {
+        unary_op_type_ = UnaryOpType::PassThrough;
+    }
+    __host__ __device__ DynamicUnaryOp(const PassThrough&&)
+    {
+        unary_op_type_ = UnaryOpType::PassThrough;
+    }
+    __host__ __device__ DynamicUnaryOp(const Logistic& logistic)
+    {
+        unary_op_type_ = UnaryOpType::Logistic;
+        alpha          = logistic.get_alpha();
+    }
+    __host__ __device__ DynamicUnaryOp(const Logistic&& logistic)
+    {
+        unary_op_type_ = UnaryOpType::Logistic;
+        alpha          = logistic.get_alpha();
+    }
+    __host__ __device__ DynamicUnaryOp(const TanH&) { unary_op_type_ = UnaryOpType::TanH; }
+    __host__ __device__ DynamicUnaryOp(const TanH&&) { unary_op_type_ = UnaryOpType::TanH; }
+    __host__ __device__ DynamicUnaryOp(const Relu&) { unary_op_type_ = UnaryOpType::Relu; }
+    __host__ __device__ DynamicUnaryOp(const Relu&&) { unary_op_type_ = UnaryOpType::Relu; }
+    __host__ __device__ DynamicUnaryOp(const SoftRelu& softrelu)
+    {
+        unary_op_type_ = UnaryOpType::SoftRelu;
+        alpha          = softrelu.get_alpha();
+    }
+    __host__ __device__ DynamicUnaryOp(const SoftRelu&& softrelu)
+    {
+        unary_op_type_ = UnaryOpType::SoftRelu;
+        alpha          = softrelu.get_alpha();
+    }
+    __host__ __device__ DynamicUnaryOp(const UnaryAbs&) { unary_op_type_ = UnaryOpType::UnaryAbs; }
+    __host__ __device__ DynamicUnaryOp(const UnaryAbs&&) { unary_op_type_ = UnaryOpType::UnaryAbs; }
+    __host__ __device__ DynamicUnaryOp(const Power& pow)
+    {
+        unary_op_type_ = UnaryOpType::Power;
+        alpha          = pow.get_alpha();
+        beta           = pow.get_beta();
+        gamma          = pow.get_gamma();
+    }
+    __host__ __device__ DynamicUnaryOp(const Power&& pow)
+    {
+        unary_op_type_ = UnaryOpType::Power;
+        alpha          = pow.get_alpha();
+        beta           = pow.get_beta();
+        gamma          = pow.get_gamma();
+    }
+    __host__ __device__ DynamicUnaryOp(const ClippedRelu& clippedrelu)
+    {
+        unary_op_type_ = UnaryOpType::ClippedRelu;
+        alpha          = clippedrelu.get_alpha();
+        beta           = clippedrelu.get_beta();
+    }
+    __host__ __device__ DynamicUnaryOp(const ClippedRelu&& clippedrelu)
+    {
+        unary_op_type_ = UnaryOpType::ClippedRelu;
+        alpha          = clippedrelu.get_alpha();
+        beta           = clippedrelu.get_beta();
+    }
+    __host__ __device__ DynamicUnaryOp(const LeakyRelu& leakyrelu)
+    {
+        unary_op_type_ = UnaryOpType::LeakyRelu;
+        alpha          = leakyrelu.get_alpha();
+    }
+    __host__ __device__ DynamicUnaryOp(const LeakyRelu&& leakyrelu)
+    {
+        unary_op_type_ = UnaryOpType::LeakyRelu;
+        alpha          = leakyrelu.get_alpha();
+    }
+    __host__ __device__ DynamicUnaryOp(const Elu& elu)
+    {
+        unary_op_type_ = UnaryOpType::Elu;
+        alpha          = elu.get_alpha();
+    }
+    __host__ __device__ DynamicUnaryOp(const Elu&& elu)
+    {
+        unary_op_type_ = UnaryOpType::Elu;
+        alpha          = elu.get_alpha();
+    }
+    __host__ __device__ DynamicUnaryOp(const DynamicUnaryOp& dynamic_op)
+        : unary_op_type_(dynamic_op.unary_op_type_),
+          unary_op_ptr_(dynamic_op.unary_op_ptr_),
+          alpha(dynamic_op.alpha),
+          beta(dynamic_op.beta),
+          gamma(dynamic_op.gamma)
+    {
+    }
+    __host__ __device__ ~DynamicUnaryOp()
+    {
+        if(unary_op_ptr_)
+            delete unary_op_ptr_;
+    }
+    __device__ void InitUnaryOpPtrOnDevice()
+    {
+        switch(unary_op_type_)
+        {
+        case(UnaryOpType::Swish): unary_op_ptr_ = new Swish(beta); break;
+        case(UnaryOpType::Sigmoid): unary_op_ptr_ = new Sigmoid; break;
+        case(UnaryOpType::PassThrough): unary_op_ptr_ = new PassThrough; break;
+        case(UnaryOpType::Logistic): unary_op_ptr_ = new Logistic(alpha); break;
+        case(UnaryOpType::TanH): unary_op_ptr_ = new TanH; break;
+        case(UnaryOpType::Relu): unary_op_ptr_ = new Relu; break;
+        case(UnaryOpType::SoftRelu): unary_op_ptr_ = new SoftRelu(alpha); break;
+        case(UnaryOpType::UnaryAbs): unary_op_ptr_ = new UnaryAbs; break;
+        case(UnaryOpType::Power): unary_op_ptr_ = new Power(alpha, beta, gamma); break;
+        case(UnaryOpType::ClippedRelu): unary_op_ptr_ = new ClippedRelu(alpha, beta); break;
+        case(UnaryOpType::LeakyRelu): unary_op_ptr_ = new LeakyRelu(alpha); break;
+        case(UnaryOpType::Elu): unary_op_ptr_ = new Elu(alpha); break;
+        default: unary_op_ptr_ = nullptr; break;
+        }
+    }
+    template <typename Y, typename X>
+    __device__ void operator()(Y& y, const X& x) const
+    {
+        isSupported<X, Y>();
+        unary_op_ptr_->operator()(y, x);
+    }
+    template <typename Y, typename X>
+    __host__ void operator()(Y& y, const X& x) const
+    {
+        isSupported<X, Y>();
+        switch(unary_op_type_)
+        {
+        case(UnaryOpType::Swish): Swish{}.operator()(y, x); break;
+        case(UnaryOpType::Sigmoid): Sigmoid{}.operator()(y, x); break;
+        case(UnaryOpType::PassThrough): PassThrough{}.operator()(y, x); break;
+        case(UnaryOpType::Logistic): Logistic{}.operator()(y, x); break;
+        case(UnaryOpType::TanH): TanH{}.operator()(y, x); break;
+        case(UnaryOpType::Relu): Relu{}.operator()(y, x); break;
+        case(UnaryOpType::SoftRelu): SoftRelu{}.operator()(y, x); break;
+        case(UnaryOpType::UnaryAbs): UnaryAbs{}.operator()(y, x); break;
+        case(UnaryOpType::Power): Power{}.operator()(y, x); break;
+        case(UnaryOpType::ClippedRelu): ClippedRelu{}.operator()(y, x); break;
+        case(UnaryOpType::LeakyRelu): LeakyRelu{}.operator()(y, x); break;
+        case(UnaryOpType::Elu): Elu{}.operator()(y, x); break;
+        default: break;
+        }
+    }
+    template <typename X, typename Y>
+    __device__ __host__ constexpr void isSupported() const
+    {
+        static_assert(std::is_same<X, Y>::value, "X and Y must be of the same type");
+        static_assert(is_same<X, float>::value || is_same<X, double>::value ||
+                          is_same<X, bhalf_t>::value || is_same<X, half_t>::value ||
+                          is_same<X, int32_t>::value || is_same<X, int8_t>::value,
+                      "Data type is not supported by this operation!");
+    }
+    private:
+    enum class UnaryOpType
+    {
+        Swish,
+        Sigmoid,
+        PassThrough,
+        Logistic,
+        TanH,
+        Relu,
+        SoftRelu,
+        UnaryAbs,
+        Power,
+        ClippedRelu,
+        LeakyRelu,
+        Elu
+    };
+    public:
+    UnaryOpType unary_op_type_;
+    UnaryOpBase* unary_op_ptr_ = nullptr;
+    float alpha;
+    float beta;
+    float gamma;
+};
 } // namespace element_wise
 } // namespace tensor_operation
 } // namespace ck