Merge branch 'develop' into gemm_elementwise_gemm

3c5717df · Illia Silin · GitHub · 171b9030 · d9f1ead3 · 3c5717df
Unverified Commit 3c5717df authored Feb 10, 2025 by Illia Silin Committed by GitHub Feb 10, 2025
20 changed files
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -584,6 +584,10 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
        {
            return false;
        }
+        if(!is_bf16_atomic_supported() && std::is_same_v<CDataType, ck::bhalf_t>)
+        {
+            return false;
+        }
        if constexpr(NDimSpatial == 1)
        {
            if constexpr(!is_GNWC_GKXC_GNWK<InLayout, WeiLayout, OutLayout>())

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -9,6 +9,7 @@
 #include <numeric>
 #include <sstream>

+#include "ck/library/utility/numeric.hpp"
 #include "ck/utility/common_header.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
@@ -98,8 +99,7 @@ __global__ void
            const ComputePtrOffsetOfG compute_ptr_offset_of_groups,
            const ComputePtrOffsetOfN compute_ptr_offset_of_n)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx94__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))

    // offset base pointer for each work-group
    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.y);
@@ -121,19 +121,6 @@ __global__ void
    static_for<0, NumDTensor, 1>{}(
        [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_group_offset[i]; });

-    if constexpr(is_same_v<AElementwiseOperation, element_wise::DynamicUnaryOp>)
-    {
-        a_element_op.InitUnaryOpPtrOnDevice();
-    }
-    if constexpr(is_same_v<BElementwiseOperation, element_wise::DynamicUnaryOp>)
-    {
-        b_element_op.InitUnaryOpPtrOnDevice();
-    }
-    if constexpr(is_same_v<CDEElementwiseOperation, element_wise::DynamicUnaryOp>)
-    {
-        cde_element_op.InitUnaryOpPtrOnDevice();
-    }
-
    if constexpr(isMultiA || isMultiB)
    {
        AsPointer p_as_grid_grp;
@@ -225,9 +212,13 @@ __global__ void
 }

 } // namespace
-
+#ifdef CK_CODE_GEN_RTC
+template <typename T>
+using is_tuple = decltype(ck::declval<T&>().IsTuple());
+#else
 template <typename T>
 using is_tuple = decltype(std::declval<T&>().IsTuple());
+#endif

 //
 // @brief      Device Convolution operation.

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
@@ -9,6 +9,7 @@
 #include <numeric>
 #include <sstream>

+#include "ck/library/utility/numeric.hpp"
 #include "ck/utility/common_header.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
@@ -117,7 +118,7 @@ __global__ void
                                        c_grid_desc_mblock_mperblock_nblock_nperblock);
 #else
    ignore = karg;
-#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+#endif // end of if (defined(__gfx9__))
 }

 template <typename GridwiseGemm,
@@ -183,7 +184,7 @@ __global__ void
                                             c_grid_desc_mblock_mperblock_nblock_nperblock);
 #else
    ignore = karg;
-#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+#endif // end of if (defined(__gfx9__))
 }

 } // namespace

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp
@@ -155,8 +155,7 @@ __global__ void
            const Block2ETileMap block_2_ctile_map,
            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx94__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
    const index_t num_blocks_per_batch =
        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
@@ -52,8 +52,7 @@ __global__ void
            const ComputePtrOffset compute_ptr_offset_of_groups,
            const ComputePtrOffset compute_ptr_offset_of_n)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx94__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

    const index_t block_id_x = __builtin_amdgcn_readfirstlane(blockIdx.x);

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
 #pragma once
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -603,11 +603,11 @@ struct DeviceGroupedGemmMultipleD_Dl : public DeviceGroupedGemm<ALayout,
            }

            hipGetErrorString(
-                hipMemcpyWithStream(arg.p_workspace_,
-                                    arg.gemm_desc_kernel_arg_.data(),
-                                    arg.gemm_desc_kernel_arg_.size() * sizeof(GemmKernelArg),
-                                    hipMemcpyHostToDevice,
-                                    stream_config.stream_id_));
+                hipMemcpyAsync(arg.p_workspace_,
+                               arg.gemm_desc_kernel_arg_.data(),
+                               arg.gemm_desc_kernel_arg_.size() * sizeof(GemmKernelArg),
+                               hipMemcpyHostToDevice,
+                               stream_config.stream_id_));

            auto launch_kernel = [&](auto has_main_k_block_loop,
                                     auto has_double_tail_k_block_loop) {

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp
@@ -18,7 +18,6 @@
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/device_grouped_gemm_multiple_d_splitk.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
@@ -78,17 +77,17 @@ template <typename ALayout,
          // TODO: change gridwise_gemm_v2r4r2 to support AK1 & BK1
          enable_if_t<AK1 == BK1, bool> = false>
 struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
-    : public DeviceGroupedGemmMultipleDSplitK<ALayout,
-                                              BLayout,
-                                              DsLayout,
-                                              ELayout,
-                                              ADataType,
-                                              BDataType,
-                                              DsDataType,
-                                              EDataType,
-                                              AElementwiseOperation,
-                                              BElementwiseOperation,
-                                              CDEElementwiseOperation>
+    : public DeviceGroupedGemmSplitK<ALayout,
+                                     BLayout,
+                                     DsLayout,
+                                     ELayout,
+                                     ADataType,
+                                     BDataType,
+                                     DsDataType,
+                                     EDataType,
+                                     AElementwiseOperation,
+                                     BElementwiseOperation,
+                                     CDEElementwiseOperation>
 {
    using DeviceOp = DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage;

@@ -530,7 +529,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
        index_t skipped_group_count_;
        index_t grid_size_;
        // Pointer to device memory with GEMM kernel arguments.
-        const void* p_dev_gemm_args_;
+        void* p_dev_gemm_kargs_;

        AElementwiseOperation a_element_op_;
        BElementwiseOperation b_element_op_;
@@ -566,7 +565,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
        /// @return     The average kernel execution time (if time measurement is enabled.)
        ///
        float Run(const Argument& arg,
-                  const void* dev_gemm_args,
+                  void* dev_gemm_args,
                  void* dev_gemm_workspace,
                  const StreamConfig& stream_config = StreamConfig{})
        {
@@ -621,7 +620,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
        ///
        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
        {
-            if(arg.p_dev_gemm_args_ == nullptr)
+            if(arg.p_dev_gemm_kargs_ == nullptr)
            {
                std::ostringstream err;
                err << "The gemm arguments device buffer is not allocated!"
@@ -637,7 +636,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
                throw std::runtime_error(err.str());
            }

-            return Run(arg, arg.p_dev_gemm_args_, arg.p_workspace_, stream_config);
+            return Run(arg, arg.p_dev_gemm_kargs_, arg.p_workspace_, stream_config);
        }

        float Run(const BaseArgument* p_arg,
@@ -723,7 +722,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage

        template <bool HasMainKBlockLoop>
        float DispatchKernel(const Argument& arg,
-                             const void* dev_gemm_args,
+                             void* dev_gemm_kargs,
                             void* dev_gemm_workspace,
                             const StreamConfig& stream_config) const
        {
@@ -746,7 +745,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
            return LaunchKernel(gemm_kernel,
                                elementwise_kernel,
                                arg,
-                                dev_gemm_args,
+                                dev_gemm_kargs,
                                dev_gemm_workspace,
                                stream_config);
        }
@@ -755,12 +754,19 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
        float LaunchKernel(const KernelFunction& gemm_kernel,
                           const KernelFunction2& elementwise_kernel,
                           const Argument& arg,
-                           const void* dev_gemm_args,
+                           void* dev_gemm_kargs,
                           [[maybe_unused]] void* dev_gemm_workspace,
                           const StreamConfig& stream_config) const
        {
            float time{0.f};

+            hip_check_error(
+                hipMemcpyAsync(dev_gemm_kargs,
+                               arg.gemm_kernel_args_.data(),
+                               arg.gemm_kernel_args_.size() * sizeof(GemmTransKernelArg),
+                               hipMemcpyHostToDevice,
+                               stream_config.stream_id_));
+
            auto preprocess = [&]() {
                hip_check_error(hipMemsetAsync(
                    dev_gemm_workspace, 0, arg.GetWorkspaceSizeBytes(), stream_config.stream_id_));
@@ -774,7 +780,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
                dim3(arg.grid_size_),
                dim3(BlockSize),
                0,
-                cast_pointer_to_constant_address_space(dev_gemm_args),
+                cast_pointer_to_constant_address_space(dev_gemm_kargs),
                arg.gemm_kernel_args_.size(),
                arg.a_element_op_,
                arg.b_element_op_,
@@ -930,18 +936,30 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
        return str.str();
    }

-    void SetDeviceKernelArgs(Argument& arg, void* p_dev_kernel_args) const
+    void SetDeviceKernelArgs(BaseArgument* p_arg, void* p_dev_kernel_args) const override
    {
-        arg.p_dev_gemm_args_ = p_dev_kernel_args;
-        hip_check_error(hipMemcpy(p_dev_kernel_args,
-                                  arg.gemm_kernel_args_.data(),
-                                  GetDeviceKernelArgSize(&arg),
-                                  hipMemcpyHostToDevice));
+        auto arg_ptr = dynamic_cast<Argument*>(p_arg);
+        if(arg_ptr)
+        {
+            arg_ptr->p_dev_gemm_kargs_ = p_dev_kernel_args;
+        }
+        else
+            throw std::runtime_error(
+                "The argument pointer is not an object of "
+                "DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage::Argument structure!");
    }

-    void SetDeviceKernelArgs(BaseArgument* p_arg, void* p_dev_kernel_args) const override
+    size_t GetDeviceKernelArgSize(const BaseArgument* p_arg) const override
    {
-        return SetDeviceKernelArgs(*dynamic_cast<Argument*>(p_arg), p_dev_kernel_args);
+        auto arg = dynamic_cast<const Argument*>(p_arg);
+        if(arg)
+        {
+            return arg->gemm_kernel_args_.size() * sizeof(GemmTransKernelArg);
+        }
+        else
+            throw std::runtime_error(
+                "The argument pointer is not an object of "
+                "DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage::Argument structure!");
    }

    size_t GetWorkSpaceSize(const BaseArgument* p_arg) const override
@@ -974,17 +992,22 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
                "DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage::Argument structure!");
    }

-    static void SetKBatchSize(Argument& arg, index_t kbatch) { arg.UpdateKBatch(kbatch); }
-
-    void SetKBatchSize(BaseArgument* p_arg, index_t kbatch) const override
+    [[deprecated]] static void SetKBatchSize(Argument& arg, index_t kbatch)
    {
-        return SetKBatchSize(*dynamic_cast<Argument*>(p_arg), kbatch);
+        arg.UpdateKBatch(kbatch);
    }

-    size_t GetDeviceKernelArgSize(const BaseArgument* p_arg) const override
+    void SetKBatchSize(BaseArgument* p_arg, index_t kbatch) const override
    {
-        return dynamic_cast<const Argument*>(p_arg)->gemm_kernel_args_.size() *
-               sizeof(GemmTransKernelArg);
+        auto p_arg_ = dynamic_cast<Argument*>(p_arg);
+        if(p_arg_)
+        {
+            p_arg_->UpdateKBatch(kbatch);
+        }
+        else
+            throw std::runtime_error(
+                "The argument pointer is not an object of "
+                "DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage::Argument structure!");
    }
 };


--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp
@@ -20,7 +20,6 @@
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include <ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp>
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp" // stare wywalic
-#include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp"

 namespace ck {
@@ -69,8 +68,7 @@ __global__ void
                                           const BElementwiseOperation b_element_op,
                                           const CDEElementwiseOperation cde_element_op)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx94__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))

    constexpr index_t shared_size = GridwiseGemm::GetSharedMemoryNumberOfByte();
    __shared__ uint8_t p_shared[shared_size];
@@ -405,7 +403,7 @@ __global__ void
    ignore = a_element_op;
    ignore = b_element_op;
    ignore = cde_element_op;
-#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+#endif // end of if (defined(__gfx9__))
 }

 template <typename ALayout,
@@ -522,7 +520,7 @@ struct DeviceGroupedGemmMultipleDXdlCShuffleTileLoop
        ComputeTypeA,
        ComputeTypeB>;

-    using KernelArguments = GroupedGemmTileLoopKernelArguments<NumDTensor>;
+    using KernelArguments = GroupedGemmKernelArgument<NumDTensor>;
    using Block2ETileMap  = BlockToCTileMap_Grouped_M00_N0_M01Adapt<8, MPerBlock, NPerBlock>;
    using OffsettedLocalBlock2ETileMap = OffsettedBlockToCTileMap2<Block2ETileMap>;

@@ -936,12 +934,31 @@ struct DeviceGroupedGemmMultipleDXdlCShuffleTileLoop
        return str.str();
    }

+    void SetDeviceKernelArgs(Argument& arg,
+                             void* p_dev_kernel_args,
+                             const void* p_host_kernel_args) const
+    {
+        arg.p_dev_gemm_args_ = p_dev_kernel_args;
+        hip_check_error(hipMemcpyAsync(p_dev_kernel_args,
+                                       p_host_kernel_args,
+                                       GetDeviceKernelArgSize(&arg),
+                                       hipMemcpyHostToDevice));
+    }
+
+    virtual void SetDeviceKernelArgs(BaseArgument* p_arg,
+                                     void* p_dev_kernel_args,
+                                     const void* p_host_kernel_args) const override
+    {
+        return SetDeviceKernelArgs(
+            *dynamic_cast<Argument*>(p_arg), p_dev_kernel_args, p_host_kernel_args);
+    }
+
    void SetDeviceKernelArgs(Argument& arg, void* p_dev_kernel_args) const
    {
        arg.p_dev_gemm_args_ = p_dev_kernel_args;
    }

-    void SetDeviceKernelArgs(BaseArgument* p_arg, void* p_dev_kernel_args) const override
+    virtual void SetDeviceKernelArgs(BaseArgument* p_arg, void* p_dev_kernel_args) const override
    {
        return SetDeviceKernelArgs(*dynamic_cast<Argument*>(p_arg), p_dev_kernel_args);
    }

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
@@ -43,8 +43,7 @@ __global__ void
            const B1ElementwiseOperation b1_element_op,
            const CElementwiseOperation c_element_op)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx94__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

    const index_t block_id = get_block_1d_id();
@@ -109,7 +108,7 @@ __global__ void
    ignore = acc_element_op;
    ignore = b1_element_op;
    ignore = c_element_op;
-#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+#endif // end of if (defined(__gfx9__))
 }

 // Computes C = A * B0 * B1

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
 #pragma once
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -38,8 +38,7 @@ __global__ void
                                const BElementwiseOperation b_element_op,
                                const CDEElementwiseOperation c_element_op)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx94__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

    const index_t block_id = get_block_1d_id();
@@ -557,12 +556,12 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
                }
            }

-            hipGetErrorString(hipMemcpyWithStream(arg.p_workspace_,
-                                                  arg.gemm_desc_kernel_arg_.data(),
-                                                  arg.gemm_desc_kernel_arg_.size() *
-                                                      sizeof(GemmBiasTransKernelArg),
-                                                  hipMemcpyHostToDevice,
-                                                  stream_config.stream_id_));
+            hipGetErrorString(
+                hipMemcpyAsync(arg.p_workspace_,
+                               arg.gemm_desc_kernel_arg_.data(),
+                               arg.gemm_desc_kernel_arg_.size() * sizeof(GemmBiasTransKernelArg),
+                               hipMemcpyHostToDevice,
+                               stream_config.stream_id_));

            float ave_time = 0;

@@ -717,7 +716,24 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,

    size_t GetWorkSpaceSize(const BaseArgument* p_arg) const override
    {
-        return dynamic_cast<const Argument*>(p_arg)->group_count_ * sizeof(GemmBiasTransKernelArg);
+        auto p_arg_ = dynamic_cast<const Argument*>(p_arg);
+        if(p_arg_)
+        {
+            return p_arg_->group_count_ * sizeof(GemmBiasTransKernelArg);
+        }
+        else
+            throw std::runtime_error("The argument pointer is not an object of "
+                                     "DeviceGroupedGemmMultipleDXdlCShuffle::Argument structure!");
+    }
+
+    size_t GetDeviceKernelArgSize(const BaseArgument* p_arg) const override
+    {
+        return GetWorkSpaceSize(p_arg);
+    }
+
+    void SetDeviceKernelArgs(BaseArgument* p_arg, void* p_dev_kernel_args) const override
+    {
+        return this->SetWorkSpacePointer(p_arg, p_dev_kernel_args);
    }
 };


--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp
@@ -50,8 +50,7 @@ __global__ void
                                         const BElementwiseOperation b_element_op,
                                         const CDEElementwiseOperation c_element_op)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx94__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

    const index_t block_id = get_block_1d_id();
@@ -445,6 +444,7 @@ struct DeviceGroupedGemm_Xdl_Fixed_NK : public DeviceGroupedGemmFixedNK<ALayout,
    using Block2ETileMap = BlockToCTileMap_KBatch_M00_N0_M01Adapt_MLoops<MPerBlock, NPerBlock>;
    using GroupedGemmBlock2ETileMap = OffsettedBlockToCTileMapMLoops<Block2ETileMap>;

+    // TODO: replace with GroupedGemmKernelArgument
    struct GemmBiasTransKernelArg
    {
        // pointers
@@ -900,40 +900,58 @@ struct DeviceGroupedGemm_Xdl_Fixed_NK : public DeviceGroupedGemmFixedNK<ALayout,
        return str.str();
    }

-    static void SetDeviceKernelArgs(Argument& arg, const void* kernel_args)
-    {
-        arg.grouped_gemm_kernel_args_dev = kernel_args;
-    }
-
    // polymorphic
-    void SetDeviceKernelArgs(BaseArgument* p_arg, const void* kernel_args) const override
+    void SetDeviceKernelArgs(BaseArgument* p_arg, void* kernel_args) const override
    {
-        return SetDeviceKernelArgs(*dynamic_cast<Argument*>(p_arg), kernel_args);
+        auto arg_ptr = dynamic_cast<Argument*>(p_arg);
+        if(arg_ptr)
+        {
+            arg_ptr->grouped_gemm_kernel_args_dev = kernel_args;
+        }
+        else
+            throw std::runtime_error("The argument pointer is not an object of "
+                                     "DeviceGroupedGemm_Xdl_Fixed_NK::Argument structure!");
    }

    size_t GetWorkSpaceSize(const BaseArgument* p_arg) const override
    {
-        auto arg = *dynamic_cast<const Argument*>(p_arg);
-
-        return arg.group_count_ * arg.barrier_size_grp_ * sizeof(uint32_t);
+        auto arg_ptr = dynamic_cast<const Argument*>(p_arg);
+        if(arg_ptr)
+        {
+            return arg_ptr->group_count_ * arg_ptr->barrier_size_grp_ * sizeof(uint32_t);
+        }
+        else
+            throw std::runtime_error("The argument pointer is not an object of "
+                                     "DeviceGroupedGemm_Xdl_Fixed_NK::Argument structure!");
    }

    size_t GetDeviceKernelArgSize(const BaseArgument* p_arg) const override
    {
-        auto arg = *dynamic_cast<const Argument*>(p_arg);
-
-        return arg.group_count_ * sizeof(GroupedGemmKernelArgument<NumDTensor>);
+        auto arg_ptr = dynamic_cast<const Argument*>(p_arg);
+        if(arg_ptr)
+        {
+            return arg_ptr->group_count_ * sizeof(GroupedGemmKernelArgument<NumDTensor>);
+        }
+        else
+            throw std::runtime_error("The argument pointer is not an object of "
+                                     "DeviceGroupedGemm_Xdl_Fixed_NK::Argument structure!");
    }

    void SetWorkSpacePointer(BaseArgument* p_arg,
                             void* p_workspace,
                             const StreamConfig& stream_config = StreamConfig{}) const override
    {
-        auto p_arg_          = dynamic_cast<Argument*>(p_arg);
-        p_arg_->p_workspace_ = p_workspace;
+        auto arg_ptr = dynamic_cast<Argument*>(p_arg);
+        if(arg_ptr)
+        {
+            arg_ptr->p_workspace_ = p_workspace;
+        }
+        else
+            throw std::runtime_error("The argument pointer is not an object of "
+                                     "DeviceGroupedGemm_Xdl_Fixed_NK::Argument structure!");

        hip_check_error(
-            hipMemsetAsync(p_workspace, 0, GetWorkSpaceSize(p_arg), stream_config.stream_id_));
+            hipMemsetAsync(p_workspace, 0, GetWorkSpaceSize(arg_ptr), stream_config.stream_id_));
    }

    static void SetKBatch(Argument& arg, index_t k_batch) { arg.UpdateKBatch(k_batch); }
@@ -941,7 +959,26 @@ struct DeviceGroupedGemm_Xdl_Fixed_NK : public DeviceGroupedGemmFixedNK<ALayout,
    // polymorphic
    void SetKBatch(BaseArgument* p_arg, index_t k_batch) const override
    {
-        return SetKBatch(*dynamic_cast<Argument*>(p_arg), k_batch);
+        auto arg_ptr = dynamic_cast<Argument*>(p_arg);
+        if(arg_ptr)
+        {
+            arg_ptr->UpdateKBatch(k_batch);
+        }
+        else
+            throw std::runtime_error("The argument pointer is not an object of "
+                                     "DeviceGroupedGemm_Xdl_Fixed_NK::Argument structure!");
+    }
+
+    void SetKBatchSize(BaseArgument* p_arg, index_t kbatch) const override
+    {
+        auto arg_ptr = dynamic_cast<Argument*>(p_arg);
+        if(arg_ptr)
+        {
+            arg_ptr->UpdateKBatch(kbatch);
+        }
+        else
+            throw std::runtime_error("The argument pointer is not an object of "
+                                     "DeviceGroupedGemm_Xdl_Fixed_NK::Argument structure!");
    }
 };


--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
@@ -40,8 +40,7 @@ __global__ void
                                       const BElementwiseOperation b_element_op,
                                       const CElementwiseOperation c_element_op)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx94__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
    constexpr index_t shared_size = GridwiseGemm::GetSharedMemoryNumberOfByte();
    __shared__ uint8_t p_shared[shared_size];

@@ -80,7 +79,7 @@ __global__ void
    ignore = a_element_op;
    ignore = b_element_op;
    ignore = c_element_op;
-#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+#endif // end of if (defined(__gfx9__))
 }

 template <typename ALayout,
@@ -421,11 +420,11 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
            }

            hip_check_error(
-                hipMemcpyWithStream(arg.p_workspace_,
-                                    arg.gemm_kernel_args_.data(),
-                                    arg.gemm_kernel_args_.size() * sizeof(GemmTransKernelArg),
-                                    hipMemcpyHostToDevice,
-                                    stream_config.stream_id_));
+                hipMemcpyAsync(arg.p_workspace_,
+                               arg.gemm_kernel_args_.data(),
+                               arg.gemm_kernel_args_.size() * sizeof(GemmTransKernelArg),
+                               hipMemcpyHostToDevice,
+                               stream_config.stream_id_));

            float ave_time = 0;

@@ -538,10 +537,16 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
            return false;
        }

+        if(std::is_same_v<EDataType, ck::bhalf_t> && arg.K_BATCH > 1 && !is_bf16_atomic_supported())
+        {
+            return false;
+        }
+
        bool supported = true;
        for(std::size_t i = 0; i < arg.gemm_kernel_args_.size(); ++i)
        {
-            const auto& a        = arg.gemm_kernel_args_[i].karg_;
+            const auto& a = arg.gemm_kernel_args_[i].karg_;
+
            bool group_arg_valid = GridwiseGemm::CheckValidity(a);
            if(not group_arg_valid)
            {
@@ -631,16 +636,42 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo

    size_t GetWorkSpaceSize(const BaseArgument* p_arg) const override
    {
-        return dynamic_cast<const Argument*>(p_arg)->gemm_kernel_args_.size() *
-               sizeof(GemmTransKernelArg);
+        auto p_arg_ = dynamic_cast<const Argument*>(p_arg);
+        if(p_arg_)
+        {
+            return p_arg_->gemm_kernel_args_.size() * sizeof(GemmTransKernelArg);
+        }
+        else
+            throw std::runtime_error(
+                "The argument pointer is not an object of "
+                "DeviceGroupedGemmMultipleDSplitKXdlCShuffle::Argument structure!");
    }

+    size_t GetDeviceKernelArgSize(const BaseArgument* p_arg) const override
+    {
+        return GetWorkSpaceSize(p_arg);
+    }
+
+    // TODO: deperecation notice.
    static void SetKBatchSize(Argument& arg, index_t kbatch) { arg.UpdateKBatch(kbatch); }

    // polymorphic
    void SetKBatchSize(BaseArgument* p_arg, index_t kbatch) const override
    {
-        return SetKBatchSize(*dynamic_cast<Argument*>(p_arg), kbatch);
+        auto p_arg_ = dynamic_cast<Argument*>(p_arg);
+        if(p_arg_)
+        {
+            p_arg_->UpdateKBatch(kbatch);
+        }
+        else
+            throw std::runtime_error(
+                "The argument pointer is not an object of "
+                "DeviceGroupedGemmMultipleDSplitKXdlCShuffle::Argument structure!");
+    }
+
+    void SetDeviceKernelArgs(BaseArgument* p_arg, void* p_dev_kernel_args) const override
+    {
+        return this->SetWorkSpacePointer(p_arg, p_dev_kernel_args);
    }
 };


--- a/include/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp
@@ -3,6 +3,7 @@

 #pragma once

+#include "ck/library/utility/numeric.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp"

--- a/include/ck/tensor_operation/gpu/device/impl/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp
@@ -56,8 +56,7 @@ __global__ void
            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
            const Block2ETileMap block_2_etile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx94__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

    const index_t num_blocks_per_batch =

--- a/include/ck/tensor_operation/gpu/device/tensor_layout.hpp
+++ b/include/ck/tensor_operation/gpu/device/tensor_layout.hpp
@@ -430,6 +430,7 @@ struct G_NDHW : public BaseTensorLayout

 } // namespace convolution

+#ifndef CK_CODE_GEN_RTC
 template <
    typename Layout,
    typename std::enable_if<std::is_base_of<BaseTensorLayout, Layout>::value, bool>::type = false>
@@ -438,6 +439,7 @@ std::ostream& operator<<(std::ostream& os, const Layout&)
    os << Layout::name;
    return os;
 }
+#endif

 } // namespace tensor_layout
 } // namespace ck
--- a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -340,8 +340,8 @@ struct Bilinear
    };

    template <>
-    __host__ __device__ constexpr void operator()<std::int8_t, std::int32_t, std::int8_t>(
-        std::int8_t& y, const std::int32_t& x0, const std::int8_t& x1) const
+    __host__ __device__ constexpr void
+    operator()<int8_t, int32_t, int8_t>(int8_t& y, const int32_t& x0, const int8_t& x1) const
    {
        y = type_convert<int8_t>(alpha_ * type_convert<float>(x0) +
                                 beta_ * type_convert<float>(x1));

--- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -533,7 +533,7 @@ struct NormalizeInInfer
                                                  const T3& gamma,
                                                  const T4& beta) const
    {
-        static_assert(std::is_same<T2, float>::value || std::is_same<T2, double>::value,
+        static_assert(is_same<T2, float>::value || is_same<T2, double>::value,
                      "Data type is not supported by this operation!");

        using ck::type_convert;

--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -7,36 +7,203 @@
 #include "ck/utility/math.hpp"
 #include "ck/utility/math_v2.hpp"
 #include "ck/utility/type_convert.hpp"
+#include "ck/utility/amd_inline_asm.hpp"
 #include <cassert>

 namespace ck {
+
+// Fast int4x4 to half8_t data type conversion based on paper
+// [Who Says Elephants Can't Run: Bringing Large Scale MoE Models into Cloud Scale Production]
+// (https://arxiv.org/abs/2211.10017) and implementation:
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
+// Convert lower part of packed int4 -> int4 to half
+__device__ inline half4_t i4_to_half4(int q)
+{
+    const int LO = 0x000f000f;
+    const int HI = 0x00f000f0;
+    const int EX = 0x64006400;
+
+    // Extract the two int4 at low bit and create two fp16 number.
+    int lo = amd_assembly_and_or_b32(q, LO, EX);
+    // Extract the two int4 at hight bit and create two fp16 number.
+    int hi = amd_assembly_and_or_b32(q, HI, EX);
+
+    const int SUB = 0xE408E408; // half2 {-1032, -1032}
+    const int MUL = 0x2c002c00; // half2 {1 / 16, 1 / 16}
+    const int ADD = 0xd480d480; // half2 {-72, -72}
+
+    vector_type<half_t, 4> res;
+
+    // for two fp16 from lowbit, subtract 1032 to get correct fp16 value
+    res.template AsType<half2_t>()(Number<0>{}) =
+        amd_assembly_pk_add_f16(bit_cast<half2_t>(lo), bit_cast<half2_t>(SUB));
+
+    // for two fp16 from highbit, divide 16 and subtract 72 to get correct fp16 value
+    res.template AsType<half2_t>()(Number<1>{}) = amd_assembly_pk_fma_f16(
+        bit_cast<half2_t>(hi), bit_cast<half2_t>(MUL), bit_cast<half2_t>(ADD));
+
+    return res.template AsType<half4_t>()[Number<0>{}];
+}
+
+__device__ inline half4_t i4_to_half4_scale(int q, const ck::half2_t& scale)
+{
+    const int LO = 0x000f000f;
+    const int HI = 0x00f000f0;
+    const int EX = 0x64006400;
+
+    // Extract the two int4 at low bit and create two fp16 number.
+    int lo = amd_assembly_and_or_b32(q, LO, EX);
+    // Extract the two int4 at hight bit and create two fp16 number.
+    int hi = amd_assembly_and_or_b32(q, HI, EX);
+
+    const int SUB = 0xE408E408; // half2 {-1032, -1032}
+    const int MUL = 0x2c002c00; // half2 {1 / 16, 1 / 16}
+    const int ADD = 0xd480d480; // half2 {-72, -72}
+
+    vector_type<half_t, 4> res;
+
+    res.template AsType<half2_t>()(Number<0>{}) =
+        amd_assembly_pk_add_f16(bit_cast<half2_t>(lo), bit_cast<half2_t>(SUB));
+
+    res.template AsType<half2_t>()(Number<1>{}) = amd_assembly_pk_fma_f16(
+        bit_cast<half2_t>(hi), bit_cast<half2_t>(MUL), bit_cast<half2_t>(ADD));
+
+    asm volatile("v_pk_mul_f16 %0, %1, %2"
+                 : "=v"(res.template AsType<half2_t>()(Number<0>{}))
+                 : "v"(res.template AsType<half2_t>()(Number<0>{})), "v"(scale));
+
+    asm volatile("v_pk_mul_f16 %0, %1, %2"
+                 : "=v"(res.template AsType<half2_t>()(Number<1>{}))
+                 : "v"(res.template AsType<half2_t>()(Number<1>{})), "v"(scale));
+
+    return res.template AsType<half4_t>()[Number<0>{}];
+}
+
+__device__ inline bhalf4_t i4_to_bhalf4(int q)
+{
+    uint32_t i8s = (q & 0xf) | ((q & 0xf0) << 4) | ((q & 0xf00) << 8) | ((q & 0xf000) << 12);
+
+    static constexpr uint32_t fp32_base = 0x4B000000;
+
+    float fp32_intermediates[4];
+
+    uint32_t* fp32_intermediates_casted = reinterpret_cast<uint32_t*>(fp32_intermediates);
+
+    fp32_intermediates_casted[0] = __byte_perm(i8s, fp32_base, 0x7650);
+    fp32_intermediates_casted[1] = __byte_perm(i8s, fp32_base, 0x7651);
+    fp32_intermediates_casted[2] = __byte_perm(i8s, fp32_base, 0x7652);
+    fp32_intermediates_casted[3] = __byte_perm(i8s, fp32_base, 0x7653);
+
+    fp32_intermediates[0] -= 8388616.f;
+    fp32_intermediates[1] -= 8388616.f;
+    fp32_intermediates[2] -= 8388616.f;
+    fp32_intermediates[3] -= 8388616.f;
+
+    vector_type<bhalf_t, 4> res;
+    res.template AsType<bhalf2_t>()(Number<0>{}) = bit_cast<bhalf2_t>(
+        __byte_perm(fp32_intermediates_casted[1], fp32_intermediates_casted[0], 0x7632));
+    res.template AsType<bhalf2_t>()(Number<1>{}) = bit_cast<bhalf2_t>(
+        __byte_perm(fp32_intermediates_casted[3], fp32_intermediates_casted[2], 0x7632));
+
+    return res.template AsType<bhalf4_t>()[Number<0>{}];
+}
+
 namespace tensor_operation {
 namespace element_wise {

-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wnon-virtual-dtor"
-struct UnaryOpBase
+struct PassThroughPack8
 {
-    public:
-    __host__ __device__ ~UnaryOpBase() = default;
+    template <typename Y, typename X>
+    __host__ __device__ void operator()(Y& y, const X& x) const;
+
+    __host__ __device__ constexpr void operator()(ck::half8_t& y, const ck::pk_i4x4_t& x) const
+    {
+#if CK_USE_PK4_LAYOUT_SHUFFLE
+        vector_type<half_t, 8> result;
+
+        result.template AsType<half4_t>()(Number<0>{}) = i4_to_half4(bit_cast<int>(x));
+        result.template AsType<half4_t>()(Number<1>{}) = i4_to_half4(bit_cast<int>(x) >> 8);

-    __host__ __device__ constexpr UnaryOpBase()                   = default;
-    __host__ __device__ constexpr UnaryOpBase(const UnaryOpBase&) = default;
-    __host__ __device__ constexpr UnaryOpBase(UnaryOpBase&&)      = default;
-    __host__ __device__ UnaryOpBase& operator=(const UnaryOpBase&) = default;
-    __host__ __device__ UnaryOpBase& operator=(UnaryOpBase&&) = default;
+        y = result.template AsType<half8_t>()[Number<0>{}];
+#else
+        vector_type<half_t, 8> dst;
+        vector_type<pk_i4_t, 4> src{x};

-    __host__ __device__ virtual inline void operator()(float& y, const float& x) const = 0;
+        dst.template AsType<half2_t>()(Number<0>{}) =
+            type_convert<half2_t>(src.template AsType<pk_i4_t>()[Number<0>{}]);
+        dst.template AsType<half2_t>()(Number<1>{}) =
+            type_convert<half2_t>(src.template AsType<pk_i4_t>()[Number<1>{}]);
+        dst.template AsType<half2_t>()(Number<2>{}) =
+            type_convert<half2_t>(src.template AsType<pk_i4_t>()[Number<2>{}]);
+        dst.template AsType<half2_t>()(Number<3>{}) =
+            type_convert<half2_t>(src.template AsType<pk_i4_t>()[Number<3>{}]);

-    __host__ __device__ virtual inline void operator()(double& y, const double& x) const = 0;
+        y = dst.template AsType<half8_t>()[Number<0>{}];
+#endif
+    }
+
+    __host__ __device__ constexpr void operator()(ck::bhalf8_t& y, const ck::pk_i4x4_t& x) const
+    {
+#if CK_USE_PK4_LAYOUT_SHUFFLE
+        vector_type<bhalf_t, 8> result;

-    __host__ __device__ virtual inline void operator()(int32_t& y, const int32_t& x) const = 0;
+        result.template AsType<bhalf4_t>()(Number<0>{}) = i4_to_bhalf4(bit_cast<int>(x));
+        result.template AsType<bhalf4_t>()(Number<1>{}) = i4_to_bhalf4(bit_cast<int>(x) >> 16);

-    __host__ __device__ virtual inline void operator()(int8_t& y, const int8_t& x) const = 0;
+        y = result.template AsType<bhalf8_t>()[Number<0>{}];
+#else
+        vector_type<bhalf_t, 8> dst;
+        vector_type<pk_i4_t, 4> src{x};

-    __host__ __device__ virtual inline void operator()(half_t& y, const half_t& x) const = 0;
+        dst.template AsType<bhalf2_t>()(Number<0>{}) =
+            type_convert<bhalf2_t>(src.template AsType<pk_i4_t>()[Number<0>{}]);
+        dst.template AsType<bhalf2_t>()(Number<1>{}) =
+            type_convert<bhalf2_t>(src.template AsType<pk_i4_t>()[Number<1>{}]);
+        dst.template AsType<bhalf2_t>()(Number<2>{}) =
+            type_convert<bhalf2_t>(src.template AsType<pk_i4_t>()[Number<2>{}]);
+        dst.template AsType<bhalf2_t>()(Number<3>{}) =
+            type_convert<bhalf2_t>(src.template AsType<pk_i4_t>()[Number<3>{}]);

-    __host__ __device__ virtual inline void operator()(bhalf_t& y, const bhalf_t& x) const = 0;
+        y = dst.template AsType<bhalf8_t>()[Number<0>{}];
+#endif
+    }
+    constexpr const static bool is_pack8_invocable = true;
+};
+
+struct DequantPack8
+{
+    template <typename Y, typename X, typename Z>
+    __host__ __device__ void operator()(Y& y, const X& x, const Z& z) const;
+
+    __host__ __device__ constexpr void
+    operator()(ck::half8_t& y, const ck::pk_i4x4_t& x, const ck::half2_t& z) const
+    {
+#if CK_USE_PK4_LAYOUT_SHUFFLE
+        vector_type<half_t, 8> result;
+
+        result.template AsType<half4_t>()(Number<0>{}) = i4_to_half4_scale(bit_cast<int>(x), z);
+        result.template AsType<half4_t>()(Number<1>{}) =
+            i4_to_half4_scale(bit_cast<int>(x) >> 8, z);
+
+        y = result.template AsType<half8_t>()[Number<0>{}];
+#else
+        vector_type<half_t, 8> dst;
+        vector_type<pk_i4_t, 4> src{x};
+
+        dst.template AsType<half2_t>()(Number<0>{}) =
+            type_convert<half2_t>(src.template AsType<pk_i4_t>()[Number<0>{}]);
+        dst.template AsType<half2_t>()(Number<1>{}) =
+            type_convert<half2_t>(src.template AsType<pk_i4_t>()[Number<1>{}]);
+        dst.template AsType<half2_t>()(Number<2>{}) =
+            type_convert<half2_t>(src.template AsType<pk_i4_t>()[Number<2>{}]);
+        dst.template AsType<half2_t>()(Number<3>{}) =
+            type_convert<half2_t>(src.template AsType<pk_i4_t>()[Number<3>{}]);
+
+        y          = dst.template AsType<half8_t>()[Number<0>{}];
+#endif
+    }
+
+    constexpr const static bool is_pack8_invocable = true;
 };

 struct PassThroughPack2
@@ -44,38 +211,49 @@ struct PassThroughPack2
    template <typename Y, typename X>
    __host__ __device__ void operator()(Y& y, const X& x) const;

-    __host__ __device__ constexpr void operator()(ck::half2_t& y, const ck::f8x2_t& x) const
+    __host__ __device__ constexpr void operator()(half2_t& y, const f8x2_t& x) const
    {
        auto t = type_convert<float2_t>(x);
        y      = type_convert<half2_t>(t);
    }
-    constexpr const static bool is_pack2_invocable = true;
-};

-struct PassThrough final : public UnaryOpBase
-{
-    __host__ __device__ constexpr PassThrough()                   = default;
-    __host__ __device__ constexpr PassThrough(const PassThrough&) = default;
-    __host__ __device__ constexpr PassThrough(PassThrough&&)      = default;
-    __host__ __device__ PassThrough& operator=(const PassThrough&) = default;
-    __host__ __device__ PassThrough& operator=(PassThrough&&) = default;
-    __host__ __device__ ~PassThrough()                        = default;
-
-    __host__ __device__ inline void operator()(float& y, const float& x) const final { y = x; }
-
-    __host__ __device__ inline void operator()(double& y, const double& x) const final { y = x; }
-
-    __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final { y = x; }
+    __host__ __device__ constexpr void operator()(ck::half2_t& y, const ck::pk_i4_t& x) const
+    {
+#if CK_USE_PK4_LAYOUT_SHUFFLE
+        uint8_t x_u8 = ck::bit_cast<uint8_t>(x);
+        uint8_t x_l  = (x_u8 & 0x0f) >> 0;
+        uint8_t x_h  = (x_u8 & 0xf0) >> 4;

-    __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final { y = x; }
+        auto l_f16 = ck::type_convert<ck::half_t>(x_l);
+        auto h_f16 = ck::type_convert<ck::half_t>(x_h);

-    __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final { y = x; }
+        y = {l_f16, h_f16};
+#else
+        uint32_t t = ck::bit_cast<uint8_t>(x);
+        y          = ck::bit_cast<half2_t>(t);
+#endif
+    }

-    __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final { y = x; }
+    constexpr const static bool is_pack2_invocable = true;
+};

+struct PassThrough
+{
    template <typename Y, typename X>
    __host__ __device__ void operator()(Y& y, const X& x) const;

+    template <>
+    __host__ __device__ void operator()<pk_i4_t, pk_i4_t>(pk_i4_t& y, const pk_i4_t& x) const
+    {
+        y = x;
+    }
+
+    template <>
+    __host__ __device__ void operator()<double, double>(double& y, const double& x) const
+    {
+        y = x;
+    }
+
    template <>
    __host__ __device__ void operator()<float, double>(float& y, const double& x) const
    {
@@ -88,12 +266,36 @@ struct PassThrough final : public UnaryOpBase
        y = type_convert<double>(x);
    }

+    template <>
+    __host__ __device__ void operator()<float, float>(float& y, const float& x) const
+    {
+        y = x;
+    }
+
+    template <>
+    __host__ __device__ void operator()<half_t, half_t>(half_t& y, const half_t& x) const
+    {
+        y = x;
+    }
+
    template <>
    __host__ __device__ void operator()<half_t, float>(half_t& y, const float& x) const
    {
        y = type_convert<half_t>(x);
    }

+    template <>
+    __host__ __device__ void operator()<bhalf_t, bhalf_t>(bhalf_t& y, const bhalf_t& x) const
+    {
+        y = x;
+    }
+
+    template <>
+    __host__ __device__ void operator()<int32_t, int32_t>(int32_t& y, const int32_t& x) const
+    {
+        y = x;
+    }
+
    template <>
    __host__ __device__ void operator()<bhalf_t, float>(bhalf_t& y, const float& x) const
    {
@@ -118,6 +320,12 @@ struct PassThrough final : public UnaryOpBase
        y = type_convert<float>(x);
    }

+    template <>
+    __host__ __device__ void operator()<int8_t, int8_t>(int8_t& y, const int8_t& x) const
+    {
+        y = x;
+    }
+
    template <>
    __host__ __device__ void operator()<half_t, int8_t>(half_t& y, const int8_t& x) const
    {
@@ -230,7 +438,7 @@ struct PassThrough final : public UnaryOpBase
    template <>
    __host__ __device__ void operator()<bf8_t, half_t>(bf8_t& y, const half_t& x) const
    {
-        y = ck::type_convert<bf8_t>(x);
+        y = type_convert<bf8_t>(x);
    }
 };

@@ -303,21 +511,21 @@ struct Scale
    template <typename Y, typename X>
    __host__ __device__ void operator()(Y& y, const X& x) const
    {
-        y = ck::type_convert<Y>(ck::type_convert<float>(x) * scale_);
+        y = type_convert<Y>(type_convert<float>(x) * scale_);
    }

    template <>
    __host__ __device__ void operator()<half_t, half_t>(half_t& y, const half_t& x) const
    {
-        y = ck::type_convert<half_t>(scale_) * x;
+        y = type_convert<half_t>(scale_) * x;
    };

    template <>
    __host__ __device__ void operator()<bhalf_t, bhalf_t>(bhalf_t& y, const bhalf_t& x) const
    {
-        const float x_tmp = ck::type_convert<float>(x);
+        const float x_tmp = type_convert<float>(x);
        const float y_tmp = scale_ * x_tmp;
-        y                 = ck::type_convert<bhalf_t>(y_tmp);
+        y                 = type_convert<bhalf_t>(y_tmp);
    };

    template <>
@@ -335,7 +543,7 @@ struct Scale
    template <>
    __host__ __device__ void operator()<int8_t, int8_t>(int8_t& y, const int8_t& x) const
    {
-        y = ck::type_convert<int8_t>(scale_ * ck::type_convert<float>(x));
+        y = type_convert<int8_t>(scale_ * type_convert<float>(x));
    };

    float scale_;
@@ -351,7 +559,7 @@ struct ScaleAndResetNaNToMinusInfinity
    template <>
    __host__ __device__ void operator()<float, float>(float& y, const float& x) const
    {
-        y = ck::math::isnan(x) ? -ck::NumericLimits<float>::Infinity() : scale_ * x;
+        y = math::isnan(x) ? -NumericLimits<float>::Infinity() : scale_ * x;
    };

    float scale_;
@@ -417,45 +625,21 @@ struct UnarySquare
    };
 };

-struct UnaryAbs final : public UnaryOpBase
+struct UnaryAbs
 {
-    __host__ __device__ constexpr UnaryAbs()                = default;
-    __host__ __device__ constexpr UnaryAbs(const UnaryAbs&) = default;
-    __host__ __device__ constexpr UnaryAbs(UnaryAbs&&)      = default;
-    __host__ __device__ UnaryAbs& operator=(const UnaryAbs&) = default;
-    __host__ __device__ UnaryAbs& operator=(UnaryAbs&&) = default;
-    __host__ __device__ ~UnaryAbs()                     = default;
-
-    __host__ __device__ inline void operator()(float& y, const float& x) const final
-    {
-        y = ck::math::abs(x);
-    }
-
-    __host__ __device__ inline void operator()(double& y, const double& x) const final
-    {
-        y = ck::math::abs(x);
-    }
-
-    __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final
-    {
-        y = ck::math::abs(x);
-    }
-
-    __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
    {
-        y = ck::math::abs(x);
-    }

-    __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final
-    {
-        y = ck::math::abs(x);
-    }
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
+                          is_same<T, int8_t>::value,
+                      "Data type is not supported by this operation!");

-    __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final
-    {
-        y = ck::math::abs(x);
-    }
+        y = math::abs(x);
+    };

+    template <>
    __host__ __device__ void operator()(f8_t& y, const f8_t& x) const
    {
        y = ck::type_convert<f8_t>(ck::math::abs(ck::type_convert<float>(x)));
@@ -470,49 +654,28 @@ struct UnarySqrt
        static_assert(is_same<T, float>::value || is_same<T, double>::value,
                      "Data type is not supported by this operation!");

-        y = ck::math::sqrt(x);
+        y = math::sqrt(x);
    };
 };

-struct Relu final : public UnaryOpBase
+struct Relu
 {
-    __host__ __device__ constexpr Relu()            = default;
-    __host__ __device__ constexpr Relu(const Relu&) = default;
-    __host__ __device__ constexpr Relu(Relu&&)      = default;
-    __host__ __device__ Relu& operator=(const Relu&) = default;
-    __host__ __device__ Relu& operator=(Relu&&) = default;
-    __host__ __device__ ~Relu()                 = default;
-
-    __host__ __device__ inline void operator()(float& y, const float& x) const final
-    {
-        y = x > 0 ? x : 0;
-    }
-
-    __host__ __device__ inline void operator()(double& y, const double& x) const final
-    {
-        y = x > 0 ? x : 0;
-    }
-
-    __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final
-    {
-        y = x > 0 ? x : 0;
-    }
-
-    __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final
-    {
-        y = x > 0 ? x : 0;
-    }
-
-    __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
+                          is_same<T, int8_t>::value,
+                      "Data type is not supported by this operation!");
        y = x > 0 ? x : 0;
    }

-    __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final
+    template <>
+    __host__ __device__ void operator()(bhalf_t& y, const bhalf_t& x) const
    {
-        float x_f32 = ck::type_convert<float>(x);
+        float x_f32 = type_convert<float>(x);
        float y_f32 = x_f32 > 0 ? x_f32 : 0;
-        y           = ck::type_convert<bhalf_t>(y_f32);
+        y           = type_convert<bhalf_t>(y_f32);
    }
 };

@@ -528,7 +691,7 @@ struct FastGelu

    template <typename Y, typename X>
    __device__ void operator()(Y& y, const X& x) const;
-
+#ifndef CK_CODE_GEN_RTC
    template <>
    __host__ void operator()<float, float>(float& y, const float& x) const
    {
@@ -539,6 +702,7 @@ struct FastGelu
        const float emu = exp(u);
        y               = x / (1.f + emu);
    }
+#endif

    // device code, use lower precision "__ocml_exp_f32" and "rcp"
    template <>
@@ -550,7 +714,7 @@ struct FastGelu
        const float u   = x * (c1 * x * x + c2);
        const float emu = __ocml_exp_f32(u);

-        y = x * ck::math::rcp(1.f + emu);
+        y = x * math::rcp(1.f + emu);
    }

    template <>
@@ -648,59 +812,24 @@ struct Gelu
    }

    template <>
-    __host__ __device__ void operator()<ck::half_t, ck::half_t>(ck::half_t& y,
-                                                                const ck::half_t& x) const
+    __host__ __device__ void operator()<half_t, half_t>(half_t& y, const half_t& x) const
    {
-        y = ck::half_t(0.5) * x * (ck::half_t(1) + ck::half_t(erf(float(0.70710678118f * x))));
+        y = half_t(0.5) * x * (half_t(1) + half_t(erf(float(0.70710678118f * x))));
    }
 };

-struct Sigmoid final : public UnaryOpBase
+struct Sigmoid
 {
-    __host__ __device__ constexpr Sigmoid()               = default;
-    __host__ __device__ constexpr Sigmoid(const Sigmoid&) = default;
-    __host__ __device__ constexpr Sigmoid(Sigmoid&&)      = default;
-    __host__ __device__ Sigmoid& operator=(const Sigmoid&) = default;
-    __host__ __device__ Sigmoid& operator=(Sigmoid&&) = default;
-    __host__ __device__ ~Sigmoid()                    = default;
-
-    __host__ __device__ inline void operator()(float& y, const float& x) const final
-    {
-        constexpr float one = type_convert<float>(1);
-        y                   = one / (one + ck::math::exp(-x));
-    }
-
-    __host__ __device__ inline void operator()(double& y, const double& x) const final
-    {
-        constexpr double one = type_convert<double>(1);
-        y                    = one / (one + ck::math::exp(-x));
-    }
-
-    __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final
-    {
-        constexpr int32_t one = type_convert<int32_t>(1);
-        y                     = one / (one + ck::math::exp(-x));
-    }
-
-    __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final
-    {
-        constexpr int8_t one = type_convert<int8_t>(1);
-        y                    = one / (one + ck::math::exp(-x));
-    }
-
-    __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final
-    {
-        constexpr half_t one = type_convert<half_t>(1);
-        y                    = one / (one + ck::math::exp(-x));
-    }
-
-    __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
    {
-        constexpr float one = type_convert<float>(1);
-        float x_f32         = ck::type_convert<float>(x);
-        float y_f32         = one / (one + ck::math::exp(x_f32));
-        y                   = ck::type_convert<bhalf_t>(y_f32);
-    }
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, ck::half_t>::value || is_same<T, int8_t>::value ||
+                          is_same<T, int32_t>::value,
+                      "Data type is not supported by this operation!");
+        constexpr T one = type_convert<T>(1);
+        y               = one / (one + math::exp(-x));
+    };
 };

 struct Silu
@@ -708,52 +837,26 @@ struct Silu
    template <typename T>
    __host__ __device__ void operator()(T& y, const T& x) const
    {
-        static_assert(is_same_v<T, float> || is_same_v<T, double> || is_same_v<T, ck::half_t> ||
+        static_assert(is_same_v<T, float> || is_same_v<T, double> || is_same_v<T, half_t> ||
                          is_same_v<T, int8_t> || is_same_v<T, int32_t>,
                      "Data type is not supported by this operation!");
        constexpr T one = type_convert<T>(1);
-        y               = x * (one / (one + ck::math::exp(-x)));
+        y               = x * (one / (one + math::exp(-x)));
    };
 };

-struct TanH final : public UnaryOpBase
+struct TanH
 {
-    __host__ __device__ constexpr TanH()            = default;
-    __host__ __device__ constexpr TanH(const TanH&) = default;
-    __host__ __device__ constexpr TanH(TanH&&)      = default;
-    __host__ __device__ TanH& operator=(const TanH&) = default;
-    __host__ __device__ TanH& operator=(TanH&&) = default;
-    __host__ __device__ ~TanH()                 = default;
-
-    __host__ __device__ inline void operator()(float& y, const float& x) const final
-    {
-        y = ck::math::tanh(x);
-    }
-
-    __host__ __device__ inline void operator()(double& y, const double& x) const final
-    {
-        y = ck::math::tanh(x);
-    }
-
-    __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final
-    {
-        y = ck::math::tanh(x);
-    }
-
-    __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final
-    {
-        y = ck::math::tanh(x);
-    }
-
-    __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
    {
-        y = ck::math::tanh(x);
-    }
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, ck::half_t>::value || is_same<T, int8_t>::value ||
+                          is_same<T, int32_t>::value,
+                      "Data type is not supported by this operation!");

-    __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final
-    {
-        y = ck::math::tanh(x);
-    }
+        y = math::tanh(x);
+    };
 };

 struct ACos
@@ -762,11 +865,11 @@ struct ACos
    __host__ __device__ void operator()(T& y, const T& x) const
    {
        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
-                          is_same<T, ck::half_t>::value || is_same<T, int8_t>::value ||
+                          is_same<T, half_t>::value || is_same<T, int8_t>::value ||
                          is_same<T, int32_t>::value,
                      "Data type is not supported by this operation!");

-        y = ck::math::acos(x);
+        y = math::acos(x);
    };
 };

@@ -776,11 +879,11 @@ struct Neg
    __host__ __device__ void operator()(T& y, const T& x) const
    {
        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
-                          is_same<T, ck::half_t>::value || is_same<T, int8_t>::value ||
+                          is_same<T, half_t>::value || is_same<T, int8_t>::value ||
                          is_same<T, int32_t>::value,
                      "Data type is not supported by this operation!");

-        y = ck::math::neg(x);
+        y = math::neg(x);
    };
 };

@@ -790,11 +893,11 @@ struct ATan
    __host__ __device__ void operator()(T& y, const T& x) const
    {
        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
-                          is_same<T, ck::half_t>::value || is_same<T, int8_t>::value ||
+                          is_same<T, half_t>::value || is_same<T, int8_t>::value ||
                          is_same<T, int32_t>::value,
                      "Data type is not supported by this operation!");

-        y = ck::math::atan(x);
+        y = math::atan(x);
    };
 };

@@ -804,11 +907,11 @@ struct Sin
    __host__ __device__ void operator()(T& y, const T& x) const
    {
        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
-                          is_same<T, ck::half_t>::value || is_same<T, int8_t>::value ||
+                          is_same<T, half_t>::value || is_same<T, int8_t>::value ||
                          is_same<T, int32_t>::value,
                      "Data type is not supported by this operation!");

-        y = ck::math::sin(x);
+        y = math::sin(x);
    };
 };

@@ -818,11 +921,11 @@ struct ASinH
    __host__ __device__ void operator()(T& y, const T& x) const
    {
        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
-                          is_same<T, ck::half_t>::value || is_same<T, int8_t>::value ||
+                          is_same<T, half_t>::value || is_same<T, int8_t>::value ||
                          is_same<T, int32_t>::value,
                      "Data type is not supported by this operation!");

-        y = ck::math::asinh(x);
+        y = math::asinh(x);
    };
 };

@@ -832,11 +935,11 @@ struct Cos
    __host__ __device__ void operator()(T& y, const T& x) const
    {
        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
-                          is_same<T, ck::half_t>::value || is_same<T, int8_t>::value ||
+                          is_same<T, half_t>::value || is_same<T, int8_t>::value ||
                          is_same<T, int32_t>::value,
                      "Data type is not supported by this operation!");

-        y = ck::math::cos(x);
+        y = cos(x);
    };
 };

@@ -846,11 +949,11 @@ struct ACosH
    __host__ __device__ void operator()(T& y, const T& x) const
    {
        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
-                          is_same<T, ck::half_t>::value || is_same<T, int8_t>::value ||
+                          is_same<T, half_t>::value || is_same<T, int8_t>::value ||
                          is_same<T, int32_t>::value,
                      "Data type is not supported by this operation!");

-        y = ck::math::acosh(x);
+        y = math::acosh(x);
    };
 };

@@ -860,11 +963,11 @@ struct Tan
    __host__ __device__ void operator()(T& y, const T& x) const
    {
        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
-                          is_same<T, ck::half_t>::value || is_same<T, int8_t>::value ||
+                          is_same<T, half_t>::value || is_same<T, int8_t>::value ||
                          is_same<T, int32_t>::value,
                      "Data type is not supported by this operation!");

-        y = ck::math::tan(x);
+        y = math::tan(x);
    };
 };

@@ -874,11 +977,11 @@ struct ATanH
    __host__ __device__ void operator()(T& y, const T& x) const
    {
        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
-                          is_same<T, ck::half_t>::value || is_same<T, int8_t>::value ||
+                          is_same<T, half_t>::value || is_same<T, int8_t>::value ||
                          is_same<T, int32_t>::value,
                      "Data type is not supported by this operation!");

-        y = ck::math::atanh(x);
+        y = math::atanh(x);
    };
 };

@@ -888,11 +991,11 @@ struct SinH
    __host__ __device__ void operator()(T& y, const T& x) const
    {
        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
-                          is_same<T, ck::half_t>::value || is_same<T, int8_t>::value ||
+                          is_same<T, half_t>::value || is_same<T, int8_t>::value ||
                          is_same<T, int32_t>::value,
                      "Data type is not supported by this operation!");

-        y = ck::math::sinh(x);
+        y = math::sinh(x);
    };
 };

@@ -902,11 +1005,11 @@ struct Ceil
    __host__ __device__ void operator()(T& y, const T& x) const
    {
        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
-                          is_same<T, ck::half_t>::value || is_same<T, int8_t>::value ||
+                          is_same<T, half_t>::value || is_same<T, int8_t>::value ||
                          is_same<T, int32_t>::value,
                      "Data type is not supported by this operation!");

-        y = ck::math::ceil(x);
+        y = math::ceil(x);
    };
 };

@@ -916,11 +1019,11 @@ struct Exp
    __host__ __device__ void operator()(T& y, const T& x) const
    {
        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
-                          is_same<T, ck::half_t>::value || is_same<T, int8_t>::value ||
+                          is_same<T, half_t>::value || is_same<T, int8_t>::value ||
                          is_same<T, int32_t>::value,
                      "Data type is not supported by this operation!");

-        y = ck::math::exp(x);
+        y = math::exp(x);
    };
 };

@@ -930,11 +1033,11 @@ struct CosH
    __host__ __device__ void operator()(T& y, const T& x) const
    {
        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
-                          is_same<T, ck::half_t>::value || is_same<T, int8_t>::value ||
+                          is_same<T, half_t>::value || is_same<T, int8_t>::value ||
                          is_same<T, int32_t>::value,
                      "Data type is not supported by this operation!");

-        y = ck::math::cosh(x);
+        y = math::cosh(x);
    };
 };

@@ -944,11 +1047,11 @@ struct Floor
    __host__ __device__ void operator()(T& y, const T& x) const
    {
        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
-                          is_same<T, ck::half_t>::value || is_same<T, int8_t>::value ||
+                          is_same<T, half_t>::value || is_same<T, int8_t>::value ||
                          is_same<T, int32_t>::value,
                      "Data type is not supported by this operation!");

-        y = ck::math::floor(x);
+        y = math::floor(x);
    };
 };

@@ -958,11 +1061,11 @@ struct Log
    __host__ __device__ void operator()(T& y, const T& x) const
    {
        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
-                          is_same<T, ck::half_t>::value || is_same<T, int8_t>::value ||
+                          is_same<T, half_t>::value || is_same<T, int8_t>::value ||
                          is_same<T, int32_t>::value,
                      "Data type is not supported by this operation!");

-        y = ck::math::log(x);
+        y = math::log(x);
    };
 };

@@ -972,11 +1075,11 @@ struct ASin
    __host__ __device__ void operator()(T& y, const T& x) const
    {
        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
-                          is_same<T, ck::half_t>::value || is_same<T, int8_t>::value ||
+                          is_same<T, half_t>::value || is_same<T, int8_t>::value ||
                          is_same<T, int32_t>::value,
                      "Data type is not supported by this operation!");

-        y = ck::math::asin(x);
+        y = math::asin(x);
    };
 };

@@ -986,426 +1089,146 @@ struct Rcp
    __host__ __device__ void operator()(T& y, const T& x) const
    {
        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
-                          is_same<T, ck::half_t>::value || is_same<T, int8_t>::value ||
+                          is_same<T, half_t>::value || is_same<T, int8_t>::value ||
                          is_same<T, int32_t>::value,
                      "Data type is not supported by this operation!");

-        y = ck::math::rcp(x);
+        y = math::rcp(x);
    };
 };

-struct Swish final : public UnaryOpBase
+struct Swish
 {
-    __host__ __device__ constexpr Swish(const Swish&) = default;
-    __host__ __device__ constexpr Swish(Swish&&)      = default;
-    __host__ __device__ ~Swish()                      = default;
-
-    __host__ __device__ Swish(float beta = 1.0f) : beta_(beta) {}
-
-    __host__ __device__ float get_beta() const { return beta_; }
-
-    const float beta_;
-
-    __host__ __device__ inline void operator()(float& y, const float& x) const final
-    {
-        float bx = -beta_ * type_convert<float>(x);
-        y        = type_convert<float>(x / (1.f + ck::math::exp(bx)));
-    }
-
-    __host__ __device__ inline void operator()(double& y, const double& x) const final
-    {
-        float bx = -beta_ * type_convert<float>(x);
-        y        = type_convert<double>(x / (1.f + ck::math::exp(bx)));
-    }
-
-    __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final
-    {
-        float bx = -beta_ * type_convert<float>(x);
-        y        = type_convert<int32_t>(x / (1.f + ck::math::exp(bx)));
-    }
-
-    __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final
-    {
-        float bx = -beta_ * type_convert<float>(x);
-        y        = type_convert<int8_t>(x / (1.f + ck::math::exp(bx)));
-    }
-
-    __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final
-    {
-        float bx = -beta_ * type_convert<float>(x);
-        y        = type_convert<half_t>(x / (1.f + ck::math::exp(bx)));
-    }
-
-    __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final
-    {
-        float bx = -beta_ * type_convert<float>(x);
-        y        = type_convert<bhalf_t>(x / (1.f + ck::math::exp(bx)));
-    }
+    Swish(float beta = 1.0f) : beta_(beta) {}

    template <typename Y, typename X>
    __host__ __device__ void operator()(Y& y, const X& x) const
    {
        static_assert(is_same<X, float>::value || is_same<X, double>::value ||
-                          is_same<X, half_t>::value,
+                          is_same<X, ck::half_t>::value || is_same<X, int8_t>::value,
                      "Data type is not supported by this operation!");

        static_assert(is_same<Y, float>::value || is_same<Y, double>::value ||
-                          is_same<Y, half_t>::value,
+                          is_same<Y, ck::half_t>::value || is_same<Y, int8_t>::value,
                      "Data type is not supported by this operation!");

        float bx = -beta_ * type_convert<float>(x);
-        y        = type_convert<Y>(x / (1.f + ck::math::exp(bx)));
-    }
+        y        = type_convert<Y>(x / (1.f + math::exp(bx)));
+    };
+
+    const float beta_;
 };

-struct SoftRelu final : public UnaryOpBase
+struct SoftRelu
 {
-    __host__ __device__ constexpr SoftRelu(const SoftRelu&) = default;
-    __host__ __device__ constexpr SoftRelu(SoftRelu&&)      = default;
-    __host__ __device__ ~SoftRelu()                         = default;
+    SoftRelu(float alpha = 1.f) : alpha_(alpha){};

-    __host__ __device__ SoftRelu(float alpha = 1.0f) : alpha_(alpha) {}
-
-    __host__ __device__ float get_alpha() const { return alpha_; }
-
-    const float alpha_;
-
-    __host__ __device__ inline void operator()(float& y, const float& x) const final
-    {
-        float casted_alpha  = type_convert<float>(alpha_);
-        constexpr float one = type_convert<float>(1);
-        y                   = ck::math::log(one + ck::math::exp(x * casted_alpha)) / casted_alpha;
-    }
-
-    __host__ __device__ inline void operator()(double& y, const double& x) const final
-    {
-        double casted_alpha  = type_convert<double>(alpha_);
-        constexpr double one = type_convert<double>(1);
-        y                    = ck::math::log(one + ck::math::exp(x * casted_alpha)) / casted_alpha;
-    }
-
-    __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final
-    {
-        int32_t casted_alpha  = type_convert<int32_t>(alpha_);
-        constexpr int32_t one = type_convert<int32_t>(1);
-        y                     = ck::math::log(one + ck::math::exp(x * casted_alpha)) / casted_alpha;
-    }
-
-    __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final
-    {
-        int8_t casted_alpha  = type_convert<int8_t>(alpha_);
-        constexpr int8_t one = type_convert<int8_t>(1);
-        y                    = ck::math::log(one + ck::math::exp(x * casted_alpha)) / casted_alpha;
-    }
-
-    __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final
-    {
-        half_t casted_alpha  = type_convert<half_t>(alpha_);
-        constexpr half_t one = type_convert<half_t>(1);
-        y                    = ck::math::log(one + ck::math::exp(x * casted_alpha)) / casted_alpha;
-    }
-
-    __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
    {
-        bhalf_t casted_alpha  = type_convert<bhalf_t>(alpha_);
-        constexpr bhalf_t one = type_convert<bhalf_t>(1);
-        y                     = ck::math::log(one + ck::math::exp(x * casted_alpha)) / casted_alpha;
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
+                          is_same<T, int8_t>::value,
+                      "Data type is not supported by this operation!");
+        T casted_alpha  = type_convert<T>(alpha_);
+        constexpr T one = type_convert<T>(1);
+        y               = math::log(one + math::exp(x * casted_alpha)) / casted_alpha;
    }
+    const float alpha_;
 };

-struct Power final : public UnaryOpBase
+struct Power
 {
-    __host__ __device__ constexpr Power(const Power&) = default;
-    __host__ __device__ constexpr Power(Power&&)      = default;
-    __host__ __device__ ~Power()                      = default;
+    Power(float alpha = 0.f, float beta = 1.f, float gamma = 2.f)
+        : alpha_(alpha), beta_(beta), gamma_(gamma){};

-    __host__ __device__ Power(float alpha = 0.f, float beta = 1.f, float gamma = 2.f)
-        : alpha_(alpha), beta_(beta), gamma_(gamma)
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
+                          is_same<T, int8_t>::value,
+                      "Data type is not supported by this operation!");
+        T casted_alpha     = type_convert<T>(alpha_);
+        T casted_beta      = type_convert<T>(beta_);
+        T casted_gamma     = type_convert<T>(gamma_);
+        T shifted_scaled_x = casted_alpha + casted_beta * x;
+        y                  = math::pow(shifted_scaled_x, casted_gamma);
    }
-
-    __host__ __device__ float get_alpha() const { return alpha_; }
-
-    __host__ __device__ float get_beta() const { return beta_; }
-
-    __host__ __device__ float get_gamma() const { return gamma_; }
-
    const float alpha_;
    const float beta_;
    const float gamma_;
-
-    __host__ __device__ inline void operator()(float& y, const float& x) const final
-    {
-        float casted_alpha = type_convert<float>(alpha_);
-        float casted_beta  = type_convert<float>(beta_);
-        float casted_gamma = type_convert<float>(gamma_);
-
-        float shifted_scaled_x = casted_alpha + casted_beta * x;
-        y                      = ck::math::pow(shifted_scaled_x, casted_gamma);
-    }
-
-    __host__ __device__ inline void operator()(double& y, const double& x) const final
-    {
-        double casted_alpha = type_convert<double>(alpha_);
-        double casted_beta  = type_convert<double>(beta_);
-        double casted_gamma = type_convert<double>(gamma_);
-
-        double shifted_scaled_x = casted_alpha + casted_beta * x;
-        y                       = ck::math::pow(shifted_scaled_x, casted_gamma);
-    }
-
-    __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final
-    {
-        int32_t casted_alpha = type_convert<int32_t>(alpha_);
-        int32_t casted_beta  = type_convert<int32_t>(beta_);
-        int32_t casted_gamma = type_convert<int32_t>(gamma_);
-
-        int32_t shifted_scaled_x = casted_alpha + casted_beta * x;
-        y                        = ck::math::pow(shifted_scaled_x, casted_gamma);
-    }
-
-    __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final
-    {
-        int8_t casted_alpha = type_convert<int8_t>(alpha_);
-        int8_t casted_beta  = type_convert<int8_t>(beta_);
-        int8_t casted_gamma = type_convert<int8_t>(gamma_);
-
-        int8_t shifted_scaled_x = casted_alpha + casted_beta * x;
-        y                       = ck::math::pow(shifted_scaled_x, casted_gamma);
-    }
-
-    __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final
-    {
-        half_t casted_alpha = type_convert<half_t>(alpha_);
-        half_t casted_beta  = type_convert<half_t>(beta_);
-        half_t casted_gamma = type_convert<half_t>(gamma_);
-
-        half_t shifted_scaled_x = casted_alpha + casted_beta * x;
-        y                       = ck::math::pow(shifted_scaled_x, casted_gamma);
-    }
-
-    __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final
-    {
-        bhalf_t casted_alpha = type_convert<bhalf_t>(alpha_);
-        bhalf_t casted_beta  = type_convert<bhalf_t>(beta_);
-        bhalf_t casted_gamma = type_convert<bhalf_t>(gamma_);
-
-        bhalf_t shifted_scaled_x = casted_alpha + casted_beta * x;
-        y                        = ck::math::pow(shifted_scaled_x, casted_gamma);
-    }
 };

-struct ClippedRelu final : public UnaryOpBase
+struct ClippedRelu
 {
-    __host__ __device__ constexpr ClippedRelu(const ClippedRelu&) = default;
-    __host__ __device__ constexpr ClippedRelu(ClippedRelu&&)      = default;
-    __host__ __device__ ~ClippedRelu()                            = default;
+    ClippedRelu(float alpha = 0.f, float beta = 1.f) : alpha_(alpha), beta_(beta){};

-    __host__ __device__ ClippedRelu(float alpha = 0.f, float beta = 1.f)
-        : alpha_(alpha), beta_(beta)
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
+                          is_same<T, int8_t>::value,
+                      "Data type is not supported by this operation!");
+        T casted_alpha = type_convert<T>(alpha_);
+        T casted_beta  = type_convert<T>(beta_);
+        y              = math::min(casted_beta, math::max(casted_alpha, x));
    }
-
-    __host__ __device__ float get_alpha() const { return alpha_; }
-
-    __host__ __device__ float get_beta() const { return beta_; }
-
    const float alpha_;
    const float beta_;
-
-    __host__ __device__ inline void operator()(float& y, const float& x) const final
-    {
-        float casted_alpha = type_convert<float>(alpha_);
-        float casted_beta  = type_convert<float>(beta_);
-        y                  = ck::math::min(casted_beta, ck::math::max(casted_alpha, x));
-    }
-
-    __host__ __device__ inline void operator()(double& y, const double& x) const final
-    {
-        double casted_alpha = type_convert<double>(alpha_);
-        double casted_beta  = type_convert<double>(beta_);
-        y                   = ck::math::min(casted_beta, ck::math::max(casted_alpha, x));
-    }
-
-    __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final
-    {
-        int32_t casted_alpha = type_convert<int32_t>(alpha_);
-        int32_t casted_beta  = type_convert<int32_t>(beta_);
-        y                    = ck::math::min(casted_beta, ck::math::max(casted_alpha, x));
-    }
-
-    __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final
-    {
-        int8_t casted_alpha = type_convert<int8_t>(alpha_);
-        int8_t casted_beta  = type_convert<int8_t>(beta_);
-        y                   = ck::math::min(casted_beta, ck::math::max(casted_alpha, x));
-    }
-
-    __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final
-    {
-        half_t casted_alpha = type_convert<half_t>(alpha_);
-        half_t casted_beta  = type_convert<half_t>(beta_);
-        y                   = ck::math::min(casted_beta, ck::math::max(casted_alpha, x));
-    }
-
-    __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final
-    {
-        bhalf_t casted_alpha = type_convert<bhalf_t>(alpha_);
-        bhalf_t casted_beta  = type_convert<bhalf_t>(beta_);
-        y                    = ck::math::min(casted_beta, ck::math::max(casted_alpha, x));
-    }
 };

-struct LeakyRelu final : public UnaryOpBase
+struct LeakyRelu
 {
-    __host__ __device__ constexpr LeakyRelu(const LeakyRelu&) = default;
-    __host__ __device__ constexpr LeakyRelu(LeakyRelu&&)      = default;
-    __host__ __device__ ~LeakyRelu()                          = default;
-
-    __host__ __device__ LeakyRelu(float alpha = 0.f) : alpha_(alpha) {}
-
-    __host__ __device__ float get_alpha() const { return alpha_; }
-
-    const float alpha_;
-
-    __host__ __device__ inline void operator()(float& y, const float& x) const final
-    {
-        float casted_alpha = type_convert<float>(alpha_);
-        y                  = x >= 0 ? x : x * casted_alpha;
-    }
-
-    __host__ __device__ inline void operator()(double& y, const double& x) const final
-    {
-        double casted_alpha = type_convert<double>(alpha_);
-        y                   = x >= 0 ? x : x * casted_alpha;
-    }
-
-    __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final
-    {
-        int32_t casted_alpha = type_convert<int32_t>(alpha_);
-        y                    = x >= 0 ? x : x * casted_alpha;
-    }
+    LeakyRelu(float alpha = 0.01f) : alpha_(alpha){};

-    __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final
-    {
-        int8_t casted_alpha = type_convert<int8_t>(alpha_);
-        y                   = x >= 0 ? x : x * casted_alpha;
-    }
-
-    __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final
-    {
-        half_t casted_alpha = type_convert<half_t>(alpha_);
-        y                   = x >= 0 ? x : x * casted_alpha;
-    }
-
-    __host__ __device__ inline void operator()([[maybe_unused]] bhalf_t& y,
-                                               [[maybe_unused]] const bhalf_t& x) const final
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
+                          is_same<T, int8_t>::value,
+                      "Data type is not supported by this operation!");
+        T casted_alpha = type_convert<T>(alpha_);
+        y              = x >= 0 ? x : x * casted_alpha;
    }
+    const float alpha_;
 };

-struct Elu final : public UnaryOpBase
+struct Elu
 {
-    __host__ __device__ constexpr Elu(const Elu&) = default;
-    __host__ __device__ constexpr Elu(Elu&&)      = default;
-    __host__ __device__ ~Elu()                    = default;
-
-    __host__ __device__ Elu(float alpha = 1.f) : alpha_(alpha) {}
-
-    __host__ __device__ float get_alpha() const { return alpha_; }
+    Elu(float alpha = 1.f) : alpha_(alpha){};

-    const float alpha_;
-
-    __host__ __device__ inline void operator()(float& y, const float& x) const final
-    {
-        float casted_alpha = type_convert<float>(alpha_);
-        y                  = x > 0 ? x : casted_alpha * ck::math::expm1(x);
-    }
-
-    __host__ __device__ inline void operator()(double& y, const double& x) const final
-    {
-        double casted_alpha = type_convert<double>(alpha_);
-        y                   = x > 0 ? x : casted_alpha * ck::math::expm1(x);
-    }
-
-    __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final
-    {
-        int32_t casted_alpha = type_convert<int32_t>(alpha_);
-        y                    = x > 0 ? x : casted_alpha * ck::math::expm1(x);
-    }
-
-    __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final
-    {
-        int8_t casted_alpha = type_convert<int8_t>(alpha_);
-        y                   = x > 0 ? x : casted_alpha * ck::math::expm1(x);
-    }
-
-    __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final
-    {
-        half_t casted_alpha = type_convert<half_t>(alpha_);
-        y                   = x > 0 ? x : casted_alpha * ck::math::expm1(x);
-    }
-
-    __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
    {
-        bhalf_t casted_alpha = type_convert<bhalf_t>(alpha_);
-        y                    = x > 0 ? x : casted_alpha * ck::math::expm1(x);
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
+                          is_same<T, int8_t>::value,
+                      "Data type is not supported by this operation!");
+        T casted_alpha = type_convert<T>(alpha_);
+        y              = x > 0 ? x : casted_alpha * math::expm1(x);
    }
+    const float alpha_;
 };

-struct Logistic final : public UnaryOpBase
+struct Logistic
 {
-    __host__ __device__ constexpr Logistic(const Logistic&) = default;
-    __host__ __device__ constexpr Logistic(Logistic&&)      = default;
-    __host__ __device__ ~Logistic()                         = default;
-
-    __host__ __device__ Logistic(float alpha = 1.0f) : alpha_(alpha) {}
-
-    __host__ __device__ float get_alpha() const { return alpha_; }
+    Logistic(float alpha = 1.f) : alpha_(alpha){};

-    const float alpha_;
-
-    __host__ __device__ inline void operator()(float& y, const float& x) const final
-    {
-        float casted_alpha  = type_convert<float>(alpha_);
-        constexpr float one = type_convert<float>(1);
-        y                   = casted_alpha / (one + ck::math::exp(-x) * casted_alpha);
-    }
-
-    __host__ __device__ inline void operator()(double& y, const double& x) const final
-    {
-        double casted_alpha  = type_convert<double>(alpha_);
-        constexpr double one = type_convert<double>(1);
-        y                    = casted_alpha / (one + ck::math::exp(-x) * casted_alpha);
-    }
-
-    __host__ __device__ inline void operator()(int32_t& y, const int32_t& x) const final
-    {
-        int32_t casted_alpha  = type_convert<int32_t>(alpha_);
-        constexpr int32_t one = type_convert<int32_t>(1);
-        y                     = casted_alpha / (one + ck::math::exp(-x) * casted_alpha);
-    }
-
-    __host__ __device__ inline void operator()(int8_t& y, const int8_t& x) const final
-    {
-        int8_t casted_alpha  = type_convert<int8_t>(alpha_);
-        constexpr int8_t one = type_convert<int8_t>(1);
-        y                    = casted_alpha / (one + ck::math::exp(-x) * casted_alpha);
-    }
-
-    __host__ __device__ inline void operator()(half_t& y, const half_t& x) const final
-    {
-        half_t casted_alpha  = type_convert<half_t>(alpha_);
-        constexpr half_t one = type_convert<half_t>(1);
-        y                    = casted_alpha / (one + ck::math::exp(-x) * casted_alpha);
-    }
-
-    __host__ __device__ inline void operator()(bhalf_t& y, const bhalf_t& x) const final
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
    {
-        bhalf_t casted_alpha  = type_convert<bhalf_t>(alpha_);
-        constexpr bhalf_t one = type_convert<bhalf_t>(1);
-        y                     = casted_alpha / (one + ck::math::exp(-x) * casted_alpha);
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
+                          is_same<T, int8_t>::value,
+                      "Data type is not supported by this operation!");
+        T casted_alpha  = type_convert<T>(alpha_);
+        constexpr T one = type_convert<T>(1);
+        y               = casted_alpha / (one + ck::math::exp(-x) * casted_alpha);
    }
+    const float alpha_;
 };

 struct ConvInvscale
@@ -1470,7 +1293,7 @@ struct ConvScaleRelu
    __host__ __device__ void operator()<f8_t, float>(f8_t& e, const float& c) const
    {
        float x;
-        Relu{}(x, c * scale_in_ * scale_wei_);
+        Relu{}.template operator()<float>(x, c * scale_in_ * scale_wei_);
        e = type_convert<f8_t>(x * scale_out_);
    };

@@ -1487,10 +1310,10 @@ struct FastNumericArrayConverter
 };

 template <>
-struct FastNumericArrayConverter<uint8_t, ck::half_t, 4>
+struct FastNumericArrayConverter<uint8_t, half_t, 4>
 {
    using InputArray  = vector_type<uint8_t, 4>;
-    using OutputArray = vector_type<ck::half_t, 4>;
+    using OutputArray = vector_type<half_t, 4>;

    __device__ static OutputArray convert(InputArray const& Input)
    {
@@ -1520,13 +1343,13 @@ struct FastNumericArrayConverter<uint8_t, ck::half_t, 4>
 };

 template <index_t N>
-struct FastNumericArrayConverter<uint8_t, ck::half_t, N>
+struct FastNumericArrayConverter<uint8_t, half_t, N>
 {
    static constexpr int VEC_WIDTH = 4;
    static_assert(!(N % VEC_WIDTH), "N must be multiple of 4.");

    using InputArray  = vector_type<uint8_t, N>;
-    using OutputArray = vector_type<ck::half_t, N>;
+    using OutputArray = vector_type<half_t, N>;

    __device__ static OutputArray convert(InputArray const& Input)
    {
@@ -1535,7 +1358,7 @@ struct FastNumericArrayConverter<uint8_t, ck::half_t, N>
        OutputArray Output;

        using Vec_InputArray  = vector_type<uint8_t, 4>;
-        using Vec_OutputArray = vector_type<ck::half_t, 4>;
+        using Vec_OutputArray = vector_type<half_t, 4>;

        Vec_OutputArray* half_4_ptr       = reinterpret_cast<Vec_OutputArray*>(&Output);
        Vec_InputArray const* uint8_4_ptr = reinterpret_cast<Vec_InputArray const*>(&Input);
@@ -1551,225 +1374,138 @@ struct FastNumericArrayConverter<uint8_t, ck::half_t, N>

 struct DynamicUnaryOp
 {
-
-    DynamicUnaryOp& operator=(const DynamicUnaryOp& other)
-    {
-        if(this != &other)
-        {
-            unary_op_ptr_  = other.unary_op_ptr_;
-            unary_op_type_ = other.unary_op_type_;
-        }
-        return *this;
-    }
-
    __host__ __device__ DynamicUnaryOp() = delete;

    __host__ __device__ DynamicUnaryOp(const Swish& swish)
+        : unary_op_type_(UnaryOpType::Swish), swish_{swish.beta_}
    {
-        unary_op_type_ = UnaryOpType::Swish;
-        beta           = swish.get_beta();
    }

    __host__ __device__ DynamicUnaryOp(const Swish&& swish)
+        : unary_op_type_(UnaryOpType::Swish), swish_{swish.beta_}
    {
-        unary_op_type_ = UnaryOpType::Swish;
-        beta           = swish.get_beta();
    }

-    __host__ __device__ DynamicUnaryOp(const Sigmoid&) { unary_op_type_ = UnaryOpType::Sigmoid; }
+    __host__ __device__ DynamicUnaryOp(const Sigmoid&) : unary_op_type_(UnaryOpType::Sigmoid) {}

-    __host__ __device__ DynamicUnaryOp(const Sigmoid&&) { unary_op_type_ = UnaryOpType::Sigmoid; }
+    __host__ __device__ DynamicUnaryOp(const Sigmoid&&) : unary_op_type_(UnaryOpType::Sigmoid) {}

    __host__ __device__ DynamicUnaryOp(const PassThrough&)
+        : unary_op_type_(UnaryOpType::PassThrough)
    {
-        unary_op_type_ = UnaryOpType::PassThrough;
    }

    __host__ __device__ DynamicUnaryOp(const PassThrough&&)
+        : unary_op_type_(UnaryOpType::PassThrough)
    {
-        unary_op_type_ = UnaryOpType::PassThrough;
    }

    __host__ __device__ DynamicUnaryOp(const Logistic& logistic)
+        : unary_op_type_(UnaryOpType::Logistic), logistic_{logistic.alpha_}
    {
-        unary_op_type_ = UnaryOpType::Logistic;
-        alpha          = logistic.get_alpha();
    }

    __host__ __device__ DynamicUnaryOp(const Logistic&& logistic)
+        : unary_op_type_(UnaryOpType::Logistic), logistic_{logistic.alpha_}
    {
-        unary_op_type_ = UnaryOpType::Logistic;
-        alpha          = logistic.get_alpha();
    }

-    __host__ __device__ DynamicUnaryOp(const TanH&) { unary_op_type_ = UnaryOpType::TanH; }
+    __host__ __device__ DynamicUnaryOp(const TanH&) : unary_op_type_(UnaryOpType::TanH) {}

-    __host__ __device__ DynamicUnaryOp(const TanH&&) { unary_op_type_ = UnaryOpType::TanH; }
+    __host__ __device__ DynamicUnaryOp(const TanH&&) : unary_op_type_(UnaryOpType::TanH) {}

-    __host__ __device__ DynamicUnaryOp(const Relu&) { unary_op_type_ = UnaryOpType::Relu; }
+    __host__ __device__ DynamicUnaryOp(const Relu&) : unary_op_type_(UnaryOpType::Relu) {}

-    __host__ __device__ DynamicUnaryOp(const Relu&&) { unary_op_type_ = UnaryOpType::Relu; }
+    __host__ __device__ DynamicUnaryOp(const Relu&&) : unary_op_type_(UnaryOpType::Relu) {}

    __host__ __device__ DynamicUnaryOp(const SoftRelu& softrelu)
+        : unary_op_type_(UnaryOpType::SoftRelu), soft_relu_{softrelu.alpha_}
    {
-        unary_op_type_ = UnaryOpType::SoftRelu;
-        alpha          = softrelu.get_alpha();
    }

    __host__ __device__ DynamicUnaryOp(const SoftRelu&& softrelu)
+        : unary_op_type_(UnaryOpType::SoftRelu), soft_relu_{softrelu.alpha_}
    {
-        unary_op_type_ = UnaryOpType::SoftRelu;
-        alpha          = softrelu.get_alpha();
    }

-    __host__ __device__ DynamicUnaryOp(const UnaryAbs&) { unary_op_type_ = UnaryOpType::UnaryAbs; }
+    __host__ __device__ DynamicUnaryOp(const UnaryAbs&) : unary_op_type_(UnaryOpType::UnaryAbs) {}

-    __host__ __device__ DynamicUnaryOp(const UnaryAbs&&) { unary_op_type_ = UnaryOpType::UnaryAbs; }
+    __host__ __device__ DynamicUnaryOp(const UnaryAbs&&) : unary_op_type_(UnaryOpType::UnaryAbs) {}

    __host__ __device__ DynamicUnaryOp(const Power& pow)
+        : unary_op_type_(UnaryOpType::Power), power_(pow.alpha_, pow.beta_, pow.gamma_)
    {
-        unary_op_type_ = UnaryOpType::Power;
-        alpha          = pow.get_alpha();
-        beta           = pow.get_beta();
-        gamma          = pow.get_gamma();
    }

    __host__ __device__ DynamicUnaryOp(const Power&& pow)
+        : unary_op_type_(UnaryOpType::Power), power_(pow.alpha_, pow.beta_, pow.gamma_)
    {
-        unary_op_type_ = UnaryOpType::Power;
-        alpha          = pow.get_alpha();
-        beta           = pow.get_beta();
-        gamma          = pow.get_gamma();
    }

    __host__ __device__ DynamicUnaryOp(const ClippedRelu& clippedrelu)
+        : unary_op_type_(UnaryOpType::ClippedRelu),
+          clipped_relu_{clippedrelu.alpha_, clippedrelu.beta_}
    {
-        unary_op_type_ = UnaryOpType::ClippedRelu;
-        alpha          = clippedrelu.get_alpha();
-        beta           = clippedrelu.get_beta();
    }

    __host__ __device__ DynamicUnaryOp(const ClippedRelu&& clippedrelu)
+        : unary_op_type_(UnaryOpType::ClippedRelu),
+          clipped_relu_{clippedrelu.alpha_, clippedrelu.beta_}
    {
-        unary_op_type_ = UnaryOpType::ClippedRelu;
-        alpha          = clippedrelu.get_alpha();
-        beta           = clippedrelu.get_beta();
    }

    __host__ __device__ DynamicUnaryOp(const LeakyRelu& leakyrelu)
+        : unary_op_type_(UnaryOpType::LeakyRelu), leaky_relu_{leakyrelu.alpha_}
    {
-        unary_op_type_ = UnaryOpType::LeakyRelu;
-        alpha          = leakyrelu.get_alpha();
    }

    __host__ __device__ DynamicUnaryOp(const LeakyRelu&& leakyrelu)
+        : unary_op_type_(UnaryOpType::LeakyRelu), leaky_relu_{leakyrelu.alpha_}
    {
-        unary_op_type_ = UnaryOpType::LeakyRelu;
-        alpha          = leakyrelu.get_alpha();
    }

    __host__ __device__ DynamicUnaryOp(const Elu& elu)
+        : unary_op_type_(UnaryOpType::Elu), elu_{elu.alpha_}
    {
-        unary_op_type_ = UnaryOpType::Elu;
-        alpha          = elu.get_alpha();
    }

    __host__ __device__ DynamicUnaryOp(const Elu&& elu)
+        : unary_op_type_(UnaryOpType::Elu), elu_{elu.alpha_}
    {
-        unary_op_type_ = UnaryOpType::Elu;
-        alpha          = elu.get_alpha();
    }

-    __host__ __device__ DynamicUnaryOp(const DynamicUnaryOp& dynamic_op)
-        : unary_op_type_(dynamic_op.unary_op_type_),
-          unary_op_ptr_(dynamic_op.unary_op_ptr_),
-          alpha(dynamic_op.alpha),
-          beta(dynamic_op.beta),
-          gamma(dynamic_op.gamma)
-    {
-    }
-
-    __host__ __device__ ~DynamicUnaryOp()
-    {
-        switch(unary_op_type_)
-        {
-        case(UnaryOpType::Swish): delete static_cast<Swish*>(unary_op_ptr_); break;
-        case(UnaryOpType::Sigmoid): delete static_cast<Sigmoid*>(unary_op_ptr_); break;
-        case(UnaryOpType::PassThrough): delete static_cast<PassThrough*>(unary_op_ptr_); break;
-        case(UnaryOpType::Logistic): delete static_cast<Logistic*>(unary_op_ptr_); break;
-        case(UnaryOpType::TanH): delete static_cast<TanH*>(unary_op_ptr_); break;
-        case(UnaryOpType::Relu): delete static_cast<Relu*>(unary_op_ptr_); break;
-        case(UnaryOpType::SoftRelu): delete static_cast<SoftRelu*>(unary_op_ptr_); break;
-        case(UnaryOpType::UnaryAbs): delete static_cast<UnaryAbs*>(unary_op_ptr_); break;
-        case(UnaryOpType::Power): delete static_cast<Power*>(unary_op_ptr_); break;
-        case(UnaryOpType::ClippedRelu): delete static_cast<ClippedRelu*>(unary_op_ptr_); break;
-        case(UnaryOpType::LeakyRelu): delete static_cast<LeakyRelu*>(unary_op_ptr_); break;
-        case(UnaryOpType::Elu): delete static_cast<Elu*>(unary_op_ptr_); break;
-
-        default: break;
-        }
-    }
-
-    __device__ void InitUnaryOpPtrOnDevice()
-    {
-        switch(unary_op_type_)
-        {
-        case(UnaryOpType::Swish): unary_op_ptr_ = new Swish(beta); break;
-        case(UnaryOpType::Sigmoid): unary_op_ptr_ = new Sigmoid; break;
-        case(UnaryOpType::PassThrough): unary_op_ptr_ = new PassThrough; break;
-        case(UnaryOpType::Logistic): unary_op_ptr_ = new Logistic(alpha); break;
-        case(UnaryOpType::TanH): unary_op_ptr_ = new TanH; break;
-        case(UnaryOpType::Relu): unary_op_ptr_ = new Relu; break;
-        case(UnaryOpType::SoftRelu): unary_op_ptr_ = new SoftRelu(alpha); break;
-        case(UnaryOpType::UnaryAbs): unary_op_ptr_ = new UnaryAbs; break;
-        case(UnaryOpType::Power): unary_op_ptr_ = new Power(alpha, beta, gamma); break;
-        case(UnaryOpType::ClippedRelu): unary_op_ptr_ = new ClippedRelu(alpha, beta); break;
-        case(UnaryOpType::LeakyRelu): unary_op_ptr_ = new LeakyRelu(alpha); break;
-        case(UnaryOpType::Elu): unary_op_ptr_ = new Elu(alpha); break;
-
-        default: unary_op_ptr_ = nullptr; break;
-        }
-    }
+    __host__ __device__ DynamicUnaryOp(const DynamicUnaryOp& dynamic_op) = default;

-    template <typename Y, typename X>
-    __device__ void operator()(Y& y, const X& x) const
-    {
-        isSupported<X, Y>();
-        unary_op_ptr_->operator()(y, x);
-    }
+    __host__ __device__ ~DynamicUnaryOp() {}

    template <typename Y, typename X>
-    __host__ void operator()(Y& y, const X& x) const
+    __host__ __device__ void operator()(Y& y, const X& x) const
    {
-        isSupported<X, Y>();
        switch(unary_op_type_)
        {
-        case(UnaryOpType::Swish): Swish{}.operator()(y, x); break;
-        case(UnaryOpType::Sigmoid): Sigmoid{}.operator()(y, x); break;
-        case(UnaryOpType::PassThrough): PassThrough{}.operator()(y, x); break;
-        case(UnaryOpType::Logistic): Logistic{}.operator()(y, x); break;
-        case(UnaryOpType::TanH): TanH{}.operator()(y, x); break;
-        case(UnaryOpType::Relu): Relu{}.operator()(y, x); break;
-        case(UnaryOpType::SoftRelu): SoftRelu{}.operator()(y, x); break;
-        case(UnaryOpType::UnaryAbs): UnaryAbs{}.operator()(y, x); break;
-        case(UnaryOpType::Power): Power{}.operator()(y, x); break;
-        case(UnaryOpType::ClippedRelu): ClippedRelu{}.operator()(y, x); break;
-        case(UnaryOpType::LeakyRelu): LeakyRelu{}.operator()(y, x); break;
-        case(UnaryOpType::Elu): Elu{}.operator()(y, x); break;
+        case(UnaryOpType::Swish): swish_(y, x); break;
+        case(UnaryOpType::Sigmoid): sigmoid_(y, x); break;
+        case(UnaryOpType::PassThrough): pass_through_(y, x); break;
+        case(UnaryOpType::Logistic): logistic_(y, x); break;
+        case(UnaryOpType::TanH): tanh_(y, x); break;
+        case(UnaryOpType::Relu): relu_(y, x); break;
+        case(UnaryOpType::SoftRelu): soft_relu_(y, x); break;
+        case(UnaryOpType::UnaryAbs): unary_abs_(y, x); break;
+        case(UnaryOpType::Power): power_(y, x); break;
+        case(UnaryOpType::ClippedRelu): clipped_relu_(y, x); break;
+        case(UnaryOpType::LeakyRelu): leaky_relu_(y, x); break;
+        case(UnaryOpType::Elu): elu_(y, x); break;
        default: break;
        }
    }

-    template <typename X, typename Y>
-    __device__ __host__ constexpr void isSupported() const
+    template <>
+    __host__ __device__ void operator()<bhalf_t, bhalf_t>(bhalf_t& y, const bhalf_t& x) const
    {
-
-        static_assert(std::is_same<X, Y>::value, "X and Y must be of the same type");
-
-        static_assert(is_same<X, float>::value || is_same<X, double>::value ||
-                          is_same<X, bhalf_t>::value || is_same<X, half_t>::value ||
-                          is_same<X, int32_t>::value || is_same<X, int8_t>::value,
-                      "Data type is not supported by this operation!");
+        float y_float;
+        float x_float = type_convert<float>(x);
+        this->operator()(y_float, x_float);
+        y = type_convert<bhalf_t>(y_float);
    }

    private:
@@ -1791,12 +1527,20 @@ struct DynamicUnaryOp

    public:
    UnaryOpType unary_op_type_;
-    UnaryOpBase* unary_op_ptr_ = nullptr;
-    float alpha;
-    float beta;
-    float gamma;
+
+    Swish swish_;
+    Sigmoid sigmoid_;
+    PassThrough pass_through_;
+    Logistic logistic_;
+    TanH tanh_;
+    Relu relu_;
+    SoftRelu soft_relu_;
+    UnaryAbs unary_abs_;
+    Power power_;
+    ClippedRelu clipped_relu_;
+    LeakyRelu leaky_relu_;
+    Elu elu_;
 };
-#pragma clang diagnostic pop

 } // namespace element_wise
 } // namespace tensor_operation

--- a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
+++ b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

 #include "ck/utility/math.hpp"
 #include "ck/utility/number.hpp"
+#include "ck/utility/tuple.hpp"
 #include "ck/tensor_description/tensor_adaptor.hpp"
 #include "ck/tensor_description/multi_index_transform_helper.hpp"
+#ifndef CK_CODE_GEN_RTC
 #include <limits>
 #include <stdlib.h>
+#endif

 namespace ck {

@@ -978,8 +981,7 @@ struct BlockToCTileMap_3DGrid_KSplit
        // Create 3D grid
        const auto M0 = math::integer_divide_ceil(M, MPerBlock);
        const auto N0 = math::integer_divide_ceil(N, NPerBlock);
-
-        return std::make_tuple(N0, M0, k_split);
+        return make_tuple(N0, M0, k_split);
    }

    template <typename TopIdx>
@@ -1103,7 +1105,7 @@ struct BlockToCTileMap_GemmStreamK
            uint32_t dp_for_sk_iters = k_iters_per_tile.get();

            uint32_t best_sk_score =
-                std::numeric_limits<int>::max(); // we need to find the smallest sk iters
+                NumericLimits<int32_t>::Max(); // we need to find the smallest sk iters
            for(uint32_t tentative_sk_blocks = min_sk_tiles; tentative_sk_blocks < max_sk_tiles;
                tentative_sk_blocks++)
            {

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp
@@ -607,6 +607,7 @@ struct GridwiseBatchedGemmGemm_Xdl_CShuffle
        // with 'group_size' amount of contiguous elements. Having Gemm1KPack greater than A1K1 will
        // cause mismatch in summation index for example c[0:7] = a1[[0:3, 8:11]] * b1[0:7].
        // therefore we may just as well assign Gemm1KPack = group_size
+
        constexpr index_t Gemm1KPack =
            MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma.group_size;