merge changes for upstream/latest update

b097be17 · root · 8a891bbd · a49115b9 · b097be17 · b097be17
Commit b097be17 authored Jun 23, 2022 by root
20 changed files
--- a/include/ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp
@@ -557,10 +557,8 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
            float ave_time = 0;
-            using Add =
+            using Add                  = ck::tensor_operation::element_wise::Add;
-                ck::tensor_operation::binary_element_wise::Add<CDataType, CDataType, CDataType>;
+            using Subtract             = ck::tensor_operation::element_wise::Subtract;
-            using Substract = ck::tensor_operation::binary_element_wise::
-                Substract<CDataType, CDataType, CDataType>;
            using GridwiseBinAdd       = GridwiseBinaryElementwise_1D<CDataType,
                                                                CDataType,
                                                                CDataType,
@@ -573,14 +571,14 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
                                                                AScalarPerVector,
                                                                BScalarPerVector,
                                                                CScalarPerVector>;
-            using GridwiseBinSubstract  = GridwiseBinaryElementwise_1D<CDataType,
+            using GridwiseBinSubtract  = GridwiseBinaryElementwise_1D<CDataType,
                                                                     CDataType,
                                                                     CDataType,
                                                                     CDataType,
                                                                     CGridDesc_M,
                                                                     CGridDesc_M,
                                                                     CGridDesc_M,
-                                                                      Substract,
+                                                                     Subtract,
                                                                     MPerThread,
                                                                     AScalarPerVector,
                                                                     BScalarPerVector,
@@ -593,14 +591,14 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
                                                                 CGridDesc_M,
                                                                 CGridDesc_M,
                                                                 Add>;
-            const auto substract_kernel = kernel_binary_elementwise_1d<GridwiseBinSubstract,
+            const auto subtract_kernel = kernel_binary_elementwise_1d<GridwiseBinSubtract,
                                                                      CDataType,
                                                                      CDataType,
                                                                      CDataType,
                                                                      CGridDesc_M,
                                                                      CGridDesc_M,
                                                                      CGridDesc_M,
-                                                                       Substract>;
+                                                                      Subtract>;
            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
            {
@@ -653,7 +651,7 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
                // c_real = aux - aux_2
                ave_time += launch_and_time_kernel(stream_config,
-                                                   substract_kernel,
+                                                   subtract_kernel,
                                                   dim3(grid_size),
                                                   dim3(BlockSize),
                                                   0,
@@ -663,7 +661,7 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
                                                   arg.c_grid_desc_m_,
                                                   arg.c_grid_desc_m_,
                                                   arg.c_grid_desc_m_,
-                                                   Substract{});
+                                                   Subtract{});
                ave_time +=
                    launch_and_time_kernel(stream_config,
@@ -764,7 +762,7 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
                // c_real = aux - aux_2
                ave_time += launch_and_time_kernel(stream_config,
-                                                   substract_kernel,
+                                                   subtract_kernel,
                                                   dim3(grid_size),
                                                   dim3(BlockSize),
                                                   0,
@@ -774,7 +772,7 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
                                                   arg.c_grid_desc_m_,
                                                   arg.c_grid_desc_m_,
                                                   arg.c_grid_desc_m_,
-                                                   Substract{});
+                                                   Subtract{});
                ave_time +=
                    launch_and_time_kernel(stream_config,

--- a/include/ck/tensor_operation/gpu/device/device_convnd_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_convnd_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
@@ -11,6 +11,7 @@
 #include "tensor_descriptor.hpp"
 #include "tensor_descriptor_helper.hpp"
 #include "gridwise_gemm_xdlops_bwd_weight.hpp"
+#include "gridwise_unary_elementwise_1d.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -432,7 +433,7 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
        using namespace ck;
        const index_t Di = input_spatial_lengths[0];
-        const index_t Hi = input_spatial_lengths[2];
+        const index_t Hi = input_spatial_lengths[1];
        const index_t Wi = input_spatial_lengths[2];
        const index_t Do = output_spatial_lengths[0];
@@ -628,6 +629,57 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
                                                                  1);
    }
+    // type convert descs
+    template <typename Desc_M0>
+    static auto PadDescriptor_M0_1d(Desc_M0 desc_m0, index_t gridSize, index_t blockSize)
+    {
+        const auto m0           = desc_m0.GetLength(I0);
+        const index_t loop_step = gridSize * blockSize * 4;
+        const auto pad          = math::integer_least_multiple(m0, loop_step) - m0;
+        const auto desc_m0_pad =
+            transform_tensor_descriptor(desc_m0,
+                                        make_tuple(make_right_pad_transform(m0, pad)),
+                                        make_tuple(Sequence<0>{}),
+                                        make_tuple(Sequence<0>{}));
+        return desc_m0_pad;
+    }
+    template <index_t Dim>
+    static auto MakeDescriptor_M0(const std::vector<index_t>& shape,
+                                  const std::vector<index_t>& stride,
+                                  index_t gridSize,
+                                  index_t blockSize)
+    {
+        auto tupleOfShape  = generate_tuple([&](auto I) { return shape[I]; }, Number<Dim>{});
+        auto tupleOfStride = generate_tuple([&](auto I) { return stride[I]; }, Number<Dim>{});
+        // nd desc - [s0, s1, s2, ...]
+        const auto desc = make_naive_tensor_descriptor(tupleOfShape, tupleOfStride);
+        // merge nd to 1d desc - [s0 * s1 * ...]
+        if constexpr(Dim > 1)
+        {
+            const auto desc_m0 = transform_tensor_descriptor(
+                desc,
+                make_tuple(make_merge_transform(tupleOfShape)),
+                make_tuple(generate_sequence_v2([&](auto I) { return I; }, Number<Dim>{})),
+                make_tuple(Sequence<0>{}));
+            return PadDescriptor_M0_1d(desc_m0, gridSize, blockSize);
+        }
+        else
+            return PadDescriptor_M0_1d(desc, gridSize, blockSize);
+    }
+    using TypeConvertFp32ToBf16Functor =
+        ck::tensor_operation::element_wise::UnaryTypeConvert<ck::bhalf_t, float>;
+    using GridDesc_M0      = decltype(MakeDescriptor_M0<1>({1}, {1}, 1, 1));
+    using GridwiseUEltwise = GridwiseUnaryElementwise_1D<AccDataType,
+                                                         InDataType,
+                                                         GridDesc_M0,
+                                                         TypeConvertFp32ToBf16Functor,
+                                                         4>;
    using ABCGridDescs = decltype(GetABCGridDesc<NumDimSpatial>());
    using AGridDesc_K0_M_K1 = remove_cvref_t<decltype(ABCGridDescs{}[I0])>;
@@ -733,6 +785,55 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
        true,
        true>;
+    using GridwiseGemmAtomicAddFloatBf16Splitk = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight<
+        BlockSize,
+        ADataType, // TODO: distinguish A/B datatype
+        AccDataType,
+        AccDataType,
+        InMemoryDataOperationEnum::AtomicAdd,
+        AGridDesc_K0_M_K1,
+        BGridDesc_K0_N_K1,
+        CGridDesc_M_N,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        MPerBlock,
+        NPerBlock,
+        K0PerBlock,
+        MPerXdl,
+        NPerXdl,
+        K1,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_K0_M_K1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_K1,
+        false, // AThreadTransferSrcResetCoordinateAfterRun,
+        ABlockLdsAddExtraM,
+        ABlockLdsM1PerBlock,
+        ABlockLdsM0PerBlock,
+        ABlockLdsM1Padding,
+        BBlockTransferThreadClusterLengths_K0_N_K1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_K1,
+        false, // BThreadTransferSrcResetCoordinateAfterRun,
+        BBlockLdsAddExtraN,
+        BBlockLdsN1PerBlock,
+        BBlockLdsN0PerBlock,
+        BBlockLdsN1Padding,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CBlockTransferScalarPerVector_NWaveNPerXdl,
+        CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        true,
+        true>;
    // Argument
    using CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
        decltype(GridwiseGemm::MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(CGridDesc_M_N{}));
@@ -881,19 +982,18 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
            const auto K0 = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I1);
-            const bool has_main_k0_block_loop = GridwiseGemm::CalculateHasMainK0BlockLoop(K0);
            float ave_time = 0;
-            const auto Run = [&](const auto& kernel) {
+            const bool has_main_k0_block_loop = GridwiseGemm::CalculateHasMainK0BlockLoop(K0);
+            const auto run_conv = [&](const auto& kernel) {
                hipGetErrorString(hipMemset(
                    arg.p_c_grid_,
                    0,
                    arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() *
                        sizeof(CDataType)));
-                ave_time =
+                return launch_and_time_kernel(stream_config,
-                    launch_and_time_kernel(stream_config,
                                              kernel,
                                              dim3(grid_size),
                                              dim3(BlockSize),
@@ -910,47 +1010,76 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
                                              arg.block_2_ctile_map_);
            };
-            if constexpr(std::is_same<InDataType, ck::bhalf_t>::value)
+            // run kernel for bf16 with splitk
-            {
+            const auto run_bf16_splitk = [&](const auto& kernel) {
-                if(has_main_k0_block_loop)
+                hipGetErrorString(hipMemset(
-                {
+                    arg.p_workspace_,
-                    const auto kernel = kernel_gemm_xdlops_bwd_weight<
+                    0,
-                        GridwiseGemm,
+                    arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() *
-                        ADataType, // TODO: distiguish A/B datatype
+                        sizeof(AccDataType)));
-                        CDataType,
-                        remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
-                        remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
-                        remove_reference_t<DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
-                        OutElementwiseOperation,
-                        InElementwiseOperation,
-                        WeiElementwiseOperation,
-                        remove_reference_t<DeviceOp::Block2CTileMap>,
-                        true>;
-                    Run(kernel);
+                return launch_and_time_kernel(stream_config,
-                }
+                                              kernel,
-                else
+                                              dim3(grid_size),
-                {
+                                              dim3(BlockSize),
-                    const auto kernel = kernel_gemm_xdlops_bwd_weight<
+                                              0,
-                        GridwiseGemm,
+                                              arg.p_a_grid_,
-                        ADataType, // TODO: distiguish A/B datatype
+                                              arg.p_b_grid_,
-                        CDataType,
+                                              static_cast<AccDataType*>(arg.p_workspace_),
-                        remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                              arg.a_grid_desc_kbatch_k0_m_k1_,
-                        remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                              arg.b_grid_desc_kbatch_k0_n_k1_,
-                        remove_reference_t<DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                              arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                        OutElementwiseOperation,
+                                              arg.a_element_op_,
-                        InElementwiseOperation,
+                                              arg.b_element_op_,
-                        WeiElementwiseOperation,
+                                              arg.c_element_op_,
-                        remove_reference_t<DeviceOp::Block2CTileMap>,
+                                              arg.block_2_ctile_map_);
-                        false>;
+            };
-                    Run(kernel);
+            // kernel for type conversion
-                }
+            std::vector<std::size_t> filter_dims{static_cast<std::size_t>(arg.Conv_K_),
-            }
+                                                 static_cast<std::size_t>(arg.Conv_C_)};
-            else
+            filter_dims.insert(std::end(filter_dims),
+                               std::begin(arg.filter_spatial_lengths_),
+                               std::end(arg.filter_spatial_lengths_));
+            int tensor_size =
+                std::accumulate(filter_dims.begin(), filter_dims.end(), 1, std::multiplies<int>{});
+            const index_t type_convert_grid_size = GridwiseUEltwise::CalculateGridSize(tensor_size);
+            GridDesc_M0 a_grid_desc_m0_ =
+                MakeDescriptor_M0<1>({tensor_size}, {1}, type_convert_grid_size, 256);
+            GridDesc_M0 b_grid_desc_m0_ =
+                MakeDescriptor_M0<1>({tensor_size}, {1}, type_convert_grid_size, 256);
+            if(!GridwiseUEltwise::CheckValidity(a_grid_desc_m0_, b_grid_desc_m0_))
            {
-                if(has_main_k0_block_loop)
+                throw std::runtime_error("wrong! GridwiseUnaryElementwise_1D has invalid setting");
+            }
+            // run kernel for type conversion
+            void* p_c_grid_tmp_            = static_cast<void*>(arg.p_c_grid_);
+            InDataType* p_c_grid_tmp_bf16_ = static_cast<InDataType*>(p_c_grid_tmp_);
+            const auto run_type_convert    = [&](const auto& kernel) {
+                float elapsed_time =
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(type_convert_grid_size),
+                                           dim3(256),
+                                           0,
+                                           static_cast<AccDataType*>(arg.p_workspace_),
+                                           p_c_grid_tmp_bf16_,
+                                           a_grid_desc_m0_,
+                                           b_grid_desc_m0_,
+                                           TypeConvertFp32ToBf16Functor{});
+                return elapsed_time;
+            };
+            if constexpr(std::is_same<InDataType, ck::bhalf_t>::value)
            {
+                auto launch_kernel = [&](auto has_main_k_block_loop) {
+                    constexpr bool has_main_loop = has_main_k_block_loop.value;
                    if(kbatch == 1)
                    {
                        const auto kernel = kernel_gemm_xdlops_bwd_weight<
@@ -965,16 +1094,23 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
                            InElementwiseOperation,
                            WeiElementwiseOperation,
                            remove_reference_t<DeviceOp::Block2CTileMap>,
-                            true>;
+                            has_main_loop>;
-                        Run(kernel);
+                        return run_conv(kernel);
                    }
                    else
                    {
-                        const auto kernel = kernel_gemm_xdlops_bwd_weight<
+                        const auto kernel_type_convert =
-                            GridwiseGemmAtomicAdd,
+                            kernel_unary_elementwise_1d<GridwiseUEltwise,
+                                                        AccDataType,
+                                                        InDataType,
+                                                        GridDesc_M0,
+                                                        TypeConvertFp32ToBf16Functor>;
+                        const auto kernel_conv = kernel_gemm_xdlops_bwd_weight<
+                            GridwiseGemmAtomicAddFloatBf16Splitk,
                            ADataType, // TODO: distiguish A/B datatype
-                            CDataType,
+                            AccDataType,
                            remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
                            remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
                            remove_reference_t<
@@ -983,13 +1119,28 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
                            InElementwiseOperation,
                            WeiElementwiseOperation,
                            remove_reference_t<DeviceOp::Block2CTileMap>,
-                            true>;
+                            has_main_loop>;
-                        Run(kernel);
+                        float elapsed_time = 0;
+                        elapsed_time += run_bf16_splitk(kernel_conv);
+                        elapsed_time += run_type_convert(kernel_type_convert);
+                        return elapsed_time;
+                    }
+                };
+                if(has_main_k0_block_loop)
+                {
+                    ave_time = launch_kernel(integral_constant<bool, true>{});
+                }
+                else
+                {
+                    ave_time = launch_kernel(integral_constant<bool, false>{});
                }
            }
            else
            {
+                auto launch_kernel = [&](auto has_main_k_block_loop) {
+                    constexpr bool has_main_loop = has_main_k_block_loop.value;
                    if(kbatch == 1)
                    {
                        const auto kernel = kernel_gemm_xdlops_bwd_weight<
@@ -1004,9 +1155,9 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
                            InElementwiseOperation,
                            WeiElementwiseOperation,
                            remove_reference_t<DeviceOp::Block2CTileMap>,
-                            false>;
+                            has_main_loop>;
-                        Run(kernel);
+                        return run_conv(kernel);
                    }
                    else
                    {
@@ -1022,10 +1173,18 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
                            InElementwiseOperation,
                            WeiElementwiseOperation,
                            remove_reference_t<DeviceOp::Block2CTileMap>,
-                            false>;
+                            has_main_loop>;
-                        Run(kernel);
+                        return run_conv(kernel);
                    }
+                };
+                if(has_main_k0_block_loop)
+                {
+                    ave_time = launch_kernel(integral_constant<bool, true>{});
+                }
+                else
+                {
+                    ave_time = launch_kernel(integral_constant<bool, false>{});
                }
            }

--- a/include/ck/tensor_operation/gpu/device/device_gemm_bias_add_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_bias_add_reduce_xdl_cshuffle.hpp
--- a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp
+#pragma once
+#include <array>
+#include "device_base.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+// input : A[M, K], B[K, N],
+// input : D0[M, N], D1[M, N], ...
+// output : E[M, N]
+// C = a_op(A) * b_op(B)
+// E = cde_op(C, D0, D1, ...)
+template <ck::index_t NumDTensor,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct DeviceGemmMultipleD : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        std::array<const void*, NumDTensor> p_ds,
+                        void* p_e,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t StrideA,
+                        ck::index_t StrideB,
+                        std::array<ck::index_t, NumDTensor> StrideDs,
+                        ck::index_t StrideE,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) = 0;
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+template <ck::index_t NumDTensor,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+using DeviceGemmMultipleDPtr = std::unique_ptr<DeviceGemmMultipleD<NumDTensor,
+                                                                   AElementwiseOperation,
+                                                                   BElementwiseOperation,
+                                                                   CElementwiseOperation>>;
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp
--- a/include/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp
@@ -6,19 +6,18 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
-template <typename DPtrsGlobal,
+template <typename AElementwiseOperation,
-          typename AElementwiseOperation,
          typename BElementwiseOperation,
          typename CElementwiseOperation,
          typename DxsInElementwiseOperation,
-          typename DxsAccElementwiseOperation>
+          typename DxsReduceAccElementwiseOperation>
 struct DeviceGemmReduce : public BaseOperator
 {
    virtual std::unique_ptr<BaseArgument>
    MakeArgumentPointer(const void* p_a,
                        const void* p_b,
                        void* p_c,
-                        DPtrsGlobal p_dxs,
+                        void* p_dxs,
                        ck::index_t M,
                        ck::index_t N,
                        ck::index_t K,
@@ -29,24 +28,69 @@ struct DeviceGemmReduce : public BaseOperator
                        BElementwiseOperation b_element_op,
                        CElementwiseOperation c_element_op,
                        DxsInElementwiseOperation dxs_in_element_op,
-                        DxsAccElementwiseOperation dxs_out_element_op,
+                        DxsReduceAccElementwiseOperation dxs_out_element_op,
                        ck::index_t BatchCount = 1) = 0;
    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
 };
-template <typename DPtrsGlobal,
+template <typename AElementwiseOperation,
-          typename AElementwiseOperation,
          typename BElementwiseOperation,
          typename CElementwiseOperation,
          typename DxsInElementwiseOperation,
-          typename DxsAccElementwiseOperation>
+          typename DxsReduceAccElementwiseOperation>
-using DeviceGemmReducePtr = std::unique_ptr<DeviceGemmReduce<DPtrsGlobal,
+using DeviceGemmReducePtr = std::unique_ptr<DeviceGemmReduce<AElementwiseOperation,
-                                                             AElementwiseOperation,
                                                             BElementwiseOperation,
                                                             CElementwiseOperation,
                                                             DxsInElementwiseOperation,
-                                                             DxsAccElementwiseOperation>>;
+                                                             DxsReduceAccElementwiseOperation>>;
+template <typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          typename C1ElementwiseOperation,
+          typename DxsInElementwiseOperation,
+          typename DxsReduceAccElementwiseOperation>
+struct DeviceGemmBiasAddReduce : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        void* p_c,
+                        const void* p_c0,
+                        const void* p_c1,
+                        void* p_dxs,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t StrideA,
+                        ck::index_t StrideB,
+                        ck::index_t StrideC,
+                        ck::index_t StrideC1,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CElementwiseOperation c_element_op,
+                        C1ElementwiseOperation c1_element_op,
+                        DxsInElementwiseOperation dxs_in_element_op,
+                        DxsReduceAccElementwiseOperation dxs_out_element_op,
+                        ck::index_t BatchCount = 1) = 0;
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+template <typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          typename C1ElementwiseOperation,
+          typename DxsInElementwiseOperation,
+          typename DxsReduceAccElementwiseOperation>
+using DeviceGemmBiasAddReducePtr =
+    std::unique_ptr<DeviceGemmBiasAddReduce<AElementwiseOperation,
+                                            BElementwiseOperation,
+                                            CElementwiseOperation,
+                                            C1ElementwiseOperation,
+                                            DxsInElementwiseOperation,
+                                            DxsReduceAccElementwiseOperation>>;
 } // namespace device
 } // namespace tensor_operation

--- a/include/ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp
@@ -32,7 +32,7 @@ template <typename ALayout,
          typename CElementwiseOperation,
          typename DxsReduceOperation,
          typename DxsInElementwiseOperation,
-          typename DxsAccElementwiseOperation,
+          typename DxsReduceAccElementwiseOperation,
          typename DGlobalMemoryDataOperation,
          GemmSpecialization GemmSpec,
          index_t NumGemmKPrefetchStage,
@@ -68,12 +68,11 @@ template <typename ALayout,
          index_t CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock,
          index_t CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock,
          LoopScheduler LoopSched = make_default_loop_scheduler()>
-struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<DPtrsGlobal,
+struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwiseOperation,
-                                                               AElementwiseOperation,
                                                               BElementwiseOperation,
                                                               CElementwiseOperation,
                                                               DxsInElementwiseOperation,
-                                                               DxsAccElementwiseOperation>
+                                                               DxsReduceAccElementwiseOperation>
 {
    using DeviceOp = DeviceGemmReduce_Xdl_CShuffle;
@@ -389,7 +388,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<DPtrsGlobal,
        CElementwiseOperation,
        DxsReduceOperation,
        DxsInElementwiseOperation,
-        DxsAccElementwiseOperation,
+        DxsReduceAccElementwiseOperation,
        InMemoryDataOperationEnum::Set,
        DGlobalMemoryDataOperation,
        AGridDesc_AK0_M_AK1,
@@ -449,7 +448,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<DPtrsGlobal,
                 BElementwiseOperation b_element_op,
                 CElementwiseOperation c_element_op,
                 DxsInElementwiseOperation dxs_in_element_op,
-                 DxsAccElementwiseOperation dxs_out_element_op)
+                 DxsReduceAccElementwiseOperation dxs_out_element_op)
            : p_a_grid_{p_a_grid},
              p_b_grid_{p_b_grid},
              p_c_grid_{p_c_grid},
@@ -498,7 +497,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<DPtrsGlobal,
        BElementwiseOperation b_element_op_;
        CElementwiseOperation c_element_op_;
        DxsInElementwiseOperation dxs_in_element_op_;
-        DxsAccElementwiseOperation dxs_out_element_op_;
+        DxsReduceAccElementwiseOperation dxs_out_element_op_;
    };
    // Invoker
@@ -554,7 +553,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<DPtrsGlobal,
                    BElementwiseOperation,
                    CElementwiseOperation,
                    DxsInElementwiseOperation,
-                    DxsAccElementwiseOperation,
+                    DxsReduceAccElementwiseOperation,
                    DeviceOp::AGridDesc_AK0_M_AK1,
                    DeviceOp::BGridDesc_BK0_N_BK1,
                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
@@ -594,7 +593,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<DPtrsGlobal,
                    BElementwiseOperation,
                    CElementwiseOperation,
                    DxsInElementwiseOperation,
-                    DxsAccElementwiseOperation,
+                    DxsReduceAccElementwiseOperation,
                    DeviceOp::AGridDesc_AK0_M_AK1,
                    DeviceOp::BGridDesc_BK0_N_BK1,
                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
@@ -669,7 +668,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<DPtrsGlobal,
                             BElementwiseOperation b_element_op,
                             CElementwiseOperation c_element_op,
                             DxsInElementwiseOperation dxs_in_element_op,
-                             DxsAccElementwiseOperation dxs_out_element_op)
+                             DxsReduceAccElementwiseOperation dxs_out_element_op)
    {
        return Argument{p_a,
                        p_b,
@@ -691,10 +690,11 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<DPtrsGlobal,
    static auto MakeInvoker() { return Invoker{}; }
    // polymorphic
-    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
                        const void* p_b,
                        void* p_c,
-                                                      DPtrsGlobal p_dxs,
+                        void* p_dxs,
                        index_t MRaw,
                        index_t NRaw,
                        index_t KRaw,
@@ -705,13 +705,14 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<DPtrsGlobal,
                        BElementwiseOperation b_element_op,
                        CElementwiseOperation c_element_op,
                        DxsInElementwiseOperation dxs_in_element_op,
-                                                      DxsAccElementwiseOperation dxs_out_element_op,
+                        DxsReduceAccElementwiseOperation dxs_out_element_op,
                        index_t /* KBatch */ = 1) override
    {
+        DPtrsGlobal dxs_tuple = *(static_cast<DPtrsGlobal*>(p_dxs));
        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
                                          static_cast<const BDataType*>(p_b),
                                          static_cast<CDataType*>(p_c),
-                                          p_dxs,
+                                          dxs_tuple,
                                          MRaw,
                                          NRaw,
                                          KRaw,

--- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
@@ -362,7 +362,7 @@ struct DeviceGroupedGemmXdl
        {
            grid_size_ = 0;
-            gemm_descs_args_workspace_ = nullptr;
+            p_workspace_ = nullptr;
            group_count_ = ck::type_convert<ck::index_t>(gemm_shapes.size());
@@ -437,8 +437,6 @@ struct DeviceGroupedGemmXdl
        std::vector<GemmDescKernelArg> gemm_desc_kernel_arg_;
-        void* gemm_descs_args_workspace_;
        index_t grid_size_;
    };
@@ -488,7 +486,7 @@ struct DeviceGroupedGemmXdl
            }
            hipGetErrorString(
-                hipMemcpy(arg.gemm_descs_args_workspace_,
+                hipMemcpy(arg.p_workspace_,
                          arg.gemm_desc_kernel_arg_.data(),
                          arg.gemm_desc_kernel_arg_.size() * sizeof(GemmDescKernelArg),
                          hipMemcpyHostToDevice));
@@ -507,13 +505,13 @@ struct DeviceGroupedGemmXdl
                                                    CElementwiseOperation,
                                                    true>;
-                ave_time = launch_and_time_kernel(
+                ave_time =
-                    stream_config,
+                    launch_and_time_kernel(stream_config,
                                           kernel,
                                           dim3(arg.grid_size_),
                                           dim3(BlockSize),
                                           0,
-                    cast_pointer_to_constant_address_space(arg.gemm_descs_args_workspace_),
+                                           cast_pointer_to_constant_address_space(arg.p_workspace_),
                                           arg.gemm_desc_kernel_arg_.size(),
                                           arg.a_element_op_,
                                           arg.b_element_op_,
@@ -531,13 +529,13 @@ struct DeviceGroupedGemmXdl
                                                    CElementwiseOperation,
                                                    false>;
-                ave_time = launch_and_time_kernel(
+                ave_time =
-                    stream_config,
+                    launch_and_time_kernel(stream_config,
                                           kernel,
                                           dim3(arg.grid_size_),
                                           dim3(BlockSize),
                                           0,
-                    cast_pointer_to_constant_address_space(arg.gemm_descs_args_workspace_),
+                                           cast_pointer_to_constant_address_space(arg.p_workspace_),
                                           arg.gemm_desc_kernel_arg_.size(),
                                           arg.a_element_op_,
                                           arg.b_element_op_,
@@ -635,11 +633,6 @@ struct DeviceGroupedGemmXdl
    {
        return dynamic_cast<const Argument*>(p_arg)->group_count_ * sizeof(GemmDescKernelArg);
    }
-    void SetWorkSpacePointer(BaseArgument* p_arg, void* workspace_ptr) const override
-    {
-        dynamic_cast<Argument*>(p_arg)->gemm_descs_args_workspace_ = workspace_ptr;
-    }
 };
 } // namespace device

--- a/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp
@@ -35,14 +35,13 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
    using IndexDataType = int32_t;
-    using ReduceOperation = typename reduce_binary_operator<AccDataType, ReduceOpId>::opType;
+    using ReduceOperation = typename reduce_binary_operator<ReduceOpId>::opType;
    using InElementwiseOperation =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation;
+        typename reduce_unary_operator<ReduceOpId, true, true>::InElementwiseOperation;
    using AccElementwiseOperation =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::
+        typename reduce_unary_operator<ReduceOpId, true, true>::AccElementwiseOperation;
-            AccElementwiseOperation;
    static constexpr index_t InSrcOutDstVectorDim =
        0; // for NHWC, the dim C is the vector Dim for both input and output in memory, which is
@@ -178,13 +177,10 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
            invariant_lowest_length_ = C;
            reduce_lowest_length_    = window_spatial_lengths[1];
-            // TODO: is this correct?
+            int32_t reduceLength = window_spatial_lengths[0] * window_spatial_lengths[1];
-            if constexpr(ReduceOpId == ck::ReduceTensorOp::AVG)
-            {
+            std::tie(in_element_op_, acc_element_op_) =
-                ck::index_t divider = window_spatial_lengths[0] * window_spatial_lengths[1];
+                reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(reduceLength);
-                in_element_op_      = InElementwiseOperation{divider};
-                acc_element_op_     = AccElementwiseOperation{divider};
-            }
        }
        const InDataType* p_in_dev_;

--- a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp
@@ -61,12 +61,9 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
    static constexpr bool use_multiblock =
        (OutMemoryDataOperation == InMemoryDataOperationEnum::AtomicAdd);
-    static constexpr bool out_type_compatible_with_atomic_op =
+    static_assert(ck::reduce::InMemoryDataOperatonSupportedOnDataType<OutMemoryDataOperation,
-        std::is_same<OutDataType, float>::value || std::is_same<OutDataType, double>::value;
+                                                                      OutDataType>::value,
+                  "The OutDataType must support the specified OutMemoryDataOperation!");
-    static_assert(
-        !use_multiblock || (use_multiblock && out_type_compatible_with_atomic_op),
-        "The OutDataType must support the atomic operation for using MultiBlock reduction");
    static_assert(!use_multiblock || (use_multiblock && !OutputIndex),
                  "MultiBlock reduction can only be used when outputing index is not required");
@@ -349,7 +346,7 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
            if constexpr(use_multiblock)
            {
                const auto identityVal =
-                    ck::reduce::GetIdentityValueueForInMemoryDataOperation<OutDataType>(
+                    ck::reduce::GetIdentityValueForInMemoryDataOperation<OutDataType>(
                        OutMemoryDataOperation);
                const auto kernel_pre =
@@ -393,10 +390,8 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
        };
    };
-    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    static bool IsSupportedArgument(const Argument* pArg)
    {
-        const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
        if constexpr(use_multiblock)
        {
            if(static_cast<float>(pArg->beta_) != 0.0f)
@@ -445,11 +440,16 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
        else
        {
            // cases with very small reduce_total_length should be handled by ThreadWise kernel
-            if(pArg->reduce_total_length / KThreadSliceSize < 2)
+            // if(pArg->reduce_total_length / KThreadSliceSize < 2)
-                return (false);
+            //     return (false);
        };
        return (true);
+    }
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(dynamic_cast<const Argument*>(p_arg));
    };
    std::unique_ptr<BaseArgument>
@@ -492,7 +492,7 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
        auto str = std::stringstream();
        // clang-format off
-        str << "DeviceReduceMultiBlockAtomicAdd<" << BlockSize << ",";
+        str << (OutMemoryDataOperation == InMemoryDataOperationEnum::Set? "DeviceReduceBlockWise<" : "DeviceReduceMultiBlock<") << BlockSize << ",";
        str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
        str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
        str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << "_OutDstVectorSize_" << OutDstVectorSize << ">";

--- a/include/ck/tensor_operation/gpu/device/device_softmax.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_softmax.hpp
+#ifndef DEVICE_SOFTMAX_HPP
+#define DEVICE_SOFTMAX_HPP
+#include <iostream>
+#include <sstream>
+#include "device.hpp"
+#include "device_base.hpp"
+#include "device_reduce.hpp"
+#include "device_reduce_multiblock.hpp"
+#include "device_reduce_common.hpp"
+#include "gridwise_softmax.hpp"
+#include "gridwise_set_buffer_value.hpp"
+#include "reduction_operator.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+template <typename InDataType,
+          typename AccDataType,
+          typename OutDataType,
+          index_t Rank,
+          index_t NumReduceDim,
+          index_t BlockSize,
+          index_t MThreadClusterSize,
+          index_t KThreadClusterSize,
+          index_t MThreadSliceSize,
+          index_t KThreadSliceSize,
+          index_t InSrcVectorDim,
+          index_t InSrcVectorSize,
+          index_t OutDstVectorSize>
+struct DeviceSoftmax : public BaseOperator
+{
+    using PassThrough = tensor_operation::element_wise::PassThrough;
+    // Used for freeloading of some handy functions from DeviceReduceMultiBlock
+    using Reduction = DeviceReduceMultiBlock<InDataType,
+                                             AccDataType,
+                                             OutDataType,
+                                             Rank,
+                                             NumReduceDim,
+                                             reduce::Add,
+                                             PassThrough, // InElementwiseOperation
+                                             PassThrough, // AccElementwiseOperation
+                                             InMemoryDataOperationEnum::Set,
+                                             false, // PropagateNan
+                                             false, // OutputIndex
+                                             false, // HaveIndexInputIfOutputIndex
+                                             BlockSize,
+                                             MThreadClusterSize,
+                                             KThreadClusterSize,
+                                             MThreadSliceSize,
+                                             KThreadSliceSize,
+                                             InSrcVectorDim,
+                                             InSrcVectorSize,
+                                             1>; // OutDstVectorSize
+    using GridDesc_M_K = decltype(Reduction::MakeSrc2dDescriptor({1}, {1}, 1, 1));
+    using GridwiseReduce = GridwiseSoftmax_mk_to_mk<InDataType,
+                                                    OutDataType,
+                                                    AccDataType,
+                                                    GridDesc_M_K,
+                                                    BlockSize,
+                                                    MThreadClusterSize,
+                                                    KThreadClusterSize,
+                                                    MThreadSliceSize,
+                                                    KThreadSliceSize,
+                                                    InSrcVectorDim,
+                                                    InSrcVectorSize,
+                                                    OutDstVectorSize>;
+    struct Argument : public Reduction::Argument
+    {
+        Argument(const std::vector<index_t> inLengths,
+                 const std::vector<index_t> inStrides,
+                 const std::vector<index_t> reduceDims,
+                 AccDataType alpha,
+                 AccDataType beta,
+                 const InDataType* in_dev,
+                 OutDataType* out_dev)
+            : Reduction::Argument(inLengths,
+                                  inStrides,
+                                  {},
+                                  {},
+                                  reduceDims,
+                                  0.0f, // alpha
+                                  0.0f, // beta
+                                  in_dev,
+                                  nullptr,
+                                  out_dev,
+                                  nullptr,
+                                  PassThrough{},
+                                  PassThrough{}),
+              // FIXME: The base class DeviceReduceMultiBlock::Argument only supports alpha/beta of
+              // float32 precision. Make it support any data type so the fields can be removed.
+              alpha_(alpha),
+              beta_(beta)
+        {
+            // std::cout << "blkGroupSize= " << this->blkGroupSize
+            //           << ", numBlockTileIteration= " << this->numBlockTileIteration
+            //           << ", gridSize=" << this->gridSize
+            //           << ", invariant_total_length=" << this->invariant_total_length <<
+            //           std::endl;
+        }
+        AccDataType alpha_;
+        AccDataType beta_;
+    };
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            const auto in_grid_desc_m_k = Reduction::MakeSrc2dDescriptor(
+                arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.numBlockTileIteration);
+            const auto out_grid_desc_m_k = Reduction::MakeSrc2dDescriptor(
+                arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.numBlockTileIteration);
+            const auto kernel_main =
+                kernel_softmax<GridwiseReduce, InDataType, OutDataType, AccDataType, GridDesc_M_K>;
+            float avg_time = 0;
+            avg_time += launch_and_time_kernel(stream_config,
+                                               kernel_main,
+                                               dim3(arg.gridSize),
+                                               dim3(BlockSize),
+                                               0,
+                                               in_grid_desc_m_k,
+                                               out_grid_desc_m_k,
+                                               arg.blkGroupSize,
+                                               arg.numBlockTileIteration,
+                                               arg.alpha_,
+                                               arg.in_dev_,
+                                               arg.beta_,
+                                               arg.out_dev_);
+            return (avg_time);
+        };
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        };
+    };
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        const Argument* p_arg_ = dynamic_cast<const Argument*>(p_arg);
+        if(!Reduction::IsSupportedArgument(p_arg_))
+        {
+            return false;
+        }
+        if(p_arg_->inLengths_[Rank - 1] % OutDstVectorSize != 0)
+        {
+            return false;
+        }
+        return true;
+    };
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const std::vector<index_t> inLengths,
+                                                      const std::vector<index_t> inStrides,
+                                                      const std::vector<int> reduceDims,
+                                                      AccDataType alpha,
+                                                      AccDataType beta,
+                                                      const void* in_dev,
+                                                      void* out_dev)
+    {
+        return std::make_unique<Argument>(inLengths,
+                                          inStrides,
+                                          reduceDims,
+                                          alpha,
+                                          beta,
+                                          static_cast<const InDataType*>(in_dev),
+                                          static_cast<OutDataType*>(out_dev));
+    };
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() { return std::make_unique<Invoker>(); };
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+        // clang-format off
+        str << "DeviceReduceSoftmax<" << BlockSize << ",";
+        str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
+        str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
+        str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << "_OutDstVectorSize_" << OutDstVectorSize << ">";
+        // clang-format on
+        return str.str();
+    }
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
+#endif // DEVICE_SOFTMAX_HPP
--- a/include/ck/tensor_operation/gpu/device/device_unary_elementwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_unary_elementwise.hpp
--- a/include/ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp
+++ b/include/ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp
--- a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
--- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
--- a/include/ck/tensor_operation/gpu/element/element_wise_reduce_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/element_wise_reduce_operation.hpp
-#pragma once
-#include "data_type.hpp"
-namespace ck {
-namespace tensor_operation {
-namespace element_wise {
-} // namespace element_wise
-} // namespace tensor_operation
-} // namespace ck
--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+#pragma once
+#include "data_type.hpp"
+#include "math_v2.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace element_wise {
+struct PassThrough
+{
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
+    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, half_t>::value || is_same<T, bhalf_t>::value ||
+                          is_same<T, int32_t>::value || is_same<T, int8_t>::value,
+                      "Data type is not supported by this operation!");
+        y = x;
+    };
+};
+struct UnaryDivide
+{
+    __host__ __device__ UnaryDivide(const int32_t divider = 1) : divider_(divider){};
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
+    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, int32_t>::value,
+                      "Data type is not supported by this operation!");
+        y = x / type_convert<T>(divider_);
+    };
+    int32_t divider_ = 1;
+};
+struct UnarySquare
+{
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
+    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value,
+                      "Data type is not supported by this operation!");
+        y = x * x;
+    };
+};
+struct UnaryAbs
+{
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
+    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
+                          is_same<T, int8_t>::value,
+                      "Data type is not supported by this operation!");
+        y = ck::math::abs(x);
+    };
+};
+struct UnarySqrt
+{
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
+    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value,
+                      "Data type is not supported by this operation!");
+        y = ck::math::sqrt(x);
+    };
+};
+struct Relu
+{
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
+    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
+                          is_same<T, int8_t>::value,
+                      "Data type is not supported by this operation!");
+        y = x > 0 ? x : 0;
+    }
+    template <>
+    __host__ __device__ void operator()(bhalf_t& y, const bhalf_t& x) const
+    {
+        float x_f32 = ck::type_convert<float>(x);
+        float y_f32 = x_f32 > 0 ? x_f32 : 0;
+        y           = ck::type_convert<bhalf_t>(y_f32);
+    }
+};
+// https://paperswithcode.com/method/gelu
+// y = 0.5*x*(1+tanh(sqrt(2/pi)*(x+0.044715*x^3)))
+struct FastGelu
+{
+    template <typename Y, typename X>
+    __host__ __device__ void operator()(Y& y, const X& x) const;
+    template <>
+    __host__ __device__ void operator()<float, float>(float& y, const float& x) const
+    {
+        const float u   = float(2) * x * (float(0.035677) * x * x + float(0.797885));
+        const float emu = exp(-u);
+        const float cdf = float(0.5) + float(0.5) * (float(2) / (float(1) + emu) - float(1));
+        y = x * cdf;
+    }
+};
+} // namespace element_wise
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp
@@ -171,15 +171,15 @@ struct GridwiseReduction_mk_to_m_multiblock
                               AccDataType beta,
                               OutDataType* const __restrict__ p_out_value_global)
    {
-        const auto identityVal = ReduceOperation::GetIdentityValue();
+        const auto identityVal = ReduceOperation::template GetIdentityValue<AccDataType>();
        // LDS
        __shared__ AccDataType p_reduce_work_buffer[BlockSize];
-        const auto in_global_val_buf =
+        const auto in_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            make_dynamic_buffer<AddressSpaceEnum::Global>(p_in_value_global,
+            p_in_value_global,
            in_grid_desc_m_k.GetElementSpaceSize(),
-                                                          type_convert<InDataType>(identityVal));
+            ReduceOperation::template GetIdentityValue<InDataType>());
        auto out_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_out_value_global, out_grid_desc_m.GetElementSpaceSize());
@@ -358,12 +358,12 @@ struct GridwiseReduction_mk_to_m_multiblock
        __shared__ AccDataType p_reduce_work_val_buffer[BlockSize];
        __shared__ IndexDataType p_reduce_work_idx_buffer[BlockSize];
-        const auto identityVal = ReduceOperation::GetIdentityValue();
+        const auto identityVal = ReduceOperation::template GetIdentityValue<AccDataType>();
-        const auto in_global_val_buf =
+        const auto in_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            make_dynamic_buffer<AddressSpaceEnum::Global>(p_in_value_global,
+            p_in_value_global,
            in_grid_desc_m_k.GetElementSpaceSize(),
-                                                          type_convert<InDataType>(identityVal));
+            ReduceOperation::template GetIdentityValue<InDataType>());
        const auto in_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_in_index_global, in_grid_desc_m_k.GetElementSpaceSize());
        auto out_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(

--- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp