Merge remote-tracking branch 'upstream/develop' into gemm-layernorm-4

7e610626 · Anthony Chang · 6c496076 · 86185bd7 · 7e610626 · 7e610626
Commit 7e610626 authored Jun 02, 2022 by Anthony Chang
18 changed files
--- a/include/ck/tensor_operation/gpu/device/device_gemm_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_dl.hpp
@@ -60,8 +60,8 @@ template <
    index_t CThreadTransferDstScalarPerVector,
    enable_if_t<
        is_same_v<AElementwiseOperation, ck::tensor_operation::element_wise::PassThrough> &&
-            is_same_v<AElementwiseOperation, ck::tensor_operation::element_wise::PassThrough> &&
-            is_same_v<AElementwiseOperation, ck::tensor_operation::element_wise::PassThrough>,
+            is_same_v<BElementwiseOperation, ck::tensor_operation::element_wise::PassThrough> &&
+            is_same_v<CElementwiseOperation, ck::tensor_operation::element_wise::PassThrough>,
        bool> = false>
 struct DeviceGemmDl
    : public DeviceGemm<AElementwiseOperation, BElementwiseOperation, CElementwiseOperation>

--- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
@@ -24,57 +24,33 @@ template <typename GridwiseGemm,
          typename AElementwiseOperation,
          typename BElementwiseOperation,
          typename CElementwiseOperation,
-          bool HasMainKBlockLoop,
-          index_t MaxGroupCount>
+          bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_grouped_gemm_xdlops_v2r3(
-            const StaticallyIndexedArray<GemmDesc, MaxGroupCount> gemm_descs,
-            const index_t group_count,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const CElementwiseOperation c_element_op)
+        kernel_grouped_gemm_xdlops_v2r3(const void CK_CONSTANT_ADDRESS_SPACE* gemm_descs_const,
+                                        const index_t group_count,
+                                        const AElementwiseOperation a_element_op,
+                                        const BElementwiseOperation b_element_op,
+                                        const CElementwiseOperation c_element_op)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

    const index_t block_id = get_block_1d_id();

-#if 1
-    static_for<0, MaxGroupCount, 1>{}([&](auto i) {
-        if(block_id >= gemm_descs[i].BlockStart_ && block_id < gemm_descs[i].BlockEnd_ &&
-           i < group_count)
-        {
-            auto group_id = i;
-
-            GridwiseGemm::template Run<HasMainKBlockLoop>(
-                gemm_descs[group_id].a_ptr,
-                gemm_descs[group_id].b_ptr,
-                gemm_descs[group_id].c_ptr,
-                p_shared,
-                gemm_descs[group_id].a_grid_desc_k0_m_k1_,
-                gemm_descs[group_id].b_grid_desc_k0_n_k1_,
-                gemm_descs[group_id].c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
-                a_element_op,
-                b_element_op,
-                c_element_op,
-                gemm_descs[group_id].grouped_gemm_block_2_ctile_map_);
-        }
-    });
-#else
-    const auto gemm_desc_ptr = reinterpret_cast<const GemmDesc*>(&gemm_descs);
+    const auto gemm_desc_ptr =
+        reinterpret_cast<const GemmDesc*>(cast_pointer_to_generic_address_space(gemm_descs_const));

    index_t group_id = 0;
-    static_for<0, MaxGroupCount, 1>{}([&](auto i) {
-        group_id = (block_id >= gemm_descs[i].BlockStart && block_id < gemm_descs[i].BlockEnd &&
-                    i < group_count)
-                       ? i
-                       : group_id;
-    });
-
-    const index_t block_id_grp = block_id - gemm_desc_ptr[group_id].BlockStart;
+    for(index_t i = 0; i < group_count; i++)
+    {
+        group_id =
+            (block_id >= gemm_desc_ptr[i].BlockStart_ && block_id < gemm_desc_ptr[i].BlockEnd_)
+                ? i
+                : group_id;
+    }

    GridwiseGemm::template Run<HasMainKBlockLoop>(
        gemm_desc_ptr[group_id].a_ptr,
@@ -87,11 +63,9 @@ __global__ void
        a_element_op,
        b_element_op,
        c_element_op,
-        gemm_desc_ptr[group_id].block_2_ctile_map_,
-        block_id_grp);
-#endif
+        gemm_desc_ptr[group_id].grouped_gemm_block_2_ctile_map_);
 #else
-    ignore = gemm_descs;
+    ignore = gemm_descs_const;
    ignore = group_count;
    ignore = a_element_op;
    ignore = b_element_op;
@@ -388,6 +362,8 @@ struct DeviceGroupedGemmXdl
        {
            grid_size_ = 0;

+            gemm_descs_args_workspace_ = nullptr;
+
            group_count_ = ck::type_convert<ck::index_t>(gemm_shapes.size());

            if(!(group_count_ == ck::type_convert<ck::index_t>(p_a.size()) &&
@@ -461,6 +437,8 @@ struct DeviceGroupedGemmXdl

        std::vector<GemmDescKernelArg> gemm_desc_kernel_arg_;

+        void* gemm_descs_args_workspace_;
+
        index_t grid_size_;
    };

@@ -471,49 +449,49 @@ struct DeviceGroupedGemmXdl

        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
        {
-            StaticallyIndexedArray<GemmDescKernelArg, MaxGroupCount> gemm_desc_kernel_args;
-
            bool has_main_k_block_loop = true;

-            static_for<0, MaxGroupCount, 1>{}([&](auto i) {
-                if(i < arg.gemm_desc_kernel_arg_.size())
+            for(std::size_t i = 0; i < arg.gemm_desc_kernel_arg_.size(); i++)
+            {
+                std::cout << "group: " << i << " arg.a_grid_desc_k0_m_k1_{"
+                          << arg.gemm_desc_kernel_arg_[i].a_grid_desc_k0_m_k1_.GetLength(I0) << ", "
+                          << arg.gemm_desc_kernel_arg_[i].a_grid_desc_k0_m_k1_.GetLength(I1) << ", "
+                          << arg.gemm_desc_kernel_arg_[i].a_grid_desc_k0_m_k1_.GetLength(I2) << "}";
+
+                std::cout << ", arg.b_grid_desc_k0_n_k1_{"
+                          << arg.gemm_desc_kernel_arg_[i].b_grid_desc_k0_n_k1_.GetLength(I0) << ", "
+                          << arg.gemm_desc_kernel_arg_[i].b_grid_desc_k0_n_k1_.GetLength(I1) << ", "
+                          << arg.gemm_desc_kernel_arg_[i].b_grid_desc_k0_n_k1_.GetLength(I2) << "}";
+
+                std::cout << ", arg.c_grid_desc_m_n_{ "
+                          << arg.gemm_desc_kernel_arg_[i].c_grid_desc_m_n_.GetLength(I0) << ", "
+                          << arg.gemm_desc_kernel_arg_[i].c_grid_desc_m_n_.GetLength(I1) << "}"
+                          << std::endl;
+
+                if(!GridwiseGemm::CheckValidity(
+                       arg.gemm_desc_kernel_arg_[i].a_grid_desc_k0_m_k1_,
+                       arg.gemm_desc_kernel_arg_[i].b_grid_desc_k0_n_k1_,
+                       arg.gemm_desc_kernel_arg_[i].c_grid_desc_m_n_,
+                       arg.gemm_desc_kernel_arg_[i].grouped_gemm_block_2_ctile_map_))
                {
-                    gemm_desc_kernel_args(i) = arg.gemm_desc_kernel_arg_[i];
-
-                    std::cout << "group: " << i << " arg.a_grid_desc_k0_m_k1_{"
-                              << gemm_desc_kernel_args[i].a_grid_desc_k0_m_k1_.GetLength(I0) << ", "
-                              << gemm_desc_kernel_args[i].a_grid_desc_k0_m_k1_.GetLength(I1) << ", "
-                              << gemm_desc_kernel_args[i].a_grid_desc_k0_m_k1_.GetLength(I2) << "}";
-
-                    std::cout << ", arg.b_grid_desc_k0_n_k1_{"
-                              << gemm_desc_kernel_args[i].b_grid_desc_k0_n_k1_.GetLength(I0) << ", "
-                              << gemm_desc_kernel_args[i].b_grid_desc_k0_n_k1_.GetLength(I1) << ", "
-                              << gemm_desc_kernel_args[i].b_grid_desc_k0_n_k1_.GetLength(I2) << "}";
-
-                    std::cout << ", arg.c_grid_desc_m_n_{ "
-                              << gemm_desc_kernel_args[i].c_grid_desc_m_n_.GetLength(I0) << ", "
-                              << gemm_desc_kernel_args[i].c_grid_desc_m_n_.GetLength(I1) << "}"
-                              << std::endl;
-
-                    if(!GridwiseGemm::CheckValidity(
-                           gemm_desc_kernel_args[i].a_grid_desc_k0_m_k1_,
-                           gemm_desc_kernel_args[i].b_grid_desc_k0_n_k1_,
-                           gemm_desc_kernel_args[i].c_grid_desc_m_n_,
-                           gemm_desc_kernel_args[i].grouped_gemm_block_2_ctile_map_))
-                    {
-                        throw std::runtime_error(
-                            "wrong! GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 has invalid setting");
-                    }
-
-                    const auto K = gemm_desc_kernel_args[i].a_grid_desc_k0_m_k1_.GetLength(I0) *
-                                   gemm_desc_kernel_args[i].a_grid_desc_k0_m_k1_.GetLength(I2);
-
-                    if(GridwiseGemm::CalculateHasMainKBlockLoop(K) != has_main_k_block_loop)
-                    {
-                        throw std::runtime_error("wrong! not all gemm has_main_k_block_loop");
-                    }
+                    throw std::runtime_error(
+                        "wrong! GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 has invalid setting");
                }
-            });
+
+                const auto K = arg.gemm_desc_kernel_arg_[i].a_grid_desc_k0_m_k1_.GetLength(I0) *
+                               arg.gemm_desc_kernel_arg_[i].a_grid_desc_k0_m_k1_.GetLength(I2);
+
+                if(GridwiseGemm::CalculateHasMainKBlockLoop(K) != has_main_k_block_loop)
+                {
+                    throw std::runtime_error("wrong! not all gemm has_main_k_block_loop");
+                }
+            }
+
+            hipGetErrorString(
+                hipMemcpy(arg.gemm_descs_args_workspace_,
+                          arg.gemm_desc_kernel_arg_.data(),
+                          arg.gemm_desc_kernel_arg_.size() * sizeof(GemmDescKernelArg),
+                          hipMemcpyHostToDevice));

            float ave_time = 0;

@@ -523,23 +501,23 @@ struct DeviceGroupedGemmXdl
                    kernel_grouped_gemm_xdlops_v2r3<GridwiseGemm,
                                                    ADataType, // TODO: distiguish A/B datatype
                                                    CDataType,
-                                                    remove_reference_t<GemmDescKernelArg>,
+                                                    GemmDescKernelArg,
                                                    AElementwiseOperation,
                                                    BElementwiseOperation,
                                                    CElementwiseOperation,
-                                                    true,
-                                                    MaxGroupCount>;
-
-                ave_time = launch_and_time_kernel(stream_config,
-                                                  kernel,
-                                                  dim3(arg.grid_size_),
-                                                  dim3(BlockSize),
-                                                  0,
-                                                  gemm_desc_kernel_args,
-                                                  arg.gemm_desc_kernel_arg_.size(),
-                                                  arg.a_element_op_,
-                                                  arg.b_element_op_,
-                                                  arg.c_element_op_);
+                                                    true>;
+
+                ave_time = launch_and_time_kernel(
+                    stream_config,
+                    kernel,
+                    dim3(arg.grid_size_),
+                    dim3(BlockSize),
+                    0,
+                    cast_pointer_to_constant_address_space(arg.gemm_descs_args_workspace_),
+                    arg.gemm_desc_kernel_arg_.size(),
+                    arg.a_element_op_,
+                    arg.b_element_op_,
+                    arg.c_element_op_);
            }
            else
            {
@@ -547,23 +525,23 @@ struct DeviceGroupedGemmXdl
                    kernel_grouped_gemm_xdlops_v2r3<GridwiseGemm,
                                                    ADataType, // TODO: distiguish A/B datatype
                                                    CDataType,
-                                                    remove_reference_t<GemmDescKernelArg>,
+                                                    GemmDescKernelArg,
                                                    AElementwiseOperation,
                                                    BElementwiseOperation,
                                                    CElementwiseOperation,
-                                                    false,
-                                                    MaxGroupCount>;
-
-                ave_time = launch_and_time_kernel(stream_config,
-                                                  kernel,
-                                                  dim3(arg.grid_size_),
-                                                  dim3(BlockSize),
-                                                  0,
-                                                  gemm_desc_kernel_args,
-                                                  arg.gemm_desc_kernel_arg_.size(),
-                                                  arg.a_element_op_,
-                                                  arg.b_element_op_,
-                                                  arg.c_element_op_);
+                                                    false>;
+
+                ave_time = launch_and_time_kernel(
+                    stream_config,
+                    kernel,
+                    dim3(arg.grid_size_),
+                    dim3(BlockSize),
+                    0,
+                    cast_pointer_to_constant_address_space(arg.gemm_descs_args_workspace_),
+                    arg.gemm_desc_kernel_arg_.size(),
+                    arg.a_element_op_,
+                    arg.b_element_op_,
+                    arg.c_element_op_);
            }

            return ave_time;
@@ -652,6 +630,16 @@ struct DeviceGroupedGemmXdl

        return str.str();
    }
+
+    size_t GetWorkSpaceSize(const BaseArgument* p_arg) const override
+    {
+        return dynamic_cast<const Argument*>(p_arg)->group_count_ * sizeof(GemmDescKernelArg);
+    }
+
+    void SetWorkSpacePointer(BaseArgument* p_arg, void* workspace_ptr) const override
+    {
+        dynamic_cast<Argument*>(p_arg)->gemm_descs_args_workspace_ = workspace_ptr;
+    }
 };

 } // namespace device

--- a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp
@@ -348,8 +348,8 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE

            if constexpr(use_multiblock)
            {
-                const auto zeroVal =
-                    ck::reduce::GetReductionZeroValueForInMemoryDataOperation<OutDataType>(
+                const auto identityVal =
+                    ck::reduce::GetIdentityValueueForInMemoryDataOperation<OutDataType>(
                        OutMemoryDataOperation);

                const auto kernel_pre =
@@ -362,7 +362,7 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
                                                   0,
                                                   out_grid_desc_m_2,
                                                   arg.out_dev_,
-                                                   zeroVal);
+                                                   identityVal);
            };

            avg_time += launch_and_time_kernel(stream_config,

--- a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
 #pragma once
 #include "data_type.hpp"

@@ -5,14 +30,22 @@ namespace ck {
 namespace tensor_operation {
 namespace binary_element_wise {

-struct Add
+template <typename Y, typename X1, typename X2>
+struct Add;
+
+template <>
+struct Add<double, double, double>
 {
    __host__ __device__ constexpr void
    operator()(double& dst, const double& src1, const double& src2) const
    {
        dst = src1 + src2;
    }
+};

+template <>
+struct Add<float, float, float>
+{
    __host__ __device__ constexpr void
    operator()(float& dst, const float& src1, const float& src2) const
    {
@@ -20,6 +53,75 @@ struct Add
    }
 };

+template <>
+struct Add<half_t, half_t, half_t>
+{
+    __host__ __device__ constexpr void
+    operator()(half_t& dst, const half_t& src1, const half_t& src2) const
+    {
+        dst = src1 + src2;
+    }
+};
+
+template <>
+struct Add<bhalf_t, bhalf_t, bhalf_t>
+{
+    __host__ __device__ constexpr void
+    operator()(bhalf_t& dst, const bhalf_t& src1, const bhalf_t& src2) const
+    {
+        const float x1 = ck::type_convert<float>(src1);
+        const float x2 = ck::type_convert<float>(src2);
+        const float y  = x1 + x2;
+        dst            = ck::type_convert<bhalf_t>(y);
+    }
+};
+
+template <typename Y, typename X1, typename X2>
+struct Substract;
+
+template <>
+struct Substract<double, double, double>
+{
+    __host__ __device__ constexpr void
+    operator()(double& dst, const double& src1, const double& src2) const
+    {
+        dst = src1 - src2;
+    }
+};
+
+template <>
+struct Substract<float, float, float>
+{
+    __host__ __device__ constexpr void
+    operator()(float& dst, const float& src1, const float& src2) const
+    {
+        dst = src1 - src2;
+    }
+};
+
+template <>
+struct Substract<half_t, half_t, half_t>
+{
+    __host__ __device__ constexpr void
+    operator()(half_t& dst, const half_t& src1, const half_t& src2) const
+    {
+        dst = src1 - src2;
+    }
+};
+
+template <>
+struct Substract<bhalf_t, bhalf_t, bhalf_t>
+{
+    __host__ __device__ constexpr void
+    operator()(bhalf_t& dst, const bhalf_t& src1, const bhalf_t& src2) const
+    {
+        const float x1 = ck::type_convert<float>(src1);
+        const float x2 = ck::type_convert<float>(src2);
+        const float y  = x1 - x2;
+        dst            = ck::type_convert<bhalf_t>(y);
+    }
+};
+
 } // namespace binary_element_wise
 } // namespace tensor_operation
 } // namespace ck
--- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
 #pragma once
 #include "data_type.hpp"
+#include "math_v2.hpp"

 namespace ck {
 namespace tensor_operation {
@@ -296,7 +297,7 @@ struct UnaryAbs<float, float>
 {
    __host__ __device__ UnaryAbs(const int32_t divider = 1) { (void)divider; };

-    __host__ __device__ void operator()(float& y, const float& x) const { y = abs(x); };
+    __host__ __device__ void operator()(float& y, const float& x) const { y = ck::math::abs(x); };
 };

 template <>
@@ -304,7 +305,7 @@ struct UnaryAbs<half_t, half_t>
 {
    __host__ __device__ UnaryAbs(const int32_t divider = 1) { (void)divider; };

-    __host__ __device__ void operator()(half_t& y, const half_t& x) const { y = __habs(x); };
+    __host__ __device__ void operator()(half_t& y, const half_t& x) const { y = ck::math::abs(x); };
 };

 template <>
@@ -312,7 +313,7 @@ struct UnaryAbs<double, double>
 {
    __host__ __device__ UnaryAbs(const int32_t divider = 1) { (void)divider; };

-    __host__ __device__ void operator()(double& y, const double& x) const { y = abs(x); };
+    __host__ __device__ void operator()(double& y, const double& x) const { y = ck::math::abs(x); };
 };

 template <>
@@ -320,12 +321,7 @@ struct UnaryAbs<int8_t, int8_t>
 {
    __host__ __device__ UnaryAbs(const int32_t divider = 1) { (void)divider; };

-    __host__ __device__ void operator()(int8_t& y, const int8_t& x) const
-    {
-        int8_t sgn = x >> (8 - 1);
-
-        y = (x ^ sgn) - sgn;
-    };
+    __host__ __device__ void operator()(int8_t& y, const int8_t& x) const { y = ck::math::abs(x); };
 };

 template <typename Y, typename X>
@@ -336,7 +332,7 @@ struct UnarySqrt<float, float>
 {
    __host__ __device__ UnarySqrt(const int32_t divider = 1) { (void)divider; };

-    __host__ __device__ void operator()(float& y, const float& x) const { y = sqrtf(x); };
+    __host__ __device__ void operator()(float& y, const float& x) const { y = ck::math::sqrt(x); };
 };

 template <>
@@ -344,7 +340,10 @@ struct UnarySqrt<double, double>
 {
    __host__ __device__ UnarySqrt(const int32_t divider = 1) { (void)divider; };

-    __host__ __device__ void operator()(double& y, const double& x) const { y = sqrt(x); };
+    __host__ __device__ void operator()(double& y, const double& x) const
+    {
+        y = ck::math::sqrt(x);
+    };
 };

 } // namespace element_wise

--- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp
@@ -171,7 +171,7 @@ struct GridwiseReduction_mk_to_m_multiblock
                               AccDataType beta,
                               OutDataType* const __restrict__ p_out_value_global)
    {
-        const auto zeroVal = ReduceOperation::GetReductionZeroVal();
+        const auto identityVal = ReduceOperation::GetIdentityValue();

        // LDS
        __shared__ AccDataType p_reduce_work_buffer[BlockSize];
@@ -179,7 +179,7 @@ struct GridwiseReduction_mk_to_m_multiblock
        const auto in_global_val_buf =
            make_dynamic_buffer<AddressSpaceEnum::Global>(p_in_value_global,
                                                          in_grid_desc_m_k.GetElementSpaceSize(),
-                                                          type_convert<InDataType>(zeroVal));
+                                                          type_convert<InDataType>(identityVal));
        auto out_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_out_value_global, out_grid_desc_m.GetElementSpaceSize());

@@ -191,7 +191,7 @@ struct GridwiseReduction_mk_to_m_multiblock

        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> accu_value_buf;

-        static_for<0, MThreadSliceSize, 1>{}([&](auto I) { accu_value_buf(I) = zeroVal; });
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) { accu_value_buf(I) = identityVal; });

        const index_t thread_local_id = get_thread_local_1d_id();
        const index_t block_global_id = get_block_1d_id();
@@ -358,12 +358,12 @@ struct GridwiseReduction_mk_to_m_multiblock
        __shared__ AccDataType p_reduce_work_val_buffer[BlockSize];
        __shared__ IndexDataType p_reduce_work_idx_buffer[BlockSize];

-        const auto zeroVal = ReduceOperation::GetReductionZeroVal();
+        const auto identityVal = ReduceOperation::GetIdentityValue();

        const auto in_global_val_buf =
            make_dynamic_buffer<AddressSpaceEnum::Global>(p_in_value_global,
                                                          in_grid_desc_m_k.GetElementSpaceSize(),
-                                                          type_convert<InDataType>(zeroVal));
+                                                          type_convert<InDataType>(identityVal));
        const auto in_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_in_index_global, in_grid_desc_m_k.GetElementSpaceSize());
        auto out_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
@@ -418,7 +418,7 @@ struct GridwiseReduction_mk_to_m_multiblock
                                 thread_k_cluster_id * KThreadSliceSize));

        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
-            accu_value_buf(I) = zeroVal;
+            accu_value_buf(I) = identityVal;
            accu_index_buf(I) = 0;
        });

@@ -459,7 +459,7 @@ struct GridwiseReduction_mk_to_m_multiblock
                                            in_thread_idx_buf);

                static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
-                    AccDataType tmpValue   = zeroVal;
+                    AccDataType tmpValue   = identityVal;
                    IndexDataType tmpIndex = 0;

                    static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
@@ -512,7 +512,7 @@ struct GridwiseReduction_mk_to_m_multiblock
                                          in_thread_val_buf(Number<offset>{}));
                    });

-                    AccDataType tmpValue   = zeroVal;
+                    AccDataType tmpValue   = identityVal;
                    IndexDataType tmpIndex = 0;

                    static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {

--- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
@@ -135,12 +135,12 @@ struct GridwiseReduction_mk_to_m_threadwise
                                                     ReduceOperation,
                                                     PropagateNan>;

-        const auto zeroVal = ReduceOperation::GetReductionZeroVal();
+        const auto identityVal = ReduceOperation::GetIdentityValue();

        const auto in_global_val_buf =
            make_dynamic_buffer<AddressSpaceEnum::Global>(p_in_value_global,
                                                          in_grid_desc_m_k.GetElementSpaceSize(),
-                                                          type_convert<InDataType>(zeroVal));
+                                                          type_convert<InDataType>(identityVal));
        auto dst_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_out_value_global, out_grid_desc_m.GetElementSpaceSize());

@@ -149,7 +149,7 @@ struct GridwiseReduction_mk_to_m_threadwise

        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> accu_value_buf;

-        static_for<0, MThreadSliceSize, 1>{}([&](auto I) { accu_value_buf(I) = zeroVal; });
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) { accu_value_buf(I) = identityVal; });

        const auto toReduceLength = in_grid_desc_m_k.GetLength(Number<1>{});

@@ -276,12 +276,12 @@ struct GridwiseReduction_mk_to_m_threadwise

        (void)acc_elementwise_op;

-        const auto zeroVal = ReduceOperation::GetReductionZeroVal();
+        const auto identityVal = ReduceOperation::GetIdentityValue();

        const auto in_global_val_buf =
            make_dynamic_buffer<AddressSpaceEnum::Global>(p_in_value_global,
                                                          in_grid_desc_m_k.GetElementSpaceSize(),
-                                                          type_convert<InDataType>(zeroVal));
+                                                          type_convert<InDataType>(identityVal));
        const auto in_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_in_index_global, in_grid_desc_m_k.GetElementSpaceSize());

@@ -303,7 +303,7 @@ struct GridwiseReduction_mk_to_m_threadwise
        StaticBuffer<AddressSpaceEnum::Vgpr, IndexDataType, MThreadSliceSize, true> accu_index_buf;

        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
-            accu_value_buf(I) = zeroVal;
+            accu_value_buf(I) = identityVal;
            accu_index_buf(I) = 0;
        });


--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
@@ -816,10 +816,10 @@ struct GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1
                                                false>;

                        // Global write Gemm shuffle + reduction
-                        const auto d_zeroVal = DReduceOperation::GetReductionZeroVal();
+                        const auto d_identityVal = DReduceOperation::GetIdentityValue();

                        static_for<0, mreduce_per_thread, 1>{}(
-                            [&](auto I) { d_thread_buf(I) = d_zeroVal; });
+                            [&](auto I) { d_thread_buf(I) = d_identityVal; });

                        // reduce in VGPR
                        static_for<0, mreduce_per_thread, 1>{}([&](auto im) {

--- a/include/ck/utility/math_v2.hpp
+++ b/include/ck/utility/math_v2.hpp
@@ -3,11 +3,13 @@

 #include <cmath>
 #include "data_type.hpp"
-#include "half.hpp"
+#include "type.hpp"

 namespace ck {
 namespace math {

+// math functions for the host,  some are implemented by calling C++ std functions
+
 static inline __host__ float abs(float x) { return std::abs(x); };

 static inline __host__ double abs(double x) { return std::abs(x); };
@@ -28,26 +30,26 @@ static inline __host__ int32_t abs(int32_t x)

 static inline __host__ half_t abs(half_t x)
 {
-    half_float::half xx = *reinterpret_cast<half_float::half*>(&x);
+    uint16_t xx = ck::bit_cast<uint16_t>(x);

-    half_float::half abs_xx = half_float::abs(xx);
+    uint16_t abs_xx = xx & 0x7fff;

-    half_t abs_x = *reinterpret_cast<half_t*>(&abs_xx);
+    half_t abs_x = ck::bit_cast<half_t>(abs_xx);

    return abs_x;
 };

-static inline __host__ float isnan(float x) { return std::isnan(x); };
+static inline __host__ bool isnan(float x) { return std::isnan(x); };

-static inline __host__ double isnan(double x) { return std::isnan(x); };
+static inline __host__ bool isnan(double x) { return std::isnan(x); };

-static inline __host__ int8_t isnan(int8_t x)
+static inline __host__ bool isnan(int8_t x)
 {
    (void)x;
    return false;
 };

-static inline __host__ int32_t isnan(int32_t x)
+static inline __host__ bool isnan(int32_t x)
 {
    (void)x;
    return false;
@@ -55,11 +57,59 @@ static inline __host__ int32_t isnan(int32_t x)

 static inline __host__ bool isnan(half_t x)
 {
-    half_float::half xx = *reinterpret_cast<half_float::half*>(&x);
+    uint16_t xx = ck::bit_cast<uint16_t>(x);
+
+    return (xx & 0x7FFF) > 0x7C00;
+};
+
+static inline __host__ float sqrt(float x) { return std::sqrt(x); };
+
+static inline __host__ double sqrt(double x) { return std::sqrt(x); };
+
+// math functions for the HIP kernel,  some are implemented by calling hip builtin functions
+
+static inline __device__ float abs(float x) { return ::abs(x); };
+
+static inline __device__ double abs(double x) { return ::abs(x); };
+
+static inline __device__ int8_t abs(int8_t x)
+{
+    int8_t sgn = x >> (8 - 1);
+
+    return (x ^ sgn) - sgn;
+};
+
+static inline __device__ int32_t abs(int32_t x)
+{
+    int32_t sgn = x >> (32 - 1);
+
+    return (x ^ sgn) - sgn;
+};
+
+static inline __device__ half_t abs(half_t x) { return ::__habs(x); };
+
+static inline __device__ bool isnan(float x) { return ::isnan(x); };
+
+static inline __device__ bool isnan(double x) { return ::isnan(x); };
+
+static inline __device__ bool isnan(int8_t x)
+{
+    (void)x;
+    return false;
+};

-    return half_float::isnan(xx);
+static inline __device__ bool isnan(int32_t x)
+{
+    (void)x;
+    return false;
 };

+static inline __device__ bool isnan(half_t x) { return ::__hisnan(x); };
+
+static inline __device__ float sqrt(float x) { return ::sqrtf(x); };
+
+static inline __device__ double sqrt(double x) { return ::sqrt(x); };
+
 } // namespace math
 } // namespace ck


--- a/include/ck/utility/reduction_functions_accumulate.hpp
+++ b/include/ck/utility/reduction_functions_accumulate.hpp
@@ -27,6 +27,7 @@
 #define CK_REDUCTION_FUNCTIONS_BINOP_HPP

 #include "data_type.hpp"
+#include "math_v2.hpp"

 #include "reduction_common.hpp"
 #include "reduction_operator.hpp"
@@ -34,18 +35,6 @@
 namespace ck {
 namespace detail {

-template <typename T>
-static inline __device__ bool is_nan(T x)
-{
-    return (isnan(x));
-};
-
-template <>
-inline __device__ bool is_nan<half_t>(half_t x)
-{
-    return (__hisnan(x));
-};
-
 template <bool PropagateNan, typename ReduceOperation, typename AccDataType>
 struct AccumulateWithNanCheck;

@@ -53,7 +42,7 @@ template <typename ReduceOperation, typename AccDataType>
 struct AccumulateWithNanCheck<false, ReduceOperation, AccDataType>
 {
    // cppcheck-suppress constParameter
-    __device__ static inline void Calculate(AccDataType& accuVal, AccDataType currVal)
+    __host__ __device__ static inline void Calculate(AccDataType& accuVal, AccDataType currVal)
    {
        ReduceOperation{}(accuVal, currVal);
    };
@@ -62,9 +51,11 @@ struct AccumulateWithNanCheck<false, ReduceOperation, AccDataType>
 template <typename ReduceOperation, typename AccDataType>
 struct AccumulateWithNanCheck<true, ReduceOperation, AccDataType>
 {
-    __device__ static inline void Calculate(AccDataType& accuVal, AccDataType currVal)
+    __host__ __device__ static inline void Calculate(AccDataType& accuVal, AccDataType currVal)
    {
-        if(is_nan(currVal))
+        using ck::math::isnan;
+
+        if(isnan(currVal))
        {
            accuVal = currVal;
        }
@@ -81,7 +72,7 @@ struct AccumulateWithIndexAndNanCheck;
 template <typename ReduceOperation, typename AccDataType, typename IndexDataType>
 struct AccumulateWithIndexAndNanCheck<false, ReduceOperation, AccDataType, IndexDataType>
 {
-    __device__ static inline void
+    __host__ __device__ static inline void
    // cppcheck-suppress constParameter
    Calculate(AccDataType& accuVal,
              AccDataType currVal,
@@ -101,12 +92,14 @@ template <typename ReduceOperation, typename AccDataType, typename IndexDataType
 struct AccumulateWithIndexAndNanCheck<true, ReduceOperation, AccDataType, IndexDataType>
 {
    // The method is called when the ReduceOperation is indexable and the user asked for indices
-    __device__ static inline void Calculate(AccDataType& accuVal,
-                                            AccDataType currVal,
-                                            IndexDataType& accuIndex,
-                                            IndexDataType currIndex)
+    __host__ __device__ static inline void Calculate(AccDataType& accuVal,
+                                                     AccDataType currVal,
+                                                     IndexDataType& accuIndex,
+                                                     IndexDataType currIndex)
    {
-        if(is_nan(currVal))
+        using ck::math::isnan;
+
+        if(isnan(currVal))
        {
            accuVal   = currVal;
            accuIndex = currIndex;

--- a/include/ck/utility/reduction_operator.hpp
+++ b/include/ck/utility/reduction_operator.hpp
@@ -36,7 +36,7 @@ namespace reduce {
 // Every binary operator used in reduction is represented by a templated functor class. Each functor
 // class must provide at least
 // three members:
-// 1) GetReductionZeroVal() -- the interface to return the "identity element" for the binary
+// 1) GetIdentityValue() -- the interface to return the "identity element" for the binary
 // operator, "identity element" is the unique
 //                    element in the algebraic space that doesn't affect the value of other elements
 //                    when operated against them, and the concept is similar to zero vector in
@@ -59,7 +59,7 @@ struct Add
 {
    using dataType = T;

-    __host__ __device__ static constexpr T GetReductionZeroVal() { return static_cast<T>(0.0f); };
+    __host__ __device__ static constexpr T GetIdentityValue() { return static_cast<T>(0.0f); };

    __device__ static constexpr bool
    IsCompatibleInMemoryDataOperation(InMemoryDataOperationEnum operation)
@@ -86,7 +86,7 @@ struct Mul
 {
    using dataType = T;

-    __host__ __device__ static constexpr T GetReductionZeroVal() { return static_cast<T>(1.0f); };
+    __host__ __device__ static constexpr T GetIdentityValue() { return static_cast<T>(1.0f); };

    __device__ static constexpr bool
    IsCompatibleInMemoryDataOperation(InMemoryDataOperationEnum operation)
@@ -102,7 +102,7 @@ struct Max
 {
    using dataType = T;

-    __host__ __device__ static constexpr T GetReductionZeroVal()
+    __host__ __device__ static constexpr T GetIdentityValue()
    {
        return NumericLimits<T>::Lowest();
    };
@@ -135,10 +135,7 @@ struct Min
 {
    using dataType = T;

-    __host__ __device__ static constexpr T GetReductionZeroVal()
-    {
-        return NumericLimits<T>::Max();
-    };
+    __host__ __device__ static constexpr T GetIdentityValue() { return NumericLimits<T>::Max(); };

    __device__ static constexpr bool
    IsCompatibleInMemoryDataOperation(InMemoryDataOperationEnum operation)
@@ -168,7 +165,7 @@ struct AMax
 {
    using dataType = T;

-    __host__ __device__ static constexpr T GetReductionZeroVal() { return static_cast<T>(0.0f); };
+    __host__ __device__ static constexpr T GetIdentityValue() { return static_cast<T>(0.0f); };

    __device__ static constexpr bool
    IsCompatibleInMemoryDataOperation(InMemoryDataOperationEnum operation)
@@ -194,7 +191,7 @@ struct AMax
 };

 template <typename T>
-T GetReductionZeroValueForInMemoryDataOperation(InMemoryDataOperationEnum operation)
+T GetIdentityValueueForInMemoryDataOperation(InMemoryDataOperationEnum operation)
 {
    T result = ck::type_convert<T>(0.0f);


--- a/library/include/ck/library/host_tensor/host_reduce_util.hpp
+++ b/library/include/ck/library/host_tensor/host_reduce_util.hpp
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2020 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef GUARD_HOST_REDUCE_UTIL_HPP
-#define GUARD_HOST_REDUCE_UTIL_HPP
-
-#include <limits>
-#include <cmath>
-#include <functional>
-
-#include "reduction_enums.hpp"
-#include "data_type.hpp"
-#include "math_v2.hpp"
-
-namespace ck {
-
-namespace host_reduce {
-
-using ck::NanPropagation;
-using ck::ReduceTensorOp;
-
-template <typename AccDataType, ReduceTensorOp ReduceOpId>
-__host__ static inline std::function<void(AccDataType&)> PreUnaryOpFn(int)
-{
-    using ck::math::abs;
-
-    if constexpr(ReduceOpId == ReduceTensorOp::NORM1)
-    {
-        return ([&](AccDataType& a_) { a_ = abs(a_); });
-    }
-    else if constexpr(ReduceOpId == ReduceTensorOp::NORM2)
-    {
-        return ([&](AccDataType& a_) { a_ = a_ * a_; });
-    }
-    else if constexpr(ReduceOpId == ReduceTensorOp::AMAX)
-    {
-        return ([&](AccDataType& a_) { a_ = abs(a_); });
-    }
-    else
-    {
-        // ReduceTensorOp::AVG:
-        // ReduceTensorOp::ADD:
-        // ReduceTensorOp::MUL:
-        // ReduceTensorOp::MIN:
-        // ReduceTensorOp::MAX:
-        return ([&](AccDataType&) {});
-    };
-};
-
-template <typename AccDataType, ReduceTensorOp ReduceOpId>
-__host__ static inline std::function<void(AccDataType&)> PosUnaryOpFn(int32_t divider)
-{
-    using std::sqrt;
-
-    if constexpr(ReduceOpId == ReduceTensorOp::NORM2)
-    {
-        return ([&](AccDataType& a_) { a_ = sqrt(a_); });
-    }
-    else if constexpr(ReduceOpId == ReduceTensorOp::AVG)
-    {
-        return ([&, divider](AccDataType& a_) {
-            a_ = a_ / static_cast<AccDataType>(static_cast<float>(divider));
-        });
-    }
-    else
-    {
-        // ReduceTensorOp::ADD:
-        // ReduceTensorOp::NORM1:
-        // ReduceTensorOp::MUL:
-        // ReduceTensorOp::MIN:
-        // ReduceTensorOp::MAX:
-        // ReduceTensorOp::AMAX:
-        return ([&](AccDataType&) {});
-    }
-};
-
-template <typename AccDataType, ReduceTensorOp ReduceOpId>
-__host__ static inline std::function<void(AccDataType&, AccDataType)> ReduceOpFn()
-{
-    if constexpr(ReduceOpId == ReduceTensorOp::ADD || ReduceOpId == ReduceTensorOp::AVG ||
-                 ReduceOpId == ReduceTensorOp::NORM1 || ReduceOpId == ReduceTensorOp::NORM2)
-    {
-        return ([&](AccDataType& a_, AccDataType b_) { a_ = a_ + b_; });
-    }
-    else if constexpr(ReduceOpId == ReduceTensorOp::MUL)
-    {
-        return ([&](AccDataType& a_, AccDataType b_) { a_ = a_ * b_; });
-    }
-    else if constexpr(ReduceOpId == ReduceTensorOp::MIN)
-    {
-        return ([&](AccDataType& a_, AccDataType b_) {
-            if(a_ > b_)
-                a_ = b_;
-        });
-    }
-    else if constexpr(ReduceOpId == ReduceTensorOp::MAX || ReduceOpId == ReduceTensorOp::AMAX)
-    {
-        return ([&](AccDataType& a_, AccDataType b_) {
-            if(a_ < b_)
-                a_ = b_;
-        });
-    }
-};
-
-template <typename AccDataType, ReduceTensorOp ReduceOpId>
-__host__ static inline std::function<void(AccDataType&, AccDataType, bool& changed)> ReduceOpFn2()
-{
-    if constexpr(ReduceOpId == ReduceTensorOp::MIN)
-    {
-        return ([&](AccDataType& a_, AccDataType b_, bool& changed) {
-            if(a_ > b_)
-            {
-                a_      = b_;
-                changed = true;
-            }
-            else
-                changed = false;
-        });
-    }
-    else if constexpr(ReduceOpId == ReduceTensorOp::MAX || ReduceOpId == ReduceTensorOp::AMAX)
-    {
-        return ([&](AccDataType& a_, AccDataType b_, bool& changed) {
-            if(a_ < b_)
-            {
-                a_      = b_;
-                changed = true;
-            }
-            else
-                changed = false;
-        });
-    }
-    else
-    {
-        // ReduceTensorOp::ADD:
-        // ReduceTensorOp::MUL:
-        // ReduceTensorOp::AVG:
-        // ReduceTensorOp::NORM1:
-        // ReduceTensorOp::NORM2:
-        return (std::function<void(AccDataType&, AccDataType, bool&)>{});
-    };
-};
-
-template <typename AccDataType, ReduceTensorOp ReduceOpId>
-__host__ static inline AccDataType ReduceOpZeroVal()
-{
-    if constexpr(ReduceOpId == ReduceTensorOp::MUL)
-    {
-        return (static_cast<AccDataType>(1.0f));
-    }
-    else if constexpr(ReduceOpId == ReduceTensorOp::MIN)
-    {
-        return (ck::NumericLimits<AccDataType>::Max());
-    }
-    else if constexpr(ReduceOpId == ReduceTensorOp::MAX)
-    {
-        return (ck::NumericLimits<AccDataType>::Lowest());
-    }
-    else if constexpr(ReduceOpId == ReduceTensorOp::AMAX)
-    {
-        return (static_cast<AccDataType>(0.0f));
-    }
-    else
-    {
-        // ReduceTensorOp::ADD
-        // ReduceTensorOp::AVG
-        // ReduceTensorOp::NORM1
-        // ReduceTensorOp::NORM2
-        return (static_cast<AccDataType>(0.0f));
-    };
-};
-
-template <typename AccDataType, bool PropagateNan>
-__host__ static inline void
-binop_with_nan_check(std::function<void(AccDataType&, AccDataType)> opReduce,
-                     AccDataType& accuVal,
-                     AccDataType currVal)
-{
-    using ck::math::isnan;
-
-    if constexpr(!PropagateNan)
-    {
-        opReduce(accuVal, currVal);
-    }
-    else
-    {
-        if(isnan(currVal))
-            accuVal = currVal;
-        else
-            opReduce(accuVal, currVal);
-    };
-};
-
-template <typename AccDataType, typename IndexDataType, bool PropagateNan>
-__host__ static inline void
-binop_with_index_and_nan_check(std::function<void(AccDataType&, AccDataType, bool&)> opReduce,
-                               AccDataType& accuVal,
-                               AccDataType currVal,
-                               IndexDataType& accuIndex,
-                               IndexDataType currIndex)
-{
-    using ck::math::isnan;
-
-    if constexpr(!PropagateNan)
-    {
-        bool changed;
-
-        opReduce(accuVal, currVal, changed);
-
-        if(changed)
-            accuIndex = currIndex;
-    }
-    else
-    {
-        if(isnan(currVal))
-        {
-            accuVal   = currVal;
-            accuIndex = currIndex;
-        }
-        else
-        {
-            bool changed;
-
-            opReduce(accuVal, currVal, changed);
-
-            if(changed)
-                accuIndex = currIndex;
-        };
-    };
-};
-
-}; // namespace host_reduce
-
-}; // namespace ck
-
-#endif
--- a/library/include/ck/library/host_tensor/host_reduction.hpp
+++ b/library/include/ck/library/host_tensor/host_reduction.hpp
@@ -33,10 +33,10 @@

 #include "reduction_enums.hpp"
 #include "reduction_common.hpp"
-#include "host_reduce_util.hpp"
 #include "host_common_util.hpp"
 #include "host_tensor.hpp"
 #include "data_type.hpp"
+#include "reduction_functions_accumulate.hpp"

 template <int NDim>
 static void get_all_indexes(const std::array<size_t, NDim>& dimLengths,
@@ -106,11 +106,13 @@ static size_t get_offset_from_index(const std::vector<size_t>& strides,
 template <typename InDataType,
          typename AccDataType,
          typename OutDataType,
-          ck::ReduceTensorOp ReduceOpId,
+          typename ReduceOperation,
+          typename InElementwiseOperation,
+          typename AccElementwiseOperation,
          int Rank,
          int NumReduceDim,
          bool PropagateNan,
-          bool NeedIndices>
+          bool OutputIndex>
 struct ReductionHost
 {
    using IndexDataType = int32_t;
@@ -122,8 +124,6 @@ struct ReductionHost
    std::vector<int> reduceDims;

    IndexDataType divider;
-    std::function<void(AccDataType&)> preUnaryOp;
-    std::function<void(AccDataType&)> posUnaryOp;
    std::array<size_t, NumReduceDim> reduceLengths;
    std::array<size_t, NumReduceDim> reduceStrides;
    std::array<size_t, NumInvariantDim> invariantLengths;
@@ -137,9 +137,6 @@ struct ReductionHost
                  const std::vector<int>& invariantDims_,
                  const std::vector<int>& reduceDims_)
    {
-        using ck::host_reduce::PosUnaryOpFn;
-        using ck::host_reduce::PreUnaryOpFn;
-
        // this->outLengths = to_int_vector(outDesc.GetLengths());
        this->outStrides = outDesc.GetStrides();

@@ -171,9 +168,6 @@ struct ReductionHost
            invariant_dim_indexes.clear();
            get_all_indexes<NumInvariantDim>(invariantLengths, invariant_dim_indexes);
        };
-
-        preUnaryOp = PreUnaryOpFn<AccDataType, ReduceOpId>(divider);
-        posUnaryOp = PosUnaryOpFn<AccDataType, ReduceOpId>(divider);
    };

    void Run(float alpha,
@@ -182,7 +176,7 @@ struct ReductionHost
             OutDataType* out_data,
             IndexDataType* out_indices)
    {
-        if constexpr(NeedIndices)
+        if constexpr(OutputIndex)
        {
            RunImpl_with_index(alpha, in_data, beta, out_data, out_indices);
        }
@@ -201,15 +195,17 @@ struct ReductionHost
        using ck::float_equal_one;
        using ck::float_equal_zero;
        using ck::type_convert;
-        using ck::host_reduce::binop_with_index_and_nan_check;
-        using ck::host_reduce::ReduceOpFn2;
-        using ck::host_reduce::ReduceOpZeroVal;

-        auto opReduce2 = ReduceOpFn2<AccDataType, ReduceOpId>();
+        using Accumulation = ck::detail::AccumulateWithIndexAndNanCheck<PropagateNan,
+                                                                        ReduceOperation,
+                                                                        AccDataType,
+                                                                        IndexDataType>;
+        InElementwiseOperation in_elementwise_op(divider);
+        AccElementwiseOperation acc_elementwise_op(divider);

        if constexpr(NumInvariantDim == 0)
        {
-            AccDataType accuVal     = ReduceOpZeroVal<AccDataType, ReduceOpId>();
+            AccDataType accuVal     = ReduceOperation::GetIdentityValue();
            IndexDataType accuIndex = 0;

            for(std::size_t i = 0; i < reduce_dim_indexes.size(); i++)
@@ -219,15 +215,14 @@ struct ReductionHost

                auto currVal = type_convert<AccDataType>(in_data[offset_reduce]);

-                preUnaryOp(currVal);
+                in_elementwise_op(currVal, currVal);

                auto currIndex = static_cast<IndexDataType>(i);

-                binop_with_index_and_nan_check<AccDataType, IndexDataType, PropagateNan>(
-                    opReduce2, accuVal, currVal, accuIndex, currIndex);
+                Accumulation::Calculate(accuVal, currVal, accuIndex, currIndex);
            };

-            posUnaryOp(accuVal);
+            acc_elementwise_op(accuVal, accuVal);

            if(!float_equal_one{}(alpha))
                accuVal *= type_convert<AccDataType>(alpha);
@@ -241,7 +236,7 @@ struct ReductionHost
        else
        {
            auto thread_reduce_func = [&](auto invariant_index) {
-                AccDataType accuVal     = ReduceOpZeroVal<AccDataType, ReduceOpId>();
+                AccDataType accuVal     = ReduceOperation::GetIdentityValue();
                IndexDataType accuIndex = 0;

                auto offset_invariant =
@@ -255,15 +250,14 @@ struct ReductionHost
                    auto currVal =
                        type_convert<AccDataType>(in_data[offset_invariant + offset_reduce]);

-                    preUnaryOp(currVal);
+                    in_elementwise_op(currVal, currVal);

                    auto currIndex = static_cast<IndexDataType>(i);

-                    binop_with_index_and_nan_check<AccDataType, IndexDataType, PropagateNan>(
-                        opReduce2, accuVal, currVal, accuIndex, currIndex);
+                    Accumulation::Calculate(accuVal, currVal, accuIndex, currIndex);
                };

-                posUnaryOp(accuVal);
+                acc_elementwise_op(accuVal, accuVal);

                if(!float_equal_one{}(alpha))
                    accuVal *= type_convert<AccDataType>(alpha);
@@ -308,15 +302,16 @@ struct ReductionHost
        using ck::float_equal_one;
        using ck::float_equal_zero;
        using ck::type_convert;
-        using ck::host_reduce::binop_with_nan_check;
-        using ck::host_reduce::ReduceOpFn;
-        using ck::host_reduce::ReduceOpZeroVal;

-        auto opReduce = ReduceOpFn<AccDataType, ReduceOpId>();
+        using Accumulation =
+            ck::detail::AccumulateWithNanCheck<PropagateNan, ReduceOperation, AccDataType>;
+
+        InElementwiseOperation in_elementwise_op(divider);
+        AccElementwiseOperation acc_elementwise_op(divider);

        if constexpr(NumInvariantDim == 0)
        {
-            AccDataType accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
+            AccDataType accuVal = ReduceOperation::GetIdentityValue();

            for(const auto& reduce_index : reduce_dim_indexes)
            {
@@ -325,12 +320,12 @@ struct ReductionHost

                auto currVal = type_convert<AccDataType>(in_data[offset_reduce]);

-                preUnaryOp(currVal);
+                in_elementwise_op(currVal, currVal);

-                binop_with_nan_check<AccDataType, PropagateNan>(opReduce, accuVal, currVal);
+                Accumulation::Calculate(accuVal, currVal);
            };

-            posUnaryOp(accuVal);
+            acc_elementwise_op(accuVal, accuVal);

            if(!float_equal_one{}(alpha))
                accuVal *= type_convert<AccDataType>(alpha);
@@ -343,7 +338,7 @@ struct ReductionHost
        else
        {
            auto thread_reduce_func = [&](auto invariant_index) {
-                AccDataType accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
+                AccDataType accuVal = ReduceOperation::GetIdentityValue();

                auto offset_invariant =
                    get_offset_from_index<NumInvariantDim>(invariantStrides, invariant_index);
@@ -356,12 +351,12 @@ struct ReductionHost
                    auto currVal =
                        type_convert<AccDataType>(in_data[offset_invariant + offset_reduce]);

-                    preUnaryOp(currVal);
+                    in_elementwise_op(currVal, currVal);

-                    binop_with_nan_check<AccDataType, PropagateNan>(opReduce, accuVal, currVal);
+                    Accumulation::Calculate(accuVal, currVal);
                };

-                posUnaryOp(accuVal);
+                acc_elementwise_op(accuVal, accuVal);

                if(!float_equal_one{}(alpha))
                    accuVal *= type_convert<AccDataType>(alpha);

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+#include <iostream>
+#include <sstream>
+#include "device_base.hpp"
+#include "host_tensor.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+// FIXME: support arbitrary elementwise operation for A/B/C
+template <
+    typename ADataType,
+    typename BDataType,
+    typename CDataType,
+    typename AElementwiseOperation,
+    typename BElementwiseOperation,
+    typename CElementwiseOperation,
+    enable_if_t<
+        is_same_v<AElementwiseOperation, ck::tensor_operation::element_wise::PassThrough> &&
+            is_same_v<BElementwiseOperation, ck::tensor_operation::element_wise::PassThrough> &&
+            is_same_v<CElementwiseOperation, ck::tensor_operation::element_wise::PassThrough>,
+        bool> = false>
+struct ReferenceCGemm : public device::BaseOperator
+{
+    // Argument
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const Tensor<ADataType>& a_m_k_real,
+                 const Tensor<ADataType>& a_m_k_imag,
+                 const Tensor<BDataType>& b_k_n_real,
+                 const Tensor<BDataType>& b_k_n_imag,
+                 Tensor<CDataType>& c_m_n_real,
+                 Tensor<CDataType>& c_m_n_imag,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CElementwiseOperation c_element_op)
+            : a_m_k_real_{a_m_k_real},
+              a_m_k_imag_{a_m_k_imag},
+              b_k_n_real_{b_k_n_real},
+              b_k_n_imag_{b_k_n_imag},
+              c_m_n_real_{c_m_n_real},
+              c_m_n_imag_{c_m_n_imag},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              c_element_op_{c_element_op}
+        {
+        }
+
+        const Tensor<ADataType>& a_m_k_real_;
+        const Tensor<ADataType>& a_m_k_imag_;
+        const Tensor<BDataType>& b_k_n_real_;
+        const Tensor<BDataType>& b_k_n_imag_;
+        Tensor<CDataType>& c_m_n_real_;
+        Tensor<CDataType>& c_m_n_imag_;
+
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CElementwiseOperation c_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public device::BaseInvoker
+    {
+        using Argument = ReferenceCGemm::Argument;
+
+        float Run(const Argument& arg)
+        {
+            const std::size_t K = arg.a_m_k_real_.mDesc.GetLengths()[1];
+
+            if(K != arg.a_m_k_imag_.mDesc.GetLengths()[1])
+            {
+                throw std::runtime_error("wrong! Incompatible real and imag sizes in CGEMM");
+            }
+
+            auto f_mk_kn_mn_real = [&](auto m, auto n) {
+                float v_c_real = 0;
+
+                for(std::size_t k = 0; k < K; ++k)
+                {
+                    float v_a_real = ck::type_convert<float>(arg.a_m_k_real_(m, k));
+                    float v_a_imag = ck::type_convert<float>(arg.a_m_k_imag_(m, k));
+                    float v_b_real = ck::type_convert<float>(arg.b_k_n_real_(k, n));
+                    float v_b_imag = ck::type_convert<float>(arg.b_k_n_imag_(k, n));
+
+                    v_c_real += v_a_real * v_b_real - v_a_imag * v_b_imag;
+                }
+
+                arg.c_m_n_real_(m, n) = v_c_real;
+            };
+
+            auto f_mk_kn_mn_imag = [&](auto m, auto n) {
+                float v_c_imag = 0;
+
+                for(std::size_t k = 0; k < K; ++k)
+                {
+                    float v_a_real = ck::type_convert<float>(arg.a_m_k_real_(m, k));
+                    float v_a_imag = ck::type_convert<float>(arg.a_m_k_imag_(m, k));
+                    float v_b_real = ck::type_convert<float>(arg.b_k_n_real_(k, n));
+                    float v_b_imag = ck::type_convert<float>(arg.b_k_n_imag_(k, n));
+
+                    v_c_imag += v_a_real * v_b_imag + v_a_imag * v_b_real;
+                }
+
+                arg.c_m_n_imag_(m, n) = v_c_imag;
+            };
+
+            make_ParallelTensorFunctor(f_mk_kn_mn_real,
+                                       arg.c_m_n_real_.mDesc.GetLengths()[0],
+                                       arg.c_m_n_real_.mDesc.GetLengths()[1])(
+                std::thread::hardware_concurrency());
+            make_ParallelTensorFunctor(f_mk_kn_mn_imag,
+                                       arg.c_m_n_imag_.mDesc.GetLengths()[0],
+                                       arg.c_m_n_imag_.mDesc.GetLengths()[1])(
+                std::thread::hardware_concurrency());
+
+            return 0;
+        }
+
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
+
+    static auto MakeArgument(const Tensor<ADataType>& a_m_k_real,
+                             const Tensor<ADataType>& a_m_k_imag,
+                             const Tensor<BDataType>& b_k_n_real,
+                             const Tensor<BDataType>& b_k_n_imag,
+                             Tensor<CDataType>& c_m_n_real,
+                             Tensor<CDataType>& c_m_n_imag,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{a_m_k_real,
+                        a_m_k_imag,
+                        b_k_n_real,
+                        b_k_n_imag,
+                        c_m_n_real,
+                        c_m_n_imag,
+                        a_element_op,
+                        b_element_op,
+                        c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceCGemm"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
--- a/profiler/include/profile_batched_gemm_reduce_impl.hpp
+++ b/profiler/include/profile_batched_gemm_reduce_impl.hpp
@@ -171,8 +171,8 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
        {
            for(int m = 0; m < M; ++m)
            {
-                float d0_acc = d0_reduce_op.GetReductionZeroVal();
-                float d1_acc = d1_reduce_op.GetReductionZeroVal();
+                float d0_acc = d0_reduce_op.GetIdentityValue();
+                float d1_acc = d1_reduce_op.GetIdentityValue();

                for(int n = 0; n < N; ++n)
                {

--- a/profiler/include/profile_gemm_reduce_impl.hpp
+++ b/profiler/include/profile_gemm_reduce_impl.hpp
@@ -165,8 +165,8 @@ bool profile_gemm_reduce_impl(int do_verification,

        for(int m = 0; m < M; ++m)
        {
-            float d0_acc = d0_reduce_op.GetReductionZeroVal();
-            float d1_acc = d1_reduce_op.GetReductionZeroVal();
+            float d0_acc = d0_reduce_op.GetIdentityValue();
+            float d1_acc = d1_reduce_op.GetIdentityValue();

            for(int n = 0; n < N; ++n)
            {

--- a/profiler/include/profile_reduce_impl.hpp
+++ b/profiler/include/profile_reduce_impl.hpp
@@ -138,7 +138,6 @@ bool profile_reduce_impl_impl(bool do_verification,
 {
    using namespace ck::tensor_operation::device;
    using namespace ck::tensor_operation::device::device_reduce_instance;
-    using namespace ck::host_reduce;
    using ck::host_common::dumpBufferToFile;

    constexpr bool op_support_indices =
@@ -261,15 +260,17 @@ bool profile_reduce_impl_impl(bool do_verification,
        float best_avg_time   = 0;
        float best_gb_per_sec = 0;

-        using InElementwiseOperation_0 =
+        using InElementwiseOperation =
            typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::
                InElementwiseOperation;
-        using AccElementwiseOperation_0 =
+        using AccElementwiseOperation =
            typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::
                AccElementwiseOperation;

+        using ReduceOperation = typename reduce_binary_operator<AccDataType, ReduceOpId>::opType;
+
        using DeviceReduceInstPtr0 =
-            DeviceReducePtr<InElementwiseOperation_0, AccElementwiseOperation_0>;
+            DeviceReducePtr<InElementwiseOperation, AccElementwiseOperation>;

        std::vector<DeviceReduceInstPtr0> reduce0_ptrs;

@@ -313,7 +314,9 @@ bool profile_reduce_impl_impl(bool do_verification,
            ReductionHost<InDataType,
                          AccDataType,
                          OutDataType,
-                          ReduceOpId,
+                          ReduceOperation,
+                          InElementwiseOperation,
+                          AccElementwiseOperation,
                          Rank,
                          NumReduceDim,
                          PropagateNan,
@@ -337,9 +340,8 @@ bool profile_reduce_impl_impl(bool do_verification,
        for(auto& reduce_ptr : reduce0_ptrs)
        {

-            InElementwiseOperation_0 in_elementwise_op_0(static_cast<int32_t>(reduce_total_length));
-            AccElementwiseOperation_0 acc_elementwise_op_0(
-                static_cast<int32_t>(reduce_total_length));
+            InElementwiseOperation in_elementwise_op(static_cast<int32_t>(reduce_total_length));
+            AccElementwiseOperation acc_elementwise_op(static_cast<int32_t>(reduce_total_length));

            auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths,
                                                                i_inStrides,
@@ -352,8 +354,8 @@ bool profile_reduce_impl_impl(bool do_verification,
                                                                nullptr,
                                                                out_dev.GetDeviceBuffer(),
                                                                out_indices_dev.GetDeviceBuffer(),
-                                                                in_elementwise_op_0,
-                                                                acc_elementwise_op_0);
+                                                                in_elementwise_op,
+                                                                acc_elementwise_op);

            if(!reduce_ptr->IsSupportedArgument(argument_ptr.get()))
                continue;

--- a/test/grouped_gemm/grouped_gemm_fp16.cpp
+++ b/test/grouped_gemm/grouped_gemm_fp16.cpp
@@ -141,10 +141,15 @@ bool TestGroupedGemm(DeviceGroupedGemmPtr_& groupedGemmPtr)
    auto c_element_op = PassThrough{};

    // do GEMM
-    auto invoker_ptr  = groupedGemmPtr->MakeInvokerPointer();
+    auto invoker_ptr = groupedGemmPtr->MakeInvokerPointer();
+
    auto argument_ptr = groupedGemmPtr->MakeArgumentPointer(
        p_a, p_b, p_c, gemm_shapes, a_element_op, b_element_op, c_element_op);

+    DeviceMem gemm_desc_workspace(groupedGemmPtr->GetWorkSpaceSize(argument_ptr.get()));
+
+    groupedGemmPtr->SetWorkSpacePointer(argument_ptr.get(), gemm_desc_workspace.GetDeviceBuffer());
+
    invoker_ptr->Run(argument_ptr.get());

    for(std::size_t i = 0; i < gemm_shapes.size(); i++)