Merge branch 'develop' into bmatrix_skip_lds

f9c478e2 · ltqin · 7d85d04a · 91d8b7d6 · f9c478e2 · f9c478e2
Commit f9c478e2 authored May 30, 2022 by ltqin
7 changed files
--- a/test/gemm_split_k/gemm_split_k.cpp
+++ b/test/gemm_split_k/gemm_split_k.cpp
@@ -45,7 +45,7 @@ static bool check_out(const Tensor<T>& ref, const Tensor<T>& result)
 {
    float max_diff = 1e-6;
-    for(int i = 0; i < ref.mData.size(); ++i)
+    for(std::size_t i = 0; i < ref.mData.size(); ++i)
    {
        float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
        if(max_diff < diff)
@@ -187,9 +187,10 @@ int test_gemm(const gemmArgs& args)
        if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
        {
-            invoker_ptr->Run(argument_ptr.get(), 0);
+            invoker_ptr->Run(argument_ptr.get());
            c_device_buf.FromDevice(c_m_n_device_result.mData.data());
            if(!check_out(c_m_n_host_result, c_m_n_device_result))
            {
                success = false;

--- a/test/grouped_gemm/grouped_gemm_fp16.cpp
+++ b/test/grouped_gemm/grouped_gemm_fp16.cpp
@@ -104,7 +104,7 @@ bool TestGroupedGemm(DeviceGroupedGemmPtr_& groupedGemmPtr)
    b_tensors_device.reserve(group_count);
    c_tensors_device.reserve(group_count);
-    for(int i = 0; i < gemm_shapes.size(); i++)
+    for(std::size_t i = 0; i < gemm_shapes.size(); i++)
    {
        a_tensors.emplace_back(Tensor<ADataType>(f_host_tensor_descriptor(
            gemm_shapes[i].M, gemm_shapes[i].K, gemm_shapes[i].StrideA, ALayout{})));
@@ -119,7 +119,7 @@ bool TestGroupedGemm(DeviceGroupedGemmPtr_& groupedGemmPtr)
        b_tensors[i].GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
    }
-    for(int i = 0; i < gemm_shapes.size(); i++)
+    for(std::size_t i = 0; i < gemm_shapes.size(); i++)
    {
        a_tensors_device.emplace_back(
            std::make_unique<DeviceMem>(sizeof(ADataType) * a_tensors[i].mDesc.GetElementSize()));
@@ -147,12 +147,17 @@ bool TestGroupedGemm(DeviceGroupedGemmPtr_& groupedGemmPtr)
    invoker_ptr->Run(argument_ptr.get());
-    for(int i = 0; i < gemm_shapes.size(); i++)
+    for(std::size_t i = 0; i < gemm_shapes.size(); i++)
    {
        c_tensors_device[i]->FromDevice(c_device_tensors[i].mData.data());
-        using ReferenceGemmInstance = ck::tensor_operation::host::
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
-            ReferenceGemm<ADataType, BDataType, CDataType, PassThrough, PassThrough, PassThrough>;
+                                                                                BDataType,
+                                                                                CDataType,
+                                                                                AccDataType,
+                                                                                PassThrough,
+                                                                                PassThrough,
+                                                                                PassThrough>;
        auto ref_gemm    = ReferenceGemmInstance{};
        auto ref_invoker = ref_gemm.MakeInvoker();

--- a/test/reduce/reduce_no_index.cpp
+++ b/test/reduce/reduce_no_index.cpp
 #include "getopt.h"
-#include "check_err.hpp"
+#include "host_common_util.hpp"
-#include "device_reduce_instance.hpp"
+#include "profile_reduce_impl.hpp"
-#include "reduction_enums.hpp"
-#include "host_tensor.hpp"
-#include "host_tensor_generator.hpp"
-#include "host_reduction.hpp"
-#include "reduce_util.hpp"
 using namespace ck;
-namespace {
-template <index_t Rank, index_t NumReduceDim>
-static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduceDims)
-{
-    assert(NumReduceDim == reduceDims.size());
-    int reduceFlag = 0;
-    // flag the bits for the reduceDims
-    for(int i = 0; i < NumReduceDim; i++)
-    {
-        reduceFlag |= 1 << reduceDims[i];
-    };
-    std::vector<int> invariantDims;
-    // collect invariant dimensions
-    for(int i = 0; i < Rank; i++)
-        if((reduceFlag & (1 << i)) == 0)
-        {
-            invariantDims.push_back(i);
-        };
-    return invariantDims;
-};
-constexpr int Rank = 4;
-constexpr ReduceTensorOp ReduceOpId      = ReduceTensorOp::AVG;
-constexpr NanPropagation NanOpt          = NanPropagation::PROPAGATE_NAN;
-constexpr bool PropagateNan              = false;
-constexpr ReduceTensorIndices IndicesOpt = ReduceTensorIndices::NO_INDICES;
-constexpr bool NeedIndices               = false;
-template <typename InDataType,
-          typename AccDataType,
-          typename OutDataType,
-          int Rank,
-          int NumReduceDim>
-bool test_reduce_no_index_impl(int init_method,
-                               const std::vector<size_t>& inLengths,
-                               const std::vector<int>& reduceDims,
-                               float alpha,
-                               float beta)
-{
-    using namespace ck::tensor_operation::device;
-    using namespace ck::tensor_operation::device::device_reduce_instance;
-    using namespace ck::host_reduce;
-    constexpr bool out_support_atomic_add = std::is_same<OutDataType, float>::value;
-    constexpr bool op_support_atomic_add  = true;
-    constexpr bool use_atomic_add         = (out_support_atomic_add && op_support_atomic_add);
-    Tensor<InDataType> in(inLengths);
-    std::vector<size_t> outLengths;
-    const auto invariantDims = get_invariant_dims<Rank, NumReduceDim>(reduceDims);
-    if(reduceDims.size() == Rank)
-        outLengths.push_back(1);
-    else
-        for(auto dim : invariantDims)
-            outLengths.push_back(inLengths[dim]);
-    Tensor<OutDataType> out_ref(outLengths);
-    Tensor<OutDataType> out(outLengths);
-    // only used when the OutDataType is bhalf_t
-    Tensor<float> out_ref_fp32(outLengths);
-    Tensor<float> out_fp32(outLengths);
-    auto inStrides  = in.mDesc.GetStrides();
-    auto outStrides = out.mDesc.GetStrides();
-    size_t invariant_total_length = out.mDesc.GetElementSize();
-    size_t reduce_total_length    = in.mDesc.GetElementSize() / invariant_total_length;
-    std::size_t num_thread = 1;
-    switch(init_method)
-    {
-    case 0: break;
-    case 1:
-        in.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}, num_thread);
-        if(beta != 0.0f)
-            out_ref.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}, num_thread);
-        break;
-    case 2:
-        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}, num_thread);
-        if(beta != 0.0f)
-            out_ref.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}, num_thread);
-        break;
-    default:
-        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0}, num_thread);
-        if(beta != 0.0f)
-            out_ref.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0}, num_thread);
-    }
-    if(beta != 0.0f)
-        for(size_t i = 0; i < out_ref.mDesc.GetElementSpace(); i++)
-            out.mData[i] = out_ref.mData[i];
-    // these buffers are usually provided by the user application
-    DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpace());
-    DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpace());
-    in_dev.ToDevice(in.mData.data());
-    if(beta != 0.0f)
-        out_dev.ToDevice(out.mData.data());
-    using InElementwiseOperation_0 =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation;
-    using AccElementwiseOperation_0 =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::
-            AccElementwiseOperation;
-    using InElementwiseOperation_1 =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::
-            InElementwiseOperation;
-    using AccElementwiseOperation_1 =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::
-            AccElementwiseOperation;
-    using InElementwiseOperation_2 =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::
-            InElementwiseOperation;
-    using AccElementwiseOperation_2 =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::
-            AccElementwiseOperation;
-    using DeviceReduceInstPtr0 =
-        DeviceReducePtr<InElementwiseOperation_0, AccElementwiseOperation_0>;
-    using DeviceReduceInstPtr1 =
-        DeviceReducePtr<InElementwiseOperation_1, AccElementwiseOperation_1>;
-    using DeviceReduceInstPtr2 =
-        DeviceReducePtr<InElementwiseOperation_2, AccElementwiseOperation_2>;
-    std::vector<DeviceReduceInstPtr0> reduce0_ptrs;
-    std::vector<DeviceReduceInstPtr1> reduce1_ptrs;
-    std::vector<DeviceReduceInstPtr2> reduce2_ptrs;
-    add_device_reduce_instance_threadwise<InDataType,
-                                          AccDataType,
-                                          OutDataType,
-                                          Rank,
-                                          NumReduceDim,
-                                          ReduceOpId,
-                                          NanOpt,
-                                          IndicesOpt>(reduce0_ptrs);
-    add_device_reduce_instance_blockwise<InDataType,
-                                         AccDataType,
-                                         OutDataType,
-                                         Rank,
-                                         NumReduceDim,
-                                         ReduceOpId,
-                                         NanOpt,
-                                         IndicesOpt>(reduce0_ptrs);
-    if constexpr(use_atomic_add)
-    {
-        add_device_reduce_instance_multiblock_atomic_add<InDataType,
-                                                         AccDataType,
-                                                         OutDataType,
-                                                         Rank,
-                                                         NumReduceDim,
-                                                         ReduceOpId,
-                                                         NanOpt,
-                                                         IndicesOpt>(reduce0_ptrs);
-    }
-    else
-    {
-        add_device_reduce_instance_multiblock_partial_reduce<InDataType,
-                                                             AccDataType,
-                                                             OutDataType,
-                                                             Rank,
-                                                             NumReduceDim,
-                                                             ReduceOpId,
-                                                             NanOpt,
-                                                             IndicesOpt>(reduce1_ptrs);
-    };
-    // used for secondary reduction
-    if constexpr(!use_atomic_add)
-    {
-        add_device_reduce_instance_blockwise_second_call<AccDataType,
-                                                         AccDataType,
-                                                         OutDataType,
-                                                         Rank,
-                                                         NumReduceDim,
-                                                         ReduceOpId,
-                                                         NanOpt,
-                                                         IndicesOpt>(reduce2_ptrs);
-    };
-    if(reduce0_ptrs.empty() && reduce1_ptrs.empty())
-    {
-        throw std::runtime_error("Wrong! No device REDUCE instance found");
-    };
-    bool result = true;
-    ReductionHost<InDataType,
-                  AccDataType,
-                  OutDataType,
-                  ReduceOpId,
-                  Rank,
-                  NumReduceDim,
-                  PropagateNan,
-                  NeedIndices>
-        hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
-    hostReduce.Run(alpha, in.mData.data(), beta, out_ref.mData.data(), nullptr);
-    const auto i_inLengths  = to_int_vector(inLengths);
-    const auto i_inStrides  = to_int_vector(inStrides);
-    const auto i_outLengths = to_int_vector(outLengths);
-    const auto i_outStrides = to_int_vector(outStrides);
-    for(auto& reduce_ptr : reduce0_ptrs)
-    {
-        auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths, reduceDims);
-        DeviceMem ws_dev(wsSizeInBytes);
-        InElementwiseOperation_0 in_elementwise_op_0(static_cast<int32_t>(reduce_total_length));
-        AccElementwiseOperation_0 acc_elementwise_op_0(static_cast<int32_t>(reduce_total_length));
-        auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths,
-                                                            i_inStrides,
-                                                            i_outLengths,
-                                                            i_outStrides,
-                                                            reduceDims,
-                                                            alpha,
-                                                            beta,
-                                                            in_dev.GetDeviceBuffer(),
-                                                            out_dev.GetDeviceBuffer(),
-                                                            nullptr,
-                                                            ws_dev.GetDeviceBuffer(),
-                                                            in_elementwise_op_0,
-                                                            acc_elementwise_op_0);
-        if(!reduce_ptr->IsSupportedArgument(argument_ptr.get()))
-            continue;
-        auto invoker_ptr = reduce_ptr->MakeInvokerPointer();
-        (void)invoker_ptr->Run(argument_ptr.get());
-        out_dev.FromDevice(out.mData.data());
-        bool single_result = true;
-        if constexpr(std::is_same<OutDataType, ck::half_t>::value ||
-                     std::is_same<OutDataType, ck::bhalf_t>::value)
-        {
-            reduce_util::to_f32_vector(out, out_fp32);
-            reduce_util::to_f32_vector(out_ref, out_ref_fp32);
-            single_result = ck::utils::check_err(
-                out_fp32.mData, out_ref_fp32.mData, "Error: incorrect data result!");
-        }
-        else
-        {
-            single_result =
-                ck::utils::check_err(out.mData, out_ref.mData, "Error: incorrect data result!");
-        };
-        if(!single_result)
-        {
-            std::cout << "Fail Info: " << reduce_ptr->GetTypeString() << std::endl;
-            result = false;
-        }
-    };
-    for(auto& reduce_ptr : reduce1_ptrs)
-    {
-        auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths, reduceDims);
-        DeviceMem ws_dev(wsSizeInBytes);
-        InElementwiseOperation_1 in_elementwise_op_1(static_cast<int32_t>(reduce_total_length));
-        AccElementwiseOperation_1 acc_elementwise_op_1(static_cast<int32_t>(reduce_total_length));
-        auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths,
-                                                            i_inStrides,
-                                                            i_outLengths,
-                                                            i_outStrides,
-                                                            reduceDims,
-                                                            alpha,
-                                                            beta,
-                                                            in_dev.GetDeviceBuffer(),
-                                                            out_dev.GetDeviceBuffer(),
-                                                            nullptr,
-                                                            ws_dev.GetDeviceBuffer(),
-                                                            in_elementwise_op_1,
-                                                            acc_elementwise_op_1);
-        if(!reduce_ptr->IsSupportedArgument(argument_ptr.get()))
-            continue;
-        auto invoker_ptr = reduce_ptr->MakeInvokerPointer();
-        (void)invoker_ptr->Run(argument_ptr.get());
-        std::vector<int> inLengths2 = reduce_ptr->GetWorkspace2dLengths(argument_ptr.get());
-        std::vector<int> inStrides2{inLengths2[1], 1};
-        for(auto& reduce2_ptr : reduce2_ptrs)
-        {
-            InElementwiseOperation_2 in_elementwise_op_2(static_cast<int32_t>(reduce_total_length));
-            AccElementwiseOperation_2 acc_elementwise_op_2(
-                static_cast<int32_t>(reduce_total_length));
-            auto argument2_ptr = reduce2_ptr->MakeArgumentPointer(inLengths2,
-                                                                  inStrides2,
-                                                                  i_outLengths,
-                                                                  i_outStrides,
-                                                                  reduceDims,
-                                                                  alpha,
-                                                                  beta,
-                                                                  ws_dev.GetDeviceBuffer(),
-                                                                  out_dev.GetDeviceBuffer(),
-                                                                  nullptr,
-                                                                  ws_dev.GetDeviceBuffer(),
-                                                                  in_elementwise_op_2,
-                                                                  acc_elementwise_op_2);
-            if(!reduce2_ptr->IsSupportedArgument(argument2_ptr.get()))
-                continue;
-            std::string reduce2_name = reduce2_ptr->GetTypeString();
-            auto invoker2_ptr = reduce2_ptr->MakeInvokerPointer();
-            (void)invoker2_ptr->Run(argument2_ptr.get());
-            out_dev.FromDevice(out.mData.data());
-            bool single_result = true;
-            if constexpr(std::is_same<OutDataType, ck::half_t>::value ||
-                         std::is_same<OutDataType, ck::bhalf_t>::value)
-            {
-                reduce_util::to_f32_vector(out, out_fp32);
-                reduce_util::to_f32_vector(out_ref, out_ref_fp32);
-                single_result = ck::utils::check_err(
-                    out_fp32.mData, out_ref_fp32.mData, "Error: incorrect data result!");
-            }
-            else
-            {
-                single_result =
-                    ck::utils::check_err(out.mData, out_ref.mData, "Error: incorrect data result!");
-            };
-            if(!single_result)
-            {
-                std::cout << "Fail Info: " << reduce_ptr->GetTypeString() << " => "
-                          << reduce2_ptr->GetTypeString() << std::endl;
-                result = false;
-            }
-        };
-    };
-    return (result);
-};
-} // anonymous namespace
 static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
                                       {"reduceDimensions", required_argument, nullptr, 'R'},
                                       {"scales", required_argument, nullptr, 'S'},
@@ -387,48 +13,6 @@ static struct option long_options[] = {{"inLengths", required_argument, nullptr,
 class SimpleAppArgs
 {
-    template <typename T>
-    static T getSingleValueFromString(const std::string& valueStr)
-    {
-        std::istringstream iss(valueStr);
-        T ret;
-        iss >> ret;
-        return (ret);
-    };
-    template <typename T>
-    static std::vector<T> getTypeValuesFromString(const char* cstr_values)
-    {
-        std::string valuesStr(cstr_values);
-        std::vector<T> values;
-        std::size_t pos = 0;
-        std::size_t new_pos;
-        new_pos = valuesStr.find(',', pos);
-        while(new_pos != std::string::npos)
-        {
-            const std::string sliceStr = valuesStr.substr(pos, new_pos - pos);
-            T val = getSingleValueFromString<T>(sliceStr);
-            values.push_back(val);
-            pos     = new_pos + 1;
-            new_pos = valuesStr.find(',', pos);
-        };
-        std::string sliceStr = valuesStr.substr(pos);
-        T val                = getSingleValueFromString<T>(sliceStr);
-        values.push_back(val);
-        return (values);
-    };
    private:
    int option_index = 0;
@@ -460,7 +44,9 @@ class SimpleAppArgs
    int processArgs(int argc, char* argv[])
    {
-        unsigned int ch;
+        using ck::host_common::getTypeValuesFromString;
+        int ch;
        while(1)
        {
@@ -514,7 +100,7 @@ class SimpleAppArgs
           (reduceDims.size() != 1 && reduceDims.size() != 3 && reduceDims.size() != 4))
            return (-1);
-        if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5)
+        if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5 && data_type != 6)
            return (-1);
        return (0);
@@ -525,87 +111,92 @@ bool test_reduce_no_index(int data_type,
                          int init_method,
                          std::vector<int> reduceDims,
                          std::vector<size_t> inLengths,
+                          ReduceTensorOp reduceOpId,
+                          bool propagateNan,
                          float alpha,
                          float beta)
 {
+    using ck::profiler::profile_reduce_impl;
    bool result = true;
    if(data_type == 0)
    {
-        switch(reduceDims.size())
+        result = profile_reduce_impl<float, float, float>(true,
-        {
+                                                          init_method,
-        case 1:
+                                                          false,
-            result = test_reduce_no_index_impl<float, float, float, Rank, 1>(
+                                                          false,
-                init_method, inLengths, reduceDims, alpha, beta);
+                                                          inLengths,
-            break;
+                                                          reduceDims,
-        case 3:
+                                                          reduceOpId,
-            result = test_reduce_no_index_impl<float, float, float, Rank, 3>(
+                                                          propagateNan,
-                init_method, inLengths, reduceDims, alpha, beta);
+                                                          false,
-            break;
+                                                          alpha,
-        case 4:
+                                                          beta);
-            result = test_reduce_no_index_impl<float, float, float, Rank, 4>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        };
    }
    else if(data_type == 1)
    {
-        switch(reduceDims.size())
+        result = profile_reduce_impl<ck::half_t, float, ck::half_t>(true,
-        {
+                                                                    init_method,
-        case 1:
+                                                                    false,
-            result = test_reduce_no_index_impl<ck::half_t, float, ck::half_t, Rank, 1>(
+                                                                    false,
-                init_method, inLengths, reduceDims, alpha, beta);
+                                                                    inLengths,
-            break;
+                                                                    reduceDims,
-        case 3:
+                                                                    reduceOpId,
-            result = test_reduce_no_index_impl<ck::half_t, float, ck::half_t, Rank, 3>(
+                                                                    propagateNan,
-                init_method, inLengths, reduceDims, alpha, beta);
+                                                                    false,
-            break;
+                                                                    alpha,
-        case 4:
+                                                                    beta);
-            result = test_reduce_no_index_impl<ck::half_t, float, ck::half_t, Rank, 4>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        };
    }
    else if(data_type == 3)
    {
-        switch(reduceDims.size())
+        result = profile_reduce_impl<int8_t, int32_t, int8_t>(true,
-        {
+                                                              init_method,
-        case 1:
+                                                              false,
-            result = test_reduce_no_index_impl<int8_t, int32_t, int8_t, Rank, 1>(
+                                                              false,
-                init_method, inLengths, reduceDims, alpha, beta);
+                                                              inLengths,
-            break;
+                                                              reduceDims,
-        case 3:
+                                                              reduceOpId,
-            result = test_reduce_no_index_impl<int8_t, int32_t, int8_t, Rank, 3>(
+                                                              propagateNan,
-                init_method, inLengths, reduceDims, alpha, beta);
+                                                              false,
-            break;
+                                                              alpha,
-        case 4:
+                                                              beta);
-            result = test_reduce_no_index_impl<int8_t, int32_t, int8_t, Rank, 4>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        };
    }
    else if(data_type == 5)
    {
-        switch(reduceDims.size())
+        result = profile_reduce_impl<ck::bhalf_t, float, ck::bhalf_t>(true,
-        {
+                                                                      init_method,
-        case 1:
+                                                                      false,
-            result = test_reduce_no_index_impl<ck::bhalf_t, float, ck::bhalf_t, Rank, 1>(
+                                                                      false,
-                init_method, inLengths, reduceDims, alpha, beta);
+                                                                      inLengths,
-            break;
+                                                                      reduceDims,
-        case 3:
+                                                                      reduceOpId,
-            result = test_reduce_no_index_impl<ck::bhalf_t, float, ck::bhalf_t, Rank, 3>(
+                                                                      propagateNan,
-                init_method, inLengths, reduceDims, alpha, beta);
+                                                                      false,
-            break;
+                                                                      alpha,
-        case 4:
+                                                                      beta);
-            result = test_reduce_no_index_impl<ck::bhalf_t, float, ck::bhalf_t, Rank, 4>(
+    }
-                init_method, inLengths, reduceDims, alpha, beta);
+    else if(data_type == 6)
-            break;
+    {
-        };
+        result = profile_reduce_impl<double, double, double>(true,
+                                                             init_method,
+                                                             false,
+                                                             false,
+                                                             inLengths,
+                                                             reduceDims,
+                                                             reduceOpId,
+                                                             propagateNan,
+                                                             false,
+                                                             alpha,
+                                                             beta);
    }
    return (result);
 };
+constexpr ReduceTensorOp reduceOpId = ReduceTensorOp::AVG;
+constexpr bool propagateNan         = false;
 int main(int argc, char* argv[])
 {
    SimpleAppArgs args;
@@ -621,8 +212,14 @@ int main(int argc, char* argv[])
            {0, 1, 2, 3}, {0, 1, 2}, {1, 2, 3}, {0, 1, 3}, {0, 2, 3}, {0}, {1}, {2}, {3}};
        for(auto& reduceDims : v_reduceDims)
-            result = result && test_reduce_no_index(
+            result = result && test_reduce_no_index(data_type,
-                                   data_type, init_method, reduceDims, inLengths, 1.0f, 0.0f);
+                                                    init_method,
+                                                    reduceDims,
+                                                    inLengths,
+                                                    reduceOpId,
+                                                    propagateNan,
+                                                    1.0f,
+                                                    0.0f);
    }
    else
    {
@@ -636,6 +233,8 @@ int main(int argc, char* argv[])
                                      args.init_method,
                                      args.reduceDims,
                                      args.inLengths,
+                                      reduceOpId,
+                                      propagateNan,
                                      args.scales[0],
                                      args.scales[1]);
    }

--- a/test/reduce/reduce_util.hpp
+++ b/test/reduce/reduce_util.hpp
-#ifndef REDUCE_UTILS_HPP
-#define REDUCE_UTILS_HPP
-#include "data_type.hpp"
-namespace ck {
-namespace reduce_util {
-template <typename T>
-void to_f32_vector(const Tensor<T>& src, Tensor<float>& dst)
-{
-    for(int i = 0; i < src.mData.size(); ++i)
-        dst.mData[i] = type_convert<float>(src.mData[i]);
-}
-} // namespace reduce_util
-} // namespace ck
-#endif
--- a/test/reduce/reduce_with_index.cpp
+++ b/test/reduce/reduce_with_index.cpp
 #include "getopt.h"
-#include "device_reduce_instance.hpp"
-#include "reduction_enums.hpp"
-#include "host_tensor.hpp"
-#include "host_tensor_generator.hpp"
-#include "host_reduction.hpp"
-#include "check_err.hpp"
-#include "reduce_util.hpp"
-using namespace ck;
+#include "host_common_util.hpp"
+#include "profile_reduce_impl.hpp"
-namespace {
-template <index_t Rank, index_t NumReduceDim>
-static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduceDims)
-{
-    assert(NumReduceDim == reduceDims.size());
-    int reduceFlag = 0;
-    // flag the bits for the reduceDims
-    for(int i = 0; i < NumReduceDim; i++)
-    {
-        reduceFlag |= 1 << reduceDims[i];
-    };
-    std::vector<int> invariantDims;
-    // collect invariant dimensions
-    for(int i = 0; i < Rank; i++)
-        if((reduceFlag & (1 << i)) == 0)
-        {
-            invariantDims.push_back(i);
-        };
-    return invariantDims;
-};
-constexpr int Rank = 4;
-constexpr ReduceTensorOp ReduceOpId      = ReduceTensorOp::AMAX;
-constexpr NanPropagation NanOpt          = NanPropagation::PROPAGATE_NAN;
-constexpr bool PropagateNan              = false;
-constexpr ReduceTensorIndices IndicesOpt = ReduceTensorIndices::FLATTENED_INDICES;
-constexpr bool NeedIndices               = true;
-template <typename InDataType,
-          typename AccDataType,
-          typename OutDataType,
-          int Rank,
-          int NumReduceDim>
-bool test_reduce_with_index_impl(int init_method,
-                                 const std::vector<size_t>& inLengths,
-                                 const std::vector<int>& reduceDims,
-                                 float alpha,
-                                 float beta)
-{
-    using namespace ck::tensor_operation::device;
-    using namespace ck::tensor_operation::device::device_reduce_instance;
-    using namespace ck::host_reduce;
-    Tensor<InDataType> in(inLengths);
-    std::vector<size_t> outLengths;
-    const auto invariantDims = get_invariant_dims<Rank, NumReduceDim>(reduceDims);
-    if(reduceDims.size() == Rank)
-        outLengths.push_back(1);
-    else
-        for(auto dim : invariantDims)
-            outLengths.push_back(inLengths[dim]);
-    Tensor<OutDataType> out_ref(outLengths);
-    Tensor<OutDataType> out(outLengths);
-    Tensor<int32_t> out_indices_ref(outLengths);
-    Tensor<int32_t> out_indices(outLengths);
-    // only used when the OutDataType is bhalf_t
-    Tensor<float> out_ref_fp32(outLengths);
-    Tensor<float> out_fp32(outLengths);
-    auto inStrides  = in.mDesc.GetStrides();
-    auto outStrides = out.mDesc.GetStrides();
-    size_t invariant_total_length = out.mDesc.GetElementSize();
-    size_t reduce_total_length    = in.mDesc.GetElementSize() / invariant_total_length;
-    std::size_t num_thread = 1;
-    switch(init_method)
-    {
-    case 0: break;
-    case 1:
-        in.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}, num_thread);
-        if(beta != 0.0f)
-            out_ref.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}, num_thread);
-        break;
-    case 2:
-        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}, num_thread);
-        if(beta != 0.0f)
-            out_ref.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}, num_thread);
-        break;
-    default:
-        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0}, num_thread);
-        if(beta != 0.0f)
-            out_ref.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0}, num_thread);
-    }
-    if(beta != 0.0f)
-        for(size_t i = 0; i < out_ref.mDesc.GetElementSpace(); i++)
-            out.mData[i] = out_ref.mData[i];
-    // these buffers are usually provided by the user application
-    DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpace());
-    DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpace());
-    in_dev.ToDevice(in.mData.data());
-    if(beta != 0.0f)
-        out_dev.ToDevice(out.mData.data());
-    size_t indicesSizeInBytes = NeedIndices ? out.mDesc.GetElementSize() * sizeof(int) : 0;
-    DeviceMem out_indices_dev(indicesSizeInBytes);
-    using InElementwiseOperation_0 =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation;
-    using AccElementwiseOperation_0 =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::
-            AccElementwiseOperation;
-    using InElementwiseOperation_1 =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::
-            InElementwiseOperation;
-    using AccElementwiseOperation_1 =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::
-            AccElementwiseOperation;
-    using InElementwiseOperation_2 =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::
-            InElementwiseOperation;
-    using AccElementwiseOperation_2 =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::
-            AccElementwiseOperation;
-    using DeviceReduceInstPtr0 =
-        DeviceReducePtr<InElementwiseOperation_0, AccElementwiseOperation_0>;
-    using DeviceReduceInstPtr1 =
-        DeviceReducePtr<InElementwiseOperation_1, AccElementwiseOperation_1>;
-    using DeviceReduceInstPtr2 =
-        DeviceReducePtr<InElementwiseOperation_2, AccElementwiseOperation_2>;
-    std::vector<DeviceReduceInstPtr0> reduce0_ptrs;
-    std::vector<DeviceReduceInstPtr1> reduce1_ptrs;
-    std::vector<DeviceReduceInstPtr2> reduce2_ptrs;
-    add_device_reduce_instance_threadwise<InDataType,
-                                          AccDataType,
-                                          OutDataType,
-                                          Rank,
-                                          NumReduceDim,
-                                          ReduceOpId,
-                                          NanOpt,
-                                          IndicesOpt>(reduce0_ptrs);
-    add_device_reduce_instance_blockwise<InDataType,
-                                         AccDataType,
-                                         OutDataType,
-                                         Rank,
-                                         NumReduceDim,
-                                         ReduceOpId,
-                                         NanOpt,
-                                         IndicesOpt>(reduce0_ptrs);
-    add_device_reduce_instance_multiblock_partial_reduce<InDataType,
-                                                         AccDataType,
-                                                         OutDataType,
-                                                         Rank,
-                                                         NumReduceDim,
-                                                         ReduceOpId,
-                                                         NanOpt,
-                                                         IndicesOpt>(reduce1_ptrs);
-    add_device_reduce_instance_blockwise_second_call<AccDataType,
-                                                     AccDataType,
-                                                     OutDataType,
-                                                     Rank,
-                                                     NumReduceDim,
-                                                     ReduceOpId,
-                                                     NanOpt,
-                                                     IndicesOpt>(reduce2_ptrs);
-    if(reduce0_ptrs.empty() && reduce1_ptrs.empty())
-    {
-        throw std::runtime_error("Wrong! No device REDUCE instance found");
-    };
-    bool result = true;
-    ReductionHost<InDataType,
-                  AccDataType,
-                  OutDataType,
-                  ReduceOpId,
-                  Rank,
-                  NumReduceDim,
-                  PropagateNan,
-                  NeedIndices>
-        hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
-    hostReduce.Run(
-        alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data());
-    const auto i_inLengths  = to_int_vector(inLengths);
-    const auto i_inStrides  = to_int_vector(inStrides);
-    const auto i_outLengths = to_int_vector(outLengths);
-    const auto i_outStrides = to_int_vector(outStrides);
-    for(auto& reduce_ptr : reduce0_ptrs)
-    {
-        auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths, reduceDims);
-        DeviceMem ws_dev(wsSizeInBytes);
-        InElementwiseOperation_0 in_elementwise_op_0(static_cast<int32_t>(reduce_total_length));
-        AccElementwiseOperation_0 acc_elementwise_op_0(static_cast<int32_t>(reduce_total_length));
-        auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths,
-                                                            i_inStrides,
-                                                            i_outLengths,
-                                                            i_outStrides,
-                                                            reduceDims,
-                                                            alpha,
-                                                            beta,
-                                                            in_dev.GetDeviceBuffer(),
-                                                            out_dev.GetDeviceBuffer(),
-                                                            out_indices_dev.GetDeviceBuffer(),
-                                                            ws_dev.GetDeviceBuffer(),
-                                                            in_elementwise_op_0,
-                                                            acc_elementwise_op_0);
-        if(!reduce_ptr->IsSupportedArgument(argument_ptr.get()))
-            continue;
-        auto invoker_ptr = reduce_ptr->MakeInvokerPointer();
-        (void)invoker_ptr->Run(argument_ptr.get());
-        out_dev.FromDevice(out.mData.data());
-        bool single_result = true;
-        if constexpr(std::is_same<OutDataType, ck::half_t>::value ||
-                     std::is_same<OutDataType, ck::bhalf_t>::value)
-        {
-            reduce_util::to_f32_vector(out, out_fp32);
-            reduce_util::to_f32_vector(out_ref, out_ref_fp32);
-            single_result = ck::utils::check_err(
-                out_fp32.mData, out_ref_fp32.mData, "Error: incorrect data result!");
-        }
-        else
-        {
-            single_result =
-                ck::utils::check_err(out.mData, out_ref.mData, "Error: incorrect data result!");
-        };
-        if(NeedIndices)
-        {
-            out_indices_dev.FromDevice(out_indices.mData.data());
-            single_result = single_result && ck::utils::check_err(out_indices_ref.mData,
-                                                                  out_indices.mData,
-                                                                  "Error: incorrect index result!");
-        };
-        if(!single_result)
+using namespace ck;
-        {
-            std::cout << "Fail Info: " << reduce_ptr->GetTypeString() << std::endl;
-            result = false;
-        }
-    };
-    for(auto& reduce_ptr : reduce1_ptrs)
-    {
-        auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths, reduceDims);
-        DeviceMem ws_dev(wsSizeInBytes);
-        InElementwiseOperation_1 in_elementwise_op_1(static_cast<int32_t>(reduce_total_length));
-        AccElementwiseOperation_1 acc_elementwise_op_1(static_cast<int32_t>(reduce_total_length));
-        auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths,
-                                                            i_inStrides,
-                                                            i_outLengths,
-                                                            i_outStrides,
-                                                            reduceDims,
-                                                            alpha,
-                                                            beta,
-                                                            in_dev.GetDeviceBuffer(),
-                                                            out_dev.GetDeviceBuffer(),
-                                                            out_indices_dev.GetDeviceBuffer(),
-                                                            ws_dev.GetDeviceBuffer(),
-                                                            in_elementwise_op_1,
-                                                            acc_elementwise_op_1);
-        if(!reduce_ptr->IsSupportedArgument(argument_ptr.get()))
-            continue;
-        std::string reduce_name = reduce_ptr->GetTypeString();
-        auto invoker_ptr = reduce_ptr->MakeInvokerPointer();
-        (void)invoker_ptr->Run(argument_ptr.get());
-        std::vector<int> inLengths2 = reduce_ptr->GetWorkspace2dLengths(argument_ptr.get());
-        std::vector<int> inStrides2{inLengths2[1], 1};
-        for(auto& reduce2_ptr : reduce2_ptrs)
-        {
-            InElementwiseOperation_2 in_elementwise_op_2(static_cast<int32_t>(reduce_total_length));
-            AccElementwiseOperation_2 acc_elementwise_op_2(
-                static_cast<int32_t>(reduce_total_length));
-            auto argument2_ptr = reduce2_ptr->MakeArgumentPointer(inLengths2,
-                                                                  inStrides2,
-                                                                  i_outLengths,
-                                                                  i_outStrides,
-                                                                  reduceDims,
-                                                                  alpha,
-                                                                  beta,
-                                                                  ws_dev.GetDeviceBuffer(),
-                                                                  out_dev.GetDeviceBuffer(),
-                                                                  out_indices_dev.GetDeviceBuffer(),
-                                                                  ws_dev.GetDeviceBuffer(),
-                                                                  in_elementwise_op_2,
-                                                                  acc_elementwise_op_2);
-            if(!reduce2_ptr->IsSupportedArgument(argument2_ptr.get()))
-                continue;
-            std::string reduce2_name = reduce2_ptr->GetTypeString();
-            auto invoker2_ptr = reduce2_ptr->MakeInvokerPointer();
-            (void)invoker2_ptr->Run(argument2_ptr.get());
-            out_dev.FromDevice(out.mData.data());
-            bool single_result = true;
-            if constexpr(std::is_same<OutDataType, ck::half_t>::value ||
-                         std::is_same<OutDataType, ck::bhalf_t>::value)
-            {
-                reduce_util::to_f32_vector(out, out_fp32);
-                reduce_util::to_f32_vector(out_ref, out_ref_fp32);
-                single_result = ck::utils::check_err(
-                    out_fp32.mData, out_ref_fp32.mData, "Error: incorrect data result!");
-            }
-            else
-            {
-                single_result =
-                    ck::utils::check_err(out.mData, out_ref.mData, "Error: incorrect data result!");
-            };
-            if(NeedIndices)
-            {
-                out_indices_dev.FromDevice(out_indices.mData.data());
-                single_result =
-                    single_result && ck::utils::check_err(out_indices_ref.mData,
-                                                          out_indices.mData,
-                                                          "Error: incorrect index result!");
-            };
-            if(!single_result)
-            {
-                std::cout << "Fail Info: " << reduce_ptr->GetTypeString() << " => "
-                          << reduce2_ptr->GetTypeString() << std::endl;
-                result = false;
-            }
-        };
-    };
-    return (result);
-};
-} // anonymous namespace
 static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
                                       {"reduceDimensions", required_argument, nullptr, 'R'},
@@ -390,48 +13,6 @@ static struct option long_options[] = {{"inLengths", required_argument, nullptr,
 class SimpleAppArgs
 {
-    template <typename T>
-    static T getSingleValueFromString(const std::string& valueStr)
-    {
-        std::istringstream iss(valueStr);
-        T ret;
-        iss >> ret;
-        return (ret);
-    };
-    template <typename T>
-    static std::vector<T> getTypeValuesFromString(const char* cstr_values)
-    {
-        std::string valuesStr(cstr_values);
-        std::vector<T> values;
-        std::size_t pos = 0;
-        std::size_t new_pos;
-        new_pos = valuesStr.find(',', pos);
-        while(new_pos != std::string::npos)
-        {
-            const std::string sliceStr = valuesStr.substr(pos, new_pos - pos);
-            T val = getSingleValueFromString<T>(sliceStr);
-            values.push_back(val);
-            pos     = new_pos + 1;
-            new_pos = valuesStr.find(',', pos);
-        };
-        std::string sliceStr = valuesStr.substr(pos);
-        T val                = getSingleValueFromString<T>(sliceStr);
-        values.push_back(val);
-        return (values);
-    };
    private:
    int option_index = 0;
@@ -463,7 +44,9 @@ class SimpleAppArgs
    int processArgs(int argc, char* argv[])
    {
-        unsigned int ch;
+        using ck::host_common::getTypeValuesFromString;
+        int ch;
        while(1)
        {
@@ -517,7 +100,7 @@ class SimpleAppArgs
           (reduceDims.size() != 1 && reduceDims.size() != 3 && reduceDims.size() != 4))
            return (-1);
-        if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5)
+        if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5 && data_type != 6)
            return (-1);
        return (0);
@@ -528,87 +111,92 @@ bool test_reduce_with_index(int data_type,
                            int init_method,
                            std::vector<int> reduceDims,
                            std::vector<size_t> inLengths,
+                            ReduceTensorOp reduceOpId,
+                            bool propagateNan,
                            float alpha,
                            float beta)
 {
+    using ck::profiler::profile_reduce_impl;
    bool result = true;
    if(data_type == 0)
    {
-        switch(reduceDims.size())
+        result = profile_reduce_impl<float, float, float>(true,
-        {
+                                                          init_method,
-        case 1:
+                                                          false,
-            result = test_reduce_with_index_impl<float, float, float, Rank, 1>(
+                                                          false,
-                init_method, inLengths, reduceDims, alpha, beta);
+                                                          inLengths,
-            break;
+                                                          reduceDims,
-        case 3:
+                                                          reduceOpId,
-            result = test_reduce_with_index_impl<float, float, float, Rank, 3>(
+                                                          propagateNan,
-                init_method, inLengths, reduceDims, alpha, beta);
+                                                          true,
-            break;
+                                                          alpha,
-        case 4:
+                                                          beta);
-            result = test_reduce_with_index_impl<float, float, float, Rank, 4>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        };
    }
    else if(data_type == 1)
    {
-        switch(reduceDims.size())
+        result = profile_reduce_impl<ck::half_t, ck::half_t, ck::half_t>(true,
-        {
+                                                                         init_method,
-        case 1:
+                                                                         false,
-            result = test_reduce_with_index_impl<ck::half_t, ck::half_t, ck::half_t, Rank, 1>(
+                                                                         false,
-                init_method, inLengths, reduceDims, alpha, beta);
+                                                                         inLengths,
-            break;
+                                                                         reduceDims,
-        case 3:
+                                                                         reduceOpId,
-            result = test_reduce_with_index_impl<ck::half_t, ck::half_t, ck::half_t, Rank, 3>(
+                                                                         propagateNan,
-                init_method, inLengths, reduceDims, alpha, beta);
+                                                                         true,
-            break;
+                                                                         alpha,
-        case 4:
+                                                                         beta);
-            result = test_reduce_with_index_impl<ck::half_t, ck::half_t, ck::half_t, Rank, 4>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        };
    }
    else if(data_type == 3)
    {
-        switch(reduceDims.size())
+        result = profile_reduce_impl<int8_t, int8_t, int8_t>(true,
-        {
+                                                             init_method,
-        case 1:
+                                                             false,
-            result = test_reduce_with_index_impl<int8_t, int8_t, int8_t, Rank, 1>(
+                                                             false,
-                init_method, inLengths, reduceDims, alpha, beta);
+                                                             inLengths,
-            break;
+                                                             reduceDims,
-        case 3:
+                                                             reduceOpId,
-            result = test_reduce_with_index_impl<int8_t, int8_t, int8_t, Rank, 3>(
+                                                             propagateNan,
-                init_method, inLengths, reduceDims, alpha, beta);
+                                                             true,
-            break;
+                                                             alpha,
-        case 4:
+                                                             beta);
-            result = test_reduce_with_index_impl<int8_t, int8_t, int8_t, Rank, 4>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        };
    }
    else if(data_type == 5)
    {
-        switch(reduceDims.size())
+        result = profile_reduce_impl<ck::bhalf_t, float, ck::bhalf_t>(true,
-        {
+                                                                      init_method,
-        case 1:
+                                                                      false,
-            result = test_reduce_with_index_impl<ck::bhalf_t, float, ck::bhalf_t, Rank, 1>(
+                                                                      false,
-                init_method, inLengths, reduceDims, alpha, beta);
+                                                                      inLengths,
-            break;
+                                                                      reduceDims,
-        case 3:
+                                                                      reduceOpId,
-            result = test_reduce_with_index_impl<ck::bhalf_t, float, ck::bhalf_t, Rank, 3>(
+                                                                      propagateNan,
-                init_method, inLengths, reduceDims, alpha, beta);
+                                                                      true,
-            break;
+                                                                      alpha,
-        case 4:
+                                                                      beta);
-            result = test_reduce_with_index_impl<ck::bhalf_t, float, ck::bhalf_t, Rank, 4>(
+    }
-                init_method, inLengths, reduceDims, alpha, beta);
+    else if(data_type == 6)
-            break;
+    {
-        };
+        result = profile_reduce_impl<double, double, double>(true,
+                                                             init_method,
+                                                             false,
+                                                             false,
+                                                             inLengths,
+                                                             reduceDims,
+                                                             reduceOpId,
+                                                             propagateNan,
+                                                             true,
+                                                             alpha,
+                                                             beta);
    }
    return (result);
 };
+constexpr ReduceTensorOp reduceOpId = ReduceTensorOp::AMAX;
+constexpr bool propagateNan         = false;
 int main(int argc, char* argv[])
 {
    SimpleAppArgs args;
@@ -624,8 +212,14 @@ int main(int argc, char* argv[])
            {0, 1, 2, 3}, {0, 1, 2}, {1, 2, 3}, {0, 1, 3}, {0, 2, 3}, {0}, {1}, {2}, {3}};
        for(auto& reduceDims : v_reduceDims)
-            result = result && test_reduce_with_index(
+            result = result && test_reduce_with_index(data_type,
-                                   data_type, init_method, reduceDims, inLengths, 1.0f, 0.0f);
+                                                      init_method,
+                                                      reduceDims,
+                                                      inLengths,
+                                                      reduceOpId,
+                                                      propagateNan,
+                                                      1.0f,
+                                                      0.0f);
    }
    else
    {
@@ -639,6 +233,8 @@ int main(int argc, char* argv[])
                                        args.init_method,
                                        args.reduceDims,
                                        args.inLengths,
+                                        reduceOpId,
+                                        propagateNan,
                                        args.scales[0],
                                        args.scales[1]);
    }

--- a/test/reference_conv_fwd/CMakeLists.txt
+++ b/test/reference_conv_fwd/CMakeLists.txt
 add_gtest_executable(test_reference_conv_fwd reference_conv_fwd.cpp)
-target_link_libraries(test_reference_conv_fwd PRIVATE host_tensor conv_fwd_util)
+target_link_libraries(test_reference_conv_fwd PRIVATE host_tensor conv_util)
--- a/test/reference_conv_fwd/reference_conv_fwd.cpp
+++ b/test/reference_conv_fwd/reference_conv_fwd.cpp
@@ -8,7 +8,7 @@
 #include "check_err.hpp"
 #include "config.hpp"
-#include "conv_fwd_util.hpp"
+#include "conv_util.hpp"
 #include "element_wise_operation.hpp"
 #include "fill.hpp"
 #include "host_tensor.hpp"
@@ -34,21 +34,21 @@ run_reference_convolution_forward(const ck::utils::conv::ConvParams& params,
                                  const FillInputOp& fill_input_op     = FillInputOp{},
                                  const FillWeightsOp& fill_weights_op = FillWeightsOp{0.5f})
 {
-    std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N),
+    std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N_),
-                                        static_cast<std::size_t>(params.C)};
+                                        static_cast<std::size_t>(params.C_)};
    input_dims.insert(std::end(input_dims),
-                      std::begin(params.input_spatial_lengths),
+                      std::begin(params.input_spatial_lengths_),
-                      std::end(params.input_spatial_lengths));
+                      std::end(params.input_spatial_lengths_));
-    std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K),
+    std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K_),
-                                         static_cast<std::size_t>(params.C)};
+                                         static_cast<std::size_t>(params.C_)};
    filter_dims.insert(std::end(filter_dims),
-                       std::begin(params.filter_spatial_lengths),
+                       std::begin(params.filter_spatial_lengths_),
-                       std::end(params.filter_spatial_lengths));
+                       std::end(params.filter_spatial_lengths_));
    const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
-    std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N),
+    std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N_),
-                                         static_cast<std::size_t>(params.K)};
+                                         static_cast<std::size_t>(params.K_)};
    output_dims.insert(std::end(output_dims),
                       std::begin(output_spatial_lengths),
                       std::end(output_spatial_lengths));
@@ -74,10 +74,10 @@ run_reference_convolution_forward(const ck::utils::conv::ConvParams& params,
    auto ref_argument = ref_conv.MakeArgument(input,
                                              weights,
                                              host_output,
-                                              params.conv_filter_strides,
+                                              params.conv_filter_strides_,
-                                              params.conv_filter_dilations,
+                                              params.conv_filter_dilations_,
-                                              params.input_left_pads,
+                                              params.input_left_pads_,
-                                              params.input_right_pads,
+                                              params.input_right_pads_,
                                              InElementOp{},
                                              WeiElementOp{},
                                              OutElementOp{});
@@ -91,15 +91,15 @@ run_reference_convolution_forward(const ck::utils::conv::ConvParams& params,
 TEST(ReferenceConvolutionFWD, Conv2DNHWC)
 {
    ck::utils::conv::ConvParams params;
-    params.N                      = 1;
+    params.N_                      = 1;
-    params.K                      = 1;
+    params.K_                      = 1;
-    params.C                      = 2;
+    params.C_                      = 2;
-    params.filter_spatial_lengths = std::vector<ck::index_t>{3, 3};
+    params.filter_spatial_lengths_ = std::vector<ck::index_t>{3, 3};
-    params.input_spatial_lengths  = std::vector<ck::index_t>{6, 6};
+    params.input_spatial_lengths_  = std::vector<ck::index_t>{6, 6};
-    params.conv_filter_strides    = std::vector<ck::index_t>{1, 1};
+    params.conv_filter_strides_    = std::vector<ck::index_t>{1, 1};
-    params.conv_filter_dilations  = std::vector<ck::index_t>{1, 1};
+    params.conv_filter_dilations_  = std::vector<ck::index_t>{1, 1};
-    params.input_left_pads        = std::vector<ck::index_t>{0, 0};
+    params.input_left_pads_        = std::vector<ck::index_t>{0, 0};
-    params.input_right_pads       = std::vector<ck::index_t>{0, 0};
+    params.input_right_pads_       = std::vector<ck::index_t>{0, 0};
    auto out_tensor = run_reference_convolution_forward<2>(params);
    std::vector<std::size_t> ref_dims{1, 1, 4, 4};
@@ -127,15 +127,15 @@ TEST(ReferenceConvolutionFWD, Conv2DNHWC)
 TEST(ReferenceConvolutionFWD, Conv2DNHWCStridesDilationsPadding)
 {
    ck::utils::conv::ConvParams params;
-    params.N                      = 1;
+    params.N_                      = 1;
-    params.K                      = 2;
+    params.K_                      = 2;
-    params.C                      = 2;
+    params.C_                      = 2;
-    params.filter_spatial_lengths = std::vector<ck::index_t>{3, 3};
+    params.filter_spatial_lengths_ = std::vector<ck::index_t>{3, 3};
-    params.input_spatial_lengths  = std::vector<ck::index_t>{12, 12};
+    params.input_spatial_lengths_  = std::vector<ck::index_t>{12, 12};
-    params.conv_filter_strides    = std::vector<ck::index_t>{2, 2};
+    params.conv_filter_strides_    = std::vector<ck::index_t>{2, 2};
-    params.conv_filter_dilations  = std::vector<ck::index_t>{2, 2};
+    params.conv_filter_dilations_  = std::vector<ck::index_t>{2, 2};
-    params.input_left_pads        = std::vector<ck::index_t>{1, 1};
+    params.input_left_pads_        = std::vector<ck::index_t>{1, 1};
-    params.input_right_pads       = std::vector<ck::index_t>{1, 1};
+    params.input_right_pads_       = std::vector<ck::index_t>{1, 1};
    auto out_tensor                   = run_reference_convolution_forward<2>(params);
    std::vector<std::size_t> ref_dims = std::vector<std::size_t>{1, 2, 5, 5};
@@ -153,16 +153,16 @@ TEST(ReferenceConvolutionFWD, Conv2DNHWCStridesDilationsPadding)
 TEST(ReferenceConvolutionFWD, Conv1DNWC)
 {
    ck::utils::conv::ConvParams params;
-    params.num_dim_spatial        = 1;
+    params.num_dim_spatial_        = 1;
-    params.N                      = 1;
+    params.N_                      = 1;
-    params.K                      = 1;
+    params.K_                      = 1;
-    params.C                      = 2;
+    params.C_                      = 2;
-    params.filter_spatial_lengths = std::vector<ck::index_t>{3};
+    params.filter_spatial_lengths_ = std::vector<ck::index_t>{3};
-    params.input_spatial_lengths  = std::vector<ck::index_t>{6};
+    params.input_spatial_lengths_  = std::vector<ck::index_t>{6};
-    params.conv_filter_strides    = std::vector<ck::index_t>{1};
+    params.conv_filter_strides_    = std::vector<ck::index_t>{1};
-    params.conv_filter_dilations  = std::vector<ck::index_t>{1};
+    params.conv_filter_dilations_  = std::vector<ck::index_t>{1};
-    params.input_left_pads        = std::vector<ck::index_t>{0};
+    params.input_left_pads_        = std::vector<ck::index_t>{0};
-    params.input_right_pads       = std::vector<ck::index_t>{0};
+    params.input_right_pads_       = std::vector<ck::index_t>{0};
    auto out_tensor =
        run_reference_convolution_forward<1,
@@ -182,16 +182,16 @@ TEST(ReferenceConvolutionFWD, Conv1DNWC)
 TEST(ReferenceConvolutionFWD, Conv1DNWCStridesDilationsPadding)
 {
    ck::utils::conv::ConvParams params;
-    params.num_dim_spatial        = 1;
+    params.num_dim_spatial_        = 1;
-    params.N                      = 1;
+    params.N_                      = 1;
-    params.K                      = 2;
+    params.K_                      = 2;
-    params.C                      = 2;
+    params.C_                      = 2;
-    params.filter_spatial_lengths = std::vector<ck::index_t>{3};
+    params.filter_spatial_lengths_ = std::vector<ck::index_t>{3};
-    params.input_spatial_lengths  = std::vector<ck::index_t>{12};
+    params.input_spatial_lengths_  = std::vector<ck::index_t>{12};
-    params.conv_filter_strides    = std::vector<ck::index_t>{2};
+    params.conv_filter_strides_    = std::vector<ck::index_t>{2};
-    params.conv_filter_dilations  = std::vector<ck::index_t>{2};
+    params.conv_filter_dilations_  = std::vector<ck::index_t>{2};
-    params.input_left_pads        = std::vector<ck::index_t>{1};
+    params.input_left_pads_        = std::vector<ck::index_t>{1};
-    params.input_right_pads       = std::vector<ck::index_t>{1};
+    params.input_right_pads_       = std::vector<ck::index_t>{1};
    auto out_tensor =
        run_reference_convolution_forward<1,
@@ -211,16 +211,16 @@ TEST(ReferenceConvolutionFWD, Conv1DNWCStridesDilationsPadding)
 TEST(ReferenceConvolutionFWD, Conv1DNWCSameOutputSize)
 {
    ck::utils::conv::ConvParams params;
-    params.num_dim_spatial        = 1;
+    params.num_dim_spatial_        = 1;
-    params.N                      = 2;
+    params.N_                      = 2;
-    params.K                      = 16;
+    params.K_                      = 16;
-    params.C                      = 4;
+    params.C_                      = 4;
-    params.filter_spatial_lengths = std::vector<ck::index_t>{3};
+    params.filter_spatial_lengths_ = std::vector<ck::index_t>{3};
-    params.input_spatial_lengths  = std::vector<ck::index_t>{16};
+    params.input_spatial_lengths_  = std::vector<ck::index_t>{16};
-    params.conv_filter_strides    = std::vector<ck::index_t>{1};
+    params.conv_filter_strides_    = std::vector<ck::index_t>{1};
-    params.conv_filter_dilations  = std::vector<ck::index_t>{1};
+    params.conv_filter_dilations_  = std::vector<ck::index_t>{1};
-    params.input_left_pads        = std::vector<ck::index_t>{1};
+    params.input_left_pads_        = std::vector<ck::index_t>{1};
-    params.input_right_pads       = std::vector<ck::index_t>{1};
+    params.input_right_pads_       = std::vector<ck::index_t>{1};
    auto out_tensor2 = run_reference_convolution_forward<1,
                                                         float,
@@ -305,16 +305,16 @@ TEST(ReferenceConvolutionFWD, Conv1DNWCSameOutputSize)
 TEST(ReferenceConvolutionFWD, Conv3DNCDHW)
 {
    ck::utils::conv::ConvParams params;
-    params.num_dim_spatial        = 3;
+    params.num_dim_spatial_        = 3;
-    params.N                      = 1;
+    params.N_                      = 1;
-    params.K                      = 1;
+    params.K_                      = 1;
-    params.C                      = 2;
+    params.C_                      = 2;
-    params.filter_spatial_lengths = std::vector<ck::index_t>{3, 3, 3};
+    params.filter_spatial_lengths_ = std::vector<ck::index_t>{3, 3, 3};
-    params.input_spatial_lengths  = std::vector<ck::index_t>{6, 6, 6};
+    params.input_spatial_lengths_  = std::vector<ck::index_t>{6, 6, 6};
-    params.conv_filter_strides    = std::vector<ck::index_t>{1, 1, 1};
+    params.conv_filter_strides_    = std::vector<ck::index_t>{1, 1, 1};
-    params.conv_filter_dilations  = std::vector<ck::index_t>{1, 1, 1};
+    params.conv_filter_dilations_  = std::vector<ck::index_t>{1, 1, 1};
-    params.input_left_pads        = std::vector<ck::index_t>{0, 0, 0};
+    params.input_left_pads_        = std::vector<ck::index_t>{0, 0, 0};
-    params.input_right_pads       = std::vector<ck::index_t>{0, 0, 0};
+    params.input_right_pads_       = std::vector<ck::index_t>{0, 0, 0};
    auto out_tensor = run_reference_convolution_forward<3,
                                                        float,
@@ -344,16 +344,16 @@ TEST(ReferenceConvolutionFWD, Conv3DNCDHW)
 TEST(ReferenceConvolutionFWD, Conv3DNCDHWStridesDilations)
 {
    ck::utils::conv::ConvParams params;
-    params.num_dim_spatial        = 3;
+    params.num_dim_spatial_        = 3;
-    params.N                      = 1;
+    params.N_                      = 1;
-    params.K                      = 2;
+    params.K_                      = 2;
-    params.C                      = 2;
+    params.C_                      = 2;
-    params.filter_spatial_lengths = std::vector<ck::index_t>{3, 3, 3};
+    params.filter_spatial_lengths_ = std::vector<ck::index_t>{3, 3, 3};
-    params.input_spatial_lengths  = std::vector<ck::index_t>{12, 12, 12};
+    params.input_spatial_lengths_  = std::vector<ck::index_t>{12, 12, 12};
-    params.conv_filter_strides    = std::vector<ck::index_t>{3, 3, 3};
+    params.conv_filter_strides_    = std::vector<ck::index_t>{3, 3, 3};
-    params.conv_filter_dilations  = std::vector<ck::index_t>{1, 1, 1};
+    params.conv_filter_dilations_  = std::vector<ck::index_t>{1, 1, 1};
-    params.input_left_pads        = std::vector<ck::index_t>{0, 0, 0};
+    params.input_left_pads_        = std::vector<ck::index_t>{0, 0, 0};
-    params.input_right_pads       = std::vector<ck::index_t>{0, 0, 0};
+    params.input_right_pads_       = std::vector<ck::index_t>{0, 0, 0};
    auto out_tensor = run_reference_convolution_forward<3,
                                                        float,