merging with latest develop branch

68886f7d · raman jana · a9ee2960 · 1677cf70 · 68886f7d · 68886f7d
Commit 68886f7d authored Jun 14, 2022 by raman jana
20 changed files
--- a/profiler/include/profile_grouped_gemm_impl.hpp
+++ b/profiler/include/profile_grouped_gemm_impl.hpp
@@ -43,13 +43,14 @@ namespace profiler {
 template <typename ADataType,
          typename BDataType,
          typename CDataType,
+          typename AccDataType,
          typename ALayout,
          typename BLayout,
          typename CLayout>
 void profile_grouped_gemm_impl(int do_verification,
                               int init_method,
                               bool do_log,
-                               int nrepeat,
+                               bool time_kernel,
                               const std::vector<int>& Ms,
                               const std::vector<int>& Ns,
                               const std::vector<int>& Ks,
@@ -231,7 +232,8 @@ void profile_grouped_gemm_impl(int do_verification,
        {
            std::string gemm_name = gemm_ptr->GetTypeString();

-            float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});

            std::size_t flop = 0, num_btype = 0;
            for(std::size_t i = 0; i < gemm_shapes.size(); i++)
@@ -270,6 +272,7 @@ void profile_grouped_gemm_impl(int do_verification,
                        ck::tensor_operation::host::ReferenceGemm<ADataType,
                                                                  BDataType,
                                                                  CDataType,
+                                                                  AccDataType,
                                                                  AElementOp,
                                                                  BElementOp,
                                                                  CElementOp>;

--- a/profiler/include/profile_reduce_impl.hpp
+++ b/profiler/include/profile_reduce_impl.hpp
@@ -5,74 +5,77 @@
 #include "device_reduce_instance.hpp"
 #include "reduction_enums.hpp"
 #include "host_reduction.hpp"
+#include "host_common_util.hpp"
+#include "host_tensor_generator.hpp"

 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace device_reduce_instance {

-template <int Rank, int NumReduceDim, int ReduceOpId, int NanOpt, int IndicesOpt>
+template <int Rank, int NumReduceDim, int ReduceOpId, bool PropagateNan, bool UseIndex>
 struct ReduceDescription
 {
    static constexpr int Rank_         = Rank;
    static constexpr int NumReduceDim_ = NumReduceDim;
    static constexpr int ReduceOpId_   = ReduceOpId;
-    static constexpr int NanOpt_       = NanOpt;
-    static constexpr int IndicesOpt_   = IndicesOpt;
+    static constexpr int PropagateNan_ = PropagateNan;
+    static constexpr int UseIndex_     = UseIndex;
 };

-using reduce_description_instances = std::tuple<ReduceDescription<4, 3, 0, 0, 0>, // for ADD
-                                                ReduceDescription<4, 4, 0, 0, 0>,
-                                                ReduceDescription<4, 1, 0, 0, 0>,
-                                                ReduceDescription<2, 1, 0, 0, 0>,
-
-                                                ReduceDescription<4, 3, 5, 0, 0>, // for AVG
-                                                ReduceDescription<4, 4, 5, 0, 0>,
-                                                ReduceDescription<4, 1, 5, 0, 0>,
-                                                ReduceDescription<2, 1, 5, 0, 0>,
-
-                                                ReduceDescription<4, 3, 7, 0, 0>, // for NORM2
-                                                ReduceDescription<4, 4, 7, 0, 0>,
-                                                ReduceDescription<4, 1, 7, 0, 0>,
-                                                ReduceDescription<2, 1, 7, 0, 0>,
-
-                                                ReduceDescription<4, 3, 2, 0, 0>, // for MIN
-                                                ReduceDescription<4, 4, 2, 0, 0>,
-                                                ReduceDescription<4, 1, 2, 0, 0>,
-                                                ReduceDescription<2, 1, 2, 0, 0>,
-                                                ReduceDescription<4, 3, 3, 0, 0>, // for MAX
-                                                ReduceDescription<4, 4, 3, 0, 0>,
-                                                ReduceDescription<4, 1, 3, 0, 0>,
-                                                ReduceDescription<2, 1, 3, 0, 0>,
-                                                ReduceDescription<4, 3, 4, 0, 0>, // for AMAX
-                                                ReduceDescription<4, 4, 4, 0, 0>,
-                                                ReduceDescription<4, 1, 4, 0, 0>,
-                                                ReduceDescription<2, 1, 4, 0, 0>,
-
-                                                ReduceDescription<4, 3, 2, 0, 1>, // for MIN
-                                                ReduceDescription<4, 4, 2, 0, 1>,
-                                                ReduceDescription<4, 1, 2, 0, 1>,
-                                                ReduceDescription<2, 1, 2, 0, 1>,
-                                                ReduceDescription<4, 3, 3, 0, 1>, // for MAX
-                                                ReduceDescription<4, 4, 3, 0, 1>,
-                                                ReduceDescription<4, 1, 3, 0, 1>,
-                                                ReduceDescription<2, 1, 3, 0, 1>,
-                                                ReduceDescription<4, 3, 4, 0, 1>, // for AMAX
-                                                ReduceDescription<4, 4, 4, 0, 1>,
-                                                ReduceDescription<4, 1, 4, 0, 1>,
-                                                ReduceDescription<2, 1, 4, 0, 1>>;
+using reduce_description_instances =
+    std::tuple<ReduceDescription<4, 3, 0, false, false>, // for ADD
+               ReduceDescription<4, 4, 0, false, false>,
+               ReduceDescription<4, 1, 0, false, false>,
+               ReduceDescription<2, 1, 0, false, false>,
+
+               ReduceDescription<4, 3, 5, false, false>, // for AVG
+               ReduceDescription<4, 4, 5, false, false>,
+               ReduceDescription<4, 1, 5, false, false>,
+               ReduceDescription<2, 1, 5, false, false>,
+
+               ReduceDescription<4, 3, 7, false, false>, // for NORM2
+               ReduceDescription<4, 4, 7, false, false>,
+               ReduceDescription<4, 1, 7, false, false>,
+               ReduceDescription<2, 1, 7, false, false>,
+
+               ReduceDescription<4, 3, 2, false, false>, // for MIN
+               ReduceDescription<4, 4, 2, false, false>,
+               ReduceDescription<4, 1, 2, false, false>,
+               ReduceDescription<2, 1, 2, false, false>,
+               ReduceDescription<4, 3, 3, false, false>, // for MAX
+               ReduceDescription<4, 4, 3, false, false>,
+               ReduceDescription<4, 1, 3, false, false>,
+               ReduceDescription<2, 1, 3, false, false>,
+               ReduceDescription<4, 3, 4, false, false>, // for AMAX
+               ReduceDescription<4, 4, 4, false, false>,
+               ReduceDescription<4, 1, 4, false, false>,
+               ReduceDescription<2, 1, 4, false, false>,
+
+               ReduceDescription<4, 3, 2, false, true>, // for MIN
+               ReduceDescription<4, 4, 2, false, true>,
+               ReduceDescription<4, 1, 2, false, true>,
+               ReduceDescription<2, 1, 2, false, true>,
+               ReduceDescription<4, 3, 3, false, true>, // for MAX
+               ReduceDescription<4, 4, 3, false, true>,
+               ReduceDescription<4, 1, 3, false, true>,
+               ReduceDescription<2, 1, 3, false, true>,
+               ReduceDescription<4, 3, 4, false, true>, // for AMAX
+               ReduceDescription<4, 4, 4, false, true>,
+               ReduceDescription<4, 1, 4, false, true>,
+               ReduceDescription<2, 1, 4, false, true>>;

 template <typename DescriptionType>
 bool description_match(const DescriptionType& description,
                       int Rank,
                       const std::vector<int>& reduceDims,
                       ReduceTensorOp ReduceOpId,
-                       NanPropagation NanOpt,
-                       ReduceTensorIndices IndicesOpt)
+                       bool PropagateNan,
+                       bool UseIndex)
 {
    if(description.Rank_ != Rank || description.ReduceOpId_ != static_cast<int>(ReduceOpId) ||
-       description.NanOpt_ != static_cast<int>(NanOpt) ||
-       description.IndicesOpt_ != static_cast<int>(IndicesOpt))
+       description.PropagateNan_ != static_cast<int>(PropagateNan) ||
+       description.UseIndex_ != static_cast<int>(UseIndex))
        return (false);

    if(DescriptionType::NumReduceDim_ != reduceDims.size())
@@ -116,48 +119,18 @@ static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduce
    return invariantDims;
 };

-template <typename T>
-static void dumpBufferToFile(const char* fileName, T* data, size_t dataNumItems)
-{
-    std::ofstream outFile(fileName, std::ios::binary);
-    if(outFile)
-    {
-        outFile.write(reinterpret_cast<char*>(data), dataNumItems * sizeof(T));
-        outFile.close();
-        std::cout << "Write output to file " << fileName << std::endl;
-    }
-    else
-    {
-        std::cout << "Could not open file " << fileName << " for writing" << std::endl;
-    }
-};
-
-// map the data type used by the GPU kernels to the corresponding type used by the host codes
-template <typename InType>
-struct type_mapping
-{
-    using OutType = InType;
-};
-
-template <>
-struct type_mapping<ck::half_t>
-{
-    using OutType = half_float::half;
-};
-
 template <typename InDataType,
          typename AccDataType,
          typename OutDataType,
          int Rank,
          int NumReduceDim,
          ReduceTensorOp ReduceOpId,
-          NanPropagation NanOpt,
-          ReduceTensorIndices IndicesOpt>
-void profile_reduce_impl_impl(bool do_verification,
+          bool PropagateNan,
+          bool UseIndex>
+bool profile_reduce_impl_impl(bool do_verification,
                              int init_method,
-                              bool do_log,
                              bool do_dumpout,
-                              int nrepeat,
+                              bool time_kernel,
                              const std::vector<size_t>& inLengths,
                              const std::vector<int>& reduceDims,
                              float alpha,
@@ -165,16 +138,13 @@ void profile_reduce_impl_impl(bool do_verification,
 {
    using namespace ck::tensor_operation::device;
    using namespace ck::tensor_operation::device::device_reduce_instance;
-    using namespace ck::host_reduce;
+    using ck::host_common::dumpBufferToFile;

    constexpr bool op_support_indices =
        (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
         ReduceOpId == ReduceTensorOp::AMAX);

-    constexpr bool NeedIndices =
-        (op_support_indices && (IndicesOpt != ReduceTensorIndices::NO_INDICES));
-
-    constexpr bool PropagateNan = (NanOpt == NanPropagation::PROPAGATE_NAN);
+    constexpr bool OutputIndex = (op_support_indices && UseIndex);

    constexpr bool out_support_atomic_add = std::is_same<OutDataType, float>::value;
    constexpr bool op_support_atomic_add =
@@ -195,8 +165,7 @@ void profile_reduce_impl_impl(bool do_verification,
        (op_support_indices && !std::is_same<AccDataType, float>::value);

    // 1) The indices can only be used when the reduction operation is indexable
-    constexpr bool invalid_reduce_3 =
-        (!op_support_indices && IndicesOpt != ReduceTensorIndices::NO_INDICES);
+    constexpr bool invalid_reduce_3 = (!op_support_indices && UseIndex);

    // 1) If InDataType is int8_t, must use int8_t as AccDataType for indexable reduction operations
    // 2) If InDataType is int8_t, must use int32_t as AccDataType for non-indexable reduction
@@ -219,6 +188,8 @@ void profile_reduce_impl_impl(bool do_verification,
    constexpr bool invalid_reduce = (invalid_reduce_1 || invalid_reduce_2 || invalid_reduce_3 ||
                                     invalid_reduce_4 || invalid_reduce_5 || invalid_reduce_6);

+    bool pass = true;
+
    if constexpr(!invalid_reduce)
    {
        Tensor<InDataType> in(inLengths);
@@ -282,42 +253,26 @@ void profile_reduce_impl_impl(bool do_verification,
        if(beta != 0.0f)
            out_dev.ToDevice(out.mData.data());

-        size_t indicesSizeInBytes = NeedIndices ? out.mDesc.GetElementSize() * sizeof(int) : 0;
+        size_t indicesSizeInBytes = OutputIndex ? out.mDesc.GetElementSize() * sizeof(int) : 0;

        DeviceMem out_indices_dev(indicesSizeInBytes);

        float best_avg_time   = 0;
        float best_gb_per_sec = 0;

-        using InElementwiseOperation_0 =
+        using InElementwiseOperation =
            typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::
                InElementwiseOperation;
-        using AccElementwiseOperation_0 =
+        using AccElementwiseOperation =
            typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::
                AccElementwiseOperation;
-        using InElementwiseOperation_1 =
-            typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::
-                InElementwiseOperation;
-        using AccElementwiseOperation_1 =
-            typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::
-                AccElementwiseOperation;
-        using InElementwiseOperation_2 =
-            typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::
-                InElementwiseOperation;
-        using AccElementwiseOperation_2 =
-            typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::
-                AccElementwiseOperation;
+
+        using ReduceOperation = typename reduce_binary_operator<AccDataType, ReduceOpId>::opType;

        using DeviceReduceInstPtr0 =
-            DeviceReducePtr<InElementwiseOperation_0, AccElementwiseOperation_0>;
-        using DeviceReduceInstPtr1 =
-            DeviceReducePtr<InElementwiseOperation_1, AccElementwiseOperation_1>;
-        using DeviceReduceInstPtr2 =
-            DeviceReducePtr<InElementwiseOperation_2, AccElementwiseOperation_2>;
+            DeviceReducePtr<InElementwiseOperation, AccElementwiseOperation>;

        std::vector<DeviceReduceInstPtr0> reduce0_ptrs;
-        std::vector<DeviceReduceInstPtr1> reduce1_ptrs;
-        std::vector<DeviceReduceInstPtr2> reduce2_ptrs;

        add_device_reduce_instance_threadwise<InDataType,
                                              AccDataType,
@@ -325,8 +280,8 @@ void profile_reduce_impl_impl(bool do_verification,
                                              Rank,
                                              NumReduceDim,
                                              ReduceOpId,
-                                              NanOpt,
-                                              IndicesOpt>(reduce0_ptrs);
+                                              PropagateNan,
+                                              UseIndex>(reduce0_ptrs);

        add_device_reduce_instance_blockwise<InDataType,
                                             AccDataType,
@@ -334,8 +289,8 @@ void profile_reduce_impl_impl(bool do_verification,
                                             Rank,
                                             NumReduceDim,
                                             ReduceOpId,
-                                             NanOpt,
-                                             IndicesOpt>(reduce0_ptrs);
+                                             PropagateNan,
+                                             UseIndex>(reduce0_ptrs);

        if constexpr(use_atomic_add)
        {
@@ -345,35 +300,11 @@ void profile_reduce_impl_impl(bool do_verification,
                                                             Rank,
                                                             NumReduceDim,
                                                             ReduceOpId,
-                                                             NanOpt,
-                                                             IndicesOpt>(reduce0_ptrs);
+                                                             PropagateNan,
+                                                             UseIndex>(reduce0_ptrs);
        }
-        else
-        {
-            add_device_reduce_instance_multiblock_partial_reduce<InDataType,
-                                                                 AccDataType,
-                                                                 OutDataType,
-                                                                 Rank,
-                                                                 NumReduceDim,
-                                                                 ReduceOpId,
-                                                                 NanOpt,
-                                                                 IndicesOpt>(reduce1_ptrs);
-        };
-
-        // used for secondary reduction
-        if constexpr(!use_atomic_add)
-        {
-            add_device_reduce_instance_blockwise_second_call<AccDataType,
-                                                             AccDataType,
-                                                             OutDataType,
-                                                             Rank,
-                                                             NumReduceDim,
-                                                             ReduceOpId,
-                                                             NanOpt,
-                                                             IndicesOpt>(reduce2_ptrs);
-        };

-        if(reduce0_ptrs.empty() && reduce1_ptrs.empty())
+        if(reduce0_ptrs.empty())
        {
            throw std::runtime_error("Wrong! No device REDUCE instance found");
        };
@@ -383,31 +314,34 @@ void profile_reduce_impl_impl(bool do_verification,
            ReductionHost<InDataType,
                          AccDataType,
                          OutDataType,
-                          ReduceOpId,
+                          ReduceOperation,
+                          InElementwiseOperation,
+                          AccElementwiseOperation,
                          Rank,
                          NumReduceDim,
                          PropagateNan,
-                          NeedIndices>
+                          OutputIndex>
                hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);

            hostReduce.Run(
                alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data());
        };

-        const auto i_inLengths  = to_int_vector(inLengths);
-        const auto i_inStrides  = to_int_vector(inStrides);
-        const auto i_outLengths = to_int_vector(outLengths);
-        const auto i_outStrides = to_int_vector(outStrides);
+        std::vector<ck::index_t> i_inLengths;
+        std::vector<ck::index_t> i_inStrides;
+        std::vector<ck::index_t> i_outLengths;
+        std::vector<ck::index_t> i_outStrides;
+
+        i_inLengths.assign(inLengths.begin(), inLengths.end());
+        i_inStrides.assign(inStrides.begin(), inStrides.end());
+        i_outLengths.assign(outLengths.begin(), outLengths.end());
+        i_outStrides.assign(outStrides.begin(), outStrides.end());

        for(auto& reduce_ptr : reduce0_ptrs)
        {
-            auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths, reduceDims);
-
-            DeviceMem ws_dev(wsSizeInBytes);

-            InElementwiseOperation_0 in_elementwise_op_0(static_cast<int32_t>(reduce_total_length));
-            AccElementwiseOperation_0 acc_elementwise_op_0(
-                static_cast<int32_t>(reduce_total_length));
+            InElementwiseOperation in_elementwise_op(static_cast<int32_t>(reduce_total_length));
+            AccElementwiseOperation acc_elementwise_op(static_cast<int32_t>(reduce_total_length));

            auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths,
                                                                i_inStrides,
@@ -417,11 +351,11 @@ void profile_reduce_impl_impl(bool do_verification,
                                                                alpha,
                                                                beta,
                                                                in_dev.GetDeviceBuffer(),
+                                                                nullptr,
                                                                out_dev.GetDeviceBuffer(),
                                                                out_indices_dev.GetDeviceBuffer(),
-                                                                ws_dev.GetDeviceBuffer(),
-                                                                in_elementwise_op_0,
-                                                                acc_elementwise_op_0);
+                                                                in_elementwise_op,
+                                                                acc_elementwise_op);

            if(!reduce_ptr->IsSupportedArgument(argument_ptr.get()))
                continue;
@@ -430,7 +364,8 @@ void profile_reduce_impl_impl(bool do_verification,

            auto invoker_ptr = reduce_ptr->MakeInvokerPointer();

-            float avg_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+            float avg_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});

            std::size_t num_bytes =
                invariant_total_length * reduce_total_length * sizeof(InDataType) +
@@ -438,8 +373,9 @@ void profile_reduce_impl_impl(bool do_verification,

            float gb_per_sec = num_bytes / 1.E6 / avg_time;

-            std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s, " << reduce_name
-                      << std::endl;
+            if(time_kernel)
+                std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s, "
+                          << reduce_name << std::endl;

            if(gb_per_sec > best_gb_per_sec)
            {
@@ -449,22 +385,24 @@ void profile_reduce_impl_impl(bool do_verification,

            if(do_verification)
            {
+                bool single_pass;
+
                out_dev.FromDevice(out.mData.data());
-                ck::utils::check_err(out.mData, out_ref.mData);
+                single_pass = ck::utils::check_err(out.mData, out_ref.mData);

-                if(NeedIndices)
+                if(OutputIndex)
                {
                    out_indices_dev.FromDevice(out_indices.mData.data());
-                    ck::utils::check_err(out_indices.mData, out_indices_ref.mData);
-                    ;
+                    single_pass = single_pass &&
+                                  ck::utils::check_err(out_indices.mData, out_indices_ref.mData);
                };

-                if(do_log)
+                if(!single_pass)
                {
-                    LogRangeAsType<float>(std::cout << "out_host  : ", out_ref.mData, ",")
-                        << std::endl;
-                    LogRangeAsType<float>(std::cout << "out_device: ", out.mData, ",") << std::endl;
-                };
+                    std::cout << "Fail Info: " << reduce_ptr->GetTypeString() << std::endl;
+                }
+
+                pass = pass && single_pass;
            };

            if(do_dumpout)
@@ -473,7 +411,7 @@ void profile_reduce_impl_impl(bool do_verification,
                dumpBufferToFile("dump_out.bin", out.mData.data(), out.mDesc.GetElementSize());
                dumpBufferToFile(
                    "dump_out_host.bin", out_ref.mData.data(), out_ref.mDesc.GetElementSize());
-                if(NeedIndices)
+                if(OutputIndex)
                {
                    dumpBufferToFile("dump_indices.bin",
                                     out_indices.mData.data(),
@@ -485,156 +423,34 @@ void profile_reduce_impl_impl(bool do_verification,
            };
        };

-        for(auto& reduce_ptr : reduce1_ptrs)
-        {
-            auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths, reduceDims);
-
-            DeviceMem ws_dev(wsSizeInBytes);
-
-            InElementwiseOperation_1 in_elementwise_op_1(static_cast<int32_t>(reduce_total_length));
-            AccElementwiseOperation_1 acc_elementwise_op_1(
-                static_cast<int32_t>(reduce_total_length));
-
-            auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths,
-                                                                i_inStrides,
-                                                                i_outLengths,
-                                                                i_outStrides,
-                                                                reduceDims,
-                                                                alpha,
-                                                                beta,
-                                                                in_dev.GetDeviceBuffer(),
-                                                                out_dev.GetDeviceBuffer(),
-                                                                out_indices_dev.GetDeviceBuffer(),
-                                                                ws_dev.GetDeviceBuffer(),
-                                                                in_elementwise_op_1,
-                                                                acc_elementwise_op_1);
-
-            if(!reduce_ptr->IsSupportedArgument(argument_ptr.get()))
-                continue;
-
-            std::string reduce_name = reduce_ptr->GetTypeString();
-
-            auto invoker_ptr = reduce_ptr->MakeInvokerPointer();
-
-            float avg_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
-
-            std::size_t num_bytes =
-                invariant_total_length * reduce_total_length * sizeof(InDataType) +
-                invariant_total_length * sizeof(OutDataType);
-
-            std::vector<int> inLengths2 = reduce_ptr->GetWorkspace2dLengths(argument_ptr.get());
-            std::vector<int> inStrides2{inLengths2[1], 1};
-
-            for(auto& reduce2_ptr : reduce2_ptrs)
-            {
-                InElementwiseOperation_2 in_elementwise_op_2(
-                    static_cast<int32_t>(reduce_total_length));
-                AccElementwiseOperation_2 acc_elementwise_op_2(
-                    static_cast<int32_t>(reduce_total_length));
-
-                auto argument2_ptr =
-                    reduce2_ptr->MakeArgumentPointer(inLengths2,
-                                                     inStrides2,
-                                                     i_outLengths,
-                                                     i_outStrides,
-                                                     reduceDims,
-                                                     alpha,
-                                                     beta,
-                                                     ws_dev.GetDeviceBuffer(),
-                                                     out_dev.GetDeviceBuffer(),
-                                                     out_indices_dev.GetDeviceBuffer(),
-                                                     ws_dev.GetDeviceBuffer(),
-                                                     in_elementwise_op_2,
-                                                     acc_elementwise_op_2);
-
-                if(!reduce2_ptr->IsSupportedArgument(argument2_ptr.get()))
-                    continue;
-
-                std::string reduce2_name = reduce2_ptr->GetTypeString();
-
-                auto invoker2_ptr = reduce2_ptr->MakeInvokerPointer();
-
-                float avg_time_2 = invoker2_ptr->Run(argument2_ptr.get(), nrepeat);
-
-                std::size_t num_bytes_2 =
-                    static_cast<size_t>(inLengths2[0]) * inLengths2[1] * sizeof(AccDataType);
-
-                float gb_per_sec = (num_bytes + num_bytes_2) / 1.E6 / (avg_time + avg_time_2);
-
-                std::cout << "Perf: " << (avg_time + avg_time_2) << " ms, " << gb_per_sec
-                          << " GB/s, " << reduce_name << " => " << reduce2_name << std::endl;
-
-                if(gb_per_sec > best_gb_per_sec)
-                {
-                    best_avg_time   = avg_time + avg_time_2;
-                    best_gb_per_sec = gb_per_sec;
-                }
-
-                if(do_verification)
-                {
-                    out_dev.FromDevice(out.mData.data());
-                    ck::utils::check_err(out.mData, out_ref.mData);
-
-                    if(NeedIndices)
-                    {
-                        out_indices_dev.FromDevice(out_indices.mData.data());
-                        ck::utils::check_err(out_indices.mData, out_indices_ref.mData);
-                        ;
-                    };
-
-                    if(do_log)
-                    {
-                        LogRangeAsType<float>(std::cout << "out_host  : ", out_ref.mData, ",")
-                            << std::endl;
-                        LogRangeAsType<float>(std::cout << "out_device: ", out.mData, ",")
-                            << std::endl;
-                    }
-                }
-
-                if(do_dumpout)
-                {
-                    dumpBufferToFile("dump_in.bin", in.mData.data(), in.mDesc.GetElementSize());
-                    dumpBufferToFile("dump_out.bin", out.mData.data(), out.mDesc.GetElementSize());
-                    dumpBufferToFile(
-                        "dump_out_host.bin", out_ref.mData.data(), out_ref.mDesc.GetElementSize());
-                    if(NeedIndices)
-                    {
-                        dumpBufferToFile("dump_indices.bin",
-                                         out_indices.mData.data(),
-                                         out_indices.mDesc.GetElementSize());
-                        dumpBufferToFile("dump_indices_host.bin",
-                                         out_indices_ref.mData.data(),
-                                         out_indices_ref.mDesc.GetElementSize());
-                    };
-                };
-            };
-        };
-
-        std::cout << "Best Perf: " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s"
-                  << std::endl;
+        if(time_kernel)
+            std::cout << "Best Perf: " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s"
+                      << std::endl;
    }
    else
    {
        std::cout << "The requested reduction operation is not supported, please check !!!"
                  << std::endl;
    };
+
+    return pass;
 };

 template <typename InDataType, typename AccDataType, typename OutDataType>
-void profile_reduce_impl(bool do_verification,
+bool profile_reduce_impl(bool do_verification,
                         int init_method,
-                         bool do_log,
                         bool do_dumpout,
-                         int nrepeat,
+                         bool time_kernel,
                         const std::vector<size_t>& inLengths,
                         const std::vector<int>& reduceDims,
                         ReduceTensorOp ReduceOpId,
-                         NanPropagation NanOpt,
-                         ReduceTensorIndices IndicesOpt,
+                         bool PropagateNan,
+                         bool UseIndex,
                         float alpha,
                         float beta)
 {
    bool matched = false;
+    bool pass    = true;

    using tuple_of_description_instances =
        tensor_operation::device::device_reduce_instance::reduce_description_instances;
@@ -648,29 +464,30 @@ void profile_reduce_impl(bool do_verification,
        using descType = remove_cvref_t<decltype(std::get<i>(tuple_object))>;

        if(!description_match(
-               descType{}, inLengths.size(), reduceDims, ReduceOpId, NanOpt, IndicesOpt))
+               descType{}, inLengths.size(), reduceDims, ReduceOpId, PropagateNan, UseIndex))
            return;

-        profile_reduce_impl_impl<InDataType,
-                                 AccDataType,
-                                 OutDataType,
-                                 descType::Rank_,
-                                 descType::NumReduceDim_,
-                                 static_cast<ReduceTensorOp>(descType::ReduceOpId_),
-                                 static_cast<NanPropagation>(descType::NanOpt_),
-                                 static_cast<ReduceTensorIndices>(descType::IndicesOpt_)>(
-            do_verification,
-            init_method,
-            do_log,
-            do_dumpout,
-            nrepeat,
-            inLengths,
-            reduceDims,
-            alpha,
-            beta);
+        pass = pass &&
+               profile_reduce_impl_impl<InDataType,
+                                        AccDataType,
+                                        OutDataType,
+                                        descType::Rank_,
+                                        descType::NumReduceDim_,
+                                        static_cast<ReduceTensorOp>(descType::ReduceOpId_),
+                                        static_cast<bool>(descType::PropagateNan_),
+                                        static_cast<bool>(descType::UseIndex_)>(do_verification,
+                                                                                init_method,
+                                                                                do_dumpout,
+                                                                                time_kernel,
+                                                                                inLengths,
+                                                                                reduceDims,
+                                                                                alpha,
+                                                                                beta);

        matched = true;
    });
+
+    return pass;
 };

 } // namespace profiler

--- a/profiler/src/profile_batched_gemm.cpp
+++ b/profiler/src/profile_batched_gemm.cpp
@@ -48,8 +48,8 @@ int profile_batched_gemm(int argc, char* argv[])
        printf("                     3: A[g, k, m] * B[g, n, k] = C[g, m, n])\n");
        printf("arg4: verification (0: no; 1: yes)\n");
        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
-        printf("arg8: print tensor value (0: no; 1: yes)\n");
-        printf("arg7: run kernel # of times (>1)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=n0, 1=yes)\n");
        printf("arg8 to 14: M, N, K, StrideA, StrideB, StrideC, BatchCount\n");
        exit(1);
    }
@@ -59,7 +59,7 @@ int profile_batched_gemm(int argc, char* argv[])
    const bool do_verification = std::stoi(argv[4]);
    const int init_method      = std::stoi(argv[5]);
    const bool do_log          = std::stoi(argv[6]);
-    const int nrepeat          = std::stoi(argv[7]);
+    const bool time_kernel     = std::stoi(argv[7]);

    const int M = std::stoi(argv[8]);
    const int N = std::stoi(argv[9]);
@@ -82,7 +82,7 @@ int profile_batched_gemm(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -102,7 +102,7 @@ int profile_batched_gemm(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -122,7 +122,7 @@ int profile_batched_gemm(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -142,7 +142,7 @@ int profile_batched_gemm(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -162,7 +162,7 @@ int profile_batched_gemm(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -182,7 +182,7 @@ int profile_batched_gemm(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -202,7 +202,7 @@ int profile_batched_gemm(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -222,7 +222,7 @@ int profile_batched_gemm(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -242,7 +242,7 @@ int profile_batched_gemm(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -262,7 +262,7 @@ int profile_batched_gemm(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -282,7 +282,7 @@ int profile_batched_gemm(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -302,7 +302,7 @@ int profile_batched_gemm(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -322,7 +322,7 @@ int profile_batched_gemm(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -342,7 +342,7 @@ int profile_batched_gemm(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -362,7 +362,7 @@ int profile_batched_gemm(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -382,7 +382,7 @@ int profile_batched_gemm(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -396,5 +396,5 @@ int profile_batched_gemm(int argc, char* argv[])
        throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented");
    }

-    return 1;
+    return 0;
 }
--- a/profiler/src/profile_batched_gemm_reduce.cpp
+++ b/profiler/src/profile_batched_gemm_reduce.cpp
@@ -33,8 +33,8 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
        printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
        printf("arg4: verification (0: no; 1: yes)\n");
        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
-        printf("arg8: print tensor value (0: no; 1: yes)\n");
-        printf("arg7: run kernel # of times (>1)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=n0, 1=yes)\n");
        printf("arg8 to 14: M, N, K, StrideA, StrideB, StrideC, BatchCount\n");
        printf("arg15: split k into  mulitiple batch\n");
        exit(1);
@@ -45,7 +45,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
    const bool do_verification = std::stoi(argv[4]);
    const int init_method      = std::stoi(argv[5]);
    const bool do_log          = std::stoi(argv[6]);
-    const int nrepeat          = std::stoi(argv[7]);
+    const bool time_kernel     = std::stoi(argv[7]);

    const int M = std::stoi(argv[8]);
    const int N = std::stoi(argv[9]);
@@ -69,7 +69,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -91,7 +91,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -113,7 +113,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -135,7 +135,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -149,5 +149,5 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
        throw std::runtime_error("wrong! this data_type & layout is not implemented");
    }

-    return 1;
+    return 0;
 }
--- a/profiler/src/profile_conv_bwd_data.cpp
+++ b/profiler/src/profile_conv_bwd_data.cpp
-#include <iostream>
-#include <numeric>
-#include <initializer_list>
-#include <cstdlib>
-#include <stdlib.h>
-#include <half.hpp>
-#include "profile_conv_bwd_data_impl.hpp"
-
-enum struct ConvDataType
-{
-    F32_F32_F32,    // 0
-    F16_F16_F16,    // 1
-    BF16_BF16_BF16, // 2
-    INT8_INT8_INT8, // 3
-};
-
-enum struct ConvInputLayout
-{
-    NCHW, // 0
-    NHWC, // 1
-};
-
-enum struct ConvWeightLayout
-{
-    KCYX, // 0
-    KYXC, // 1
-};
-
-enum struct ConvOutputLayout
-{
-    NKHW, // 0
-    NHWK, // 1
-};
-
-int profile_conv_bwd_data(int argc, char* argv[])
-{
-    if(argc != 25)
-    {
-        printf("arg1: tensor operation (conv_bwd: BackwardConvolution)\n");
-        printf("arg2: data type (0: fp32; 1: fp16)\n");
-        printf("arg3: input tensor layout (0: NCHW; 1: NHWC)\n");
-        printf("arg4: weight tensor layout (0: KCYX; 1: KYXC)\n");
-        printf("arg5: output tensor layout (0: NKHW; 1: NHWK)\n");
-        printf("arg6: verification (0: no; 1: yes)\n");
-        printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
-        printf("arg8: print tensor value (0: no; 1: yes)\n");
-        printf("arg9: run kernel # of times (>1)\n");
-        printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
-               "RightPx\n");
-        exit(1);
-    }
-
-    const auto data_type       = static_cast<ConvDataType>(std::stoi(argv[2]));
-    const auto in_layout       = static_cast<ConvInputLayout>(std::stoi(argv[3]));
-    const auto wei_layout      = static_cast<ConvWeightLayout>(std::stoi(argv[4]));
-    const auto out_layout      = static_cast<ConvOutputLayout>(std::stoi(argv[5]));
-    const bool do_verification = std::stoi(argv[6]);
-    const int init_method      = std::stoi(argv[7]);
-    const bool do_log          = std::stoi(argv[8]);
-    const int nrepeat          = std::stoi(argv[9]);
-
-    const ck::index_t N  = std::stoi(argv[10]);
-    const ck::index_t K  = std::stoi(argv[11]);
-    const ck::index_t C  = std::stoi(argv[12]);
-    const ck::index_t Y  = std::stoi(argv[13]);
-    const ck::index_t X  = std::stoi(argv[14]);
-    const ck::index_t Hi = std::stoi(argv[15]);
-    const ck::index_t Wi = std::stoi(argv[16]);
-
-    const ck::index_t conv_stride_h   = std::stoi(argv[17]);
-    const ck::index_t conv_stride_w   = std::stoi(argv[18]);
-    const ck::index_t conv_dilation_h = std::stoi(argv[19]);
-    const ck::index_t conv_dilation_w = std::stoi(argv[20]);
-    const ck::index_t in_left_pad_h   = std::stoi(argv[21]);
-    const ck::index_t in_left_pad_w   = std::stoi(argv[22]);
-    const ck::index_t in_right_pad_h  = std::stoi(argv[23]);
-    const ck::index_t in_right_pad_w  = std::stoi(argv[24]);
-
-    const ck::index_t YEff = (Y - 1) * conv_dilation_h + 1;
-    const ck::index_t XEff = (X - 1) * conv_dilation_w + 1;
-
-    const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
-    const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
-
-    if(data_type == ConvDataType::F32_F32_F32 && in_layout == ConvInputLayout::NHWC &&
-       wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
-    {
-        ck::profiler::profile_conv_bwd_data_impl<2,
-                                                 float,
-                                                 float,
-                                                 float,
-                                                 float,
-                                                 ck::tensor_layout::convolution::NHWC,
-                                                 ck::tensor_layout::convolution::KYXC,
-                                                 ck::tensor_layout::convolution::NHWK>(
-            do_verification,
-            init_method,
-            do_log,
-            nrepeat,
-            N,
-            K,
-            C,
-            std::vector<ck::index_t>{Hi, Wi},
-            std::vector<ck::index_t>{Y, X},
-            std::vector<ck::index_t>{Ho, Wo},
-            std::vector<ck::index_t>{conv_stride_h, conv_stride_w},
-            std::vector<ck::index_t>{conv_dilation_h, conv_dilation_w},
-            std::vector<ck::index_t>{in_left_pad_h, in_left_pad_w},
-            std::vector<ck::index_t>{in_right_pad_h, in_right_pad_w});
-    }
-    else if(data_type == ConvDataType::F16_F16_F16 && in_layout == ConvInputLayout::NHWC &&
-            wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
-    {
-        ck::profiler::profile_conv_bwd_data_impl<2,
-                                                 ck::half_t,
-                                                 ck::half_t,
-                                                 ck::half_t,
-                                                 float,
-                                                 ck::tensor_layout::convolution::NHWC,
-                                                 ck::tensor_layout::convolution::KYXC,
-                                                 ck::tensor_layout::convolution::NHWK>(
-            do_verification,
-            init_method,
-            do_log,
-            nrepeat,
-            N,
-            K,
-            C,
-            std::vector<ck::index_t>{Hi, Wi},
-            std::vector<ck::index_t>{Y, X},
-            std::vector<ck::index_t>{Ho, Wo},
-            std::vector<ck::index_t>{conv_stride_h, conv_stride_w},
-            std::vector<ck::index_t>{conv_dilation_h, conv_dilation_w},
-            std::vector<ck::index_t>{in_left_pad_h, in_left_pad_w},
-            std::vector<ck::index_t>{in_right_pad_h, in_right_pad_w});
-    }
-    else if(data_type == ConvDataType::BF16_BF16_BF16 && in_layout == ConvInputLayout::NHWC &&
-            wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
-    {
-        ck::profiler::profile_conv_bwd_data_impl<2,
-                                                 uint16_t,
-                                                 uint16_t,
-                                                 uint16_t,
-                                                 float,
-                                                 ck::tensor_layout::convolution::NHWC,
-                                                 ck::tensor_layout::convolution::KYXC,
-                                                 ck::tensor_layout::convolution::NHWK>(
-            do_verification,
-            init_method,
-            do_log,
-            nrepeat,
-            N,
-            K,
-            C,
-            std::vector<ck::index_t>{Hi, Wi},
-            std::vector<ck::index_t>{Y, X},
-            std::vector<ck::index_t>{Ho, Wo},
-            std::vector<ck::index_t>{conv_stride_h, conv_stride_w},
-            std::vector<ck::index_t>{conv_dilation_h, conv_dilation_w},
-            std::vector<ck::index_t>{in_left_pad_h, in_left_pad_w},
-            std::vector<ck::index_t>{in_right_pad_h, in_right_pad_w});
-    }
-    else if(data_type == ConvDataType::INT8_INT8_INT8 && in_layout == ConvInputLayout::NHWC &&
-            wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
-    {
-        ck::profiler::profile_conv_bwd_data_impl<2,
-                                                 int8_t,
-                                                 int8_t,
-                                                 int8_t,
-                                                 int32_t,
-                                                 ck::tensor_layout::convolution::NHWC,
-                                                 ck::tensor_layout::convolution::KYXC,
-                                                 ck::tensor_layout::convolution::NHWK>(
-            do_verification,
-            init_method,
-            do_log,
-            nrepeat,
-            N,
-            K,
-            C,
-            std::vector<ck::index_t>{Hi, Wi},
-            std::vector<ck::index_t>{Y, X},
-            std::vector<ck::index_t>{Ho, Wo},
-            std::vector<ck::index_t>{conv_stride_h, conv_stride_w},
-            std::vector<ck::index_t>{conv_dilation_h, conv_dilation_w},
-            std::vector<ck::index_t>{in_left_pad_h, in_left_pad_w},
-            std::vector<ck::index_t>{in_right_pad_h, in_right_pad_w});
-    }
-    else
-    {
-        throw std::runtime_error("wrong! this Conv data_type & layout is not implemented");
-    }
-
-    return 1;
-}
--- a/profiler/src/profile_conv_bwd_weight.cpp
+++ b/profiler/src/profile_conv_bwd_weight.cpp
@@ -58,7 +58,7 @@ int profile_conv_bwd_weight(int argc, char* argv[])
    const bool do_verification = std::stoi(argv[6]);
    const int init_method      = std::stoi(argv[7]);
    const bool do_log          = std::stoi(argv[8]);
-    const int nrepeat          = std::stoi(argv[9]);
+    const bool time_kernel     = std::stoi(argv[9]);

    const ck::index_t N  = std::stoi(argv[10]);
    const ck::index_t K  = std::stoi(argv[11]);
@@ -98,7 +98,7 @@ int profile_conv_bwd_weight(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            N,
            K,
            C,
@@ -124,7 +124,7 @@ int profile_conv_bwd_weight(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            N,
            K,
            C,
@@ -142,5 +142,5 @@ int profile_conv_bwd_weight(int argc, char* argv[])
        throw std::runtime_error("wrong! this Conv data_type & layout is not implemented");
    }

-    return 1;
+    return 0;
 }
--- a/profiler/src/profile_conv_fwd_bias_relu.cpp
+++ b/profiler/src/profile_conv_fwd_bias_relu.cpp
@@ -42,7 +42,7 @@ int profile_conv_fwd_bias_relu(int argc, char* argv[])
        printf("arg6: verification (0: no; 1: yes)\n");
        printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
        printf("arg8: print tensor value (0: no; 1: yes)\n");
-        printf("arg9: run kernel # of times (>1)\n");
+        printf("arg9: time kernel (0=n0, 1=yes)\n");
        printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
               "RightPx\n");
        exit(1);
@@ -55,7 +55,7 @@ int profile_conv_fwd_bias_relu(int argc, char* argv[])
    const bool do_verification = std::stoi(argv[6]);
    const int init_method      = std::stoi(argv[7]);
    const bool do_log          = std::stoi(argv[8]);
-    const int nrepeat          = std::stoi(argv[9]);
+    const bool time_kernel     = std::stoi(argv[9]);

    const ck::index_t N  = std::stoi(argv[10]);
    const ck::index_t K  = std::stoi(argv[11]);
@@ -93,7 +93,7 @@ int profile_conv_fwd_bias_relu(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            N,
            K,
            C,
@@ -110,5 +110,5 @@ int profile_conv_fwd_bias_relu(int argc, char* argv[])
        throw std::runtime_error("wrong! data_type & layout for this operator is not implemented");
    }

-    return 1;
+    return 0;
 }
--- a/profiler/src/profile_conv_fwd_bias_relu_add.cpp
+++ b/profiler/src/profile_conv_fwd_bias_relu_add.cpp
@@ -43,7 +43,7 @@ int profile_conv_fwd_bias_relu_add(int argc, char* argv[])
        printf("arg6: verification (0: no; 1: yes)\n");
        printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
        printf("arg8: print tensor value (0: no; 1: yes)\n");
-        printf("arg9: run kernel # of times (>1)\n");
+        printf("arg9: time kernel (0=n0, 1=yes)\n");
        printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
               "RightPx\n");
        exit(1);
@@ -56,7 +56,7 @@ int profile_conv_fwd_bias_relu_add(int argc, char* argv[])
    const bool do_verification = std::stoi(argv[6]);
    const int init_method      = std::stoi(argv[7]);
    const bool do_log          = std::stoi(argv[8]);
-    const int nrepeat          = std::stoi(argv[9]);
+    const bool time_kernel     = std::stoi(argv[9]);

    const ck::index_t N  = std::stoi(argv[10]);
    const ck::index_t K  = std::stoi(argv[11]);
@@ -94,7 +94,7 @@ int profile_conv_fwd_bias_relu_add(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            N,
            K,
            C,
@@ -111,5 +111,5 @@ int profile_conv_fwd_bias_relu_add(int argc, char* argv[])
        throw std::runtime_error("wrong! data_type & layout for this operator is not implemented");
    }

-    return 1;
+    return 0;
 }
--- a/profiler/src/profile_conv_fwd_bias_relu_atomic_add.cpp
+++ b/profiler/src/profile_conv_fwd_bias_relu_atomic_add.cpp
@@ -43,7 +43,7 @@ int profile_conv_fwd_bias_relu_atomic_add(int argc, char* argv[])
        printf("arg6: verification (0: no; 1: yes)\n");
        printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
        printf("arg8: print tensor value (0: no; 1: yes)\n");
-        printf("arg9: run kernel # of times (>1)\n");
+        printf("arg9: time kernel (0=n0, 1=yes)\n");
        printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
               "RightPx\n");
        exit(1);
@@ -56,7 +56,7 @@ int profile_conv_fwd_bias_relu_atomic_add(int argc, char* argv[])
    const bool do_verification = std::stoi(argv[6]);
    const int init_method      = std::stoi(argv[7]);
    const bool do_log          = std::stoi(argv[8]);
-    const int nrepeat          = std::stoi(argv[9]);
+    const bool time_kernel     = std::stoi(argv[9]);

    const ck::index_t N  = std::stoi(argv[10]);
    const ck::index_t K  = std::stoi(argv[11]);
@@ -95,7 +95,7 @@ int profile_conv_fwd_bias_relu_atomic_add(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            N,
            K,
            C,
@@ -112,5 +112,5 @@ int profile_conv_fwd_bias_relu_atomic_add(int argc, char* argv[])
        throw std::runtime_error("wrong! data_type & layout for this operator is not implemented");
    }

-    return 1;
+    return 0;
 }
--- a/profiler/src/profile_convnd_bwd_data.cpp
+++ b/profiler/src/profile_convnd_bwd_data.cpp
@@ -95,7 +95,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
        printf("arg6: verification (0: no; 1: yes)\n");
        printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
        printf("arg8: print tensor value (0: no; 1: yes)\n");
-        printf("arg9: run kernel # of times (>1)\n");
+        printf("arg9: time kernel (0=n0, 1=yes)\n");
        printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
               "RightPx\n");
        return 1;
@@ -108,7 +108,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
    const bool do_verification = std::stoi(argv[6]);
    const int init_method      = std::stoi(argv[7]);
    const bool do_log          = std::stoi(argv[8]);
-    const int nrepeat          = std::stoi(argv[9]);
+    const bool time_kernel     = std::stoi(argv[9]);

    ck::utils::conv::ConvParams params = parse_conv_params(num_dim_spatial, argv, preParams);

@@ -132,7 +132,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
                do_verification,
                init_method,
                do_log,
-                nrepeat,
+                time_kernel,
                params.N_,
                params.K_,
                params.C_,
@@ -157,7 +157,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
                do_verification,
                init_method,
                do_log,
-                nrepeat,
+                time_kernel,
                params.N_,
                params.K_,
                params.C_,
@@ -182,7 +182,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
                do_verification,
                init_method,
                do_log,
-                nrepeat,
+                time_kernel,
                params.N_,
                params.K_,
                params.C_,

--- a/profiler/src/profile_convnd_fwd.cpp
+++ b/profiler/src/profile_convnd_fwd.cpp
@@ -119,7 +119,7 @@ template <int NDim,
 void profile_convnd_instances_impl(const ck::utils::conv::ConvParams& params,
                                   bool do_verification,
                                   bool do_log,
-                                   int nrepeat,
+                                   bool time_kernel,
                                   int init_method,
                                   ConvLayouts)
 {
@@ -185,7 +185,7 @@ void profile_convnd_instances_impl(const ck::utils::conv::ConvParams& params,
                                                                         reference_conv_fwd_fun);
    auto best_conf = run_engine.Profile(
        conv::ConvolutionFwdInstances<InDataType, WeiDataType, OutDataType>::template Get<NDim>(),
-        nrepeat,
+        time_kernel,
        do_verification,
        do_log);

@@ -201,7 +201,7 @@ void profile_convnd_instances(ConvDataType data_type,
                              const ck::utils::conv::ConvParams& params,
                              bool do_verification,
                              bool do_log,
-                              int nrepeat,
+                              bool time_kernel,
                              int init_method)
 {
    switch(data_layout)
@@ -214,7 +214,7 @@ void profile_convnd_instances(ConvDataType data_type,
                params,
                do_verification,
                do_log,
-                nrepeat,
+                time_kernel,
                init_method,
                ConvolutionLayouts<NDim, ConvDataLayout::NHWC>{});
            break;
@@ -223,7 +223,7 @@ void profile_convnd_instances(ConvDataType data_type,
                params,
                do_verification,
                do_log,
-                nrepeat,
+                time_kernel,
                init_method,
                ConvolutionLayouts<NDim, ConvDataLayout::NHWC>{});
            break;
@@ -232,7 +232,7 @@ void profile_convnd_instances(ConvDataType data_type,
                params,
                do_verification,
                do_log,
-                nrepeat,
+                time_kernel,
                init_method,
                ConvolutionLayouts<NDim, ConvDataLayout::NHWC>{});
            break;
@@ -241,7 +241,7 @@ void profile_convnd_instances(ConvDataType data_type,
                params,
                do_verification,
                do_log,
-                nrepeat,
+                time_kernel,
                init_method,
                ConvolutionLayouts<NDim, ConvDataLayout::NHWC>{});
            break;
@@ -256,7 +256,7 @@ void profile_convnd_instances(ConvDataType data_type,
                params,
                do_verification,
                do_log,
-                nrepeat,
+                time_kernel,
                init_method,
                ConvolutionLayouts<NDim, ConvDataLayout::NCHW>{});
            break;
@@ -265,7 +265,7 @@ void profile_convnd_instances(ConvDataType data_type,
                params,
                do_verification,
                do_log,
-                nrepeat,
+                time_kernel,
                init_method,
                ConvolutionLayouts<NDim, ConvDataLayout::NCHW>{});
            break;
@@ -274,7 +274,7 @@ void profile_convnd_instances(ConvDataType data_type,
                params,
                do_verification,
                do_log,
-                nrepeat,
+                time_kernel,
                init_method,
                ConvolutionLayouts<NDim, ConvDataLayout::NCHW>{});
            break;
@@ -283,7 +283,7 @@ void profile_convnd_instances(ConvDataType data_type,
                params,
                do_verification,
                do_log,
-                nrepeat,
+                time_kernel,
                init_method,
                ConvolutionLayouts<NDim, ConvDataLayout::NCHW>{});
            break;
@@ -304,7 +304,7 @@ int ck::profiler::profile_convnd_fwd(int argc, char* argv[])
    bool do_verification{true};
    int init_method{2};
    bool do_log{false};
-    int nrepeat{100};
+    bool time_kernel{false};
    int num_dim_spatial{2};
    ConvParams params;

@@ -318,7 +318,7 @@ int ck::profiler::profile_convnd_fwd(int argc, char* argv[])
        do_verification = std::stoi(argv[4]);
        init_method     = std::stoi(argv[5]);
        do_log          = std::stoi(argv[6]);
-        nrepeat         = std::stoi(argv[7]);
+        time_kernel     = std::stoi(argv[7]);
        num_dim_spatial = std::stoi(argv[8]);
    }
    if(argc >= 10)
@@ -332,20 +332,20 @@ int ck::profiler::profile_convnd_fwd(int argc, char* argv[])
    {
    case 1:
        profile_convnd_instances<1>(
-            data_type, data_layout, params, do_verification, do_log, nrepeat, init_method);
+            data_type, data_layout, params, do_verification, do_log, time_kernel, init_method);
        break;
    case 2:
        profile_convnd_instances<2>(
-            data_type, data_layout, params, do_verification, do_log, nrepeat, init_method);
+            data_type, data_layout, params, do_verification, do_log, time_kernel, init_method);
        break;
    case 3:
        profile_convnd_instances<3>(
-            data_type, data_layout, params, do_verification, do_log, nrepeat, init_method);
+            data_type, data_layout, params, do_verification, do_log, time_kernel, init_method);
        break;
    default:
        throw std::runtime_error("profile_conv_fwd: unsupported num_dim_spatial value: " +
                                 std::to_string(num_dim_spatial));
    }

-    return 1;
+    return 0;
 }
--- a/profiler/src/profile_gemm.cpp
+++ b/profiler/src/profile_gemm.cpp
@@ -38,8 +38,8 @@ int profile_gemm(int argc, char* argv[])
        printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
        printf("arg4: verification (0: no; 1: yes)\n");
        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
-        printf("arg8: print tensor value (0: no; 1: yes)\n");
-        printf("arg7: run kernel # of times (>1)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=n0, 1=yes)\n");
        printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
        printf("arg14: split k into  mulitiple batch\n");
        exit(1);
@@ -50,7 +50,7 @@ int profile_gemm(int argc, char* argv[])
    const bool do_verification = std::stoi(argv[4]);
    const int init_method      = std::stoi(argv[5]);
    const bool do_log          = std::stoi(argv[6]);
-    const int nrepeat          = std::stoi(argv[7]);
+    const bool time_kernel     = std::stoi(argv[7]);

    const int M = std::stoi(argv[8]);
    const int N = std::stoi(argv[9]);
@@ -68,13 +68,14 @@ int profile_gemm(int argc, char* argv[])
        ck::profiler::profile_gemm_impl<ck::half_t,
                                        ck::half_t,
                                        ck::half_t,
+                                        float,
                                        ck::tensor_layout::gemm::RowMajor,
                                        ck::tensor_layout::gemm::RowMajor,
                                        ck::tensor_layout::gemm::RowMajor>(
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -88,13 +89,14 @@ int profile_gemm(int argc, char* argv[])
        ck::profiler::profile_gemm_impl<ck::half_t,
                                        ck::half_t,
                                        ck::half_t,
+                                        float,
                                        ck::tensor_layout::gemm::RowMajor,
                                        ck::tensor_layout::gemm::ColumnMajor,
                                        ck::tensor_layout::gemm::RowMajor>(
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -108,13 +110,14 @@ int profile_gemm(int argc, char* argv[])
        ck::profiler::profile_gemm_impl<ck::half_t,
                                        ck::half_t,
                                        ck::half_t,
+                                        float,
                                        ck::tensor_layout::gemm::ColumnMajor,
                                        ck::tensor_layout::gemm::RowMajor,
                                        ck::tensor_layout::gemm::RowMajor>(
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -128,13 +131,14 @@ int profile_gemm(int argc, char* argv[])
        ck::profiler::profile_gemm_impl<ck::half_t,
                                        ck::half_t,
                                        ck::half_t,
+                                        float,
                                        ck::tensor_layout::gemm::ColumnMajor,
                                        ck::tensor_layout::gemm::ColumnMajor,
                                        ck::tensor_layout::gemm::RowMajor>(
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -146,6 +150,7 @@ int profile_gemm(int argc, char* argv[])
    else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN)
    {
        ck::profiler::profile_gemm_impl<float,
+                                        float,
                                        float,
                                        float,
                                        ck::tensor_layout::gemm::RowMajor,
@@ -154,7 +159,7 @@ int profile_gemm(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -166,6 +171,7 @@ int profile_gemm(int argc, char* argv[])
    else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_NK_MN)
    {
        ck::profiler::profile_gemm_impl<float,
+                                        float,
                                        float,
                                        float,
                                        ck::tensor_layout::gemm::RowMajor,
@@ -174,7 +180,7 @@ int profile_gemm(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -186,6 +192,7 @@ int profile_gemm(int argc, char* argv[])
    else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::KM_KN_MN)
    {
        ck::profiler::profile_gemm_impl<float,
+                                        float,
                                        float,
                                        float,
                                        ck::tensor_layout::gemm::ColumnMajor,
@@ -194,7 +201,7 @@ int profile_gemm(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -206,6 +213,7 @@ int profile_gemm(int argc, char* argv[])
    else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::KM_NK_MN)
    {
        ck::profiler::profile_gemm_impl<float,
+                                        float,
                                        float,
                                        float,
                                        ck::tensor_layout::gemm::ColumnMajor,
@@ -214,7 +222,7 @@ int profile_gemm(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -228,13 +236,14 @@ int profile_gemm(int argc, char* argv[])
        ck::profiler::profile_gemm_impl<int8_t,
                                        int8_t,
                                        int8_t,
+                                        int32_t,
                                        ck::tensor_layout::gemm::RowMajor,
                                        ck::tensor_layout::gemm::RowMajor,
                                        ck::tensor_layout::gemm::RowMajor>(
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -248,13 +257,14 @@ int profile_gemm(int argc, char* argv[])
        ck::profiler::profile_gemm_impl<int8_t,
                                        int8_t,
                                        int8_t,
+                                        int32_t,
                                        ck::tensor_layout::gemm::RowMajor,
                                        ck::tensor_layout::gemm::ColumnMajor,
                                        ck::tensor_layout::gemm::RowMajor>(
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -268,13 +278,14 @@ int profile_gemm(int argc, char* argv[])
        ck::profiler::profile_gemm_impl<int8_t,
                                        int8_t,
                                        int8_t,
+                                        int32_t,
                                        ck::tensor_layout::gemm::ColumnMajor,
                                        ck::tensor_layout::gemm::RowMajor,
                                        ck::tensor_layout::gemm::RowMajor>(
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -288,13 +299,14 @@ int profile_gemm(int argc, char* argv[])
        ck::profiler::profile_gemm_impl<int8_t,
                                        int8_t,
                                        int8_t,
+                                        int32_t,
                                        ck::tensor_layout::gemm::ColumnMajor,
                                        ck::tensor_layout::gemm::ColumnMajor,
                                        ck::tensor_layout::gemm::RowMajor>(
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -308,13 +320,14 @@ int profile_gemm(int argc, char* argv[])
        ck::profiler::profile_gemm_impl<ck::bhalf_t,
                                        ck::bhalf_t,
                                        ck::bhalf_t,
+                                        float,
                                        ck::tensor_layout::gemm::RowMajor,
                                        ck::tensor_layout::gemm::RowMajor,
                                        ck::tensor_layout::gemm::RowMajor>(
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -328,13 +341,14 @@ int profile_gemm(int argc, char* argv[])
        ck::profiler::profile_gemm_impl<ck::bhalf_t,
                                        ck::bhalf_t,
                                        ck::bhalf_t,
+                                        float,
                                        ck::tensor_layout::gemm::RowMajor,
                                        ck::tensor_layout::gemm::ColumnMajor,
                                        ck::tensor_layout::gemm::RowMajor>(
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -348,13 +362,14 @@ int profile_gemm(int argc, char* argv[])
        ck::profiler::profile_gemm_impl<ck::bhalf_t,
                                        ck::bhalf_t,
                                        ck::bhalf_t,
+                                        float,
                                        ck::tensor_layout::gemm::ColumnMajor,
                                        ck::tensor_layout::gemm::RowMajor,
                                        ck::tensor_layout::gemm::RowMajor>(
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -368,13 +383,14 @@ int profile_gemm(int argc, char* argv[])
        ck::profiler::profile_gemm_impl<ck::bhalf_t,
                                        ck::bhalf_t,
                                        ck::bhalf_t,
+                                        float,
                                        ck::tensor_layout::gemm::ColumnMajor,
                                        ck::tensor_layout::gemm::ColumnMajor,
                                        ck::tensor_layout::gemm::RowMajor>(
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -388,5 +404,5 @@ int profile_gemm(int argc, char* argv[])
        throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented");
    }

-    return 1;
+    return 0;
 }
--- a/profiler/src/profile_gemm_bias_2d.cpp
+++ b/profiler/src/profile_gemm_bias_2d.cpp
@@ -36,8 +36,8 @@ int profile_gemm_bias_2d(int argc, char* argv[])
        printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
        printf("arg4: verification (0: no; 1: yes)\n");
        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
-        printf("arg8: print tensor value (0: no; 1: yes)\n");
-        printf("arg7: run kernel # of times (>1)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=n0, 1=yes)\n");
        printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
        printf("arg14: alpha\n");
        printf("arg15: beta\n");
@@ -50,7 +50,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
    const bool do_verification = std::stoi(argv[4]);
    const int init_method      = std::stoi(argv[5]);
    const bool do_log          = std::stoi(argv[6]);
-    const int nrepeat          = std::stoi(argv[7]);
+    const bool time_kernel     = std::stoi(argv[7]);

    const int M = std::stoi(argv[8]);
    const int N = std::stoi(argv[9]);
@@ -76,7 +76,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -99,7 +99,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -122,7 +122,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -145,7 +145,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -168,7 +168,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -191,7 +191,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -214,7 +214,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -237,7 +237,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -252,5 +252,5 @@ int profile_gemm_bias_2d(int argc, char* argv[])
        throw std::runtime_error("wrong! this data_type & layout is not implemented");
    }

-    return 1;
+    return 0;
 }
--- a/profiler/src/profile_gemm_bias_relu.cpp
+++ b/profiler/src/profile_gemm_bias_relu.cpp
@@ -36,8 +36,8 @@ int profile_gemm_bias_relu(int argc, char* argv[])
        printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
        printf("arg4: verification (0: no; 1: yes)\n");
        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
-        printf("arg8: print tensor value (0: no; 1: yes)\n");
-        printf("arg7: run kernel # of times (>1)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=n0, 1=yes)\n");
        printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
        printf("arg14: split k into  mulitiple batch\n");
        exit(1);
@@ -48,7 +48,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
    const bool do_verification = std::stoi(argv[4]);
    const int init_method      = std::stoi(argv[5]);
    const bool do_log          = std::stoi(argv[6]);
-    const int nrepeat          = std::stoi(argv[7]);
+    const bool time_kernel     = std::stoi(argv[7]);

    const int M = std::stoi(argv[8]);
    const int N = std::stoi(argv[9]);
@@ -69,7 +69,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -88,7 +88,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -107,7 +107,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -126,7 +126,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -139,5 +139,5 @@ int profile_gemm_bias_relu(int argc, char* argv[])
        throw std::runtime_error("wrong! this data_type & layout is not implemented");
    }

-    return 1;
+    return 0;
 }
--- a/profiler/src/profile_gemm_bias_relu_add.cpp
+++ b/profiler/src/profile_gemm_bias_relu_add.cpp
@@ -36,8 +36,8 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
        printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
        printf("arg4: verification (0: no; 1: yes)\n");
        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
-        printf("arg8: print tensor value (0: no; 1: yes)\n");
-        printf("arg7: run kernel # of times (>1)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=n0, 1=yes)\n");
        printf("arg8 to 14: M, N, K, StrideA, StrideB, StrideC, StrideC1\n");
        printf("arg15: split k into  mulitiple batch\n");
        exit(1);
@@ -48,7 +48,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
    const bool do_verification = std::stoi(argv[4]);
    const int init_method      = std::stoi(argv[5]);
    const bool do_log          = std::stoi(argv[6]);
-    const int nrepeat          = std::stoi(argv[7]);
+    const bool time_kernel     = std::stoi(argv[7]);

    const int M = std::stoi(argv[8]);
    const int N = std::stoi(argv[9]);
@@ -70,7 +70,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -90,7 +90,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -110,7 +110,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -130,7 +130,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -144,5 +144,5 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
        throw std::runtime_error("wrong! this data_type & layout is not implemented");
    }

-    return 1;
+    return 0;
 }
--- a/profiler/src/profile_gemm_reduce.cpp
+++ b/profiler/src/profile_gemm_reduce.cpp
@@ -32,8 +32,8 @@ int profile_gemm_reduce(int argc, char* argv[])
        printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
        printf("arg4: verification (0: no; 1: yes)\n");
        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
-        printf("arg8: print tensor value (0: no; 1: yes)\n");
-        printf("arg7: run kernel # of times (>1)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=n0, 1=yes)\n");
        printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
        printf("arg14: split k into  mulitiple batch\n");
        exit(1);
@@ -44,7 +44,7 @@ int profile_gemm_reduce(int argc, char* argv[])
    const bool do_verification = std::stoi(argv[4]);
    const int init_method      = std::stoi(argv[5]);
    const bool do_log          = std::stoi(argv[6]);
-    const int nrepeat          = std::stoi(argv[7]);
+    const bool time_kernel     = std::stoi(argv[7]);

    const int M = std::stoi(argv[8]);
    const int N = std::stoi(argv[9]);
@@ -66,7 +66,7 @@ int profile_gemm_reduce(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -87,7 +87,7 @@ int profile_gemm_reduce(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -108,7 +108,7 @@ int profile_gemm_reduce(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -129,7 +129,7 @@ int profile_gemm_reduce(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -142,5 +142,5 @@ int profile_gemm_reduce(int argc, char* argv[])
        throw std::runtime_error("wrong! this data_type & layout is not implemented");
    }

-    return 1;
+    return 0;
 }
--- a/profiler/src/profile_grouped_gemm.cpp
+++ b/profiler/src/profile_grouped_gemm.cpp
@@ -54,8 +54,8 @@ int profile_grouped_gemm(int argc, char* argv[])
        printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
        printf("arg4: verification (0: no; 1: yes)\n");
        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
-        printf("arg8: print tensor value (0: no; 1: yes)\n");
-        printf("arg7: run kernel # of times (>1)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=n0, 1=yes)\n");
        printf("arg8 to 13: Ms, Ns, Ks, StrideAs, StrideBs, StrideCs (e.g., 256,256 128,128 64,64 "
               "64,64 64,64 128,128)\n");
        exit(1);
@@ -66,7 +66,7 @@ int profile_grouped_gemm(int argc, char* argv[])
    const bool do_verification = std::stoi(argv[4]);
    const int init_method      = std::stoi(argv[5]);
    const bool do_log          = std::stoi(argv[6]);
-    const int nrepeat          = std::stoi(argv[7]);
+    const bool time_kernel     = std::stoi(argv[7]);

    const auto Ms = argToIntArray(argv[8]);
    const auto Ns = argToIntArray(argv[9]);
@@ -79,6 +79,7 @@ int profile_grouped_gemm(int argc, char* argv[])
    if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
    {
        ck::profiler::profile_grouped_gemm_impl<ck::half_t,
+                                                ck::half_t,
                                                ck::half_t,
                                                ck::half_t,
                                                ck::tensor_layout::gemm::RowMajor,
@@ -86,7 +87,7 @@ int profile_grouped_gemm(int argc, char* argv[])
                                                ck::tensor_layout::gemm::RowMajor>(do_verification,
                                                                                   init_method,
                                                                                   do_log,
-                                                                                   nrepeat,
+                                                                                   time_kernel,
                                                                                   Ms,
                                                                                   Ns,
                                                                                   Ks,
@@ -97,6 +98,7 @@ int profile_grouped_gemm(int argc, char* argv[])
    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
    {
        ck::profiler::profile_grouped_gemm_impl<ck::half_t,
+                                                ck::half_t,
                                                ck::half_t,
                                                ck::half_t,
                                                ck::tensor_layout::gemm::RowMajor,
@@ -104,7 +106,7 @@ int profile_grouped_gemm(int argc, char* argv[])
                                                ck::tensor_layout::gemm::RowMajor>(do_verification,
                                                                                   init_method,
                                                                                   do_log,
-                                                                                   nrepeat,
+                                                                                   time_kernel,
                                                                                   Ms,
                                                                                   Ns,
                                                                                   Ks,
@@ -115,6 +117,7 @@ int profile_grouped_gemm(int argc, char* argv[])
    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN)
    {
        ck::profiler::profile_grouped_gemm_impl<ck::half_t,
+                                                ck::half_t,
                                                ck::half_t,
                                                ck::half_t,
                                                ck::tensor_layout::gemm::ColumnMajor,
@@ -122,7 +125,7 @@ int profile_grouped_gemm(int argc, char* argv[])
                                                ck::tensor_layout::gemm::RowMajor>(do_verification,
                                                                                   init_method,
                                                                                   do_log,
-                                                                                   nrepeat,
+                                                                                   time_kernel,
                                                                                   Ms,
                                                                                   Ns,
                                                                                   Ks,
@@ -133,6 +136,7 @@ int profile_grouped_gemm(int argc, char* argv[])
    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN)
    {
        ck::profiler::profile_grouped_gemm_impl<ck::half_t,
+                                                ck::half_t,
                                                ck::half_t,
                                                ck::half_t,
                                                ck::tensor_layout::gemm::ColumnMajor,
@@ -140,7 +144,7 @@ int profile_grouped_gemm(int argc, char* argv[])
                                                ck::tensor_layout::gemm::RowMajor>(do_verification,
                                                                                   init_method,
                                                                                   do_log,
-                                                                                   nrepeat,
+                                                                                   time_kernel,
                                                                                   Ms,
                                                                                   Ns,
                                                                                   Ks,
@@ -153,5 +157,5 @@ int profile_grouped_gemm(int argc, char* argv[])
        throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented");
    }

-    return 1;
+    return 0;
 }
--- a/profiler/src/profile_reduce.cpp
+++ b/profiler/src/profile_reduce.cpp
 #include <iostream>
 #include <fstream>
-#include <numeric>
-#include <initializer_list>
 #include <cstdlib>
 #include <vector>
 #include <stdexcept>
 #include <sstream>
 #include <getopt.h>

-#include "config.hpp"
-#include "print.hpp"
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "host_tensor_generator.hpp"
-#include "device_tensor.hpp"
+#include "data_type_enum.hpp"
 #include "reduction_enums.hpp"

+#include "host_common_util.hpp"
 #include "profile_reduce_impl.hpp"

 using namespace std;

-using ck::NanPropagation;
-using ck::ReduceTensorIndices;
 using ck::ReduceTensorOp;

 static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
@@ -38,63 +30,9 @@ static struct option long_options[] = {{"inLengths", required_argument, nullptr,
                                       {"bf16", no_argument, nullptr, '?'},
                                       {"dumpout", required_argument, nullptr, 'o'},
                                       {"verify", required_argument, nullptr, 'v'},
-                                       {"log", required_argument, nullptr, 'l'},
                                       {"help", no_argument, nullptr, '?'},
                                       {nullptr, 0, nullptr, 0}};

-template <typename T>
-static T getSingleValueFromString(const string& valueStr)
-{
-    std::istringstream iss(valueStr);
-
-    T val;
-
-    iss >> val;
-
-    return (val);
-};
-
-template <typename T>
-static std::vector<T> getTypeValuesFromString(const char* cstr_values)
-{
-    std::string valuesStr(cstr_values);
-
-    std::vector<T> values;
-    std::size_t pos = 0;
-    std::size_t new_pos;
-
-    new_pos = valuesStr.find(',', pos);
-    while(new_pos != std::string::npos)
-    {
-        const std::string sliceStr = valuesStr.substr(pos, new_pos - pos);
-
-        T val = getSingleValueFromString<T>(sliceStr);
-
-        values.push_back(val);
-
-        pos     = new_pos + 1;
-        new_pos = valuesStr.find(',', pos);
-    };
-
-    std::string sliceStr = valuesStr.substr(pos);
-    T val                = getSingleValueFromString<T>(sliceStr);
-
-    values.push_back(val);
-
-    return (values);
-}
-
-enum struct AppDataType
-{
-    appHalf     = 0,
-    appFloat    = 1,
-    appInt32    = 2,
-    appInt8     = 3,
-    appInt8x4   = 4,
-    appBFloat16 = 5,
-    appDouble   = 6,
-};
-
 static void check_reduce_dims(const int rank, const std::vector<int>& reduceDims)
 {
    for(auto dim : reduceDims)
@@ -113,7 +51,7 @@ static void check_reduce_dims(const int rank, const std::vector<int>& reduceDims
    };
 };

-class AppArgs
+class ReduceProfilerArgs
 {
    private:
    int option_index = 0;
@@ -130,26 +68,23 @@ class AppArgs

    std::vector<float> scales;

-    ReduceTensorOp reduceOp = ReduceTensorOp::ADD;
-    AppDataType compTypeId  = AppDataType::appFloat;
-    AppDataType outTypeId   = AppDataType::appFloat;
+    ReduceTensorOp reduceOp     = ReduceTensorOp::ADD;
+    ck::DataTypeEnum compTypeId = ck::DataTypeEnum::Float;
+    ck::DataTypeEnum outTypeId  = ck::DataTypeEnum::Float;

    bool compType_assigned = false;
    bool outType_assigned  = false;

-    NanPropagation nanOpt          = NanPropagation::NOT_PROPAGATE_NAN;
-    ReduceTensorIndices indicesOpt = ReduceTensorIndices::NO_INDICES;
-    bool do_log                    = false;
-    bool do_verification           = false;
-    bool do_dumpout                = false;
+    int nanOpt           = 0;
+    int indicesOpt       = 0;
+    bool do_verification = false;
+    bool do_dumpout      = false;

    int init_method;
-    int nrepeat;
+    bool time_kernel;

-    bool need_indices = false;
-
-    AppArgs()  = default;
-    ~AppArgs() = default;
+    ReduceProfilerArgs()  = default;
+    ~ReduceProfilerArgs() = default;

    void show_usage(const char* cmd)
    {
@@ -166,8 +101,11 @@ class AppArgs
        std::cout << "--outType or -W, optional enum value indicating the type of the reduced "
                     "output, which could be float when the input data is half"
                  << std::endl;
-        std::cout << "--nanOpt or -N, enum value indicates the selection for NanOpt" << std::endl;
-        std::cout << "--indicesOpt or -I, enum value indicates the selection for IndicesOpt"
+        std::cout
+            << "--nanOpt or -N, 1/0 value indicates the selection to use or not use Nan-Propagation"
+            << std::endl;
+        std::cout << "--indicesOpt or -I, 1/0 value indicates the selection to use or not use "
+                     "index in reduction"
                  << std::endl;
        std::cout << "--scales or -S, comma separated two float values for alpha and beta"
                  << std::endl;
@@ -181,18 +119,19 @@ class AppArgs
        std::cout << "--dumpout or -o, 1/0 to indicate where to save the reduction result to files "
                     "for further analysis"
                  << std::endl;
-        std::cout << "--log or -l, 1/0 to indicate whether to log some information" << std::endl;
    };

    int processArgs(int argc, char* argv[])
    {
+        using ck::host_common::getTypeValuesFromString;
+
        int ch;

        optind++; // to skip the "reduce" module name

        while(1)
        {
-            ch = getopt_long(argc, argv, "D:R:O:C:W:N:I:S:v:o:l:", long_options, &option_index);
+            ch = getopt_long(argc, argv, "D:R:O:C:W:N:I:S:v:o:", long_options, &option_index);
            if(ch == -1)
                break;
            switch(ch)
@@ -219,27 +158,27 @@ class AppArgs
                if(!optarg)
                    throw std::runtime_error("Invalid option format!");

-                compTypeId        = static_cast<AppDataType>(std::atoi(optarg));
+                compTypeId        = static_cast<ck::DataTypeEnum>(std::atoi(optarg));
                compType_assigned = true;
                break;
            case 'W':
                if(!optarg)
                    throw std::runtime_error("Invalid option format!");

-                outTypeId        = static_cast<AppDataType>(std::atoi(optarg));
+                outTypeId        = static_cast<ck::DataTypeEnum>(std::atoi(optarg));
                outType_assigned = true;
                break;
            case 'N':
                if(!optarg)
                    throw std::runtime_error("Invalid option format!");

-                nanOpt = static_cast<NanPropagation>(std::atoi(optarg));
+                nanOpt = std::atoi(optarg);
                break;
            case 'I':
                if(!optarg)
                    throw std::runtime_error("Invalid option format!");

-                indicesOpt = static_cast<ReduceTensorIndices>(std::atoi(optarg));
+                indicesOpt = std::atoi(optarg);
                break;
            case 'S':
                if(!optarg)
@@ -262,12 +201,6 @@ class AppArgs

                do_dumpout = static_cast<bool>(std::atoi(optarg));
                break;
-            case 'l':
-                if(!optarg)
-                    throw std::runtime_error("Invalid option format!");
-
-                do_log = static_cast<bool>(std::atoi(optarg));
-                break;
            case '?':
                if(std::string(long_options[option_index].name) == "half")
                    use_half = true;
@@ -295,7 +228,7 @@ class AppArgs
            throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");

        init_method = std::atoi(argv[optind++]);
-        nrepeat     = std::atoi(argv[optind]);
+        time_kernel = static_cast<bool>(std::atoi(argv[optind]));

        if(scales.empty())
        {
@@ -306,9 +239,6 @@ class AppArgs
        if(reduceOp == ReduceTensorOp::MIN || reduceOp == ReduceTensorOp::MAX ||
           reduceOp == ReduceTensorOp::AMAX)
        {
-            if(indicesOpt != ReduceTensorIndices::NO_INDICES)
-                need_indices = true;
-
            // for indexable operations, no need to assign compType and outType, just let them be
            // same as inType
            compType_assigned = false;
@@ -322,9 +252,10 @@ class AppArgs

 int profile_reduce(int argc, char* argv[])
 {
-    using namespace ck::profiler;
+    using ck::DataTypeEnum;
+    using ck::profiler::profile_reduce_impl;

-    AppArgs args;
+    ReduceProfilerArgs args;

    if(args.processArgs(argc, argv) < 0)
        return (-1);
@@ -339,42 +270,41 @@ int profile_reduce(int argc, char* argv[])
    if(args.use_half)
    {
        if(!args.compType_assigned)
-            args.compTypeId = AppDataType::appHalf;
+            args.compTypeId = DataTypeEnum::Half;

        if(args.outType_assigned &&
-           (args.outTypeId != AppDataType::appHalf && args.outTypeId != AppDataType::appFloat))
-            args.outTypeId = AppDataType::appFloat;
+           (args.outTypeId != DataTypeEnum::Half && args.outTypeId != DataTypeEnum::Float))
+            args.outTypeId = DataTypeEnum::Float;

        if(!args.outType_assigned)
-            args.outTypeId = AppDataType::appHalf;
+            args.outTypeId = DataTypeEnum::Half;

-        if(args.compTypeId == AppDataType::appHalf)
+        if(args.compTypeId == DataTypeEnum::Half)
        {
-            profile_reduce_impl<ck::half_t, ck::half_t, ck::half_t>(args.do_verification,
-                                                                    args.init_method,
-                                                                    args.do_log,
-                                                                    args.do_dumpout,
-                                                                    args.nrepeat,
-                                                                    args.inLengths,
-                                                                    args.reduceDims,
-                                                                    args.reduceOp,
-                                                                    args.nanOpt,
-                                                                    args.indicesOpt,
-                                                                    args.scales[0],
-                                                                    args.scales[1]);
+            profile_reduce_impl<ck::half_t, ck::half_t, ck::half_t>(
+                args.do_verification,
+                args.init_method,
+                args.do_dumpout,
+                args.time_kernel,
+                args.inLengths,
+                args.reduceDims,
+                args.reduceOp,
+                static_cast<bool>(args.nanOpt),
+                static_cast<bool>(args.indicesOpt),
+                args.scales[0],
+                args.scales[1]);
        }
-        else if(args.compTypeId == AppDataType::appFloat)
+        else if(args.compTypeId == DataTypeEnum::Float)
        {
            profile_reduce_impl<ck::half_t, float, ck::half_t>(args.do_verification,
                                                               args.init_method,
-                                                               args.do_log,
                                                               args.do_dumpout,
-                                                               args.nrepeat,
+                                                               args.time_kernel,
                                                               args.inLengths,
                                                               args.reduceDims,
                                                               args.reduceOp,
-                                                               args.nanOpt,
-                                                               args.indicesOpt,
+                                                               static_cast<bool>(args.nanOpt),
+                                                               static_cast<bool>(args.indicesOpt),
                                                               args.scales[0],
                                                               args.scales[1]);
        }
@@ -385,56 +315,53 @@ int profile_reduce(int argc, char* argv[])
    {
        profile_reduce_impl<double, double, double>(args.do_verification,
                                                    args.init_method,
-                                                    args.do_log,
                                                    args.do_dumpout,
-                                                    args.nrepeat,
+                                                    args.time_kernel,
                                                    args.inLengths,
                                                    args.reduceDims,
                                                    args.reduceOp,
-                                                    args.nanOpt,
-                                                    args.indicesOpt,
+                                                    static_cast<bool>(args.nanOpt),
+                                                    static_cast<bool>(args.indicesOpt),
                                                    args.scales[0],
                                                    args.scales[1]);
    }
    else if(args.use_int8)
    {
        if(!args.compType_assigned)
-            args.compTypeId = AppDataType::appInt8;
+            args.compTypeId = DataTypeEnum::Int8;

        if(args.outType_assigned &&
-           (args.outTypeId != AppDataType::appInt8 && args.outTypeId != AppDataType::appInt32))
-            args.outTypeId = AppDataType::appInt32;
+           (args.outTypeId != DataTypeEnum::Int8 && args.outTypeId != DataTypeEnum::Int32))
+            args.outTypeId = DataTypeEnum::Int32;

        if(!args.outType_assigned)
-            args.outTypeId = AppDataType::appInt8;
+            args.outTypeId = DataTypeEnum::Int8;

-        if(args.compTypeId == AppDataType::appInt8)
+        if(args.compTypeId == DataTypeEnum::Int8)
        {
            profile_reduce_impl<int8_t, int8_t, int8_t>(args.do_verification,
                                                        args.init_method,
-                                                        args.do_log,
                                                        args.do_dumpout,
-                                                        args.nrepeat,
+                                                        args.time_kernel,
                                                        args.inLengths,
                                                        args.reduceDims,
                                                        args.reduceOp,
-                                                        args.nanOpt,
-                                                        args.indicesOpt,
+                                                        static_cast<bool>(args.nanOpt),
+                                                        static_cast<bool>(args.indicesOpt),
                                                        args.scales[0],
                                                        args.scales[1]);
        }
-        else if(args.compTypeId == AppDataType::appInt32)
+        else if(args.compTypeId == DataTypeEnum::Int32)
        {
            profile_reduce_impl<int8_t, int32_t, int8_t>(args.do_verification,
                                                         args.init_method,
-                                                         args.do_log,
                                                         args.do_dumpout,
-                                                         args.nrepeat,
+                                                         args.time_kernel,
                                                         args.inLengths,
                                                         args.reduceDims,
                                                         args.reduceOp,
-                                                         args.nanOpt,
-                                                         args.indicesOpt,
+                                                         static_cast<bool>(args.nanOpt),
+                                                         static_cast<bool>(args.indicesOpt),
                                                         args.scales[0],
                                                         args.scales[1]);
        }
@@ -444,54 +371,51 @@ int profile_reduce(int argc, char* argv[])
    else if(args.use_bf16)
    {
        if(args.outType_assigned &&
-           (args.outTypeId != AppDataType::appBFloat16 && args.outTypeId != AppDataType::appFloat))
-            args.outTypeId = AppDataType::appFloat;
+           (args.outTypeId != DataTypeEnum::BFloat16 && args.outTypeId != DataTypeEnum::Float))
+            args.outTypeId = DataTypeEnum::Float;

        if(!args.outType_assigned)
-            args.outTypeId = AppDataType::appBFloat16;
+            args.outTypeId = DataTypeEnum::BFloat16;

        profile_reduce_impl<ck::bhalf_t, float, ck::bhalf_t>(args.do_verification,
                                                             args.init_method,
-                                                             args.do_log,
                                                             args.do_dumpout,
-                                                             args.nrepeat,
+                                                             args.time_kernel,
                                                             args.inLengths,
                                                             args.reduceDims,
                                                             args.reduceOp,
-                                                             args.nanOpt,
-                                                             args.indicesOpt,
+                                                             static_cast<bool>(args.nanOpt),
+                                                             static_cast<bool>(args.indicesOpt),
                                                             args.scales[0],
                                                             args.scales[1]);
    }
    else
    {
-        if(args.compTypeId == AppDataType::appFloat)
+        if(args.compTypeId == DataTypeEnum::Float)
        {
            profile_reduce_impl<float, float, float>(args.do_verification,
                                                     args.init_method,
-                                                     args.do_log,
                                                     args.do_dumpout,
-                                                     args.nrepeat,
+                                                     args.time_kernel,
                                                     args.inLengths,
                                                     args.reduceDims,
                                                     args.reduceOp,
-                                                     args.nanOpt,
-                                                     args.indicesOpt,
+                                                     static_cast<bool>(args.nanOpt),
+                                                     static_cast<bool>(args.indicesOpt),
                                                     args.scales[0],
                                                     args.scales[1]);
        }
-        else if(args.compTypeId == AppDataType::appDouble)
+        else if(args.compTypeId == DataTypeEnum::Double)
        {
            profile_reduce_impl<float, double, float>(args.do_verification,
                                                      args.init_method,
-                                                      args.do_log,
                                                      args.do_dumpout,
-                                                      args.nrepeat,
+                                                      args.time_kernel,
                                                      args.inLengths,
                                                      args.reduceDims,
                                                      args.reduceOp,
-                                                      args.nanOpt,
-                                                      args.indicesOpt,
+                                                      static_cast<bool>(args.nanOpt),
+                                                      static_cast<bool>(args.indicesOpt),
                                                      args.scales[0],
                                                      args.scales[1]);
        }

--- a/profiler/src/profiler.cpp
+++ b/profiler/src/profiler.cpp
@@ -13,6 +13,7 @@ int profile_gemm_bias_relu_add(int, char*[]);
 int profile_gemm_reduce(int, char*[]);
 int profile_batched_gemm(int, char*[]);
 int profile_grouped_gemm(int, char*[]);
+int profile_conv_fwd(int, char*[]);
 int profile_conv_fwd_bias_relu(int, char*[]);
 int profile_conv_fwd_bias_relu_add(int, char*[]);
 int profile_conv_fwd_bias_relu_atomic_add(int, char*[]);
@@ -53,7 +54,7 @@ int main(int argc, char* argv[])
    }
    else if(strcmp(argv[1], "grouped_gemm") == 0)
    {
-        profile_grouped_gemm(argc, argv);
+        return profile_grouped_gemm(argc, argv);
    }
    else if(strcmp(argv[1], "conv_fwd") == 0)
    {
@@ -107,7 +108,7 @@ int main(int argc, char* argv[])
               "                        conv1d_bwd_data: BackwardConvolution data 1 dim\n"
               "                        conv2d_bwd_data: BackwardConvolution data 2 dim\n"
               "                        conv3d_bwd_data: BackwardConvolution data 3 dim\n"
-               "                        reduce: REDUCE\n"
+               "                        reduce: Reduce\n"
               "                        conv2d_bwd_weight: Backward Weight Convolution 2d\n");
        // clang-format on
    }

--- a/script/parse_perf_data.py
+++ b/script/parse_perf_data.py
-#!/usr/bin/env python3
-import os, io
-import argparse
-
-def print_to_string(*args, **kwargs):
-    output = io.StringIO()
-    print(*args, file=output, **kwargs)
-    contents = output.getvalue()
-    output.close()
-    return contents
-
-def parse_args():
-    parser = argparse.ArgumentParser(description='Parse results from tf benchmark runs')
-    parser.add_argument('filename', type=str, help='Log file to prase or directory containing log files')
-    args = parser.parse_args()
-    files = []
-    if os.path.isdir(args.filename):
-        all_files = os.listdir(args.filename)
-        for name in all_files:
-            if not 'log' in name:
-                continue
-            files.append(os.path.join(args.filename, name))
-    else:
-        files = [args.filename]
-    args.files = files
-    return args
-
-def main():
-    args = parse_args()
-    results = []
-    #parse results
-    glue=""
-    for filename in args.files:
-        for line in open(filename):
-            if 'Best Perf' in line:
-                lst=line.split()
-                results.append(print_to_string(glue.join(lst[8:]),lst[4]))
-                
-    #sort results    
-
-    #read baseline results for the latest develop branch    
-
-    #write new results to the db
-    
-    #compare the results to the baseline
-    
-    #return 0 if performance criteria met, otherwise return 1
-
-    print(results)
-    return 0
-
-if __name__ == '__main__':
+#!/usr/bin/env python3
+import os, io, argparse, datetime, re
+import numpy as np
+import sqlalchemy
+from sqlalchemy.types import NVARCHAR, Float, Integer
+import pymysql
+import pandas as pd
+from sshtunnel import SSHTunnelForwarder
+
+def print_to_string(*args, **kwargs):
+    output = io.StringIO()
+    print(*args, file=output, **kwargs)
+    contents = output.getvalue()
+    output.close()
+    return contents
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Parse results from tf benchmark runs')
+    parser.add_argument('filename', type=str, help='Log file to prase or directory containing log files')
+    args = parser.parse_args()
+    files = []
+    if os.path.isdir(args.filename):
+        all_files = os.listdir(args.filename)
+        for name in all_files:
+            if not 'log' in name:
+                continue
+            files.append(os.path.join(args.filename, name))
+    else:
+        files = [args.filename]
+    args.files = files
+    return args
+
+def main():
+    args = parse_args()
+    tests = []
+    kernels=[]
+    tflops=[]
+    dtype=[]
+    alayout=[]
+    blayout=[]
+    M=[]
+    N=[]
+    K=[]
+    StrideA=[]
+    StrideB=[]
+    StrideC=[]
+    #parse results, get the Tflops value for "Best Perf" kernels
+
+    glue=""
+    for filename in args.files:
+        for line in open(filename):
+            if 'Branch name' in line:
+                lst=line.split()
+                branch_name=lst[2]
+            if 'Node name' in line:
+                lst=line.split()
+                node_id=lst[2]
+            if 'GPU_arch' in line:
+                lst=line.split()
+                gpu_arch=lst[1]
+            if 'HIP version' in line:
+                lst=line.split()
+                hip_vers=lst[2]
+            if 'InstalledDir' in line:
+                lst=line.split()
+                rocm_vers=lst[1][lst[1].find('/opt/rocm-')+len('/opt/rocm-'):lst[1].rfind('/llvm/bin')]
+    print("Branch name:",branch_name)
+    print("Node name:",node_id)
+    print("GPU_arch:",gpu_arch)
+    print("ROCM_version:",rocm_vers)
+    print("HIP_version:",hip_vers)
+
+
+    #parse gemm performance tests:
+    if 'gemm' in filename:
+        for filename in args.files:
+            for line in open(filename):
+                if 'Best Perf' in line:
+                    lst=line.split()
+                    if len(lst)>=37: #the line is complete
+                        tests.append(glue.join(lst[5:30]))
+                        kernels.append(glue.join(lst[37:]))
+                        tflops.append(lst[33])
+                        dtype.append(lst[5])
+                        alayout.append(lst[8])
+                        blayout.append(lst[11])
+                        M.append(lst[14])
+                        N.append(lst[17])
+                        K.append(lst[20])
+                        StrideA.append(lst[23])
+                        StrideB.append(lst[26])
+                        StrideC.append(lst[29])
+                    elif len(lst)<37 and len(lst)>=33: #the tflops are available
+                        tests.append(glue.join(lst[5:30]))
+                        kernels.append("N/A")
+                        tflops.append(lst[33])
+                        dtype.append(lst[5])
+                        alayout.append(lst[8])
+                        blayout.append(lst[11])
+                        M.append(lst[14])
+                        N.append(lst[17])
+                        K.append(lst[20])
+                        StrideA.append(lst[23])
+                        StrideB.append(lst[26])
+                        StrideC.append(lst[29])
+                        print("warning: incomplete line:",lst)
+                    elif len(lst)<33: #even the tflops are not available
+                        print("Error in ckProfiler output!")
+                        print("warning: incomplete line=",lst)
+        #sort results
+        #sorted_tests = sorted(tests)
+        #print("sorted tests:",sorted_tests)
+        sorted_tflops = [x for _,x in sorted(zip(tests,tflops))]
+        #sorted_kernels = [x for _,x in sorted(zip(tests,kernels))]
+        test_list=list(range(1,len(tests)+1))
+
+    #parse resnet50 performance tests:
+    if 'resnet50' in filename:
+        for filename in args.files:
+            for line in open(filename):
+                if 'Best Perf' in line:
+                    lst=line.split()
+                    tflops.append(lst[4])
+
+    print("Number of tests:",len(tflops))
+    sql_hostname = '127.0.0.1'
+    sql_username = os.environ["dbuser"]
+    sql_password = os.environ["dbpassword"]
+    sql_main_database = 'miopen_perf'
+    sql_port = 3306
+    ssh_host = os.environ["dbsship"]
+    ssh_user = os.environ["dbsshuser"]
+    ssh_port = int(os.environ["dbsshport"])
+    ssh_pass = os.environ["dbsshpassword"]
+
+    with SSHTunnelForwarder(
+            (ssh_host, ssh_port),
+            ssh_username=ssh_user,
+            ssh_password=ssh_pass,
+            remote_bind_address=(sql_hostname, sql_port)) as tunnel:
+
+        sqlEngine = sqlalchemy.create_engine('mysql+pymysql://{0}:{1}@{2}:{3}/{4}'.
+            format(sql_username, sql_password, sql_hostname, tunnel.local_bind_port, sql_main_database))
+        conn = sqlEngine.connect()
+
+        #save gemm performance tests:
+        if 'gemm' in filename:
+
+            #write the ck_gemm_test_params table
+            #only needed once the test set changes
+            '''
+            sorted_dtypes = [x for _,x in sorted(zip(tests,dtype))]
+            sorted_alayout = [x for _,x in sorted(zip(tests,alayout))]
+            sorted_blayout = [x for _,x in sorted(zip(tests,blayout))]
+            sorted_M = [x for _,x in sorted(zip(tests,M))]
+            sorted_N = [x for _,x in sorted(zip(tests,N))]
+            sorted_K = [x for _,x in sorted(zip(tests,K))]
+            sorted_StrideA = [x for _,x in sorted(zip(tests,StrideA))]
+            sorted_StrideB = [x for _,x in sorted(zip(tests,StrideB))]
+            sorted_StrideC = [x for _,x in sorted(zip(tests,StrideC))]
+            ck_gemm_params=[test_list,sorted_dtypes,sorted_alayout,sorted_blayout,
+                        sorted_M,sorted_N,sorted_K,sorted_StrideA,sorted_StrideB,
+                        sorted_StrideC]
+            df=pd.DataFrame(np.transpose(ck_gemm_params),columns=['Test_number','Data_type',
+                'Alayout','BLayout','M','N','K', 'StrideA','StrideB','StrideC'])
+            print(df)
+
+            dtypes = {
+                'Test_number': Integer(),
+                'Data_type': NVARCHAR(length=5),
+                'Alayout': NVARCHAR(length=12),
+                'Blayout': NVARCHAR(length=12),
+                'M': Integer(),
+                'N': Integer(),
+                'K': Integer(),
+                'StrideA': Integer(),
+                'StrideB': Integer(),
+                'StrideC': Integer()
+                }
+            df.to_sql("ck_gemm_test_params",conn,if_exists='replace',index=False, dtype=dtypes)
+            '''
+
+            #read baseline results for the latest develop branch
+            query = '''SELECT * from ck_gemm_tflops WHERE Datetime = (SELECT MAX(Datetime) FROM ck_gemm_tflops where Branch_ID='develop' );'''
+            tflops_base = pd.read_sql_query(query, conn)
+
+            #write new results to the db
+            testlist=[]
+            for i in range(1,len(tests)+1):
+                testlist.append("Test%i"%i)
+            ck_gemm_tflops=[str(branch_name),str(node_id),str(gpu_arch),str(rocm_vers),str(hip_vers),str(datetime.datetime.now())]
+            flops=pd.DataFrame(data=[ck_gemm_tflops],columns=['Branch_ID','Node_ID','GPU_arch','ROCM_version','HIP_version','Datetime'])
+            df_add=pd.DataFrame(data=[sorted_tflops],columns=testlist)
+            flops=pd.concat([flops,df_add],axis=1)
+            print("new tflops for gemm tests:",flops)
+            flops.to_sql("ck_gemm_tflops",conn,if_exists='append',index=False)
+
+        #save resnet50 performance tests:
+        if 'resnet50' in filename:
+            #read baseline results for the latest develop branch
+            query = '''SELECT * from ck_resnet50_N256_tflops WHERE Datetime = (SELECT MAX(Datetime) FROM ck_resnet50_N256_tflops where Branch_ID='develop' );'''
+            tflops_base_N256 = pd.read_sql_query(query, conn)
+            query = '''SELECT * from ck_resnet50_N4_tflops WHERE Datetime = (SELECT MAX(Datetime) FROM ck_resnet50_N4_tflops where Branch_ID='develop' );'''
+            tflops_base_N4 = pd.read_sql_query(query, conn)
+
+            #write new results to the db
+            testlist=[]
+            for i in range(1,50):
+                testlist.append("Layer%i"%i)
+            ck_resnet_tflops=[str(branch_name),str(node_id),str(gpu_arch),str(rocm_vers),str(hip_vers),str(datetime.datetime.now())]
+            flops0=pd.DataFrame(data=[ck_resnet_tflops],columns=['Branch_ID','Node_ID','GPU_arch','ROCM_version','HIP_version','Datetime'])
+            df_add=pd.DataFrame(data=[tflops[0:49]],columns=testlist)
+            flops=pd.concat([flops0,df_add],axis=1)
+            print("new tflops for N=256 resnet50 test:",flops)
+            flops.to_sql("ck_resnet50_N256_tflops",conn,if_exists='append',index=False)
+            df_add=pd.DataFrame(data=[tflops[49:98]],columns=testlist)
+            flops=pd.concat([flops0,df_add],axis=1)
+            print("new tflops for N=4 resnet50 test:",flops)
+            flops.to_sql("ck_resnet50_N4_tflops",conn,if_exists='append',index=False)
+
+        conn.close()
+
+    #compare the results to the baseline if baseline exists
+    regression=0
+    if 'gemm' in filename:
+        if not tflops_base.empty:
+            base=tflops_base[testlist].to_numpy(dtype='float')
+            base_list=base[0]
+            ave_perf=0
+            for i in range(len(base_list)):
+                # success criterion:
+                if base_list[i]>1.01*float(sorted_tflops[i]):
+                    print("test # ",i,"shows regression by {:.3f}%".format(
+                        (float(sorted_tflops[i])-base_list[i])/base_list[i]*100))
+                    regression=1
+                ave_perf=ave_perf+float(sorted_tflops[i])/base_list[i]
+            if regression==0:
+                print("no regressions found")
+            ave_perf=ave_perf/len(base_list)
+            print("average performance relative to baseline:",ave_perf)
+        else:
+            print("could not find a baseline")
+    if 'resnet50' in filename:
+        if not tflops_base_N256.empty:
+            base=tflops_base_N256[testlist].to_numpy(dtype='float')
+            base_list=base[0]
+            ave_perf=0
+            for i in range(len(base_list)):
+                # success criterion:
+                if base_list[i]>1.01*float(tflops[i]):
+                    print("layer # ",i,"shows regression by {:.3f}%".format(
+                        (float(tflops[i])-base_list[i])/base_list[i]*100))
+                    regression=1
+                ave_perf=ave_perf+float(tflops[i])/base_list[i]
+            if regression==0:
+                print("no regressions found")
+            ave_perf=ave_perf/len(base_list)
+            print("average performance relative to baseline:",ave_perf)
+        else:
+            print("could not find a baseline for N=256")
+        if not tflops_base_N4.empty:
+            base=tflops_base_N4[testlist].to_numpy(dtype='float')
+            base_list=base[0]
+            ave_perf=0
+            for i in range(len(base_list)):
+                # success criterion:
+                if base_list[i]>1.01*float(tflops[i+49]):
+                    print("layer # ",i,"shows regression by {:.3f}%".format(
+                        (float(tflops[i+49])-base_list[i])/base_list[i]*100))
+                    regression=1
+                ave_perf=ave_perf+float(tflops[i+49])/base_list[i]
+            if regression==0:
+                print("no regressions found")
+            ave_perf=ave_perf/len(base_list)
+            print("average performance relative to baseline:",ave_perf)
+        else:
+            print("could not find a baseline for N=4")
+
+    #return 0 if performance criteria met, otherwise return 1
+    return regression
+
+if __name__ == '__main__':
    main()
\ No newline at end of file