Merge remote-tracking branch 'origin/develop' into cpu_avx2

07a673c6 · carlushuang · c0f698d5 · ac0d8066 · 07a673c6 · 07a673c6
Commit 07a673c6 authored Apr 14, 2022 by carlushuang
20 changed files
--- a/library/include/ck/library/obselete_driver_offline/driver_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+++ b/library/include/ck/library/obselete_driver_offline/driver_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
@@ -27,7 +27,7 @@ template <ck::index_t BlockSize,
          ck::index_t ABlockTransferDstScalarPerVector_E2,
          ck::index_t BThreadTransferSrcScalarPerVector_E2,
          ck::index_t CThreadTransferDstScalarPerVector_K,
-          ck::ActivTypeEnum_t activ_type>
+          ck::ActivTypeEnum activ_type>
 struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_maxpool
 {
    template <typename... Wei,
@@ -305,7 +305,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
            FloatAB,
            FloatAcc,
            FloatC,
-            InMemoryDataOperationEnum_t::Set,
+            InMemoryDataOperationEnum::Set,
            decltype(a_e0_e1_k_e2_grid_desc),
            decltype(b_e0_e1_n_ho_wo_e2_grid_desc),
            decltype(c_k_n_hop_wop_grid_desc),

--- a/library/include/ck/library/obselete_driver_offline/driver_gemm_dlops_v1r2.hpp
+++ b/library/include/ck/library/obselete_driver_offline/driver_gemm_dlops_v1r2.hpp
@@ -10,7 +10,7 @@ template <ck::index_t BlockSize,
          typename FloatAB,
          typename FloatAcc,
          typename FloatC,
-          ck::InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
+          ck::InMemoryDataOperationEnum CGlobalMemoryDataOperation,
          typename AKMGridDesc,
          typename BKNGridDesc,
          typename CMNGridDesc,

--- a/library/include/ck/library/obselete_driver_offline/driver_gemm_dlops_v1r3.hpp
+++ b/library/include/ck/library/obselete_driver_offline/driver_gemm_dlops_v1r3.hpp
@@ -10,7 +10,7 @@ template <ck::index_t BlockSize,
          typename FloatAB,
          typename FloatAcc,
          typename FloatC,
-          ck::InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
+          ck::InMemoryDataOperationEnum CGlobalMemoryDataOperation,
          typename AK0MK1GridDesc,
          typename BK0NK1GridDesc,
          typename CMNGridDesc,

--- a/library/include/ck/library/obselete_driver_offline/driver_gemm_xdlops_v2r3.hpp
+++ b/library/include/ck/library/obselete_driver_offline/driver_gemm_xdlops_v2r3.hpp
@@ -11,7 +11,7 @@ template <ck::index_t BlockSize,
          typename FloatAB,
          typename FloatAcc,
          typename FloatC,
-          ck::InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
+          ck::InMemoryDataOperationEnum CGlobalMemoryDataOperation,
          typename AGridDesc_K0_M_K1,
          typename BGridDesc_K0_N_K,
          typename CMNGridDesc,

--- a/library/include/ck/library/obselete_driver_offline/driver_gemm_xdlops_v2r4.hpp
+++ b/library/include/ck/library/obselete_driver_offline/driver_gemm_xdlops_v2r4.hpp
@@ -10,7 +10,7 @@ template <ck::index_t BlockSize,
          typename FloatAB,
          typename FloatAcc,
          typename FloatC,
-          ck::InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
+          ck::InMemoryDataOperationEnum CGlobalMemoryDataOperation,
          typename ABK0MK1GridDesc,
          typename BBK0NK1GridDesc,
          typename CMNGridDesc,

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp
@@ -17,7 +17,7 @@ template <typename InDataType,
          typename InElementwiseOperation,
          typename WeiElementwiseOperation,
          typename OutElementwiseOperation>
-struct ReferenceConvWrw : public device::BaseOperator
+struct ReferenceConvBwdWeight : public device::BaseOperator
 {
    // Argument
    struct Argument : public device::BaseArgument
@@ -62,7 +62,7 @@ struct ReferenceConvWrw : public device::BaseOperator
    // Invoker
    struct Invoker : public device::BaseInvoker
    {
-        using Argument = ReferenceConvWrw::Argument;
+        using Argument = ReferenceConvBwdWeight::Argument;
        float Run(const Argument& arg)
        {
@@ -163,7 +163,7 @@ struct ReferenceConvWrw : public device::BaseOperator
        auto str = std::stringstream();
        // clang-format off
-        str << "ReferenceConvFwd"
+        str << "ReferenceConvBwdWeight"
            << std::endl;
        // clang-format on

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp
@@ -19,7 +19,7 @@ template <typename InDataType,
          typename WeiElementwiseOperation,
          typename OutElementwiseOperation,
          ck::index_t NumDimSpatial                                                    = 2,
-          typename std::enable_if<NumDimSpatial >= 1 && NumDimSpatial <= 3, bool>::type = false>
+          typename ck::enable_if<NumDimSpatial >= 1 && NumDimSpatial <= 3, bool>::type = false>
 struct ReferenceConvBwdData : public device::BaseOperator
 {
    // Argument
@@ -71,7 +71,7 @@ struct ReferenceConvBwdData : public device::BaseOperator
        {
            if constexpr(NumDimSpatial == 1)
            {
-                auto f_nchw = [&](auto n, auto c, auto wi) {
+                auto f_ncw = [&](auto n, auto c, auto wi) {
                    std::size_t K  = arg.weight_.mDesc.GetLengths()[0];
                    std::size_t X  = arg.weight_.mDesc.GetLengths()[2];
                    std::size_t Wo = arg.output_.mDesc.GetLengths()[2];
@@ -108,7 +108,7 @@ struct ReferenceConvBwdData : public device::BaseOperator
                    arg.input_(n, c, wi) = ck::type_convert<InDataType>(v_in);
                };
-                make_ParallelTensorFunctor(f_nchw,
+                make_ParallelTensorFunctor(f_ncw,
                                           arg.input_.mDesc.GetLengths()[0],
                                           arg.input_.mDesc.GetLengths()[1],
                                           arg.input_.mDesc.GetLengths()[2])(
@@ -182,7 +182,7 @@ struct ReferenceConvBwdData : public device::BaseOperator
            }
            else if constexpr(NumDimSpatial == 3)
            {
-                auto f_nchw = [&](auto n, auto c, auto di, auto hi, auto wi) {
+                auto f_ncdhw = [&](auto n, auto c, auto di, auto hi, auto wi) {
                    std::size_t K = arg.weight_.mDesc.GetLengths()[0];
                    std::size_t Z = arg.weight_.mDesc.GetLengths()[2];
                    std::size_t Y = arg.weight_.mDesc.GetLengths()[3];
@@ -252,7 +252,7 @@ struct ReferenceConvBwdData : public device::BaseOperator
                    arg.input_(n, c, di, hi, wi) = ck::type_convert<InDataType>(v_in);
                };
-                make_ParallelTensorFunctor(f_nchw,
+                make_ParallelTensorFunctor(f_ncdhw,
                                           arg.input_.mDesc.GetLengths()[0],
                                           arg.input_.mDesc.GetLengths()[1],
                                           arg.input_.mDesc.GetLengths()[2],

--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp
@@ -47,7 +47,7 @@ using reduce_configuration_2_instances_blockwise = std::tuple<
    >;
 #endif
-template <typename AccDataType, ReduceTensorOp_t ReduceOpId>
+template <typename AccDataType, ReduceTensorOp ReduceOpId>
 using deviceReduceBlockWisePtrType = DeviceReducePtr<
    typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation,
    typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::AccElementwiseOperation>;
@@ -57,9 +57,9 @@ template <typename InDataType,
          typename OutDataType,
          int Rank,
          int NumReduceDim,
-          ReduceTensorOp_t ReduceOpId,
+          ReduceTensorOp ReduceOpId,
-          NanPropagation_t NanOpt,
+          NanPropagation NanOpt,
-          ReduceTensorIndices_t IndicesOpt>
+          ReduceTensorIndices IndicesOpt>
 void add_device_reduce_instance_blockwise(
    std::vector<deviceReduceBlockWisePtrType<AccDataType, ReduceOpId>>& device_op_instances)
 {
@@ -71,11 +71,11 @@ void add_device_reduce_instance_blockwise(
            AccElementwiseOperation;
    constexpr bool Indexable =
-        (ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX ||
+        (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
-         ReduceOpId == ReduceTensorOp_t::AMAX);
+         ReduceOpId == ReduceTensorOp::AMAX);
-    constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES);
+    constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices::NO_INDICES);
-    constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true;
+    constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
    static_for<0, std::tuple_size<reduce_configuration_1_instances>::value, 1>{}([&](auto i) {
        using cfg1 =
@@ -128,9 +128,9 @@ void add_device_reduce_instance_blockwise(
    ADD_BLOCKWISE_INST_BY_TYPE(inT,                                          \
                               compT,                                        \
                               outT,                                         \
-                               static_cast<ReduceTensorOp_t>(ReduceOpId),      \
+                               static_cast<ReduceTensorOp>(ReduceOpId),      \
-                               static_cast<NanPropagation_t>(NanOpt),          \
+                               static_cast<NanPropagation>(NanOpt),          \
-                               static_cast<ReduceTensorIndices_t>(IndicesOpt), \
+                               static_cast<ReduceTensorIndices>(IndicesOpt), \
                               Rank,                                         \
                               NumReduceDim)
@@ -155,9 +155,9 @@ void add_device_reduce_instance_blockwise(
    ADD_BLOCKWISE_INST_REF_BY_TYPE(inT,                                          \
                                   compT,                                        \
                                   outT,                                         \
-                                   static_cast<ReduceTensorOp_t>(ReduceOpId),      \
+                                   static_cast<ReduceTensorOp>(ReduceOpId),      \
-                                   static_cast<NanPropagation_t>(NanOpt),          \
+                                   static_cast<NanPropagation>(NanOpt),          \
-                                   static_cast<ReduceTensorIndices_t>(IndicesOpt), \
+                                   static_cast<ReduceTensorIndices>(IndicesOpt), \
                                   Rank,                                         \
                                   NumReduceDim)

--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call.hpp
@@ -34,7 +34,7 @@ using reduce_configuration_2_instances_blockwise_second_call = std::tuple<
    >;
 #endif
-template <typename AccDataType, ReduceTensorOp_t ReduceOpId>
+template <typename AccDataType, ReduceTensorOp ReduceOpId>
 using deviceReduceBlockWiseSecondCallPtrType = DeviceReducePtr<
    typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::InElementwiseOperation,
    typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::AccElementwiseOperation>;
@@ -44,9 +44,9 @@ template <typename InDataType,
          typename OutDataType,
          int Rank,
          int NumReduceDim,
-          ReduceTensorOp_t ReduceOpId,
+          ReduceTensorOp ReduceOpId,
-          NanPropagation_t NanOpt,
+          NanPropagation NanOpt,
-          ReduceTensorIndices_t IndicesOpt>
+          ReduceTensorIndices IndicesOpt>
 void add_device_reduce_instance_blockwise_second_call(
    std::vector<deviceReduceBlockWiseSecondCallPtrType<AccDataType, ReduceOpId>>&
        device_op_instances)
@@ -60,11 +60,11 @@ void add_device_reduce_instance_blockwise_second_call(
            AccElementwiseOperation;
    constexpr bool Indexable =
-        (ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX ||
+        (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
-         ReduceOpId == ReduceTensorOp_t::AMAX);
+         ReduceOpId == ReduceTensorOp::AMAX);
-    constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES);
+    constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices::NO_INDICES);
-    constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true;
+    constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
    static_assert(std::is_same<InDataType, AccDataType>::value,
                  "InDataType and AccDataType should be the same to use "
@@ -122,9 +122,9 @@ void add_device_reduce_instance_blockwise_second_call(
    ADD_BLOCKWISE_SECOND_CALL_INST_BY_TYPE(inT,                                          \
                                           compT,                                        \
                                           outT,                                         \
-                                           static_cast<ReduceTensorOp_t>(ReduceOpId),      \
+                                           static_cast<ReduceTensorOp>(ReduceOpId),      \
-                                           static_cast<NanPropagation_t>(NanOpt),          \
+                                           static_cast<NanPropagation>(NanOpt),          \
-                                           static_cast<ReduceTensorIndices_t>(IndicesOpt), \
+                                           static_cast<ReduceTensorIndices>(IndicesOpt), \
                                           Rank,                                         \
                                           NumReduceDim)
@@ -150,9 +150,9 @@ void add_device_reduce_instance_blockwise_second_call(
    ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_TYPE(inT,                                          \
                                               compT,                                        \
                                               outT,                                         \
-                                               static_cast<ReduceTensorOp_t>(ReduceOpId),      \
+                                               static_cast<ReduceTensorOp>(ReduceOpId),      \
-                                               static_cast<NanPropagation_t>(NanOpt),          \
+                                               static_cast<NanPropagation>(NanOpt),          \
-                                               static_cast<ReduceTensorIndices_t>(IndicesOpt), \
+                                               static_cast<ReduceTensorIndices>(IndicesOpt), \
                                               Rank,                                         \
                                               NumReduceDim)

--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp
@@ -47,7 +47,7 @@ using reduce_configuration_2_instances_multiblock_atomic_add = std::tuple<
    >;
 #endif
-template <typename AccDataType, ReduceTensorOp_t ReduceOperation>
+template <typename AccDataType, ReduceTensorOp ReduceOperation>
 using deviceReduceMultiBlockAtomicAddPtrType =
    DeviceReducePtr<typename reduce_unary_operator<AccDataType, ReduceOperation, true, true>::
                        InElementwiseOperation,
@@ -59,9 +59,9 @@ template <typename InDataType,
          typename OutDataType,
          int Rank,
          int NumReduceDim,
-          ReduceTensorOp_t ReduceOpId,
+          ReduceTensorOp ReduceOpId,
-          NanPropagation_t NanOpt,
+          NanPropagation NanOpt,
-          ReduceTensorIndices_t IndicesOpt>
+          ReduceTensorIndices IndicesOpt>
 void add_device_reduce_instance_multiblock_atomic_add(
    std::vector<deviceReduceMultiBlockAtomicAddPtrType<AccDataType, ReduceOpId>>&
        device_op_instances)
@@ -74,18 +74,18 @@ void add_device_reduce_instance_multiblock_atomic_add(
            AccElementwiseOperation;
    constexpr bool Indexable =
-        (ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX ||
+        (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
-         ReduceOpId == ReduceTensorOp_t::AMAX);
+         ReduceOpId == ReduceTensorOp::AMAX);
-    constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES);
+    constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices::NO_INDICES);
-    constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true;
+    constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
-    static_assert(IndicesOpt == ReduceTensorIndices_t::NO_INDICES,
+    static_assert(IndicesOpt == ReduceTensorIndices::NO_INDICES,
                  "AtomicAdd can only be used with reduction operations without indices!");
    constexpr bool op_acceptable =
-        (ReduceOpId == ReduceTensorOp_t::ADD || ReduceOpId == ReduceTensorOp_t::MUL ||
+        (ReduceOpId == ReduceTensorOp::ADD || ReduceOpId == ReduceTensorOp::MUL ||
-         ReduceOpId == ReduceTensorOp_t::AVG || ReduceOpId == ReduceTensorOp_t::NORM1);
+         ReduceOpId == ReduceTensorOp::AVG || ReduceOpId == ReduceTensorOp::NORM1);
    constexpr bool out_type_acceptable =
        (std::is_same<OutDataType, float>::value || std::is_same<OutDataType, double>::value);
@@ -149,9 +149,9 @@ void add_device_reduce_instance_multiblock_atomic_add(
    ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_TYPE(inT,                                          \
                                           compT,                                        \
                                           outT,                                         \
-                                           static_cast<ReduceTensorOp_t>(ReduceOpId),      \
+                                           static_cast<ReduceTensorOp>(ReduceOpId),      \
-                                           static_cast<NanPropagation_t>(NanOpt),          \
+                                           static_cast<NanPropagation>(NanOpt),          \
-                                           static_cast<ReduceTensorIndices_t>(IndicesOpt), \
+                                           static_cast<ReduceTensorIndices>(IndicesOpt), \
                                           Rank,                                         \
                                           NumReduceDim)
@@ -176,9 +176,9 @@ void add_device_reduce_instance_multiblock_atomic_add(
    ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE(inT,                                          \
                                               compT,                                        \
                                               outT,                                         \
-                                               static_cast<ReduceTensorOp_t>(ReduceOpId),      \
+                                               static_cast<ReduceTensorOp>(ReduceOpId),      \
-                                               static_cast<NanPropagation_t>(NanOpt),          \
+                                               static_cast<NanPropagation>(NanOpt),          \
-                                               static_cast<ReduceTensorIndices_t>(IndicesOpt), \
+                                               static_cast<ReduceTensorIndices>(IndicesOpt), \
                                               Rank,                                         \
                                               NumReduceDim)

--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce.hpp
@@ -46,7 +46,7 @@ using reduce_configuration_2_instances_multiblock_partial_reduce = std::tuple<
    >;
 #endif
-template <typename AccDataType, ReduceTensorOp_t ReduceOpId>
+template <typename AccDataType, ReduceTensorOp ReduceOpId>
 using deviceReduceMultiBlockPartialReducePtrType = DeviceReducePtr<
    typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::InElementwiseOperation,
    typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::AccElementwiseOperation>;
@@ -56,9 +56,9 @@ template <typename InDataType,
          typename OutDataType,
          int Rank,
          int NumReduceDim,
-          ReduceTensorOp_t ReduceOpId,
+          ReduceTensorOp ReduceOpId,
-          NanPropagation_t NanOpt,
+          NanPropagation NanOpt,
-          ReduceTensorIndices_t IndicesOpt>
+          ReduceTensorIndices IndicesOpt>
 void add_device_reduce_instance_multiblock_partial_reduce(
    std::vector<deviceReduceMultiBlockPartialReducePtrType<AccDataType, ReduceOpId>>&
        device_op_instances)
@@ -72,11 +72,11 @@ void add_device_reduce_instance_multiblock_partial_reduce(
            AccElementwiseOperation;
    constexpr bool Indexable =
-        (ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX ||
+        (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
-         ReduceOpId == ReduceTensorOp_t::AMAX);
+         ReduceOpId == ReduceTensorOp::AMAX);
-    constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES);
+    constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices::NO_INDICES);
-    constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true;
+    constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
    static_for<0, std::tuple_size<reduce_configuration_1_instances>::value, 1>{}([&](auto i) {
        using cfg1 =
@@ -131,9 +131,9 @@ void add_device_reduce_instance_multiblock_partial_reduce(
    ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_TYPE(inT,                                          \
                                               compT,                                        \
                                               outT,                                         \
-                                               static_cast<ReduceTensorOp_t>(ReduceOpId),      \
+                                               static_cast<ReduceTensorOp>(ReduceOpId),      \
-                                               static_cast<NanPropagation_t>(NanOpt),          \
+                                               static_cast<NanPropagation>(NanOpt),          \
-                                               static_cast<ReduceTensorIndices_t>(IndicesOpt), \
+                                               static_cast<ReduceTensorIndices>(IndicesOpt), \
                                               Rank,                                         \
                                               NumReduceDim)
@@ -159,9 +159,9 @@ void add_device_reduce_instance_multiblock_partial_reduce(
    ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_TYPE(inT,                                          \
                                                   compT,                                        \
                                                   outT,                                         \
-                                                   static_cast<ReduceTensorOp_t>(ReduceOpId),      \
+                                                   static_cast<ReduceTensorOp>(ReduceOpId),      \
-                                                   static_cast<NanPropagation_t>(NanOpt),          \
+                                                   static_cast<NanPropagation>(NanOpt),          \
-                                                   static_cast<ReduceTensorIndices_t>(IndicesOpt), \
+                                                   static_cast<ReduceTensorIndices>(IndicesOpt), \
                                                   Rank,                                         \
                                                   NumReduceDim)

--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp
@@ -47,7 +47,7 @@ using reduce_configuration_2_instances_threadwise = std::tuple<
    >;
 #endif
-template <typename AccDataType, ReduceTensorOp_t ReduceOpId>
+template <typename AccDataType, ReduceTensorOp ReduceOpId>
 using deviceReduceThreadWisePtrType = DeviceReducePtr<
    typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation,
    typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::AccElementwiseOperation>;
@@ -57,9 +57,9 @@ template <typename InDataType,
          typename OutDataType,
          int Rank,
          int NumReduceDim,
-          ReduceTensorOp_t ReduceOpId,
+          ReduceTensorOp ReduceOpId,
-          NanPropagation_t NanOpt,
+          NanPropagation NanOpt,
-          ReduceTensorIndices_t IndicesOpt>
+          ReduceTensorIndices IndicesOpt>
 void add_device_reduce_instance_threadwise(
    std::vector<deviceReduceThreadWisePtrType<AccDataType, ReduceOpId>>& device_op_instances)
 {
@@ -71,11 +71,11 @@ void add_device_reduce_instance_threadwise(
            AccElementwiseOperation;
    constexpr bool Indexable =
-        (ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX ||
+        (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
-         ReduceOpId == ReduceTensorOp_t::AMAX);
+         ReduceOpId == ReduceTensorOp::AMAX);
-    constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES);
+    constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices::NO_INDICES);
-    constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true;
+    constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
    using cfg1 = ReductionConfiguration_1<256, 256, 1>;
@@ -124,9 +124,9 @@ void add_device_reduce_instance_threadwise(
    ADD_THREADWISE_INST_BY_TYPE(inT,                                          \
                                compT,                                        \
                                outT,                                         \
-                                static_cast<ReduceTensorOp_t>(ReduceOpId),      \
+                                static_cast<ReduceTensorOp>(ReduceOpId),      \
-                                static_cast<NanPropagation_t>(NanOpt),          \
+                                static_cast<NanPropagation>(NanOpt),          \
-                                static_cast<ReduceTensorIndices_t>(IndicesOpt), \
+                                static_cast<ReduceTensorIndices>(IndicesOpt), \
                                Rank,                                         \
                                NumReduceDim)
@@ -151,9 +151,9 @@ void add_device_reduce_instance_threadwise(
    ADD_THREADWISE_INST_REF_BY_TYPE(inT,                                          \
                                    compT,                                        \
                                    outT,                                         \
-                                    static_cast<ReduceTensorOp_t>(ReduceOpId),      \
+                                    static_cast<ReduceTensorOp>(ReduceOpId),      \
-                                    static_cast<NanPropagation_t>(NanOpt),          \
+                                    static_cast<NanPropagation>(NanOpt),          \
-                                    static_cast<ReduceTensorIndices_t>(IndicesOpt), \
+                                    static_cast<ReduceTensorIndices>(IndicesOpt), \
                                    Rank,                                         \
                                    NumReduceDim)

--- a/test/include/test_util.hpp
+++ b/test/include/test_util.hpp
-#ifndef TEST_UTIL_HPP
+#ifndef CHECK_ERR_HPP
-#define TEST_UTIL_HPP
+#define CHECK_ERR_HPP
 #include <algorithm>
 #include <cmath>
 #include <cstdlib>
+#include <half.hpp>
 #include <iostream>
 #include <iomanip>
 #include <iterator>
@@ -13,14 +14,15 @@
 #include "data_type.hpp"
-namespace test {
+namespace ck {
+namespace utils {
 template <typename T>
-typename std::enable_if<std::is_floating_point<T>::value && !std::is_same<T, ck::half_t>::value,
+typename std::enable_if<std::is_floating_point<T>::value && !std::is_same<T, half_t>::value,
                        bool>::type
 check_err(const std::vector<T>& out,
          const std::vector<T>& ref,
-          const std::string& msg,
+          const std::string& msg = "Error: Incorrect results!",
          double rtol            = 1e-5,
          double atol            = 1e-8)
 {
@@ -60,13 +62,12 @@ check_err(const std::vector<T>& out,
 }
 template <typename T>
-typename std::enable_if<std::is_same<T, ck::bhalf_t>::value || std::is_same<T, ck::half_t>::value,
+typename std::enable_if<std::is_same<T, bhalf_t>::value, bool>::type
-                        bool>::type
 check_err(const std::vector<T>& out,
          const std::vector<T>& ref,
-          const std::string& msg,
+          const std::string& msg = "Error: Incorrect results!",
-          double rtol = 1e-5,
+          double rtol            = 1e-3,
-          double atol = 1e-8)
+          double atol            = 1e-3)
 {
    if(out.size() != ref.size())
    {
@@ -79,11 +80,12 @@ check_err(const std::vector<T>& out,
    bool res{true};
    int err_count = 0;
    double err    = 0;
-    double max_err = ck::type_convert<float>(ck::NumericLimits<T>::Min());
+    // TODO: This is a hack. We should have proper specialization for bhalf_t data type.
+    double max_err = std::numeric_limits<float>::min();
    for(std::size_t i = 0; i < ref.size(); ++i)
    {
-        float o = ck::type_convert<float>(out[i]);
+        double o = type_convert<float>(out[i]);
-        float r = ck::type_convert<float>(ref[i]);
+        double r = type_convert<float>(ref[i]);
        err      = std::abs(o - r);
        if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
        {
@@ -105,11 +107,14 @@ check_err(const std::vector<T>& out,
    return res;
 }
-bool check_err(const std::vector<ck::half_t>& out,
+template <typename T>
-               const std::vector<ck::half_t>& ref,
+typename std::enable_if<std::is_same<T, half_t>::value || std::is_same<T, half_float::half>::value,
-               const std::string& msg,
+                        bool>::type
-               ck::half_t rtol = static_cast<ck::half_t>(1e-3f),
+check_err(const std::vector<T>& out,
-               ck::half_t atol = static_cast<ck::half_t>(1e-3f))
+          const std::vector<T>& ref,
+          const std::string& msg = "Error: Incorrect results!",
+          double rtol            = 1e-3,
+          double atol            = 1e-3)
 {
    if(out.size() != ref.size())
    {
@@ -122,20 +127,20 @@ bool check_err(const std::vector<ck::half_t>& out,
    bool res{true};
    int err_count  = 0;
    double err     = 0;
-    double max_err = std::numeric_limits<ck::half_t>::min();
+    double max_err = std::numeric_limits<T>::min();
    for(std::size_t i = 0; i < ref.size(); ++i)
    {
-        double out_ = double(out[i]);
+        double o = type_convert<float>(out[i]);
-        double ref_ = double(ref[i]);
+        double r = type_convert<float>(ref[i]);
-        err         = std::abs(out_ - ref_);
+        err      = std::abs(o - r);
-        if(err > atol + rtol * std::abs(ref_) || !std::isfinite(out_) || !std::isfinite(ref_))
+        if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
        {
            max_err = err > max_err ? err : max_err;
            err_count++;
            if(err_count < 5)
            {
                std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref["
-                          << i << "]: " << out_ << "!=" << ref_ << std::endl
+                          << i << "]: " << o << " != " << r << std::endl
                          << msg << std::endl;
            }
            res = false;
@@ -149,11 +154,10 @@ bool check_err(const std::vector<ck::half_t>& out,
 }
 template <typename T>
-typename std::enable_if<std::is_integral<T>::value && !std::is_same<T, ck::bhalf_t>::value,
+typename std::enable_if<std::is_integral<T>::value && !std::is_same<T, bhalf_t>::value, bool>::type
-                        bool>::type
 check_err(const std::vector<T>& out,
          const std::vector<T>& ref,
-          const std::string& msg,
+          const std::string& msg = "Error: Incorrect results!",
          double                 = 0,
          double                 = 0)
 {
@@ -178,7 +182,8 @@ check_err(const std::vector<T>& out,
    return true;
 }
-} // namespace test
+} // namespace utils
+} // namespace ck
 template <typename T>
 std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)

--- a/include/ck/tensor_operation/gpu/device/conv_utils.hpp
+++ b/include/ck/tensor_operation/gpu/device/conv_utils.hpp
-#ifndef CONV_UTILS_HPP
+#ifndef CONV_FWD_UTIL_HPP
-#define CONV_UTILS_HPP
+#define CONV_FWD_UTIL_HPP
+#include <algorithm>
 #include <cstdlib>
 #include <functional>
 #include <iterator>
 #include <numeric>
 #include <sstream>
+#include <random>
+#include <tuple>
 #include <type_traits>
 #include <vector>
+#include "check_err.hpp"
 #include "config.hpp"
+#include "device.hpp"
+#include "device_conv_fwd.hpp"
+#include "device_tensor.hpp"
+#include "element_wise_operation.hpp"
 #include "host_tensor.hpp"
+#include "reference_conv_fwd.hpp"
 #include "tensor_layout.hpp"
 namespace ck {
-namespace conv_util {
+namespace utils {
+namespace conv {
+using DeviceConvFwdNoOpPtr =
+    ck::tensor_operation::device::DeviceConvFwdPtr<ck::tensor_operation::element_wise::PassThrough,
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   ck::tensor_operation::element_wise::PassThrough>;
 /**
 * @brief      Calculate number of FLOPs for Convolution
@@ -28,7 +43,7 @@ namespace conv_util {
 *
 * @return     The number of flops.
 */
-std::size_t GetFlops(ck::index_t N,
+std::size_t get_flops(ck::index_t N,
                      ck::index_t C,
                      ck::index_t K,
                      const std::vector<ck::index_t>& filter_spatial_lengths,
@@ -66,7 +81,7 @@ std::size_t GetFlops(ck::index_t N,
 template <typename InDataType  = float,
          typename WeiDataType = InDataType,
          typename OutDataType = InDataType>
-std::size_t GetBtype(ck::index_t N,
+std::size_t get_btype(ck::index_t N,
                      ck::index_t C,
                      ck::index_t K,
                      const std::vector<ck::index_t>& input_spatial_lengths,
@@ -108,27 +123,38 @@ struct ConvParams
          input_right_pads(2, 1)
    {
    }
-    ConvParams(ck::index_t n_dim_spatial,
-               ck::index_t n,
+    ConvParams(ck::index_t n_dim,
-               ck::index_t k,
+               ck::index_t n_batch,
-               ck::index_t c,
+               ck::index_t n_out_channels,
-               std::vector<ck::index_t> filter_lengths,
+               ck::index_t n_in_channels,
-               std::vector<ck::index_t> input_lengths,
+               const std::vector<ck::index_t>& filters_len,
-               std::vector<ck::index_t> conv_strides,
+               const std::vector<ck::index_t>& input_len,
-               std::vector<ck::index_t> conv_dilations,
+               const std::vector<ck::index_t>& strides,
-               std::vector<ck::index_t> left_pads,
+               const std::vector<ck::index_t>& dilations,
-               std::vector<ck::index_t> right_pads)
+               const std::vector<ck::index_t>& left_pads,
-        : num_dim_spatial(n_dim_spatial),
+               const std::vector<ck::index_t>& right_pads)
-          N(n),
+        : num_dim_spatial(n_dim),
-          K(k),
+          N(n_batch),
-          C(c),
+          K(n_out_channels),
-          filter_spatial_lengths(filter_lengths),
+          C(n_in_channels),
-          input_spatial_lengths(input_lengths),
+          filter_spatial_lengths(filters_len),
-          conv_filter_strides(conv_strides),
+          input_spatial_lengths(input_len),
-          conv_filter_dilations(conv_dilations),
+          conv_filter_strides(strides),
+          conv_filter_dilations(dilations),
          input_left_pads(left_pads),
          input_right_pads(right_pads)
    {
+        if(filter_spatial_lengths.size() != num_dim_spatial ||
+           input_spatial_lengths.size() != num_dim_spatial ||
+           conv_filter_strides.size() != num_dim_spatial ||
+           conv_filter_dilations.size() != num_dim_spatial ||
+           input_left_pads.size() != num_dim_spatial || input_right_pads.size() != num_dim_spatial)
+        {
+            throw(std::runtime_error(
+                "ConvParams::GetOutputSpatialLengths: "
+                "parameter size is different from number of declared dimensions!"));
+        }
    }
    ck::index_t num_dim_spatial;
@@ -147,6 +173,17 @@ struct ConvParams
    std::vector<ck::index_t> GetOutputSpatialLengths() const
    {
+        if(filter_spatial_lengths.size() != num_dim_spatial ||
+           input_spatial_lengths.size() != num_dim_spatial ||
+           conv_filter_strides.size() != num_dim_spatial ||
+           conv_filter_dilations.size() != num_dim_spatial ||
+           input_left_pads.size() != num_dim_spatial || input_right_pads.size() != num_dim_spatial)
+        {
+            throw(std::runtime_error(
+                "ConvParams::GetOutputSpatialLengths: "
+                "parameter size is different from number of declared dimensions!"));
+        }
        std::vector<ck::index_t> out_spatial_len(num_dim_spatial, 0);
        for(ck::index_t i = 0; i < num_dim_spatial; ++i)
        {
@@ -174,7 +211,7 @@ struct ConvParams
 * @return     The host tensor descriptor object.
 */
 template <typename TensorLayout>
-HostTensorDescriptor GetHostTensorDescriptor(const std::vector<std::size_t>& dims,
+HostTensorDescriptor get_host_tensor_descriptor(const std::vector<std::size_t>& dims,
                                                const TensorLayout& layout)
 {
    std::size_t C = dims[1];
@@ -228,7 +265,7 @@ HostTensorDescriptor GetHostTensorDescriptor(const std::vector<std::size_t>& dim
        return HostTensorDescriptor(
            dims,
            std::vector<std::size_t>{
-                C * dims[2] * dims[3] * dims[4], 1, dims[3] * dims[4] * C, dims[4] * C, C});
+                C * dims[2] * dims[3] * dims[4], 1, C * dims[3] * dims[4], C * dims[4], C});
    }
    std::stringstream err_msg;
@@ -236,7 +273,282 @@ HostTensorDescriptor GetHostTensorDescriptor(const std::vector<std::size_t>& dim
    throw std::runtime_error(err_msg.str());
 }
-} // namespace conv_util
+template <typename InDataType  = float,
+          typename WeiDataType = float,
+          typename OutDataType = float,
+          typename InLayout    = ck::tensor_layout::convolution::NHWC,
+          typename WeiLayout   = ck::tensor_layout::convolution::KYXC,
+          typename OutLayout   = ck::tensor_layout::convolution::NHWK>
+auto get_host_tensors(const ConvParams& params, bool init = true)
+{
+    std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N),
+                                        static_cast<std::size_t>(params.C)};
+    input_dims.insert(std::end(input_dims),
+                      std::begin(params.input_spatial_lengths),
+                      std::end(params.input_spatial_lengths));
+    std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K),
+                                         static_cast<std::size_t>(params.C)};
+    filter_dims.insert(std::end(filter_dims),
+                       std::begin(params.filter_spatial_lengths),
+                       std::end(params.filter_spatial_lengths));
+    const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
+    std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N),
+                                         static_cast<std::size_t>(params.K)};
+    output_dims.insert(std::end(output_dims),
+                       std::begin(output_spatial_lengths),
+                       std::end(output_spatial_lengths));
+    Tensor<InDataType> input(ck::utils::conv::get_host_tensor_descriptor(input_dims, InLayout{}));
+    Tensor<WeiDataType> weights(
+        ck::utils::conv::get_host_tensor_descriptor(filter_dims, WeiLayout{}));
+    Tensor<OutDataType> host_output(
+        ck::utils::conv::get_host_tensor_descriptor(output_dims, OutLayout{}));
+    Tensor<OutDataType> device_output(
+        ck::utils::conv::get_host_tensor_descriptor(output_dims, OutLayout{}));
+    if(init)
+    {
+        std::mt19937 gen(11939);
+        if constexpr(std::is_same<InDataType, uint8_t>::value)
+        {
+            std::uniform_int_distribution<> dis(-5, 5);
+            std::generate(
+                input.begin(), input.end(), [&dis, &gen]() { return InDataType(dis(gen)); });
+            std::generate(
+                weights.begin(), weights.end(), [&dis, &gen]() { return WeiDataType(dis(gen)); });
+        }
+        else
+        {
+            std::uniform_real_distribution<> dis(0.f, 1.f);
+            std::generate(
+                input.begin(), input.end(), [&dis, &gen]() { return InDataType(dis(gen)); });
+            std::generate(
+                weights.begin(), weights.end(), [&dis, &gen]() { return WeiDataType(dis(gen)); });
+        }
+        std::fill(host_output.begin(), host_output.end(), OutDataType(0.f));
+        std::fill(device_output.begin(), device_output.end(), OutDataType(0.f));
+    }
+    return std::make_tuple(input, weights, host_output, device_output);
+}
+HostTensorDescriptor get_output_host_tensor_descriptor(const std::vector<std::size_t>& dims,
+                                                       int num_dim_spatial = 2)
+{
+    namespace tl = ck::tensor_layout::convolution;
+    switch(num_dim_spatial)
+    {
+    case 3: {
+        return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NDHWK{});
+    }
+    case 2: {
+        return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NHWK{});
+    }
+    case 1: {
+        return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NWK{});
+    }
+    default: {
+        throw std::runtime_error("Unsupported number of spatial dimensions provided!");
+    }
+    }
+}
+HostTensorDescriptor get_filters_host_tensor_descriptor(const std::vector<std::size_t>& dims,
+                                                        int num_dim_spatial = 2)
+{
+    namespace tl = ck::tensor_layout::convolution;
+    switch(num_dim_spatial)
+    {
+    case 3: {
+        return ck::utils::conv::get_host_tensor_descriptor(dims, tl::KZYXC{});
+    }
+    case 2: {
+        return ck::utils::conv::get_host_tensor_descriptor(dims, tl::KYXC{});
+    }
+    case 1: {
+        return ck::utils::conv::get_host_tensor_descriptor(dims, tl::KXC{});
+    }
+    default: {
+        throw std::runtime_error("Unsupported number of spatial dimensions provided!");
+    }
+    }
+}
+HostTensorDescriptor get_input_host_tensor_descriptor(const std::vector<std::size_t>& dims,
+                                                      int num_dim_spatial = 2)
+{
+    namespace tl = ck::tensor_layout::convolution;
+    switch(num_dim_spatial)
+    {
+    case 3: {
+        return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NDHWC{});
+    }
+    case 2: {
+        return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NHWC{});
+    }
+    case 1: {
+        return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NWC{});
+    }
+    default: {
+        throw std::runtime_error("Unsupported number of spatial dimensions provided!");
+    }
+    }
+}
+template <ck::index_t NDim,
+          typename InDataType  = float,
+          typename WeiDataType = float,
+          typename OutDataType = float>
+void run_reference_convolution_forward(const ConvParams& params,
+                                       const Tensor<InDataType>& input,
+                                       const Tensor<WeiDataType>& weights,
+                                       Tensor<OutDataType>& output)
+{
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+    auto ref_conv     = ck::tensor_operation::host::ReferenceConvFwd<InDataType,
+                                                                 WeiDataType,
+                                                                 OutDataType,
+                                                                 PassThrough,
+                                                                 PassThrough,
+                                                                 PassThrough,
+                                                                 NDim>();
+    auto ref_invoker  = ref_conv.MakeInvoker();
+    auto ref_argument = ref_conv.MakeArgument(input,
+                                              weights,
+                                              output,
+                                              params.conv_filter_strides,
+                                              params.conv_filter_dilations,
+                                              params.input_left_pads,
+                                              params.input_right_pads,
+                                              PassThrough{},
+                                              PassThrough{},
+                                              PassThrough{});
+    ref_invoker.Run(ref_argument);
+}
+template <ck::index_t NDim,
+          typename InDataType  = float,
+          typename WeiDataType = float,
+          typename OutDataType = float,
+          template <ck::index_t, typename, typename, typename>
+          class DeviceConvNDFwdInstance>
+void run_convolution_forward(const ConvParams& params,
+                             const Tensor<InDataType>& input,
+                             const Tensor<WeiDataType>& weights,
+                             Tensor<OutDataType>& output)
+{
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+    DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace());
+    DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpace());
+    in_device_buf.ToDevice(input.mData.data());
+    wei_device_buf.ToDevice(weights.mData.data());
+    const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
+    auto conv     = DeviceConvNDFwdInstance<NDim, InDataType, WeiDataType, OutDataType>();
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+                                      static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+                                      static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+                                      params.N,
+                                      params.K,
+                                      params.C,
+                                      params.input_spatial_lengths,
+                                      params.filter_spatial_lengths,
+                                      output_spatial_lengths,
+                                      params.conv_filter_strides,
+                                      params.conv_filter_dilations,
+                                      params.input_left_pads,
+                                      params.input_right_pads,
+                                      PassThrough{},
+                                      PassThrough{},
+                                      PassThrough{});
+    if(!conv.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "Error! device_conv with the specified compilation parameters does "
+            "not support this Conv problem");
+    }
+    invoker.Run(argument);
+    out_device_buf.FromDevice(output.mData.data());
+}
+template <ck::index_t NDim,
+          typename InDataType  = float,
+          typename WeiDataType = float,
+          typename OutDataType = float>
+bool run_convolution_forward_instances(const ConvParams& params,
+                                       const std::vector<DeviceConvFwdNoOpPtr>& conv_ptrs,
+                                       const Tensor<InDataType>& input,
+                                       const Tensor<WeiDataType>& weights,
+                                       Tensor<OutDataType>& output,
+                                       const Tensor<OutDataType>& host_output)
+{
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+    DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace());
+    DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpace());
+    in_device_buf.ToDevice(input.mData.data());
+    wei_device_buf.ToDevice(weights.mData.data());
+    const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
+    bool res{true};
+    for(auto& conv_ptr : conv_ptrs)
+    {
+        auto invoker  = conv_ptr->MakeInvokerPointer();
+        auto argument = conv_ptr->MakeArgumentPointer(
+            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+            static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+            params.N,
+            params.K,
+            params.C,
+            params.input_spatial_lengths,
+            params.filter_spatial_lengths,
+            output_spatial_lengths,
+            params.conv_filter_strides,
+            params.conv_filter_dilations,
+            params.input_left_pads,
+            params.input_right_pads,
+            PassThrough{},
+            PassThrough{},
+            PassThrough{});
+        if(conv_ptr->IsSupportedArgument(argument.get()))
+        {
+            float atol{1e-5f};
+            float rtol{1e-4f};
+            if constexpr(std::is_same_v<InDataType, ck::half_t>)
+            {
+                atol = 1e-4f;
+                rtol = 2.5e-3f;
+            }
+            invoker->Run(argument.get());
+            out_device_buf.FromDevice(output.mData.data());
+            res = res &&
+                  ck::utils::check_err(
+                      output.mData, host_output.mData, "Error: incorrect results!", atol, rtol);
+            hipGetErrorString(
+                hipMemset(out_device_buf.GetDeviceBuffer(), 0, out_device_buf.mMemSize));
+        }
+    }
+    return res;
+}
+} // namespace conv
+} // namespace utils
 } // namespace ck
 #endif
--- a/library/src/host_tensor/host_tensor.cpp
+++ b/library/src/host_tensor/host_tensor.cpp
@@ -65,21 +65,10 @@ void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream
 }
 #if 1
-// FIXME: remove
-float bf16_to_f32_(ck::bhalf_t src_val)
-{
-    union
-    {
-        uint32_t int32;
-        float fp32;
-    } u = {uint32_t(src_val) << 16};
-    return u.fp32;
-}
 // FIXME: remove
 void bf16_to_f32_(const Tensor<ck::bhalf_t>& src, Tensor<float>& dst)
 {
    for(int i = 0; i < src.mData.size(); ++i)
-        dst.mData[i] = bf16_to_f32_(src.mData[i]);
+        dst.mData[i] = ck::type_convert<float>(src.mData[i]);
 }
 #endif
--- a/library/src/obselete_driver_offline/conv_add_fwd_driver_offline_nchwc.cpp
+++ b/library/src/obselete_driver_offline/conv_add_fwd_driver_offline_nchwc.cpp
@@ -4,6 +4,8 @@
 #include <cstdlib>
 #include <stdlib.h>
 #include <half.hpp>
+#include "check_err.hpp"
 #include "config.hpp"
 #include "debug.hpp"
 #include "print.hpp"
@@ -39,7 +41,7 @@ void host_direct_convolution_add_nchwc(const Tensor<TIn>& in,
                                       const ConvDilations& conv_dilations,
                                       const InLeftPads& in_left_pads,
                                       const InRightPads&,
-                                       const ck::ActivTypeEnum_t activ_type)
+                                       const ck::ActivTypeEnum activ_type)
 {
    using namespace ck;
@@ -117,7 +119,7 @@ int main(int argc, char* argv[])
        exit(1);
    }
-    constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu;
+    constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::LeakyRelu;
    const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1]));
    const bool do_verification = std::stoi(argv[2]);
@@ -167,7 +169,7 @@ int main(int argc, char* argv[])
    const bool do_log          = std::stoi(argv[4]);
    const int nrepeat          = std::stoi(argv[5]);
-    constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu;
+    constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::LeakyRelu;
 #if 0
    constexpr auto N             = Number<1>{};
@@ -401,7 +403,7 @@ int main(int argc, char* argv[])
                                          make_tuple(in_right_pad_h, in_right_pad_w),
                                          activ_type);
-        check_error(add_host, add_device);
+        ck::utils::check_err(add_device.mData, add_host.mData);
        if(do_log)
        {

--- a/library/src/obselete_driver_offline/conv_bwd_driver_offline.cpp
+++ b/library/src/obselete_driver_offline/conv_bwd_driver_offline.cpp
@@ -4,6 +4,8 @@
 #include <cstdlib>
 #include <stdlib.h>
 #include <half.hpp>
+#include "check_err.hpp"
 #include "config.hpp"
 #include "debug.hpp"
 #include "print.hpp"
@@ -473,7 +475,7 @@ int main(int argc, char* argv[])
                                       make_tuple(in_right_pad_h, in_right_pad_w),
                                       layout);
-        check_error(in_host, in_device);
+        ck::utils::check_err(in_device.mData, in_host.mData);
        if(do_log)
        {

--- a/library/src/obselete_driver_offline/conv_fwd_driver_offline.cpp
+++ b/library/src/obselete_driver_offline/conv_fwd_driver_offline.cpp
@@ -4,6 +4,8 @@
 #include <cstdlib>
 #include <stdlib.h>
 #include <half.hpp>
+#include "check_err.hpp"
 #include "config.hpp"
 #include "debug.hpp"
 #include "print.hpp"
@@ -534,7 +536,7 @@ int main(int argc, char* argv[])
                                 make_tuple(in_right_pad_h, in_right_pad_w),
                                 layout);
-        check_error(out_host, out_device);
+        ck::utils::check_err(out_device.mData, out_host.mData);
        if(do_log)
        {

--- a/library/src/obselete_driver_offline/conv_fwd_driver_offline_nchwc.cpp
+++ b/library/src/obselete_driver_offline/conv_fwd_driver_offline_nchwc.cpp
@@ -4,6 +4,8 @@
 #include <cstdlib>
 #include <stdlib.h>
 #include <half.hpp>
+#include "check_err.hpp"
 #include "config.hpp"
 #include "debug.hpp"
 #include "print.hpp"
@@ -37,7 +39,7 @@ void host_direct_convolution_nchwc(const Tensor<TIn>& in,
                                   const ConvDilations& conv_dilations,
                                   const InLeftPads& in_left_pads,
                                   const InRightPads&,
-                                   const ck::ActivTypeEnum_t activ_type)
+                                   const ck::ActivTypeEnum activ_type)
 {
    using namespace ck;
@@ -102,7 +104,7 @@ int main(int argc, char* argv[])
        exit(1);
    }
-    constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu;
+    constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::LeakyRelu;
    const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1]));
    const bool do_verification = std::stoi(argv[2]);
@@ -149,8 +151,8 @@ int main(int argc, char* argv[])
    const bool do_log          = std::stoi(argv[4]);
    const int nrepeat          = std::stoi(argv[5]);
-    // constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::Sigmoid;
+    // constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::Sigmoid;
-    constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu;
+    constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::LeakyRelu;
 #if 0
    constexpr auto N              = Number<1>{};
@@ -377,7 +379,7 @@ int main(int argc, char* argv[])
                                      make_tuple(in_right_pad_h, in_right_pad_w),
                                      activ_type);
-        check_error(out_host, out_device);
+        ck::utils::check_err(out_device.mData, out_host.mData);
        if(do_log)
        {

--- a/library/src/obselete_driver_offline/conv_maxpool_fwd_driver_offline_nchwc.cpp
+++ b/library/src/obselete_driver_offline/conv_maxpool_fwd_driver_offline_nchwc.cpp
@@ -4,6 +4,8 @@
 #include <cstdlib>
 #include <stdlib.h>
 #include <half.hpp>
+#include "check_err.hpp"
 #include "config.hpp"
 #include "debug.hpp"
 #include "print.hpp"
@@ -38,7 +40,7 @@ void host_direct_convolution_maxpool_nchwc(const Tensor<TIn>& in,
                                           const ConvDilations& conv_dilations,
                                           const InLeftPads& in_left_pads,
                                           const InRightPads&,
-                                           const ck::ActivTypeEnum_t activ_type)
+                                           const ck::ActivTypeEnum activ_type)
 {
    using namespace ck;
@@ -126,7 +128,7 @@ int main(int argc, char* argv[])
        exit(1);
    }
-    constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu;
+    constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::LeakyRelu;
    const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1]));
    const bool do_verification = std::stoi(argv[2]);
@@ -176,7 +178,7 @@ int main(int argc, char* argv[])
    const bool do_log          = std::stoi(argv[4]);
    const int nrepeat          = std::stoi(argv[5]);
-    constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu;
+    constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::LeakyRelu;
 #if 1
    constexpr auto N                       = Number<1>{};
@@ -397,8 +399,8 @@ int main(int argc, char* argv[])
                                              make_tuple(in_right_pad_h, in_right_pad_w),
                                              activ_type);
-        check_error(out_host, out_device);
+        ck::utils::check_err(out_device.mData, out_host.mData);
-        check_error(max_host, max_device);
+        ck::utils::check_err(max_device.mData, max_host.mData);
        if(do_log)
        {