Merge remote-tracking branch 'origin/develop' into cpu_avx2

b79df771 · carlushuang · 05d38218 · 63914743 · b79df771 · b79df771
Commit b79df771 authored Jul 12, 2022 by carlushuang
20 changed files
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F32_F32_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F32_F32_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "device_reduce_instance_multiblock_atomic_add.hpp"
+#pragma once
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
 namespace ck {
 namespace tensor_operation {
 namespace device {
-namespace device_reduce_instance {
+namespace instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
@@ -20,10 +24,7 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 1);
 ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 5, 0, 0, 2, 1);
 // clang-format on
-} // namespace device_reduce_instance
+} // namespace instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F64_F32_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F64_F32_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "device_reduce_instance_multiblock_atomic_add.hpp"
+#pragma once
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
 namespace ck {
 namespace tensor_operation {
 namespace device {
-namespace device_reduce_instance {
+namespace instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
@@ -20,10 +24,7 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 1);
 ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 5, 0, 0, 2, 1);
 // clang-format on
-} // namespace device_reduce_instance
+} // namespace instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F64_F64_F64_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F64_F64_F64_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "device_reduce_instance_multiblock_atomic_add.hpp"
+#pragma once
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
 namespace ck {
 namespace tensor_operation {
 namespace device {
-namespace device_reduce_instance {
+namespace instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
@@ -20,10 +24,7 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 1);
 ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1);
 // clang-format on
-} // namespace device_reduce_instance
+} // namespace instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_THREADWISE_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "reduction_operator_mapping.hpp"
+#pragma once
-#include "device_reduce_instance_impl_common.hpp"
-#include "device_reduce_threadwise.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp"
 namespace ck {
 namespace tensor_operation {
 namespace device {
-namespace device_reduce_instance {
+namespace instance {
 #ifdef QUICK_REDUCE_TEST
 using reduce_configuration_2_instances_threadwise = std::tuple<
@@ -47,10 +49,10 @@ using reduce_configuration_2_instances_threadwise = std::tuple<
    >;
 #endif
-template <typename AccDataType, ReduceTensorOp ReduceOpId>
+template <ReduceTensorOp ReduceOpId>
 using deviceReduceThreadWisePtrType = DeviceReducePtr<
-    typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation,
+    typename reduce_unary_operator<ReduceOpId, true, true>::InElementwiseOperation,
-    typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::AccElementwiseOperation>;
+    typename reduce_unary_operator<ReduceOpId, true, true>::AccElementwiseOperation>;
 template <typename InDataType,
          typename AccDataType,
@@ -61,14 +63,13 @@ template <typename InDataType,
          bool PropagateNan,
          bool UseIndex>
 void add_device_reduce_instance_threadwise(
-    std::vector<deviceReduceThreadWisePtrType<AccDataType, ReduceOpId>>& device_op_instances)
+    std::vector<deviceReduceThreadWisePtrType<ReduceOpId>>& device_op_instances)
 {
-    using ReduceOperation = typename reduce_binary_operator<AccDataType, ReduceOpId>::opType;
+    using ReduceOperation = typename reduce_binary_operator<ReduceOpId>::opType;
    using InElementwiseOperation =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation;
+        typename reduce_unary_operator<ReduceOpId, true, true>::InElementwiseOperation;
    using AccElementwiseOperation =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::
+        typename reduce_unary_operator<ReduceOpId, true, true>::AccElementwiseOperation;
-            AccElementwiseOperation;
    constexpr bool Indexable =
        (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
@@ -114,7 +115,7 @@ void add_device_reduce_instance_threadwise(
                                                        ReduceOpId,           \
                                                        PropagateNan,         \
                                                        UseIndex>(            \
-        std::vector<deviceReduceThreadWisePtrType<compT, ReduceOpId>> & device_op_instances)
+        std::vector<deviceReduceThreadWisePtrType<ReduceOpId>> & device_op_instances)
 #define ADD_THREADWISE_INST_BY_ID(                                        \
    inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
@@ -127,21 +128,17 @@ void add_device_reduce_instance_threadwise(
                                Rank,                                     \
                                NumReduceDim)
-#define ADD_THREADWISE_INST_REF_BY_TYPE(                                                           \
+#define ADD_THREADWISE_INST_REF_BY_TYPE(                                      \
-    inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim)                      \
+    inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \
-    extern template void add_device_reduce_instance_threadwise<inT,                                \
+    extern template void add_device_reduce_instance_threadwise<inT,           \
-                                                               compT,                              \
+                                                               compT,         \
-                                                               outT,                               \
+                                                               outT,          \
-                                                               Rank,                               \
+                                                               Rank,          \
-                                                               NumReduceDim,                       \
+                                                               NumReduceDim,  \
-                                                               ReduceOpId,                         \
+                                                               ReduceOpId,    \
-                                                               PropagateNan,                       \
+                                                               PropagateNan,  \
-                                                               UseIndex>(                          \
+                                                               UseIndex>(     \
-        std::vector<DeviceReducePtr<                                                               \
+        std::vector<deviceReduceThreadWisePtrType<ReduceOpId>> & device_op_instances)
-            typename reduce_unary_operator<compT, ReduceOpId, true, true>::InElementwiseOperation, \
-            typename reduce_unary_operator<compT, ReduceOpId, true, true>::                        \
-                AccElementwiseOperation>> &                                                        \
-        device_op_instances)
 #define ADD_THREADWISE_INST_REF_BY_ID(                                       \
    inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)    \
@@ -154,10 +151,7 @@ void add_device_reduce_instance_threadwise(
                                    Rank,                                    \
                                    NumReduceDim)
-} // namespace device_reduce_instance
+} // namespace instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_B16_F32_B16_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_THREADWISE_B16_F32_B16_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "data_type.hpp"
+#pragma once
-#include "device_reduce_instance_threadwise.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
 namespace ck {
 namespace tensor_operation {
 namespace device {
-namespace device_reduce_instance {
+namespace instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
@@ -50,10 +53,7 @@ ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 1);
 ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1);
 // clang-format on
-} // namespace device_reduce_instance
+} // namespace instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F16_F16_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F16_F16_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "data_type.hpp"
+#pragma once
-#include "device_reduce_instance_threadwise.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
 namespace ck {
 namespace tensor_operation {
 namespace device {
-namespace device_reduce_instance {
+namespace instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
@@ -37,10 +40,7 @@ ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1);
 ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1);
 // clang-format on
-} // namespace device_reduce_instance
+} // namespace instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F32_F16_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F32_F16_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "data_type.hpp"
+#pragma once
-#include "device_reduce_instance_threadwise.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
 namespace ck {
 namespace tensor_operation {
 namespace device {
-namespace device_reduce_instance {
+namespace instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
@@ -25,10 +28,7 @@ ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1);
 ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1);
 // clang-format on
-} // namespace device_reduce_instance
+} // namespace instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F32_F32_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F32_F32_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "device_reduce_instance_threadwise.hpp"
+#pragma once
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
 namespace ck {
 namespace tensor_operation {
 namespace device {
-namespace device_reduce_instance {
+namespace instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
@@ -48,10 +52,7 @@ ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 1);
 ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1);
 // clang-format on
-} // namespace device_reduce_instance
+} // namespace instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F64_F32_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F64_F32_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "device_reduce_instance_threadwise.hpp"
+#pragma once
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
 namespace ck {
 namespace tensor_operation {
 namespace device {
-namespace device_reduce_instance {
+namespace instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
@@ -24,10 +28,7 @@ ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 1);
 ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 2, 1);
 // clang-format on
-} // namespace device_reduce_instance
+} // namespace instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F64_F64_F64_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_THREADWISE_F64_F64_F64_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "device_reduce_instance_threadwise.hpp"
+#pragma once
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
 namespace ck {
 namespace tensor_operation {
 namespace device {
-namespace device_reduce_instance {
+namespace instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
@@ -48,10 +52,7 @@ ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 1);
 ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1);
 // clang-format on
-} // namespace device_reduce_instance
+} // namespace instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I32_I8_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I32_I8_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "device_reduce_instance_threadwise.hpp"
+#pragma once
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
 namespace ck {
 namespace tensor_operation {
 namespace device {
-namespace device_reduce_instance {
+namespace instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
@@ -20,10 +24,7 @@ ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 1);
 ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1);
 // clang-format on
-} // namespace device_reduce_instance
+} // namespace instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I8_I8_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I8_I8_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "device_reduce_instance_threadwise.hpp"
+#pragma once
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
 namespace ck {
 namespace tensor_operation {
 namespace device {
-namespace device_reduce_instance {
+namespace instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
@@ -36,10 +40,7 @@ ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1);
 ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1);
 // clang-format on
-} // namespace device_reduce_instance
+} // namespace instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/utility/check_err.hpp
+++ b/library/include/ck/library/utility/check_err.hpp
-#ifndef CHECK_ERR_HPP
+// SPDX-License-Identifier: MIT
-#define CHECK_ERR_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include <algorithm>
+#pragma once
-#include <cmath>
-#include <cstdlib>
+#include <algorithm>
-#include <half.hpp>
+#include <cmath>
-#include <iostream>
+#include <cstdlib>
-#include <iomanip>
+#include <iostream>
-#include <iterator>
+#include <iomanip>
-#include <limits>
+#include <iterator>
-#include <type_traits>
+#include <limits>
-#include <vector>
+#include <type_traits>
+#include <vector>
-#include "data_type.hpp"
+#include "ck/utility/data_type.hpp"
-namespace ck {
-namespace utils {
+namespace ck {
+namespace utils {
-template <typename T>
-typename std::enable_if<std::is_floating_point<T>::value && !std::is_same<T, half_t>::value,
+template <typename T>
-                        bool>::type
+typename std::enable_if<std::is_floating_point<T>::value && !std::is_same<T, half_t>::value,
-check_err(const std::vector<T>& out,
+                        bool>::type
-          const std::vector<T>& ref,
+check_err(const std::vector<T>& out,
-          const std::string& msg = "Error: Incorrect results!",
+          const std::vector<T>& ref,
-          double rtol            = 1e-5,
+          const std::string& msg = "Error: Incorrect results!",
-          double atol            = 3e-6)
+          double rtol            = 1e-5,
-{
+          double atol            = 3e-6)
-    if(out.size() != ref.size())
+{
-    {
+    if(out.size() != ref.size())
-        std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
+    {
-                  << std::endl
+        std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
-                  << msg << std::endl;
+                  << std::endl
-        return false;
+                  << msg << std::endl;
-    }
+        return false;
+    }
-    bool res{true};
-    int err_count  = 0;
+    bool res{true};
-    double err     = 0;
+    int err_count  = 0;
-    double max_err = std::numeric_limits<double>::min();
+    double err     = 0;
-    for(std::size_t i = 0; i < ref.size(); ++i)
+    double max_err = std::numeric_limits<double>::min();
-    {
+    for(std::size_t i = 0; i < ref.size(); ++i)
-        err = std::abs(out[i] - ref[i]);
+    {
-        if(err > atol + rtol * std::abs(ref[i]) || !std::isfinite(out[i]) || !std::isfinite(ref[i]))
+        err = std::abs(out[i] - ref[i]);
-        {
+        if(err > atol + rtol * std::abs(ref[i]) || !std::isfinite(out[i]) || !std::isfinite(ref[i]))
-            max_err = err > max_err ? err : max_err;
+        {
-            err_count++;
+            max_err = err > max_err ? err : max_err;
-            if(err_count < 5)
+            err_count++;
-            {
+            if(err_count < 5)
-                std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref["
+            {
-                          << i << "]: " << out[i] << " != " << ref[i] << std::endl
+                std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref["
-                          << msg << std::endl;
+                          << i << "]: " << out[i] << " != " << ref[i] << std::endl
-            }
+                          << msg << std::endl;
-            res = false;
+            }
-        }
+            res = false;
-    }
+        }
-    if(!res)
+    }
-    {
+    if(!res)
-        std::cout << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
+    {
-    }
+        std::cout << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
-    return res;
+    }
-}
+    return res;
+}
-template <typename T>
-typename std::enable_if<std::is_same<T, bhalf_t>::value, bool>::type
+template <typename T>
-check_err(const std::vector<T>& out,
+typename std::enable_if<std::is_same<T, bhalf_t>::value, bool>::type
-          const std::vector<T>& ref,
+check_err(const std::vector<T>& out,
-          const std::string& msg = "Error: Incorrect results!",
+          const std::vector<T>& ref,
-          double rtol            = 1e-3,
+          const std::string& msg = "Error: Incorrect results!",
-          double atol            = 1e-3)
+          double rtol            = 1e-3,
-{
+          double atol            = 1e-3)
-    if(out.size() != ref.size())
+{
-    {
+    if(out.size() != ref.size())
-        std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
+    {
-                  << std::endl
+        std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
-                  << msg << std::endl;
+                  << std::endl
-        return false;
+                  << msg << std::endl;
-    }
+        return false;
+    }
-    bool res{true};
-    int err_count = 0;
+    bool res{true};
-    double err    = 0;
+    int err_count = 0;
-    // TODO: This is a hack. We should have proper specialization for bhalf_t data type.
+    double err    = 0;
-    double max_err = std::numeric_limits<float>::min();
+    // TODO: This is a hack. We should have proper specialization for bhalf_t data type.
-    for(std::size_t i = 0; i < ref.size(); ++i)
+    double max_err = std::numeric_limits<float>::min();
-    {
+    for(std::size_t i = 0; i < ref.size(); ++i)
-        double o = type_convert<float>(out[i]);
+    {
-        double r = type_convert<float>(ref[i]);
+        double o = type_convert<float>(out[i]);
-        err      = std::abs(o - r);
+        double r = type_convert<float>(ref[i]);
-        if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
+        err      = std::abs(o - r);
-        {
+        if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
-            max_err = err > max_err ? err : max_err;
+        {
-            err_count++;
+            max_err = err > max_err ? err : max_err;
-            if(err_count < 5)
+            err_count++;
-            {
+            if(err_count < 5)
-                std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref["
+            {
-                          << i << "]: " << o << " != " << r << std::endl
+                std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref["
-                          << msg << std::endl;
+                          << i << "]: " << o << " != " << r << std::endl
-            }
+                          << msg << std::endl;
-            res = false;
+            }
-        }
+            res = false;
-    }
+        }
-    if(!res)
+    }
-    {
+    if(!res)
-        std::cout << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
+    {
-    }
+        std::cout << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
-    return res;
+    }
-}
+    return res;
+}
-template <typename T>
-typename std::enable_if<std::is_same<T, half_t>::value || std::is_same<T, half_float::half>::value,
+template <typename T>
-                        bool>::type
+typename std::enable_if<std::is_same<T, half_t>::value, bool>::type
 check_err(const std::vector<T>& out,
          const std::vector<T>& ref,
          const std::string& msg = "Error: Incorrect results!",
          double rtol            = 1e-3,
          double atol            = 1e-3)
 {
    if(out.size() != ref.size())
    {
        std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
                  << std::endl
                  << msg << std::endl;
        return false;
    }
    bool res{true};
    int err_count  = 0;
    double err     = 0;
    double max_err = std::numeric_limits<T>::min();
    for(std::size_t i = 0; i < ref.size(); ++i)
    {
        double o = type_convert<float>(out[i]);
        double r = type_convert<float>(ref[i]);
        err      = std::abs(o - r);
        if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
        {
            max_err = err > max_err ? err : max_err;
            err_count++;
            if(err_count < 5)
            {
                std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref["
                          << i << "]: " << o << " != " << r << std::endl
                          << msg << std::endl;
            }
            res = false;
        }
    }
    if(!res)
    {
        std::cout << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
    }
    return res;
 }
 template <typename T>
 typename std::enable_if<std::is_integral<T>::value && !std::is_same<T, bhalf_t>::value, bool>::type
 check_err(const std::vector<T>& out,
          const std::vector<T>& ref,
          const std::string& msg = "Error: Incorrect results!",
          double                 = 0,
-          double                 = 0)
+          double atol            = 0)
 {
    if(out.size() != ref.size())
    {
        std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
                  << std::endl
                  << msg << std::endl;
        return false;
    }
-    for(std::size_t i = 0; i < ref.size(); ++i)
+    bool res{true};
-    {
+    int err_count   = 0;
-        if(out[i] != ref[i])
+    int64_t err     = 0;
-        {
+    int64_t max_err = std::numeric_limits<int64_t>::min();
-            std::cout << "out[" << i << "] != ref[" << i << "]: " << static_cast<int>(out[i])
+    for(std::size_t i = 0; i < ref.size(); ++i)
-                      << " != " << static_cast<int>(ref[i]) << std::endl
+    {
-                      << msg << std::endl;
+        int64_t o = out[i];
-            return false;
+        int64_t r = ref[i];
-        }
+        err       = std::abs(o - r);
-    }
-    return true;
+        if(err > atol)
-}
+        {
+            max_err = err > max_err ? err : max_err;
-} // namespace utils
+            err_count++;
-} // namespace ck
+            if(err_count < 5)
+            {
-template <typename T>
+                std::cout << "out[" << i << "] != ref[" << i << "]: " << static_cast<int>(out[i])
-std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
+                          << " != " << static_cast<int>(ref[i]) << std::endl
-{
+                          << msg << std::endl;
-    std::copy(std::begin(v), std::end(v), std::ostream_iterator<T>(os, " "));
+            }
-    return os;
+            res = false;
-}
+        }
+    }
-#endif
+    if(!res)
+    {
+        std::cout << "max err: " << max_err << std::endl;
+    }
+    return res;
+}
+} // namespace utils
+} // namespace ck
+template <typename T>
+std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
+{
+    std::copy(std::begin(v), std::end(v), std::ostream_iterator<T>(os, " "));
+    return os;
+}
--- a/library/include/ck/library/utility/conv_util.hpp
+++ b/library/include/ck/library/utility/conv_util.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 #include <cstdlib>
@@ -9,17 +12,17 @@
 #include <type_traits>
 #include <vector>
-#include "check_err.hpp"
+#include "ck/ck.hpp"
-#include "config.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "device.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
-#include "device_conv_fwd.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "device_tensor.hpp"
-#include "element_wise_operation.hpp"
+#include "ck/library/utility/check_err.hpp"
-#include "fill.hpp"
+#include "ck/library/utility/fill.hpp"
-#include "host_tensor.hpp"
+#include "ck/library/utility/op_instance_engine.hpp"
-#include "op_instance_engine.hpp"
+#include "ck/library/host_tensor/device_memory.hpp"
-#include "reference_conv_fwd.hpp"
+#include "ck/library/host_tensor/host_tensor.hpp"
-#include "tensor_layout.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -28,15 +31,15 @@ namespace device {
 using DeviceConvFwdNoOpPtr = DeviceConvFwdPtr<element_wise::PassThrough,
                                              element_wise::PassThrough,
                                              element_wise::PassThrough>;
-namespace device_conv1d_fwd_instance {
+namespace instance {
 void add_device_conv1d_fwd_xdl_nwc_kxc_nwk_bf16_instances(std::vector<DeviceConvFwdNoOpPtr>&);
 void add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f16_instances(std::vector<DeviceConvFwdNoOpPtr>&);
 void add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f32_instances(std::vector<DeviceConvFwdNoOpPtr>&);
 void add_device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instances(std::vector<DeviceConvFwdNoOpPtr>&);
-} // namespace device_conv1d_fwd_instance
+} // namespace instance
-namespace device_conv2d_fwd_instance {
+namespace instance {
 void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(std::vector<DeviceConvFwdNoOpPtr>&);
 void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(std::vector<DeviceConvFwdNoOpPtr>&);
@@ -45,15 +48,15 @@ void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(
 void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(std::vector<DeviceConvFwdNoOpPtr>&);
 void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(std::vector<DeviceConvFwdNoOpPtr>&);
-} // namespace device_conv2d_fwd_instance
+} // namespace instance
-namespace device_conv3d_fwd_instance {
+namespace instance {
 void add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(std::vector<DeviceConvFwdNoOpPtr>&);
 void add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f16_instances(std::vector<DeviceConvFwdNoOpPtr>&);
 void add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f32_instances(std::vector<DeviceConvFwdNoOpPtr>&);
 void add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instances(std::vector<DeviceConvFwdNoOpPtr>&);
-} // namespace device_conv3d_fwd_instance
+} // namespace instance
 } // namespace device
 } // namespace tensor_operation
@@ -292,17 +295,17 @@ struct ConvolutionFwdInstances<float, float, float>
        std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;
        if constexpr(NumDimSpatial == 1)
        {
-            ck::tensor_operation::device::device_conv1d_fwd_instance::
+            ck::tensor_operation::device::instance::
                add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f32_instances(conv_ptrs);
        }
        else if constexpr(NumDimSpatial == 2)
        {
-            ck::tensor_operation::device::device_conv2d_fwd_instance::
+            ck::tensor_operation::device::instance::
                add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(conv_ptrs);
        }
        else if constexpr(NumDimSpatial == 3)
        {
-            ck::tensor_operation::device::device_conv3d_fwd_instance::
+            ck::tensor_operation::device::instance::
                add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f32_instances(conv_ptrs);
        }
        return conv_ptrs;
@@ -319,20 +322,20 @@ struct ConvolutionFwdInstances<half_t, half_t, half_t>
        std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;
        if constexpr(NumDimSpatial == 1)
        {
-            ck::tensor_operation::device::device_conv1d_fwd_instance::
+            ck::tensor_operation::device::instance::
                add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f16_instances(conv_ptrs);
            return conv_ptrs;
        }
        else if constexpr(NumDimSpatial == 2)
        {
-            ck::tensor_operation::device::device_conv2d_fwd_instance::
+            ck::tensor_operation::device::instance::
                add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(conv_ptrs);
-            ck::tensor_operation::device::device_conv2d_fwd_instance::
+            ck::tensor_operation::device::instance::
                add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(conv_ptrs);
        }
        else if constexpr(NumDimSpatial == 3)
        {
-            ck::tensor_operation::device::device_conv3d_fwd_instance::
+            ck::tensor_operation::device::instance::
                add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f16_instances(conv_ptrs);
        }
        return conv_ptrs;
@@ -349,17 +352,17 @@ struct ConvolutionFwdInstances<bhalf_t, bhalf_t, bhalf_t>
        std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;
        if constexpr(NumDimSpatial == 1)
        {
-            ck::tensor_operation::device::device_conv1d_fwd_instance::
+            ck::tensor_operation::device::instance::
                add_device_conv1d_fwd_xdl_nwc_kxc_nwk_bf16_instances(conv_ptrs);
        }
        else if constexpr(NumDimSpatial == 2)
        {
-            ck::tensor_operation::device::device_conv2d_fwd_instance::
+            ck::tensor_operation::device::instance::
                add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(conv_ptrs);
        }
        else if constexpr(NumDimSpatial == 3)
        {
-            ck::tensor_operation::device::device_conv3d_fwd_instance::
+            ck::tensor_operation::device::instance::
                add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(conv_ptrs);
        }
        return conv_ptrs;
@@ -376,17 +379,17 @@ struct ConvolutionFwdInstances<int8_t, int8_t, int8_t>
        std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;
        if constexpr(NumDimSpatial == 1)
        {
-            ck::tensor_operation::device::device_conv1d_fwd_instance::
+            ck::tensor_operation::device::instance::
                add_device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instances(conv_ptrs);
        }
        else if constexpr(NumDimSpatial == 2)
        {
-            ck::tensor_operation::device::device_conv2d_fwd_instance::
+            ck::tensor_operation::device::instance::
                add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(conv_ptrs);
        }
        else if constexpr(NumDimSpatial == 3)
        {
-            ck::tensor_operation::device::device_conv3d_fwd_instance::
+            ck::tensor_operation::device::instance::
                add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instances(conv_ptrs);
        }
        return conv_ptrs;
@@ -402,8 +405,8 @@ template <typename InDataType,
          typename InElementwiseOp  = ck::tensor_operation::element_wise::PassThrough,
          typename WeiElementwiseOp = ck::tensor_operation::element_wise::PassThrough,
          typename OutElementwiseOp = ck::tensor_operation::element_wise::PassThrough,
-          typename InputInitFun     = FillUniform<InDataType>,
+          typename InputInitFun     = FillUniformDistribution<InDataType>,
-          typename WeightsInitFun   = FillUniform<WeiDataType>>
+          typename WeightsInitFun   = FillUniformDistribution<WeiDataType>>
 class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType, WeiDataType>
 {
    using DeviceConvFwdOp = tensor_operation::device::
@@ -422,8 +425,8 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType,
    ConvFwdOpInstance(const ConvParams& params,
                      bool do_init                         = true,
-                      const InputInitFun& input_init_f     = InputInitFun{},
+                      const InputInitFun& input_init_f     = InputInitFun(),
-                      const WeightsInitFun& weights_init_f = WeightsInitFun{})
+                      const WeightsInitFun& weights_init_f = WeightsInitFun())
        : BaseType(),
          params_{params},
          output_spatial_lengths_{params.GetOutputSpatialLengths()},
@@ -560,8 +563,8 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType,
    const ConvParams& params_;
    const std::vector<ck::index_t> output_spatial_lengths_;
    const bool do_init_;
-    const InputInitFun& input_init_f_;
+    InputInitFun input_init_f_;
-    const WeightsInitFun& weights_init_f_;
+    WeightsInitFun weights_init_f_;
 };
 } // namespace conv

--- a/library/include/ck/library/utility/fill.hpp
+++ b/library/include/ck/library/utility/fill.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 #include <algorithm>
+#include <cmath>
 #include <random>
-#include "data_type.hpp"
+#include "ck/utility/data_type.hpp"
 namespace ck {
 namespace utils {
-// template <typename T, class Enable = void>
+template <typename T>
-// struct FillUniform;
+struct FillUniformDistribution
+{
+    float a_{-5.f};
+    float b_{5.f};
-// TODO: what's wrong with this specialization???
+    template <typename ForwardIter>
-// err: segmentation fault in mt19937 - infinite loop like.
+    void operator()(ForwardIter first, ForwardIter last) const
-// template <typename T>
+    {
-// struct FillUniform<T, typename std::enable_if<std::is_integral<T>::value &&
+        std::mt19937 gen(11939);
-//                                               !std::is_same<T, bhalf_t>::value>::type>
+        std::uniform_real_distribution<float> dis(a_, b_);
-// {
+        std::generate(first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
-//     int a_{0};
+    }
-//     int b_{5};
+};
-//     // T a_ = T{0};
-//     // T b_ = T{5};
-//     template <typename ForwardIter>
+// Normally FillUniformDistributionIntegerValue should use std::uniform_int_distribution as below.
-//     void operator()(ForwardIter first, ForwardIter last) const
+// However this produces segfaults in std::mt19937 which look like inifite loop.
-//     {
+//      template <typename T>
-//         std::mt19937 gen{11939};
+//      struct FillUniformDistributionIntegerValue
-//         std::uniform_int_distribution<int> dis(a_, b_);
+//      {
-//         std::generate(first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
+//          int a_{-5};
-//     }
+//          int b_{5};
-// };
+//
+//          template <typename ForwardIter>
+//          void operator()(ForwardIter first, ForwardIter last) const
+//          {
+//              std::mt19937 gen(11939);
+//              std::uniform_int_distribution<int> dis(a_, b_);
+//              std::generate(
+//                  first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
+//          }
+//      };
-// struct FillUniform<T, typename std::enable_if<std::is_floating_point<T>::value ||
+// Workaround for uniform_int_distribution not working as expected. See note above.<
-//                                               std::is_same<T, bhalf_t>::value>::type>
 template <typename T>
-struct FillUniform
+struct FillUniformDistributionIntegerValue
 {
-    float a_{0};
+    float a_{-5.f};
-    float b_{5};
+    float b_{5.f};
    template <typename ForwardIter>
    void operator()(ForwardIter first, ForwardIter last) const
    {
-        std::mt19937 gen{11939};
+        std::mt19937 gen(11939);
-        std::uniform_real_distribution<> dis(a_, b_);
+        std::uniform_real_distribution<float> dis(a_, b_);
-        std::generate(first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
+        std::generate(
+            first, last, [&dis, &gen]() { return ck::type_convert<T>(std::round(dis(gen))); });
    }
 };

--- a/library/include/ck/library/utility/op_instance_engine.hpp
+++ b/library/include/ck/library/utility/op_instance_engine.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 #include <cstdlib>
+#include <iostream>
 #include <limits>
 #include <memory>
 #include <stdexcept>
@@ -8,9 +12,12 @@
 #include <utility>
 #include <vector>
-#include "check_err.hpp"
+#include "ck/utility/functional2.hpp"
-#include "device_base.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
-#include "functional2.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/host_tensor/device_memory.hpp"
+#include "ck/library/host_tensor/host_tensor.hpp"
 namespace ck {
 namespace utils {
@@ -78,7 +85,8 @@ class OpInstanceRunEngine
    template <typename ReferenceOp = std::function<void()>>
    OpInstanceRunEngine(const OpInstanceT& op_instance,
-                        const ReferenceOp& reference_op = ReferenceOp{})
+                        const ReferenceOp& reference_op = ReferenceOp{},
+                        bool do_verification            = true)
        : op_instance_{op_instance}
    {
        in_tensors_ = op_instance_.GetInputTensors();
@@ -88,8 +96,11 @@ class OpInstanceRunEngine
                                         const Tensor<InArgTypes>&...,
                                         Tensor<OutDataType>&>)
        {
-            ref_output_ = op_instance_.GetOutputTensor();
+            if(do_verification)
-            CallRefOpUnpackArgs(reference_op, std::make_index_sequence<kNInArgs_>{});
+            {
+                ref_output_ = op_instance_.GetOutputTensor();
+                CallRefOpUnpackArgs(reference_op, std::make_index_sequence<kNInArgs_>{});
+            }
        }
        AllocateDeviceInputTensors(std::make_index_sequence<kNInArgs_>{});
        out_device_buffer_ =
@@ -110,6 +121,7 @@ class OpInstanceRunEngine
                op_ptr.get(), in_device_buffers_, out_device_buffer_);
            if(op_ptr->IsSupportedArgument(argument.get()))
            {
+                std::cout << "Testing instance: " << op_ptr->GetTypeString() << std::endl;
                invoker->Run(argument.get());
                out_device_buffer_->FromDevice(out_tensor_->mData.data());
                if(!ref_output_)
@@ -119,9 +131,16 @@ class OpInstanceRunEngine
                        " You have to provide reference function.");
                }
                // TODO: enable flexible use of custom check_error functions
-                res = res && check_err(out_tensor_->mData, ref_output_->mData);
+                bool inst_res = CheckErr(out_tensor_->mData, ref_output_->mData);
+                std::cout << (inst_res ? "SUCCESS" : "FAILURE") << std::endl;
+                res = res && inst_res;
                out_device_buffer_->SetZero();
            }
+            else
+            {
+                std::cout << "Given conv problem is not supported by instance: \n\t>>>>"
+                          << op_ptr->GetTypeString() << std::endl;
+            }
        }
        return res;
    }
@@ -132,7 +151,6 @@ class OpInstanceRunEngine
                              bool do_verification = false,
                              bool do_log          = false)
    {
-        bool res{true};
        ProfileBestConfig best_config;
        for(auto& op_ptr : op_ptrs)
@@ -153,7 +171,7 @@ class OpInstanceRunEngine
                std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec
                          << " GB/s, " << op_name << std::endl;
-                if(tflops < best_config.best_tflops)
+                if(avg_time < best_config.best_avg_time)
                {
                    best_config.best_op_name    = op_name;
                    best_config.best_tflops     = tflops;
@@ -171,7 +189,7 @@ class OpInstanceRunEngine
                            " You have to provide reference function.");
                    }
                    // TODO: enable flexible use of custom check_error functions
-                    res = res && CheckErr(out_tensor_->mData, ref_output_->mData);
+                    CheckErr(out_tensor_->mData, ref_output_->mData);
                    if(do_log) {}
                }
@@ -223,7 +241,7 @@ class OpInstanceRunEngine
    template <typename T>
    bool CheckErr(const std::vector<T>& dev_out, const std::vector<T>& ref_out) const
    {
-        return ck::utils::check_err(dev_out, ref_out, "Error: incorrect results!", atol_, rtol_);
+        return ck::utils::check_err(dev_out, ref_out, "Error: incorrect results!", rtol_, atol_);
    }
 };

--- a/library/src/host_tensor/CMakeLists.txt
+++ b/library/src/host_tensor/CMakeLists.txt
 ## host_tensor
-include_directories(BEFORE
-    ${PROJECT_SOURCE_DIR}/include/ck
-    ${PROJECT_SOURCE_DIR}/include/ck/utility
-    ${PROJECT_SOURCE_DIR}/library/include/ck/library/host_tensor
-)
 set(HOST_TENSOR_SOURCE
-    device.cpp
+    device_memory.cpp
    host_tensor.cpp
 )
@@ -17,22 +11,20 @@ target_compile_features(host_tensor PUBLIC)
 set_target_properties(host_tensor PROPERTIES POSITION_INDEPENDENT_CODE ON)
 target_include_directories(host_tensor SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
-target_include_directories(host_tensor PUBLIC 
+target_include_directories(host_tensor PUBLIC
    "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck>"
    "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/utility>"
    "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/host_tensor>"
 )
-install(TARGETS host_tensor 
+rocm_install(
-        EXPORT host_tensorTargets
+    TARGETS host_tensor
-        LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    EXPORT host_tensorTargets
-        ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
-        RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
-        INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
 )
-install(EXPORT host_tensorTargets
+rocm_install(
-    FILE composable_kernelhost_tensorTargets.cmake 
+    EXPORT host_tensorTargets
+    FILE composable_kernelhost_tensorTargets.cmake
    NAMESPACE composable_kernel::
    DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
 )

--- a/library/src/host_tensor/device.cpp
+++ b/library/src/host_tensor/device.cpp
-#include <chrono>
-#include <assert.h>
-#include <string.h>
-#include <stdlib.h>
-#include "device.hpp"
-#ifndef CK_NOGPU
-DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
-{
-    hip_check_error(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
-}
-void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; }
-std::size_t DeviceMem::GetBufferSize() { return mMemSize; }
-void DeviceMem::ToDevice(const void* p)
-{
-    hip_check_error(hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
-}
-void DeviceMem::FromDevice(void* p)
-{
-    hip_check_error(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
-}
-void DeviceMem::SetZero() { hip_check_error(hipMemset(mpDeviceBuf, 0, mMemSize)); }
-DeviceMem::~DeviceMem() { hip_check_error(hipFree(mpDeviceBuf)); }
-struct KernelTimerImpl
-{
-    KernelTimerImpl()
-    {
-        hip_check_error(hipEventCreate(&mStart));
-        hip_check_error(hipEventCreate(&mEnd));
-    }
-    ~KernelTimerImpl()
-    {
-        hip_check_error(hipEventDestroy(mStart));
-        hip_check_error(hipEventDestroy(mEnd));
-    }
-    void Start()
-    {
-        hip_check_error(hipDeviceSynchronize());
-        hip_check_error(hipEventRecord(mStart, nullptr));
-    }
-    void End()
-    {
-        hip_check_error(hipEventRecord(mEnd, nullptr));
-        hip_check_error(hipEventSynchronize(mEnd));
-    }
-    float GetElapsedTime() const
-    {
-        float time;
-        hip_check_error(hipEventElapsedTime(&time, mStart, mEnd));
-        return time;
-    }
-    hipEvent_t mStart, mEnd;
-};
-KernelTimer::KernelTimer() : impl(new KernelTimerImpl()) {}
-KernelTimer::~KernelTimer() {}
-void KernelTimer::Start() { impl->Start(); }
-void KernelTimer::End() { impl->End(); }
-float KernelTimer::GetElapsedTime() const { return impl->GetElapsedTime(); }
-#endif
-DeviceAlignedMemCPU::DeviceAlignedMemCPU(std::size_t mem_size, std::size_t alignment)
-    : mMemSize(mem_size), mAlignment(alignment)
-{
-    if(mem_size == 0)
-    {
-        mpDeviceBuf = nullptr;
-    }
-    else
-    {
-        assert(!(alignment == 0 || (alignment & (alignment - 1)))); // check pow of 2
-        // TODO: posix only
-        int rtn = posix_memalign(&mpDeviceBuf, alignment, mem_size);
-        assert(rtn == 0);
-    }
-}
-void* DeviceAlignedMemCPU::GetDeviceBuffer() { return mpDeviceBuf; }
-std::size_t DeviceAlignedMemCPU::GetBufferSize() { return mMemSize; }
-void DeviceAlignedMemCPU::ToDevice(const void* p) { memcpy(mpDeviceBuf, p, mMemSize); }
-void DeviceAlignedMemCPU::FromDevice(void* p) { memcpy(p, mpDeviceBuf, mMemSize); }
-void DeviceAlignedMemCPU::SetZero() { memset(mpDeviceBuf, 0, mMemSize); }
-DeviceAlignedMemCPU::~DeviceAlignedMemCPU()
-{
-    if(mpDeviceBuf != nullptr)
-        free(mpDeviceBuf);
-}
-struct WallTimerImpl
-{
-    void Start() { mStart = std::chrono::high_resolution_clock::now(); }
-    void End() { mStop = std::chrono::high_resolution_clock::now(); }
-    float GetElapsedTime() const
-    {
-        return static_cast<float>(
-                   std::chrono::duration_cast<std::chrono::microseconds>(mStop - mStart).count()) *
-               1e-3;
-    }
-    std::chrono::time_point<std::chrono::high_resolution_clock> mStart;
-    std::chrono::time_point<std::chrono::high_resolution_clock> mStop;
-};
-WallTimer::WallTimer() : impl(new WallTimerImpl()) {}
-WallTimer::~WallTimer() {}
-void WallTimer::Start() { impl->Start(); }
-void WallTimer::End() { impl->End(); }
-float WallTimer::GetElapsedTime() const { return impl->GetElapsedTime(); }
--- a/library/src/host_tensor/device_memory.cpp
+++ b/library/src/host_tensor/device_memory.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/device_utility/hip_check_error.hpp"
+#include "ck/library/host_tensor/device_memory.hpp"
+DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
+{
+    hip_check_error(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
+}
+void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; }
+std::size_t DeviceMem::GetBufferSize() { return mMemSize; }
+void DeviceMem::ToDevice(const void* p)
+{
+    hip_check_error(hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
+}
+void DeviceMem::FromDevice(void* p)
+{
+    hip_check_error(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
+}
+void DeviceMem::SetZero() { hip_check_error(hipMemset(mpDeviceBuf, 0, mMemSize)); }
+DeviceMem::~DeviceMem() { hip_check_error(hipFree(mpDeviceBuf)); }
--- a/library/src/host_tensor/host_tensor.cpp
+++ b/library/src/host_tensor/host_tensor.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #include <cassert>
-#include "host_tensor.hpp"
+#include "ck/library/host_tensor/host_tensor.hpp"
 void HostTensorDescriptor::CalculateStrides()
 {
@@ -50,25 +54,3 @@ std::ostream& operator<<(std::ostream& os, const HostTensorDescriptor& desc)
    return os;
 }
-void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream& os)
-{
-    os << "dim " << desc.GetNumOfDimension() << ", ";
-    os << "lengths {";
-    LogRange(os, desc.GetLengths(), ", ");
-    os << "}, ";
-    os << "strides {";
-    LogRange(os, desc.GetStrides(), ", ");
-    os << "}" << std::endl;
-}
-#if 1
-// FIXME: remove
-void bf16_to_f32_(const Tensor<ck::bhalf_t>& src, Tensor<float>& dst)
-{
-    for(std::size_t i = 0; i < src.mData.size(); ++i)
-        dst.mData[i] = ck::type_convert<float>(src.mData[i]);
-}
-#endif