Merge remote-tracking branch 'upstream/develop' into gemm-layernorm-4

cba8f7f2 · Anthony Chang · cc50b687 · b653c5eb · cba8f7f2 · cba8f7f2
Commit cba8f7f2 authored Jun 26, 2022 by Anthony Chang
20 changed files
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_IMPL_COMMON_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_IMPL_COMMON_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
 namespace ck {
 namespace tensor_operation {
@@ -35,7 +37,4 @@ struct ReductionConfiguration_2
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "reduction_operator_mapping.hpp"
+#pragma once
-#include "device_reduce_instance_impl_common.hpp"
-#include "device_reduce_multiblock.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -193,7 +196,4 @@ void add_device_reduce_instance_multiblock_atomic_add(
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_B16_F32_F32_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_B16_F32_F32_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "data_type.hpp"
+#pragma once
-#include "device_reduce_instance_multiblock_atomic_add.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -24,7 +27,4 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(bhalf_t, float, float, 5, 0, 0, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F16_F32_F32_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F16_F32_F32_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "data_type.hpp"
+#pragma once
-#include "device_reduce_instance_multiblock_atomic_add.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -24,7 +27,4 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 5, 0, 0, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F32_F32_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F32_F32_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "device_reduce_instance_multiblock_atomic_add.hpp"
+#pragma once
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -23,7 +27,4 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 5, 0, 0, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F64_F32_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F64_F32_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "device_reduce_instance_multiblock_atomic_add.hpp"
+#pragma once
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -23,7 +27,4 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 5, 0, 0, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F64_F64_F64_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F64_F64_F64_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "device_reduce_instance_multiblock_atomic_add.hpp"
+#pragma once
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -23,7 +27,4 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_THREADWISE_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "reduction_operator_mapping.hpp"
+#pragma once
-#include "device_reduce_instance_impl_common.hpp"
-#include "device_reduce_threadwise.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -152,7 +154,4 @@ void add_device_reduce_instance_threadwise(
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_B16_F32_B16_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_THREADWISE_B16_F32_B16_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "data_type.hpp"
+#pragma once
-#include "device_reduce_instance_threadwise.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -53,7 +56,4 @@ ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F16_F16_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F16_F16_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "data_type.hpp"
+#pragma once
-#include "device_reduce_instance_threadwise.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -40,7 +43,4 @@ ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F32_F16_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F32_F16_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "data_type.hpp"
+#pragma once
-#include "device_reduce_instance_threadwise.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -28,7 +31,4 @@ ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F32_F32_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F32_F32_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "device_reduce_instance_threadwise.hpp"
+#pragma once
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -51,7 +55,4 @@ ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F64_F32_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F64_F32_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "device_reduce_instance_threadwise.hpp"
+#pragma once
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -27,7 +31,4 @@ ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F64_F64_F64_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_THREADWISE_F64_F64_F64_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "device_reduce_instance_threadwise.hpp"
+#pragma once
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -51,7 +55,4 @@ ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I32_I8_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I32_I8_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "device_reduce_instance_threadwise.hpp"
+#pragma once
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -23,7 +27,4 @@ ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I8_I8_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I8_I8_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "device_reduce_instance_threadwise.hpp"
+#pragma once
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -39,7 +43,4 @@ ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/utility/check_err.hpp
+++ b/library/include/ck/library/utility/check_err.hpp
-#ifndef CHECK_ERR_HPP
+// SPDX-License-Identifier: MIT
-#define CHECK_ERR_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include <algorithm>
+#pragma once
-#include <cmath>
-#include <cstdlib>
+#include <algorithm>
-#include <half.hpp>
+#include <cmath>
-#include <iostream>
+#include <cstdlib>
-#include <iomanip>
+#include <iostream>
-#include <iterator>
+#include <iomanip>
-#include <limits>
+#include <iterator>
-#include <type_traits>
+#include <limits>
-#include <vector>
+#include <type_traits>
+#include <vector>
-#include "data_type.hpp"
+#include "ck/utility/data_type.hpp"
-namespace ck {
-namespace utils {
+namespace ck {
+namespace utils {
-template <typename T>
-typename std::enable_if<std::is_floating_point<T>::value && !std::is_same<T, half_t>::value,
+template <typename T>
-                        bool>::type
+typename std::enable_if<std::is_floating_point<T>::value && !std::is_same<T, half_t>::value,
-check_err(const std::vector<T>& out,
+                        bool>::type
-          const std::vector<T>& ref,
+check_err(const std::vector<T>& out,
-          const std::string& msg = "Error: Incorrect results!",
+          const std::vector<T>& ref,
-          double rtol            = 1e-5,
+          const std::string& msg = "Error: Incorrect results!",
-          double atol            = 3e-6)
+          double rtol            = 1e-5,
-{
+          double atol            = 3e-6)
-    if(out.size() != ref.size())
+{
-    {
+    if(out.size() != ref.size())
-        std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
+    {
-                  << std::endl
+        std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
-                  << msg << std::endl;
+                  << std::endl
-        return false;
+                  << msg << std::endl;
-    }
+        return false;
+    }
-    bool res{true};
-    int err_count  = 0;
+    bool res{true};
-    double err     = 0;
+    int err_count  = 0;
-    double max_err = std::numeric_limits<double>::min();
+    double err     = 0;
-    for(std::size_t i = 0; i < ref.size(); ++i)
+    double max_err = std::numeric_limits<double>::min();
-    {
+    for(std::size_t i = 0; i < ref.size(); ++i)
-        err = std::abs(out[i] - ref[i]);
+    {
-        if(err > atol + rtol * std::abs(ref[i]) || !std::isfinite(out[i]) || !std::isfinite(ref[i]))
+        err = std::abs(out[i] - ref[i]);
-        {
+        if(err > atol + rtol * std::abs(ref[i]) || !std::isfinite(out[i]) || !std::isfinite(ref[i]))
-            max_err = err > max_err ? err : max_err;
+        {
-            err_count++;
+            max_err = err > max_err ? err : max_err;
-            if(err_count < 5)
+            err_count++;
-            {
+            if(err_count < 5)
-                std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref["
+            {
-                          << i << "]: " << out[i] << " != " << ref[i] << std::endl
+                std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref["
-                          << msg << std::endl;
+                          << i << "]: " << out[i] << " != " << ref[i] << std::endl
-            }
+                          << msg << std::endl;
-            res = false;
+            }
-        }
+            res = false;
-    }
+        }
-    if(!res)
+    }
-    {
+    if(!res)
-        std::cout << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
+    {
-    }
+        std::cout << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
-    return res;
+    }
-}
+    return res;
+}
-template <typename T>
-typename std::enable_if<std::is_same<T, bhalf_t>::value, bool>::type
+template <typename T>
-check_err(const std::vector<T>& out,
+typename std::enable_if<std::is_same<T, bhalf_t>::value, bool>::type
-          const std::vector<T>& ref,
+check_err(const std::vector<T>& out,
-          const std::string& msg = "Error: Incorrect results!",
+          const std::vector<T>& ref,
-          double rtol            = 1e-3,
+          const std::string& msg = "Error: Incorrect results!",
-          double atol            = 1e-3)
+          double rtol            = 1e-3,
-{
+          double atol            = 1e-3)
-    if(out.size() != ref.size())
+{
-    {
+    if(out.size() != ref.size())
-        std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
+    {
-                  << std::endl
+        std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
-                  << msg << std::endl;
+                  << std::endl
-        return false;
+                  << msg << std::endl;
-    }
+        return false;
+    }
-    bool res{true};
-    int err_count = 0;
+    bool res{true};
-    double err    = 0;
+    int err_count = 0;
-    // TODO: This is a hack. We should have proper specialization for bhalf_t data type.
+    double err    = 0;
-    double max_err = std::numeric_limits<float>::min();
+    // TODO: This is a hack. We should have proper specialization for bhalf_t data type.
-    for(std::size_t i = 0; i < ref.size(); ++i)
+    double max_err = std::numeric_limits<float>::min();
-    {
+    for(std::size_t i = 0; i < ref.size(); ++i)
-        double o = type_convert<float>(out[i]);
+    {
-        double r = type_convert<float>(ref[i]);
+        double o = type_convert<float>(out[i]);
-        err      = std::abs(o - r);
+        double r = type_convert<float>(ref[i]);
-        if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
+        err      = std::abs(o - r);
-        {
+        if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
-            max_err = err > max_err ? err : max_err;
+        {
-            err_count++;
+            max_err = err > max_err ? err : max_err;
-            if(err_count < 5)
+            err_count++;
-            {
+            if(err_count < 5)
-                std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref["
+            {
-                          << i << "]: " << o << " != " << r << std::endl
+                std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref["
-                          << msg << std::endl;
+                          << i << "]: " << o << " != " << r << std::endl
-            }
+                          << msg << std::endl;
-            res = false;
+            }
-        }
+            res = false;
-    }
+        }
-    if(!res)
+    }
-    {
+    if(!res)
-        std::cout << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
+    {
-    }
+        std::cout << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
-    return res;
+    }
-}
+    return res;
+}
-template <typename T>
-typename std::enable_if<std::is_same<T, half_t>::value || std::is_same<T, half_float::half>::value,
+template <typename T>
-                        bool>::type
+typename std::enable_if<std::is_same<T, half_t>::value, bool>::type
 check_err(const std::vector<T>& out,
          const std::vector<T>& ref,
          const std::string& msg = "Error: Incorrect results!",
          double rtol            = 1e-3,
          double atol            = 1e-3)
 {
    if(out.size() != ref.size())
    {
        std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
                  << std::endl
                  << msg << std::endl;
        return false;
    }
    bool res{true};
    int err_count  = 0;
    double err     = 0;
    double max_err = std::numeric_limits<T>::min();
    for(std::size_t i = 0; i < ref.size(); ++i)
    {
        double o = type_convert<float>(out[i]);
        double r = type_convert<float>(ref[i]);
        err      = std::abs(o - r);
        if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
        {
            max_err = err > max_err ? err : max_err;
            err_count++;
            if(err_count < 5)
            {
                std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref["
                          << i << "]: " << o << " != " << r << std::endl
                          << msg << std::endl;
            }
            res = false;
        }
    }
    if(!res)
    {
        std::cout << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
    }
    return res;
 }
 template <typename T>
 typename std::enable_if<std::is_integral<T>::value && !std::is_same<T, bhalf_t>::value, bool>::type
 check_err(const std::vector<T>& out,
          const std::vector<T>& ref,
          const std::string& msg = "Error: Incorrect results!",
          double                 = 0,
          double                 = 0)
 {
    if(out.size() != ref.size())
    {
        std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
                  << std::endl
                  << msg << std::endl;
        return false;
    }
-    for(std::size_t i = 0; i < ref.size(); ++i)
+    bool res{true};
-    {
+    int err_count   = 0;
-        if(out[i] != ref[i])
+    int64_t err     = 0;
-        {
+    int64_t max_err = std::numeric_limits<int64_t>::min();
-            std::cout << "out[" << i << "] != ref[" << i << "]: " << static_cast<int>(out[i])
+    for(std::size_t i = 0; i < ref.size(); ++i)
-                      << " != " << static_cast<int>(ref[i]) << std::endl
+    {
-                      << msg << std::endl;
+        int64_t o = out[i];
-            return false;
+        int64_t r = ref[i];
-        }
+        err       = std::abs(o - r);
-    }
-    return true;
+        if(err > 0)
-}
+        {
+            max_err = err > max_err ? err : max_err;
-} // namespace utils
+            err_count++;
-} // namespace ck
+            if(err_count < 5)
+            {
-template <typename T>
+                std::cout << "out[" << i << "] != ref[" << i << "]: " << static_cast<int>(out[i])
-std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
+                          << " != " << static_cast<int>(ref[i]) << std::endl
-{
+                          << msg << std::endl;
-    std::copy(std::begin(v), std::end(v), std::ostream_iterator<T>(os, " "));
+            }
-    return os;
+            res = false;
-}
+        }
+    }
-#endif
+    if(!res)
+    {
+        std::cout << "max err: " << max_err << std::endl;
+    }
+    return res;
+}
+} // namespace utils
+} // namespace ck
+template <typename T>
+std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
+{
+    std::copy(std::begin(v), std::end(v), std::ostream_iterator<T>(os, " "));
+    return os;
+}
--- a/library/include/ck/library/utility/conv_util.hpp
+++ b/library/include/ck/library/utility/conv_util.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 #include <cstdlib>
@@ -9,17 +12,17 @@
 #include <type_traits>
 #include <vector>
-#include "check_err.hpp"
+#include "ck/ck.hpp"
-#include "config.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "device.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
-#include "device_conv_fwd.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "device_tensor.hpp"
-#include "element_wise_operation.hpp"
+#include "ck/library/utility/check_err.hpp"
-#include "fill.hpp"
+#include "ck/library/utility/fill.hpp"
-#include "host_tensor.hpp"
+#include "ck/library/utility/op_instance_engine.hpp"
-#include "op_instance_engine.hpp"
+#include "ck/library/host_tensor/device_memory.hpp"
-#include "reference_conv_fwd.hpp"
+#include "ck/library/host_tensor/host_tensor.hpp"
-#include "tensor_layout.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -402,8 +405,8 @@ template <typename InDataType,
          typename InElementwiseOp  = ck::tensor_operation::element_wise::PassThrough,
          typename WeiElementwiseOp = ck::tensor_operation::element_wise::PassThrough,
          typename OutElementwiseOp = ck::tensor_operation::element_wise::PassThrough,
-          typename InputInitFun     = FillUniform<InDataType>,
+          typename InputInitFun     = FillUniformDistribution<InDataType>,
-          typename WeightsInitFun   = FillUniform<WeiDataType>>
+          typename WeightsInitFun   = FillUniformDistribution<WeiDataType>>
 class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType, WeiDataType>
 {
    using DeviceConvFwdOp = tensor_operation::device::
@@ -422,8 +425,8 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType,
    ConvFwdOpInstance(const ConvParams& params,
                      bool do_init                         = true,
-                      const InputInitFun& input_init_f     = InputInitFun{},
+                      const InputInitFun& input_init_f     = InputInitFun(),
-                      const WeightsInitFun& weights_init_f = WeightsInitFun{})
+                      const WeightsInitFun& weights_init_f = WeightsInitFun())
        : BaseType(),
          params_{params},
          output_spatial_lengths_{params.GetOutputSpatialLengths()},
@@ -560,8 +563,8 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType,
    const ConvParams& params_;
    const std::vector<ck::index_t> output_spatial_lengths_;
    const bool do_init_;
-    const InputInitFun& input_init_f_;
+    InputInitFun input_init_f_;
-    const WeightsInitFun& weights_init_f_;
+    WeightsInitFun weights_init_f_;
 };
 } // namespace conv

--- a/library/include/ck/library/utility/fill.hpp
+++ b/library/include/ck/library/utility/fill.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 #include <algorithm>
+#include <cmath>
 #include <random>
-#include "data_type.hpp"
+#include "ck/utility/data_type.hpp"
 namespace ck {
 namespace utils {
-// template <typename T, class Enable = void>
+template <typename T>
-// struct FillUniform;
+struct FillUniformDistribution
+{
+    float a_{-5.f};
+    float b_{5.f};
-// TODO: what's wrong with this specialization???
+    template <typename ForwardIter>
-// err: segmentation fault in mt19937 - infinite loop like.
+    void operator()(ForwardIter first, ForwardIter last) const
-// template <typename T>
+    {
-// struct FillUniform<T, typename std::enable_if<std::is_integral<T>::value &&
+        std::mt19937 gen(11939);
-//                                               !std::is_same<T, bhalf_t>::value>::type>
+        std::uniform_real_distribution<float> dis(a_, b_);
-// {
+        std::generate(first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
-//     int a_{0};
+    }
-//     int b_{5};
+};
-//     // T a_ = T{0};
-//     // T b_ = T{5};
-//     template <typename ForwardIter>
+// Normally FillUniformDistributionIntegerValue should use std::uniform_int_distribution as below.
-//     void operator()(ForwardIter first, ForwardIter last) const
+// However this produces segfaults in std::mt19937 which look like inifite loop.
-//     {
+//      template <typename T>
-//         std::mt19937 gen{11939};
+//      struct FillUniformDistributionIntegerValue
-//         std::uniform_int_distribution<int> dis(a_, b_);
+//      {
-//         std::generate(first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
+//          int a_{-5};
-//     }
+//          int b_{5};
-// };
+//
+//          template <typename ForwardIter>
+//          void operator()(ForwardIter first, ForwardIter last) const
+//          {
+//              std::mt19937 gen(11939);
+//              std::uniform_int_distribution<int> dis(a_, b_);
+//              std::generate(
+//                  first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
+//          }
+//      };
-// struct FillUniform<T, typename std::enable_if<std::is_floating_point<T>::value ||
+// Workaround for uniform_int_distribution not working as expected. See note above.<
-//                                               std::is_same<T, bhalf_t>::value>::type>
 template <typename T>
-struct FillUniform
+struct FillUniformDistributionIntegerValue
 {
-    float a_{0};
+    float a_{-5.f};
-    float b_{5};
+    float b_{5.f};
    template <typename ForwardIter>
    void operator()(ForwardIter first, ForwardIter last) const
    {
-        std::mt19937 gen{11939};
+        std::mt19937 gen(11939);
-        std::uniform_real_distribution<> dis(a_, b_);
+        std::uniform_real_distribution<float> dis(a_, b_);
-        std::generate(first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
+        std::generate(
+            first, last, [&dis, &gen]() { return ck::type_convert<T>(std::round(dis(gen))); });
    }
 };

--- a/library/include/ck/library/utility/op_instance_engine.hpp
+++ b/library/include/ck/library/utility/op_instance_engine.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 #include <cstdlib>
+#include <iostream>
 #include <limits>
 #include <memory>
 #include <stdexcept>
@@ -8,9 +12,12 @@
 #include <utility>
 #include <vector>
-#include "check_err.hpp"
+#include "ck/utility/functional2.hpp"
-#include "device_base.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
-#include "functional2.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/host_tensor/device_memory.hpp"
+#include "ck/library/host_tensor/host_tensor.hpp"
 namespace ck {
 namespace utils {
@@ -78,7 +85,8 @@ class OpInstanceRunEngine
    template <typename ReferenceOp = std::function<void()>>
    OpInstanceRunEngine(const OpInstanceT& op_instance,
-                        const ReferenceOp& reference_op = ReferenceOp{})
+                        const ReferenceOp& reference_op = ReferenceOp{},
+                        bool do_verification            = true)
        : op_instance_{op_instance}
    {
        in_tensors_ = op_instance_.GetInputTensors();
@@ -88,8 +96,11 @@ class OpInstanceRunEngine
                                         const Tensor<InArgTypes>&...,
                                         Tensor<OutDataType>&>)
        {
-            ref_output_ = op_instance_.GetOutputTensor();
+            if(do_verification)
-            CallRefOpUnpackArgs(reference_op, std::make_index_sequence<kNInArgs_>{});
+            {
+                ref_output_ = op_instance_.GetOutputTensor();
+                CallRefOpUnpackArgs(reference_op, std::make_index_sequence<kNInArgs_>{});
+            }
        }
        AllocateDeviceInputTensors(std::make_index_sequence<kNInArgs_>{});
        out_device_buffer_ =
@@ -110,6 +121,7 @@ class OpInstanceRunEngine
                op_ptr.get(), in_device_buffers_, out_device_buffer_);
            if(op_ptr->IsSupportedArgument(argument.get()))
            {
+                std::cout << "Testing instance: " << op_ptr->GetTypeString() << std::endl;
                invoker->Run(argument.get());
                out_device_buffer_->FromDevice(out_tensor_->mData.data());
                if(!ref_output_)
@@ -119,9 +131,16 @@ class OpInstanceRunEngine
                        " You have to provide reference function.");
                }
                // TODO: enable flexible use of custom check_error functions
-                res = res && check_err(out_tensor_->mData, ref_output_->mData);
+                bool inst_res = CheckErr(out_tensor_->mData, ref_output_->mData);
+                std::cout << (inst_res ? "SUCCESS" : "FAILURE") << std::endl;
+                res = res && inst_res;
                out_device_buffer_->SetZero();
            }
+            else
+            {
+                std::cout << "Given conv problem is not supported by instance: \n\t>>>>"
+                          << op_ptr->GetTypeString() << std::endl;
+            }
        }
        return res;
    }
@@ -132,7 +151,6 @@ class OpInstanceRunEngine
                              bool do_verification = false,
                              bool do_log          = false)
    {
-        bool res{true};
        ProfileBestConfig best_config;
        for(auto& op_ptr : op_ptrs)
@@ -153,7 +171,7 @@ class OpInstanceRunEngine
                std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec
                          << " GB/s, " << op_name << std::endl;
-                if(tflops < best_config.best_tflops)
+                if(avg_time < best_config.best_avg_time)
                {
                    best_config.best_op_name    = op_name;
                    best_config.best_tflops     = tflops;
@@ -171,7 +189,7 @@ class OpInstanceRunEngine
                            " You have to provide reference function.");
                    }
                    // TODO: enable flexible use of custom check_error functions
-                    res = res && CheckErr(out_tensor_->mData, ref_output_->mData);
+                    CheckErr(out_tensor_->mData, ref_output_->mData);
                    if(do_log) {}
                }
@@ -223,7 +241,7 @@ class OpInstanceRunEngine
    template <typename T>
    bool CheckErr(const std::vector<T>& dev_out, const std::vector<T>& ref_out) const
    {
-        return ck::utils::check_err(dev_out, ref_out, "Error: incorrect results!", atol_, rtol_);
+        return ck::utils::check_err(dev_out, ref_out, "Error: incorrect results!", rtol_, atol_);
    }
 };