Merge remote-tracking branch 'upstream/develop' into gemm-layernorm-4

cba8f7f2 · Anthony Chang · cc50b687 · b653c5eb · cba8f7f2 · cba8f7f2
Commit cba8f7f2 authored Jun 26, 2022 by Anthony Chang
20 changed files
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_IMPL_COMMON_HPP
-#define DEVICE_REDUCE_INSTANCE_IMPL_COMMON_HPP
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once

 namespace ck {
 namespace tensor_operation {
@@ -35,7 +37,4 @@ struct ReductionConfiguration_2
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
-
 } // namespace ck
-
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_HPP
-#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_HPP
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

-#include "reduction_operator_mapping.hpp"
-#include "device_reduce_instance_impl_common.hpp"
-#include "device_reduce_multiblock.hpp"
+#pragma once
+
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp"

 namespace ck {
 namespace tensor_operation {
@@ -193,7 +196,4 @@ void add_device_reduce_instance_multiblock_atomic_add(
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
-
 } // namespace ck
-
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_B16_F32_F32_HPP
-#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_B16_F32_F32_HPP
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

-#include "data_type.hpp"
-#include "device_reduce_instance_multiblock_atomic_add.hpp"
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"

 namespace ck {
 namespace tensor_operation {
@@ -24,7 +27,4 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(bhalf_t, float, float, 5, 0, 0, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
-
 } // namespace ck
-
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F16_F32_F32_HPP
-#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F16_F32_F32_HPP
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

-#include "data_type.hpp"
-#include "device_reduce_instance_multiblock_atomic_add.hpp"
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"

 namespace ck {
 namespace tensor_operation {
@@ -24,7 +27,4 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 5, 0, 0, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
-
 } // namespace ck
-
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F32_F32_HPP
-#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F32_F32_HPP
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

-#include "device_reduce_instance_multiblock_atomic_add.hpp"
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"

 namespace ck {
 namespace tensor_operation {
@@ -23,7 +27,4 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 5, 0, 0, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
-
 } // namespace ck
-
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F64_F32_HPP
-#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F64_F32_HPP
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

-#include "device_reduce_instance_multiblock_atomic_add.hpp"
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"

 namespace ck {
 namespace tensor_operation {
@@ -23,7 +27,4 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 5, 0, 0, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
-
 } // namespace ck
-
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F64_F64_F64_HPP
-#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F64_F64_F64_HPP
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

-#include "device_reduce_instance_multiblock_atomic_add.hpp"
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"

 namespace ck {
 namespace tensor_operation {
@@ -23,7 +27,4 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
-
 } // namespace ck
-
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_HPP
-#define DEVICE_REDUCE_INSTANCE_THREADWISE_HPP
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

-#include "reduction_operator_mapping.hpp"
-#include "device_reduce_instance_impl_common.hpp"
-#include "device_reduce_threadwise.hpp"
+#pragma once
+
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp"

 namespace ck {
 namespace tensor_operation {
@@ -152,7 +154,4 @@ void add_device_reduce_instance_threadwise(
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
-
 } // namespace ck
-
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_B16_F32_B16_HPP
-#define DEVICE_REDUCE_INSTANCE_THREADWISE_B16_F32_B16_HPP
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

-#include "data_type.hpp"
-#include "device_reduce_instance_threadwise.hpp"
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"

 namespace ck {
 namespace tensor_operation {
@@ -53,7 +56,4 @@ ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
-
 } // namespace ck
-
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F16_F16_HPP
-#define DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F16_F16_HPP
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

-#include "data_type.hpp"
-#include "device_reduce_instance_threadwise.hpp"
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"

 namespace ck {
 namespace tensor_operation {
@@ -40,7 +43,4 @@ ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
-
 } // namespace ck
-
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F32_F16_HPP
-#define DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F32_F16_HPP
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

-#include "data_type.hpp"
-#include "device_reduce_instance_threadwise.hpp"
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"

 namespace ck {
 namespace tensor_operation {
@@ -28,7 +31,4 @@ ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
-
 } // namespace ck
-
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F32_F32_HPP
-#define DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F32_F32_HPP
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

-#include "device_reduce_instance_threadwise.hpp"
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"

 namespace ck {
 namespace tensor_operation {
@@ -51,7 +55,4 @@ ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
-
 } // namespace ck
-
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F64_F32_HPP
-#define DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F64_F32_HPP
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

-#include "device_reduce_instance_threadwise.hpp"
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"

 namespace ck {
 namespace tensor_operation {
@@ -27,7 +31,4 @@ ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
-
 } // namespace ck
-
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F64_F64_F64_HPP
-#define DEVICE_REDUCE_INSTANCE_THREADWISE_F64_F64_F64_HPP
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

-#include "device_reduce_instance_threadwise.hpp"
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"

 namespace ck {
 namespace tensor_operation {
@@ -51,7 +55,4 @@ ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
-
 } // namespace ck
-
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I32_I8_HPP
-#define DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I32_I8_HPP
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

-#include "device_reduce_instance_threadwise.hpp"
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"

 namespace ck {
 namespace tensor_operation {
@@ -23,7 +27,4 @@ ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
-
 } // namespace ck
-
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I8_I8_HPP
-#define DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I8_I8_HPP
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

-#include "device_reduce_instance_threadwise.hpp"
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"

 namespace ck {
 namespace tensor_operation {
@@ -39,7 +43,4 @@ ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
-
 } // namespace ck
-
-#endif
--- a/library/include/ck/library/utility/check_err.hpp
+++ b/library/include/ck/library/utility/check_err.hpp
-#ifndef CHECK_ERR_HPP
-#define CHECK_ERR_HPP
-
-#include <algorithm>
-#include <cmath>
-#include <cstdlib>
-#include <half.hpp>
-#include <iostream>
-#include <iomanip>
-#include <iterator>
-#include <limits>
-#include <type_traits>
-#include <vector>
-
-#include "data_type.hpp"
-
-namespace ck {
-namespace utils {
-
-template <typename T>
-typename std::enable_if<std::is_floating_point<T>::value && !std::is_same<T, half_t>::value,
-                        bool>::type
-check_err(const std::vector<T>& out,
-          const std::vector<T>& ref,
-          const std::string& msg = "Error: Incorrect results!",
-          double rtol            = 1e-5,
-          double atol            = 3e-6)
-{
-    if(out.size() != ref.size())
-    {
-        std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
-                  << std::endl
-                  << msg << std::endl;
-        return false;
-    }
-
-    bool res{true};
-    int err_count  = 0;
-    double err     = 0;
-    double max_err = std::numeric_limits<double>::min();
-    for(std::size_t i = 0; i < ref.size(); ++i)
-    {
-        err = std::abs(out[i] - ref[i]);
-        if(err > atol + rtol * std::abs(ref[i]) || !std::isfinite(out[i]) || !std::isfinite(ref[i]))
-        {
-            max_err = err > max_err ? err : max_err;
-            err_count++;
-            if(err_count < 5)
-            {
-                std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref["
-                          << i << "]: " << out[i] << " != " << ref[i] << std::endl
-                          << msg << std::endl;
-            }
-            res = false;
-        }
-    }
-    if(!res)
-    {
-        std::cout << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
-    }
-    return res;
-}
-
-template <typename T>
-typename std::enable_if<std::is_same<T, bhalf_t>::value, bool>::type
-check_err(const std::vector<T>& out,
-          const std::vector<T>& ref,
-          const std::string& msg = "Error: Incorrect results!",
-          double rtol            = 1e-3,
-          double atol            = 1e-3)
-{
-    if(out.size() != ref.size())
-    {
-        std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
-                  << std::endl
-                  << msg << std::endl;
-        return false;
-    }
-
-    bool res{true};
-    int err_count = 0;
-    double err    = 0;
-    // TODO: This is a hack. We should have proper specialization for bhalf_t data type.
-    double max_err = std::numeric_limits<float>::min();
-    for(std::size_t i = 0; i < ref.size(); ++i)
-    {
-        double o = type_convert<float>(out[i]);
-        double r = type_convert<float>(ref[i]);
-        err      = std::abs(o - r);
-        if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
-        {
-            max_err = err > max_err ? err : max_err;
-            err_count++;
-            if(err_count < 5)
-            {
-                std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref["
-                          << i << "]: " << o << " != " << r << std::endl
-                          << msg << std::endl;
-            }
-            res = false;
-        }
-    }
-    if(!res)
-    {
-        std::cout << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
-    }
-    return res;
-}
-
-template <typename T>
-typename std::enable_if<std::is_same<T, half_t>::value || std::is_same<T, half_float::half>::value,
-                        bool>::type
-check_err(const std::vector<T>& out,
-          const std::vector<T>& ref,
-          const std::string& msg = "Error: Incorrect results!",
-          double rtol            = 1e-3,
-          double atol            = 1e-3)
-{
-    if(out.size() != ref.size())
-    {
-        std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
-                  << std::endl
-                  << msg << std::endl;
-        return false;
-    }
-
-    bool res{true};
-    int err_count  = 0;
-    double err     = 0;
-    double max_err = std::numeric_limits<T>::min();
-    for(std::size_t i = 0; i < ref.size(); ++i)
-    {
-        double o = type_convert<float>(out[i]);
-        double r = type_convert<float>(ref[i]);
-        err      = std::abs(o - r);
-        if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
-        {
-            max_err = err > max_err ? err : max_err;
-            err_count++;
-            if(err_count < 5)
-            {
-                std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref["
-                          << i << "]: " << o << " != " << r << std::endl
-                          << msg << std::endl;
-            }
-            res = false;
-        }
-    }
-    if(!res)
-    {
-        std::cout << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
-    }
-    return res;
-}
-
-template <typename T>
-typename std::enable_if<std::is_integral<T>::value && !std::is_same<T, bhalf_t>::value, bool>::type
-check_err(const std::vector<T>& out,
-          const std::vector<T>& ref,
-          const std::string& msg = "Error: Incorrect results!",
-          double                 = 0,
-          double                 = 0)
-{
-    if(out.size() != ref.size())
-    {
-        std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
-                  << std::endl
-                  << msg << std::endl;
-        return false;
-    }
-
-    for(std::size_t i = 0; i < ref.size(); ++i)
-    {
-        if(out[i] != ref[i])
-        {
-            std::cout << "out[" << i << "] != ref[" << i << "]: " << static_cast<int>(out[i])
-                      << " != " << static_cast<int>(ref[i]) << std::endl
-                      << msg << std::endl;
-            return false;
-        }
-    }
-    return true;
-}
-
-} // namespace utils
-} // namespace ck
-
-template <typename T>
-std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
-{
-    std::copy(std::begin(v), std::end(v), std::ostream_iterator<T>(os, " "));
-    return os;
-}
-
-#endif
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+#include <iostream>
+#include <iomanip>
+#include <iterator>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include "ck/utility/data_type.hpp"
+
+namespace ck {
+namespace utils {
+
+template <typename T>
+typename std::enable_if<std::is_floating_point<T>::value && !std::is_same<T, half_t>::value,
+                        bool>::type
+check_err(const std::vector<T>& out,
+          const std::vector<T>& ref,
+          const std::string& msg = "Error: Incorrect results!",
+          double rtol            = 1e-5,
+          double atol            = 3e-6)
+{
+    if(out.size() != ref.size())
+    {
+        std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
+                  << std::endl
+                  << msg << std::endl;
+        return false;
+    }
+
+    bool res{true};
+    int err_count  = 0;
+    double err     = 0;
+    double max_err = std::numeric_limits<double>::min();
+    for(std::size_t i = 0; i < ref.size(); ++i)
+    {
+        err = std::abs(out[i] - ref[i]);
+        if(err > atol + rtol * std::abs(ref[i]) || !std::isfinite(out[i]) || !std::isfinite(ref[i]))
+        {
+            max_err = err > max_err ? err : max_err;
+            err_count++;
+            if(err_count < 5)
+            {
+                std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref["
+                          << i << "]: " << out[i] << " != " << ref[i] << std::endl
+                          << msg << std::endl;
+            }
+            res = false;
+        }
+    }
+    if(!res)
+    {
+        std::cout << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
+    }
+    return res;
+}
+
+template <typename T>
+typename std::enable_if<std::is_same<T, bhalf_t>::value, bool>::type
+check_err(const std::vector<T>& out,
+          const std::vector<T>& ref,
+          const std::string& msg = "Error: Incorrect results!",
+          double rtol            = 1e-3,
+          double atol            = 1e-3)
+{
+    if(out.size() != ref.size())
+    {
+        std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
+                  << std::endl
+                  << msg << std::endl;
+        return false;
+    }
+
+    bool res{true};
+    int err_count = 0;
+    double err    = 0;
+    // TODO: This is a hack. We should have proper specialization for bhalf_t data type.
+    double max_err = std::numeric_limits<float>::min();
+    for(std::size_t i = 0; i < ref.size(); ++i)
+    {
+        double o = type_convert<float>(out[i]);
+        double r = type_convert<float>(ref[i]);
+        err      = std::abs(o - r);
+        if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
+        {
+            max_err = err > max_err ? err : max_err;
+            err_count++;
+            if(err_count < 5)
+            {
+                std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref["
+                          << i << "]: " << o << " != " << r << std::endl
+                          << msg << std::endl;
+            }
+            res = false;
+        }
+    }
+    if(!res)
+    {
+        std::cout << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
+    }
+    return res;
+}
+
+template <typename T>
+typename std::enable_if<std::is_same<T, half_t>::value, bool>::type
+check_err(const std::vector<T>& out,
+          const std::vector<T>& ref,
+          const std::string& msg = "Error: Incorrect results!",
+          double rtol            = 1e-3,
+          double atol            = 1e-3)
+{
+    if(out.size() != ref.size())
+    {
+        std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
+                  << std::endl
+                  << msg << std::endl;
+        return false;
+    }
+
+    bool res{true};
+    int err_count  = 0;
+    double err     = 0;
+    double max_err = std::numeric_limits<T>::min();
+    for(std::size_t i = 0; i < ref.size(); ++i)
+    {
+        double o = type_convert<float>(out[i]);
+        double r = type_convert<float>(ref[i]);
+        err      = std::abs(o - r);
+        if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
+        {
+            max_err = err > max_err ? err : max_err;
+            err_count++;
+            if(err_count < 5)
+            {
+                std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref["
+                          << i << "]: " << o << " != " << r << std::endl
+                          << msg << std::endl;
+            }
+            res = false;
+        }
+    }
+    if(!res)
+    {
+        std::cout << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
+    }
+    return res;
+}
+
+template <typename T>
+typename std::enable_if<std::is_integral<T>::value && !std::is_same<T, bhalf_t>::value, bool>::type
+check_err(const std::vector<T>& out,
+          const std::vector<T>& ref,
+          const std::string& msg = "Error: Incorrect results!",
+          double                 = 0,
+          double                 = 0)
+{
+    if(out.size() != ref.size())
+    {
+        std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
+                  << std::endl
+                  << msg << std::endl;
+        return false;
+    }
+
+    bool res{true};
+    int err_count   = 0;
+    int64_t err     = 0;
+    int64_t max_err = std::numeric_limits<int64_t>::min();
+    for(std::size_t i = 0; i < ref.size(); ++i)
+    {
+        int64_t o = out[i];
+        int64_t r = ref[i];
+        err       = std::abs(o - r);
+
+        if(err > 0)
+        {
+            max_err = err > max_err ? err : max_err;
+            err_count++;
+            if(err_count < 5)
+            {
+                std::cout << "out[" << i << "] != ref[" << i << "]: " << static_cast<int>(out[i])
+                          << " != " << static_cast<int>(ref[i]) << std::endl
+                          << msg << std::endl;
+            }
+            res = false;
+        }
+    }
+    if(!res)
+    {
+        std::cout << "max err: " << max_err << std::endl;
+    }
+    return res;
+}
+
+} // namespace utils
+} // namespace ck
+
+template <typename T>
+std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
+{
+    std::copy(std::begin(v), std::end(v), std::ostream_iterator<T>(os, " "));
+    return os;
+}
--- a/library/include/ck/library/utility/conv_util.hpp
+++ b/library/include/ck/library/utility/conv_util.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
 #pragma once

 #include <cstdlib>
@@ -9,17 +12,17 @@
 #include <type_traits>
 #include <vector>

-#include "check_err.hpp"
-#include "config.hpp"
-#include "device.hpp"
-#include "device_conv_fwd.hpp"
-#include "device_tensor.hpp"
-#include "element_wise_operation.hpp"
-#include "fill.hpp"
-#include "host_tensor.hpp"
-#include "op_instance_engine.hpp"
-#include "reference_conv_fwd.hpp"
-#include "tensor_layout.hpp"
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/fill.hpp"
+#include "ck/library/utility/op_instance_engine.hpp"
+#include "ck/library/host_tensor/device_memory.hpp"
+#include "ck/library/host_tensor/host_tensor.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"

 namespace ck {
 namespace tensor_operation {
@@ -402,8 +405,8 @@ template <typename InDataType,
          typename InElementwiseOp  = ck::tensor_operation::element_wise::PassThrough,
          typename WeiElementwiseOp = ck::tensor_operation::element_wise::PassThrough,
          typename OutElementwiseOp = ck::tensor_operation::element_wise::PassThrough,
-          typename InputInitFun     = FillUniform<InDataType>,
-          typename WeightsInitFun   = FillUniform<WeiDataType>>
+          typename InputInitFun     = FillUniformDistribution<InDataType>,
+          typename WeightsInitFun   = FillUniformDistribution<WeiDataType>>
 class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType, WeiDataType>
 {
    using DeviceConvFwdOp = tensor_operation::device::
@@ -422,8 +425,8 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType,

    ConvFwdOpInstance(const ConvParams& params,
                      bool do_init                         = true,
-                      const InputInitFun& input_init_f     = InputInitFun{},
-                      const WeightsInitFun& weights_init_f = WeightsInitFun{})
+                      const InputInitFun& input_init_f     = InputInitFun(),
+                      const WeightsInitFun& weights_init_f = WeightsInitFun())
        : BaseType(),
          params_{params},
          output_spatial_lengths_{params.GetOutputSpatialLengths()},
@@ -560,8 +563,8 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType,
    const ConvParams& params_;
    const std::vector<ck::index_t> output_spatial_lengths_;
    const bool do_init_;
-    const InputInitFun& input_init_f_;
-    const WeightsInitFun& weights_init_f_;
+    InputInitFun input_init_f_;
+    WeightsInitFun weights_init_f_;
 };

 } // namespace conv

--- a/library/include/ck/library/utility/fill.hpp
+++ b/library/include/ck/library/utility/fill.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
 #pragma once

 #include <algorithm>
+#include <cmath>
 #include <random>

-#include "data_type.hpp"
+#include "ck/utility/data_type.hpp"

 namespace ck {
 namespace utils {

-// template <typename T, class Enable = void>
-// struct FillUniform;
+template <typename T>
+struct FillUniformDistribution
+{
+    float a_{-5.f};
+    float b_{5.f};

-// TODO: what's wrong with this specialization???
-// err: segmentation fault in mt19937 - infinite loop like.
-// template <typename T>
-// struct FillUniform<T, typename std::enable_if<std::is_integral<T>::value &&
-//                                               !std::is_same<T, bhalf_t>::value>::type>
-// {
-//     int a_{0};
-//     int b_{5};
-//     // T a_ = T{0};
-//     // T b_ = T{5};
+    template <typename ForwardIter>
+    void operator()(ForwardIter first, ForwardIter last) const
+    {
+        std::mt19937 gen(11939);
+        std::uniform_real_distribution<float> dis(a_, b_);
+        std::generate(first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
+    }
+};

-//     template <typename ForwardIter>
-//     void operator()(ForwardIter first, ForwardIter last) const
-//     {
-//         std::mt19937 gen{11939};
-//         std::uniform_int_distribution<int> dis(a_, b_);
-//         std::generate(first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
-//     }
-// };
+// Normally FillUniformDistributionIntegerValue should use std::uniform_int_distribution as below.
+// However this produces segfaults in std::mt19937 which look like inifite loop.
+//      template <typename T>
+//      struct FillUniformDistributionIntegerValue
+//      {
+//          int a_{-5};
+//          int b_{5};
+//
+//          template <typename ForwardIter>
+//          void operator()(ForwardIter first, ForwardIter last) const
+//          {
+//              std::mt19937 gen(11939);
+//              std::uniform_int_distribution<int> dis(a_, b_);
+//              std::generate(
+//                  first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
+//          }
+//      };

-// struct FillUniform<T, typename std::enable_if<std::is_floating_point<T>::value ||
-//                                               std::is_same<T, bhalf_t>::value>::type>
+// Workaround for uniform_int_distribution not working as expected. See note above.<
 template <typename T>
-struct FillUniform
+struct FillUniformDistributionIntegerValue
 {
-    float a_{0};
-    float b_{5};
+    float a_{-5.f};
+    float b_{5.f};

    template <typename ForwardIter>
    void operator()(ForwardIter first, ForwardIter last) const
    {
-        std::mt19937 gen{11939};
-        std::uniform_real_distribution<> dis(a_, b_);
-        std::generate(first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
+        std::mt19937 gen(11939);
+        std::uniform_real_distribution<float> dis(a_, b_);
+        std::generate(
+            first, last, [&dis, &gen]() { return ck::type_convert<T>(std::round(dis(gen))); });
    }
 };


--- a/library/include/ck/library/utility/op_instance_engine.hpp
+++ b/library/include/ck/library/utility/op_instance_engine.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
 #pragma once

 #include <cstdlib>
+#include <iostream>
 #include <limits>
 #include <memory>
 #include <stdexcept>
@@ -8,9 +12,12 @@
 #include <utility>
 #include <vector>

-#include "check_err.hpp"
-#include "device_base.hpp"
-#include "functional2.hpp"
+#include "ck/utility/functional2.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/host_tensor/device_memory.hpp"
+#include "ck/library/host_tensor/host_tensor.hpp"

 namespace ck {
 namespace utils {
@@ -78,7 +85,8 @@ class OpInstanceRunEngine

    template <typename ReferenceOp = std::function<void()>>
    OpInstanceRunEngine(const OpInstanceT& op_instance,
-                        const ReferenceOp& reference_op = ReferenceOp{})
+                        const ReferenceOp& reference_op = ReferenceOp{},
+                        bool do_verification            = true)
        : op_instance_{op_instance}
    {
        in_tensors_ = op_instance_.GetInputTensors();
@@ -88,8 +96,11 @@ class OpInstanceRunEngine
                                         const Tensor<InArgTypes>&...,
                                         Tensor<OutDataType>&>)
        {
-            ref_output_ = op_instance_.GetOutputTensor();
-            CallRefOpUnpackArgs(reference_op, std::make_index_sequence<kNInArgs_>{});
+            if(do_verification)
+            {
+                ref_output_ = op_instance_.GetOutputTensor();
+                CallRefOpUnpackArgs(reference_op, std::make_index_sequence<kNInArgs_>{});
+            }
        }
        AllocateDeviceInputTensors(std::make_index_sequence<kNInArgs_>{});
        out_device_buffer_ =
@@ -110,6 +121,7 @@ class OpInstanceRunEngine
                op_ptr.get(), in_device_buffers_, out_device_buffer_);
            if(op_ptr->IsSupportedArgument(argument.get()))
            {
+                std::cout << "Testing instance: " << op_ptr->GetTypeString() << std::endl;
                invoker->Run(argument.get());
                out_device_buffer_->FromDevice(out_tensor_->mData.data());
                if(!ref_output_)
@@ -119,9 +131,16 @@ class OpInstanceRunEngine
                        " You have to provide reference function.");
                }
                // TODO: enable flexible use of custom check_error functions
-                res = res && check_err(out_tensor_->mData, ref_output_->mData);
+                bool inst_res = CheckErr(out_tensor_->mData, ref_output_->mData);
+                std::cout << (inst_res ? "SUCCESS" : "FAILURE") << std::endl;
+                res = res && inst_res;
                out_device_buffer_->SetZero();
            }
+            else
+            {
+                std::cout << "Given conv problem is not supported by instance: \n\t>>>>"
+                          << op_ptr->GetTypeString() << std::endl;
+            }
        }
        return res;
    }
@@ -132,7 +151,6 @@ class OpInstanceRunEngine
                              bool do_verification = false,
                              bool do_log          = false)
    {
-        bool res{true};
        ProfileBestConfig best_config;

        for(auto& op_ptr : op_ptrs)
@@ -153,7 +171,7 @@ class OpInstanceRunEngine
                std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec
                          << " GB/s, " << op_name << std::endl;

-                if(tflops < best_config.best_tflops)
+                if(avg_time < best_config.best_avg_time)
                {
                    best_config.best_op_name    = op_name;
                    best_config.best_tflops     = tflops;
@@ -171,7 +189,7 @@ class OpInstanceRunEngine
                            " You have to provide reference function.");
                    }
                    // TODO: enable flexible use of custom check_error functions
-                    res = res && CheckErr(out_tensor_->mData, ref_output_->mData);
+                    CheckErr(out_tensor_->mData, ref_output_->mData);

                    if(do_log) {}
                }
@@ -223,7 +241,7 @@ class OpInstanceRunEngine
    template <typename T>
    bool CheckErr(const std::vector<T>& dev_out, const std::vector<T>& ref_out) const
    {
-        return ck::utils::check_err(dev_out, ref_out, "Error: incorrect results!", atol_, rtol_);
+        return ck::utils::check_err(dev_out, ref_out, "Error: incorrect results!", rtol_, atol_);
    }
 };