Commit cba8f7f2 authored by Anthony Chang's avatar Anthony Chang
Browse files

Merge remote-tracking branch 'upstream/develop' into gemm-layernorm-4

parents cc50b687 b653c5eb
#ifndef DEVICE_REDUCE_INSTANCE_IMPL_COMMON_HPP // SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_INSTANCE_IMPL_COMMON_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -35,7 +37,4 @@ struct ReductionConfiguration_2 ...@@ -35,7 +37,4 @@ struct ReductionConfiguration_2
} // namespace device_reduce_instance } // namespace device_reduce_instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_HPP // SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "reduction_operator_mapping.hpp" #pragma once
#include "device_reduce_instance_impl_common.hpp"
#include "device_reduce_multiblock.hpp" #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -193,7 +196,4 @@ void add_device_reduce_instance_multiblock_atomic_add( ...@@ -193,7 +196,4 @@ void add_device_reduce_instance_multiblock_atomic_add(
} // namespace device_reduce_instance } // namespace device_reduce_instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_B16_F32_F32_HPP // SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_B16_F32_F32_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "data_type.hpp" #pragma once
#include "device_reduce_instance_multiblock_atomic_add.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -24,7 +27,4 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(bhalf_t, float, float, 5, 0, 0, 2, 1); ...@@ -24,7 +27,4 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(bhalf_t, float, float, 5, 0, 0, 2, 1);
} // namespace device_reduce_instance } // namespace device_reduce_instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F16_F32_F32_HPP // SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F16_F32_F32_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "data_type.hpp" #pragma once
#include "device_reduce_instance_multiblock_atomic_add.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -24,7 +27,4 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 5, 0, 0, 2, 1); ...@@ -24,7 +27,4 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 5, 0, 0, 2, 1);
} // namespace device_reduce_instance } // namespace device_reduce_instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F32_F32_HPP // SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F32_F32_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "device_reduce_instance_multiblock_atomic_add.hpp" #pragma once
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -23,7 +27,4 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 5, 0, 0, 2, 1); ...@@ -23,7 +27,4 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 5, 0, 0, 2, 1);
} // namespace device_reduce_instance } // namespace device_reduce_instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F64_F32_HPP // SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F64_F32_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "device_reduce_instance_multiblock_atomic_add.hpp" #pragma once
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -23,7 +27,4 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 5, 0, 0, 2, 1); ...@@ -23,7 +27,4 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 5, 0, 0, 2, 1);
} // namespace device_reduce_instance } // namespace device_reduce_instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F64_F64_F64_HPP // SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F64_F64_F64_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "device_reduce_instance_multiblock_atomic_add.hpp" #pragma once
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -23,7 +27,4 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1); ...@@ -23,7 +27,4 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1);
} // namespace device_reduce_instance } // namespace device_reduce_instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_HPP // SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_INSTANCE_THREADWISE_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "reduction_operator_mapping.hpp" #pragma once
#include "device_reduce_instance_impl_common.hpp"
#include "device_reduce_threadwise.hpp" #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -152,7 +154,4 @@ void add_device_reduce_instance_threadwise( ...@@ -152,7 +154,4 @@ void add_device_reduce_instance_threadwise(
} // namespace device_reduce_instance } // namespace device_reduce_instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_B16_F32_B16_HPP // SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_INSTANCE_THREADWISE_B16_F32_B16_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "data_type.hpp" #pragma once
#include "device_reduce_instance_threadwise.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -53,7 +56,4 @@ ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1); ...@@ -53,7 +56,4 @@ ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1);
} // namespace device_reduce_instance } // namespace device_reduce_instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F16_F16_HPP // SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F16_F16_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "data_type.hpp" #pragma once
#include "device_reduce_instance_threadwise.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -40,7 +43,4 @@ ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); ...@@ -40,7 +43,4 @@ ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1);
} // namespace device_reduce_instance } // namespace device_reduce_instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F32_F16_HPP // SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F32_F16_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "data_type.hpp" #pragma once
#include "device_reduce_instance_threadwise.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -28,7 +31,4 @@ ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1); ...@@ -28,7 +31,4 @@ ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1);
} // namespace device_reduce_instance } // namespace device_reduce_instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F32_F32_HPP // SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F32_F32_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "device_reduce_instance_threadwise.hpp" #pragma once
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -51,7 +55,4 @@ ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1); ...@@ -51,7 +55,4 @@ ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1);
} // namespace device_reduce_instance } // namespace device_reduce_instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F64_F32_HPP // SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F64_F32_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "device_reduce_instance_threadwise.hpp" #pragma once
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -27,7 +31,4 @@ ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 2, 1); ...@@ -27,7 +31,4 @@ ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 2, 1);
} // namespace device_reduce_instance } // namespace device_reduce_instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F64_F64_F64_HPP // SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F64_F64_F64_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "device_reduce_instance_threadwise.hpp" #pragma once
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -51,7 +55,4 @@ ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1); ...@@ -51,7 +55,4 @@ ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1);
} // namespace device_reduce_instance } // namespace device_reduce_instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I32_I8_HPP // SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I32_I8_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "device_reduce_instance_threadwise.hpp" #pragma once
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -23,7 +27,4 @@ ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1); ...@@ -23,7 +27,4 @@ ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1);
} // namespace device_reduce_instance } // namespace device_reduce_instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I8_I8_HPP // SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I8_I8_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "device_reduce_instance_threadwise.hpp" #pragma once
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -39,7 +43,4 @@ ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1); ...@@ -39,7 +43,4 @@ ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1);
} // namespace device_reduce_instance } // namespace device_reduce_instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef CHECK_ERR_HPP // SPDX-License-Identifier: MIT
#define CHECK_ERR_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <algorithm> #pragma once
#include <cmath>
#include <cstdlib> #include <algorithm>
#include <half.hpp> #include <cmath>
#include <iostream> #include <cstdlib>
#include <iomanip> #include <iostream>
#include <iterator> #include <iomanip>
#include <limits> #include <iterator>
#include <type_traits> #include <limits>
#include <vector> #include <type_traits>
#include <vector>
#include "data_type.hpp"
#include "ck/utility/data_type.hpp"
namespace ck {
namespace utils { namespace ck {
namespace utils {
template <typename T>
typename std::enable_if<std::is_floating_point<T>::value && !std::is_same<T, half_t>::value, template <typename T>
bool>::type typename std::enable_if<std::is_floating_point<T>::value && !std::is_same<T, half_t>::value,
check_err(const std::vector<T>& out, bool>::type
const std::vector<T>& ref, check_err(const std::vector<T>& out,
const std::string& msg = "Error: Incorrect results!", const std::vector<T>& ref,
double rtol = 1e-5, const std::string& msg = "Error: Incorrect results!",
double atol = 3e-6) double rtol = 1e-5,
{ double atol = 3e-6)
if(out.size() != ref.size()) {
{ if(out.size() != ref.size())
std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size() {
<< std::endl std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
<< msg << std::endl; << std::endl
return false; << msg << std::endl;
} return false;
}
bool res{true};
int err_count = 0; bool res{true};
double err = 0; int err_count = 0;
double max_err = std::numeric_limits<double>::min(); double err = 0;
for(std::size_t i = 0; i < ref.size(); ++i) double max_err = std::numeric_limits<double>::min();
{ for(std::size_t i = 0; i < ref.size(); ++i)
err = std::abs(out[i] - ref[i]); {
if(err > atol + rtol * std::abs(ref[i]) || !std::isfinite(out[i]) || !std::isfinite(ref[i])) err = std::abs(out[i] - ref[i]);
{ if(err > atol + rtol * std::abs(ref[i]) || !std::isfinite(out[i]) || !std::isfinite(ref[i]))
max_err = err > max_err ? err : max_err; {
err_count++; max_err = err > max_err ? err : max_err;
if(err_count < 5) err_count++;
{ if(err_count < 5)
std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref[" {
<< i << "]: " << out[i] << " != " << ref[i] << std::endl std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref["
<< msg << std::endl; << i << "]: " << out[i] << " != " << ref[i] << std::endl
} << msg << std::endl;
res = false; }
} res = false;
} }
if(!res) }
{ if(!res)
std::cout << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl; {
} std::cout << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
return res; }
} return res;
}
template <typename T>
typename std::enable_if<std::is_same<T, bhalf_t>::value, bool>::type template <typename T>
check_err(const std::vector<T>& out, typename std::enable_if<std::is_same<T, bhalf_t>::value, bool>::type
const std::vector<T>& ref, check_err(const std::vector<T>& out,
const std::string& msg = "Error: Incorrect results!", const std::vector<T>& ref,
double rtol = 1e-3, const std::string& msg = "Error: Incorrect results!",
double atol = 1e-3) double rtol = 1e-3,
{ double atol = 1e-3)
if(out.size() != ref.size()) {
{ if(out.size() != ref.size())
std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size() {
<< std::endl std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
<< msg << std::endl; << std::endl
return false; << msg << std::endl;
} return false;
}
bool res{true};
int err_count = 0; bool res{true};
double err = 0; int err_count = 0;
// TODO: This is a hack. We should have proper specialization for bhalf_t data type. double err = 0;
double max_err = std::numeric_limits<float>::min(); // TODO: This is a hack. We should have proper specialization for bhalf_t data type.
for(std::size_t i = 0; i < ref.size(); ++i) double max_err = std::numeric_limits<float>::min();
{ for(std::size_t i = 0; i < ref.size(); ++i)
double o = type_convert<float>(out[i]); {
double r = type_convert<float>(ref[i]); double o = type_convert<float>(out[i]);
err = std::abs(o - r); double r = type_convert<float>(ref[i]);
if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r)) err = std::abs(o - r);
{ if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
max_err = err > max_err ? err : max_err; {
err_count++; max_err = err > max_err ? err : max_err;
if(err_count < 5) err_count++;
{ if(err_count < 5)
std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref[" {
<< i << "]: " << o << " != " << r << std::endl std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref["
<< msg << std::endl; << i << "]: " << o << " != " << r << std::endl
} << msg << std::endl;
res = false; }
} res = false;
} }
if(!res) }
{ if(!res)
std::cout << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl; {
} std::cout << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
return res; }
} return res;
}
template <typename T>
typename std::enable_if<std::is_same<T, half_t>::value || std::is_same<T, half_float::half>::value, template <typename T>
bool>::type typename std::enable_if<std::is_same<T, half_t>::value, bool>::type
check_err(const std::vector<T>& out, check_err(const std::vector<T>& out,
const std::vector<T>& ref, const std::vector<T>& ref,
const std::string& msg = "Error: Incorrect results!", const std::string& msg = "Error: Incorrect results!",
double rtol = 1e-3, double rtol = 1e-3,
double atol = 1e-3) double atol = 1e-3)
{ {
if(out.size() != ref.size()) if(out.size() != ref.size())
{ {
std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size() std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
<< std::endl << std::endl
<< msg << std::endl; << msg << std::endl;
return false; return false;
} }
bool res{true}; bool res{true};
int err_count = 0; int err_count = 0;
double err = 0; double err = 0;
double max_err = std::numeric_limits<T>::min(); double max_err = std::numeric_limits<T>::min();
for(std::size_t i = 0; i < ref.size(); ++i) for(std::size_t i = 0; i < ref.size(); ++i)
{ {
double o = type_convert<float>(out[i]); double o = type_convert<float>(out[i]);
double r = type_convert<float>(ref[i]); double r = type_convert<float>(ref[i]);
err = std::abs(o - r); err = std::abs(o - r);
if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r)) if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
{ {
max_err = err > max_err ? err : max_err; max_err = err > max_err ? err : max_err;
err_count++; err_count++;
if(err_count < 5) if(err_count < 5)
{ {
std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref[" std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref["
<< i << "]: " << o << " != " << r << std::endl << i << "]: " << o << " != " << r << std::endl
<< msg << std::endl; << msg << std::endl;
} }
res = false; res = false;
} }
} }
if(!res) if(!res)
{ {
std::cout << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl; std::cout << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
} }
return res; return res;
} }
template <typename T> template <typename T>
typename std::enable_if<std::is_integral<T>::value && !std::is_same<T, bhalf_t>::value, bool>::type typename std::enable_if<std::is_integral<T>::value && !std::is_same<T, bhalf_t>::value, bool>::type
check_err(const std::vector<T>& out, check_err(const std::vector<T>& out,
const std::vector<T>& ref, const std::vector<T>& ref,
const std::string& msg = "Error: Incorrect results!", const std::string& msg = "Error: Incorrect results!",
double = 0, double = 0,
double = 0) double = 0)
{ {
if(out.size() != ref.size()) if(out.size() != ref.size())
{ {
std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size() std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
<< std::endl << std::endl
<< msg << std::endl; << msg << std::endl;
return false; return false;
} }
for(std::size_t i = 0; i < ref.size(); ++i) bool res{true};
{ int err_count = 0;
if(out[i] != ref[i]) int64_t err = 0;
{ int64_t max_err = std::numeric_limits<int64_t>::min();
std::cout << "out[" << i << "] != ref[" << i << "]: " << static_cast<int>(out[i]) for(std::size_t i = 0; i < ref.size(); ++i)
<< " != " << static_cast<int>(ref[i]) << std::endl {
<< msg << std::endl; int64_t o = out[i];
return false; int64_t r = ref[i];
} err = std::abs(o - r);
}
return true; if(err > 0)
} {
max_err = err > max_err ? err : max_err;
} // namespace utils err_count++;
} // namespace ck if(err_count < 5)
{
template <typename T> std::cout << "out[" << i << "] != ref[" << i << "]: " << static_cast<int>(out[i])
std::ostream& operator<<(std::ostream& os, const std::vector<T>& v) << " != " << static_cast<int>(ref[i]) << std::endl
{ << msg << std::endl;
std::copy(std::begin(v), std::end(v), std::ostream_iterator<T>(os, " ")); }
return os; res = false;
} }
}
#endif if(!res)
{
std::cout << "max err: " << max_err << std::endl;
}
return res;
}
} // namespace utils
} // namespace ck
template <typename T>
std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
{
std::copy(std::begin(v), std::end(v), std::ostream_iterator<T>(os, " "));
return os;
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
#include <cstdlib> #include <cstdlib>
...@@ -9,17 +12,17 @@ ...@@ -9,17 +12,17 @@
#include <type_traits> #include <type_traits>
#include <vector> #include <vector>
#include "check_err.hpp" #include "ck/ck.hpp"
#include "config.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "device.hpp" #include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
#include "device_conv_fwd.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "device_tensor.hpp"
#include "element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp"
#include "fill.hpp" #include "ck/library/utility/fill.hpp"
#include "host_tensor.hpp" #include "ck/library/utility/op_instance_engine.hpp"
#include "op_instance_engine.hpp" #include "ck/library/host_tensor/device_memory.hpp"
#include "reference_conv_fwd.hpp" #include "ck/library/host_tensor/host_tensor.hpp"
#include "tensor_layout.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -402,8 +405,8 @@ template <typename InDataType, ...@@ -402,8 +405,8 @@ template <typename InDataType,
typename InElementwiseOp = ck::tensor_operation::element_wise::PassThrough, typename InElementwiseOp = ck::tensor_operation::element_wise::PassThrough,
typename WeiElementwiseOp = ck::tensor_operation::element_wise::PassThrough, typename WeiElementwiseOp = ck::tensor_operation::element_wise::PassThrough,
typename OutElementwiseOp = ck::tensor_operation::element_wise::PassThrough, typename OutElementwiseOp = ck::tensor_operation::element_wise::PassThrough,
typename InputInitFun = FillUniform<InDataType>, typename InputInitFun = FillUniformDistribution<InDataType>,
typename WeightsInitFun = FillUniform<WeiDataType>> typename WeightsInitFun = FillUniformDistribution<WeiDataType>>
class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType, WeiDataType> class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType, WeiDataType>
{ {
using DeviceConvFwdOp = tensor_operation::device:: using DeviceConvFwdOp = tensor_operation::device::
...@@ -422,8 +425,8 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType, ...@@ -422,8 +425,8 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType,
ConvFwdOpInstance(const ConvParams& params, ConvFwdOpInstance(const ConvParams& params,
bool do_init = true, bool do_init = true,
const InputInitFun& input_init_f = InputInitFun{}, const InputInitFun& input_init_f = InputInitFun(),
const WeightsInitFun& weights_init_f = WeightsInitFun{}) const WeightsInitFun& weights_init_f = WeightsInitFun())
: BaseType(), : BaseType(),
params_{params}, params_{params},
output_spatial_lengths_{params.GetOutputSpatialLengths()}, output_spatial_lengths_{params.GetOutputSpatialLengths()},
...@@ -560,8 +563,8 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType, ...@@ -560,8 +563,8 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType,
const ConvParams& params_; const ConvParams& params_;
const std::vector<ck::index_t> output_spatial_lengths_; const std::vector<ck::index_t> output_spatial_lengths_;
const bool do_init_; const bool do_init_;
const InputInitFun& input_init_f_; InputInitFun input_init_f_;
const WeightsInitFun& weights_init_f_; WeightsInitFun weights_init_f_;
}; };
} // namespace conv } // namespace conv
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
#include <algorithm> #include <algorithm>
#include <cmath>
#include <random> #include <random>
#include "data_type.hpp" #include "ck/utility/data_type.hpp"
namespace ck { namespace ck {
namespace utils { namespace utils {
// template <typename T, class Enable = void> template <typename T>
// struct FillUniform; struct FillUniformDistribution
{
float a_{-5.f};
float b_{5.f};
// TODO: what's wrong with this specialization??? template <typename ForwardIter>
// err: segmentation fault in mt19937 - infinite loop like. void operator()(ForwardIter first, ForwardIter last) const
// template <typename T> {
// struct FillUniform<T, typename std::enable_if<std::is_integral<T>::value && std::mt19937 gen(11939);
// !std::is_same<T, bhalf_t>::value>::type> std::uniform_real_distribution<float> dis(a_, b_);
// { std::generate(first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
// int a_{0}; }
// int b_{5}; };
// // T a_ = T{0};
// // T b_ = T{5};
// template <typename ForwardIter> // Normally FillUniformDistributionIntegerValue should use std::uniform_int_distribution as below.
// void operator()(ForwardIter first, ForwardIter last) const // However this produces segfaults in std::mt19937 which look like inifite loop.
// { // template <typename T>
// std::mt19937 gen{11939}; // struct FillUniformDistributionIntegerValue
// std::uniform_int_distribution<int> dis(a_, b_); // {
// std::generate(first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); }); // int a_{-5};
// } // int b_{5};
// }; //
// template <typename ForwardIter>
// void operator()(ForwardIter first, ForwardIter last) const
// {
// std::mt19937 gen(11939);
// std::uniform_int_distribution<int> dis(a_, b_);
// std::generate(
// first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
// }
// };
// struct FillUniform<T, typename std::enable_if<std::is_floating_point<T>::value || // Workaround for uniform_int_distribution not working as expected. See note above.<
// std::is_same<T, bhalf_t>::value>::type>
template <typename T> template <typename T>
struct FillUniform struct FillUniformDistributionIntegerValue
{ {
float a_{0}; float a_{-5.f};
float b_{5}; float b_{5.f};
template <typename ForwardIter> template <typename ForwardIter>
void operator()(ForwardIter first, ForwardIter last) const void operator()(ForwardIter first, ForwardIter last) const
{ {
std::mt19937 gen{11939}; std::mt19937 gen(11939);
std::uniform_real_distribution<> dis(a_, b_); std::uniform_real_distribution<float> dis(a_, b_);
std::generate(first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); }); std::generate(
first, last, [&dis, &gen]() { return ck::type_convert<T>(std::round(dis(gen))); });
} }
}; };
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
#include <cstdlib> #include <cstdlib>
#include <iostream>
#include <limits> #include <limits>
#include <memory> #include <memory>
#include <stdexcept> #include <stdexcept>
...@@ -8,9 +12,12 @@ ...@@ -8,9 +12,12 @@
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "check_err.hpp" #include "ck/utility/functional2.hpp"
#include "device_base.hpp" #include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "functional2.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
namespace ck { namespace ck {
namespace utils { namespace utils {
...@@ -78,7 +85,8 @@ class OpInstanceRunEngine ...@@ -78,7 +85,8 @@ class OpInstanceRunEngine
template <typename ReferenceOp = std::function<void()>> template <typename ReferenceOp = std::function<void()>>
OpInstanceRunEngine(const OpInstanceT& op_instance, OpInstanceRunEngine(const OpInstanceT& op_instance,
const ReferenceOp& reference_op = ReferenceOp{}) const ReferenceOp& reference_op = ReferenceOp{},
bool do_verification = true)
: op_instance_{op_instance} : op_instance_{op_instance}
{ {
in_tensors_ = op_instance_.GetInputTensors(); in_tensors_ = op_instance_.GetInputTensors();
...@@ -88,8 +96,11 @@ class OpInstanceRunEngine ...@@ -88,8 +96,11 @@ class OpInstanceRunEngine
const Tensor<InArgTypes>&..., const Tensor<InArgTypes>&...,
Tensor<OutDataType>&>) Tensor<OutDataType>&>)
{ {
ref_output_ = op_instance_.GetOutputTensor(); if(do_verification)
CallRefOpUnpackArgs(reference_op, std::make_index_sequence<kNInArgs_>{}); {
ref_output_ = op_instance_.GetOutputTensor();
CallRefOpUnpackArgs(reference_op, std::make_index_sequence<kNInArgs_>{});
}
} }
AllocateDeviceInputTensors(std::make_index_sequence<kNInArgs_>{}); AllocateDeviceInputTensors(std::make_index_sequence<kNInArgs_>{});
out_device_buffer_ = out_device_buffer_ =
...@@ -110,6 +121,7 @@ class OpInstanceRunEngine ...@@ -110,6 +121,7 @@ class OpInstanceRunEngine
op_ptr.get(), in_device_buffers_, out_device_buffer_); op_ptr.get(), in_device_buffers_, out_device_buffer_);
if(op_ptr->IsSupportedArgument(argument.get())) if(op_ptr->IsSupportedArgument(argument.get()))
{ {
std::cout << "Testing instance: " << op_ptr->GetTypeString() << std::endl;
invoker->Run(argument.get()); invoker->Run(argument.get());
out_device_buffer_->FromDevice(out_tensor_->mData.data()); out_device_buffer_->FromDevice(out_tensor_->mData.data());
if(!ref_output_) if(!ref_output_)
...@@ -119,9 +131,16 @@ class OpInstanceRunEngine ...@@ -119,9 +131,16 @@ class OpInstanceRunEngine
" You have to provide reference function."); " You have to provide reference function.");
} }
// TODO: enable flexible use of custom check_error functions // TODO: enable flexible use of custom check_error functions
res = res && check_err(out_tensor_->mData, ref_output_->mData); bool inst_res = CheckErr(out_tensor_->mData, ref_output_->mData);
std::cout << (inst_res ? "SUCCESS" : "FAILURE") << std::endl;
res = res && inst_res;
out_device_buffer_->SetZero(); out_device_buffer_->SetZero();
} }
else
{
std::cout << "Given conv problem is not supported by instance: \n\t>>>>"
<< op_ptr->GetTypeString() << std::endl;
}
} }
return res; return res;
} }
...@@ -132,7 +151,6 @@ class OpInstanceRunEngine ...@@ -132,7 +151,6 @@ class OpInstanceRunEngine
bool do_verification = false, bool do_verification = false,
bool do_log = false) bool do_log = false)
{ {
bool res{true};
ProfileBestConfig best_config; ProfileBestConfig best_config;
for(auto& op_ptr : op_ptrs) for(auto& op_ptr : op_ptrs)
...@@ -153,7 +171,7 @@ class OpInstanceRunEngine ...@@ -153,7 +171,7 @@ class OpInstanceRunEngine
std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec
<< " GB/s, " << op_name << std::endl; << " GB/s, " << op_name << std::endl;
if(tflops < best_config.best_tflops) if(avg_time < best_config.best_avg_time)
{ {
best_config.best_op_name = op_name; best_config.best_op_name = op_name;
best_config.best_tflops = tflops; best_config.best_tflops = tflops;
...@@ -171,7 +189,7 @@ class OpInstanceRunEngine ...@@ -171,7 +189,7 @@ class OpInstanceRunEngine
" You have to provide reference function."); " You have to provide reference function.");
} }
// TODO: enable flexible use of custom check_error functions // TODO: enable flexible use of custom check_error functions
res = res && CheckErr(out_tensor_->mData, ref_output_->mData); CheckErr(out_tensor_->mData, ref_output_->mData);
if(do_log) {} if(do_log) {}
} }
...@@ -223,7 +241,7 @@ class OpInstanceRunEngine ...@@ -223,7 +241,7 @@ class OpInstanceRunEngine
template <typename T> template <typename T>
bool CheckErr(const std::vector<T>& dev_out, const std::vector<T>& ref_out) const bool CheckErr(const std::vector<T>& dev_out, const std::vector<T>& ref_out) const
{ {
return ck::utils::check_err(dev_out, ref_out, "Error: incorrect results!", atol_, rtol_); return ck::utils::check_err(dev_out, ref_out, "Error: incorrect results!", rtol_, atol_);
} }
}; };
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment