Commit dd6a8de4 authored by Jehandad Khan's avatar Jehandad Khan
Browse files

Merge branch 'develop' into jd/dev_pkg

parents 0aa899aa abf4bdb9
#ifndef CK_COMMON_HEADER_HPP #pragma once
#define CK_COMMON_HEADER_HPP
#include "config.hpp" #include "config.hpp"
#include "array.hpp" #include "array.hpp"
#include "container_helper.hpp" #include "container_helper.hpp"
...@@ -20,30 +18,29 @@ ...@@ -20,30 +18,29 @@
#include "number.hpp" #include "number.hpp"
#include "sequence.hpp" #include "sequence.hpp"
#include "sequence_helper.hpp" #include "sequence_helper.hpp"
#include "synchronization.hpp"
#include "tuple.hpp" #include "tuple.hpp"
#include "tuple_helper.hpp" #include "tuple_helper.hpp"
#include "type.hpp" #include "type.hpp"
#include "magic_division.hpp" #include "magic_division.hpp"
#include "utility.hpp"
#include "c_style_pointer_cast.hpp" #include "c_style_pointer_cast.hpp"
#include "amd_address_space.hpp"
#include "amd_buffer_addressing.hpp"
#include "static_buffer.hpp"
#include "dynamic_buffer.hpp"
#include "is_known_at_compile_time.hpp" #include "is_known_at_compile_time.hpp"
#include "transpose_vectors.hpp" #include "transpose_vectors.hpp"
#include "inner_product.hpp" #include "inner_product.hpp"
#include "element_wise_operation.hpp" #include "element_wise_operation.hpp"
#include "debug.hpp" #include "debug.hpp"
#include "amd_buffer_addressing.hpp"
#include "get_id.hpp"
#include "synchronization.hpp"
#include "amd_address_space.hpp"
#include "static_buffer.hpp"
#include "dynamic_buffer.hpp"
// TODO: remove this // TODO: remove this
#if CK_USE_AMD_INLINE_ASM #if CK_USE_AMD_INLINE_ASM
#include "amd_inline_asm.hpp" #include "amd_inline_asm.hpp"
#endif #endif
#if CK_USE_AMD_XDLOPS #ifdef CK_USE_AMD_MFMA
#include "amd_xdlops.hpp" #include "amd_xdlops.hpp"
#endif #endif
#endif
#ifndef CK_FLOAT_TYPE_AMD_HPP #pragma once
#define CK_FLOAT_TYPE_AMD_HPP
#include "statically_indexed_array.hpp" #include "statically_indexed_array.hpp"
namespace ck { namespace ck {
...@@ -937,7 +935,7 @@ __host__ __device__ Y type_convert(X x) ...@@ -937,7 +935,7 @@ __host__ __device__ Y type_convert(X x)
// convert bfp16 to fp32 // convert bfp16 to fp32
template <> template <>
inline __host__ __device__ float type_convert(bhalf_t x) inline __host__ __device__ float type_convert<float, bhalf_t>(bhalf_t x)
{ {
union union
{ {
...@@ -950,7 +948,7 @@ inline __host__ __device__ float type_convert(bhalf_t x) ...@@ -950,7 +948,7 @@ inline __host__ __device__ float type_convert(bhalf_t x)
// convert fp32 to bfp16 // convert fp32 to bfp16
template <> template <>
inline __host__ __device__ bhalf_t type_convert(float x) inline __host__ __device__ bhalf_t type_convert<bhalf_t, float>(float x)
{ {
union union
{ {
...@@ -1090,4 +1088,3 @@ struct NumericLimits<half_t> ...@@ -1090,4 +1088,3 @@ struct NumericLimits<half_t>
}; };
} // namespace ck } // namespace ck
#endif
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
namespace ck { namespace ck {
enum DataTypeEnum_t enum struct DataTypeEnum
{ {
Half = 0, Half = 0,
Float = 1, Float = 1,
......
...@@ -6,35 +6,35 @@ ...@@ -6,35 +6,35 @@
namespace ck { namespace ck {
template <DataTypeEnum_t DataTypeEnum> template <DataTypeEnum DataTypeEnum>
struct get_datatype_from_enum; struct get_datatype_from_enum;
template <> template <>
struct get_datatype_from_enum<DataTypeEnum_t::Int8> struct get_datatype_from_enum<DataTypeEnum::Int8>
{ {
using type = int8_t; using type = int8_t;
}; };
template <> template <>
struct get_datatype_from_enum<DataTypeEnum_t::Int32> struct get_datatype_from_enum<DataTypeEnum::Int32>
{ {
using type = int32_t; using type = int32_t;
}; };
template <> template <>
struct get_datatype_from_enum<DataTypeEnum_t::Half> struct get_datatype_from_enum<DataTypeEnum::Half>
{ {
using type = half_t; using type = half_t;
}; };
template <> template <>
struct get_datatype_from_enum<DataTypeEnum_t::Float> struct get_datatype_from_enum<DataTypeEnum::Float>
{ {
using type = float; using type = float;
}; };
template <> template <>
struct get_datatype_from_enum<DataTypeEnum_t::Double> struct get_datatype_from_enum<DataTypeEnum::Double>
{ {
using type = double; using type = double;
}; };
...@@ -45,31 +45,31 @@ struct get_datatype_enum_from_type; ...@@ -45,31 +45,31 @@ struct get_datatype_enum_from_type;
template <> template <>
struct get_datatype_enum_from_type<int8_t> struct get_datatype_enum_from_type<int8_t>
{ {
static constexpr DataTypeEnum_t value = DataTypeEnum_t::Int8; static constexpr DataTypeEnum value = DataTypeEnum::Int8;
}; };
template <> template <>
struct get_datatype_enum_from_type<int32_t> struct get_datatype_enum_from_type<int32_t>
{ {
static constexpr DataTypeEnum_t value = DataTypeEnum_t::Int32; static constexpr DataTypeEnum value = DataTypeEnum::Int32;
}; };
template <> template <>
struct get_datatype_enum_from_type<half_t> struct get_datatype_enum_from_type<half_t>
{ {
static constexpr DataTypeEnum_t value = DataTypeEnum_t::Half; static constexpr DataTypeEnum value = DataTypeEnum::Half;
}; };
template <> template <>
struct get_datatype_enum_from_type<float> struct get_datatype_enum_from_type<float>
{ {
static constexpr DataTypeEnum_t value = DataTypeEnum_t::Float; static constexpr DataTypeEnum value = DataTypeEnum::Float;
}; };
template <> template <>
struct get_datatype_enum_from_type<double> struct get_datatype_enum_from_type<double>
{ {
static constexpr DataTypeEnum_t value = DataTypeEnum_t::Double; static constexpr DataTypeEnum value = DataTypeEnum::Double;
}; };
} // namespace ck } // namespace ck
......
#ifndef CK_BUFFER_HPP #pragma once
#define CK_BUFFER_HPP
#include "amd_buffer_addressing.hpp" #include "amd_buffer_addressing.hpp"
#include "c_style_pointer_cast.hpp" #include "c_style_pointer_cast.hpp"
#include "config.hpp" #include "config.hpp"
...@@ -8,7 +6,7 @@ ...@@ -8,7 +6,7 @@
namespace ck { namespace ck {
template <AddressSpaceEnum_t BufferAddressSpace, template <AddressSpaceEnum BufferAddressSpace,
typename T, typename T,
typename ElementSpaceSize, typename ElementSpaceSize,
bool InvalidElementUseNumericalZeroValue> bool InvalidElementUseNumericalZeroValue>
...@@ -34,7 +32,7 @@ struct DynamicBuffer ...@@ -34,7 +32,7 @@ struct DynamicBuffer
{ {
} }
__host__ __device__ static constexpr AddressSpaceEnum_t GetAddressSpace() __host__ __device__ static constexpr AddressSpaceEnum GetAddressSpace()
{ {
return BufferAddressSpace; return BufferAddressSpace;
} }
...@@ -55,7 +53,7 @@ struct DynamicBuffer ...@@ -55,7 +53,7 @@ struct DynamicBuffer
constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size; constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
static_assert(scalar_per_x_vector % scalar_per_t_vector == 0, static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
"wrong! X need to be multiple T"); "wrong! X should contain multiple T");
#if CK_USE_AMD_BUFFER_LOAD #if CK_USE_AMD_BUFFER_LOAD
bool constexpr use_amd_buffer_addressing = true; bool constexpr use_amd_buffer_addressing = true;
...@@ -63,7 +61,7 @@ struct DynamicBuffer ...@@ -63,7 +61,7 @@ struct DynamicBuffer
bool constexpr use_amd_buffer_addressing = false; bool constexpr use_amd_buffer_addressing = false;
#endif #endif
if constexpr(GetAddressSpace() == AddressSpaceEnum_t::Global && use_amd_buffer_addressing) if constexpr(GetAddressSpace() == AddressSpaceEnum::Global && use_amd_buffer_addressing)
{ {
constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector; constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
...@@ -81,50 +79,48 @@ struct DynamicBuffer ...@@ -81,50 +79,48 @@ struct DynamicBuffer
} }
else else
{ {
if constexpr(InvalidElementUseNumericalZeroValue) if(is_valid_element)
{ {
#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS #if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
X tmp; X tmp;
__builtin_memcpy(&tmp, &(p_data_[i]), sizeof(X)); __builtin_memcpy(&tmp, &(p_data_[i]), sizeof(X));
return is_valid_element ? tmp : X{0}; return tmp;
#else #else
return is_valid_element ? *c_style_pointer_cast<const X*>(&p_data_[i]) : X{0}; return *c_style_pointer_cast<const X*>(&p_data_[i]);
#endif #endif
} }
else else
{ {
#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS if constexpr(InvalidElementUseNumericalZeroValue)
X tmp; {
return X{0};
__builtin_memcpy(&tmp, &(p_data_[i]), sizeof(X)); }
else
return is_valid_element ? tmp : X{invalid_element_value_}; {
#else return X{invalid_element_value_};
return is_valid_element ? *c_style_pointer_cast<const X*>(&p_data_[i]) }
: X{invalid_element_value_};
#endif
} }
} }
} }
template <InMemoryDataOperationEnum_t Op, template <InMemoryDataOperationEnum Op,
typename X, typename X,
typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type, typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
typename scalar_type<remove_cvref_t<T>>::type>::value, typename scalar_type<remove_cvref_t<T>>::type>::value,
bool>::type = false> bool>::type = false>
__host__ __device__ void Update(index_t i, bool is_valid_element, const X& x) __host__ __device__ void Update(index_t i, bool is_valid_element, const X& x)
{ {
if constexpr(Op == InMemoryDataOperationEnum_t::Set) if constexpr(Op == InMemoryDataOperationEnum::Set)
{ {
this->template Set<X>(i, is_valid_element, x); this->template Set<X>(i, is_valid_element, x);
} }
else if constexpr(Op == InMemoryDataOperationEnum_t::AtomicAdd) else if constexpr(Op == InMemoryDataOperationEnum::AtomicAdd)
{ {
this->template AtomicAdd<X>(i, is_valid_element, x); this->template AtomicAdd<X>(i, is_valid_element, x);
} }
else if constexpr(Op == InMemoryDataOperationEnum_t::Add) else if constexpr(Op == InMemoryDataOperationEnum::Add)
{ {
auto tmp = this->template Get<X>(i, is_valid_element); auto tmp = this->template Get<X>(i, is_valid_element);
this->template Set<X>(i, is_valid_element, x + tmp); this->template Set<X>(i, is_valid_element, x + tmp);
...@@ -145,143 +141,120 @@ struct DynamicBuffer ...@@ -145,143 +141,120 @@ struct DynamicBuffer
constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size; constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
static_assert(scalar_per_x_vector % scalar_per_t_vector == 0, static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
"wrong! X need to be multiple T"); "wrong! X should contain multiple T");
if constexpr(GetAddressSpace() == AddressSpaceEnum_t::Global)
{
#if CK_USE_AMD_BUFFER_STORE #if CK_USE_AMD_BUFFER_STORE
constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector; bool constexpr use_amd_buffer_addressing = true;
amd_buffer_store<remove_cvref_t<T>, t_per_x>(
x, p_data_, i, is_valid_element, element_space_size_);
#else #else
if(is_valid_element) bool constexpr use_amd_buffer_addressing = false;
{ #endif
#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
X tmp = x;
__builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X)); #if CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
bool constexpr workaround_int8_ds_write_issue = true;
#else #else
*c_style_pointer_cast<X*>(&p_data_[i]) = x; bool constexpr workaround_int8_ds_write_issue = false;
#endif
}
#endif #endif
if constexpr(GetAddressSpace() == AddressSpaceEnum::Global && use_amd_buffer_addressing)
{
constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
amd_buffer_store<remove_cvref_t<T>, t_per_x>(
x, p_data_, i, is_valid_element, element_space_size_);
} }
else if constexpr(GetAddressSpace() == AddressSpaceEnum_t::Lds) else if constexpr(GetAddressSpace() == AddressSpaceEnum::Lds &&
is_same<typename scalar_type<remove_cvref_t<T>>::type, int8_t>::value &&
workaround_int8_ds_write_issue)
{ {
if(is_valid_element) if(is_valid_element)
{ {
#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE // HACK: compiler would lower IR "store<i8, 16> address_space(3)" into inefficient
#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
X tmp = x;
__builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X));
#else
*c_style_pointer_cast<X*>(&p_data_[i]) = x;
#endif
#else
// HACK: compiler would lower IR "store<i8, 16> address_space(3)" into
// inefficient
// ISA, so I try to let compiler emit IR "store<i32, 4>" which would be lower to // ISA, so I try to let compiler emit IR "store<i32, 4>" which would be lower to
// ds_write_b128 // ds_write_b128
// TODO: remove this after compiler fix // TODO: remove this after compiler fix
if constexpr(is_same<typename scalar_type<remove_cvref_t<T>>::type, int8_t>::value) static_assert((is_same<remove_cvref_t<T>, int8_t>::value &&
is_same<remove_cvref_t<X>, int8_t>::value) ||
(is_same<remove_cvref_t<T>, int8_t>::value &&
is_same<remove_cvref_t<X>, int8x2_t>::value) ||
(is_same<remove_cvref_t<T>, int8_t>::value &&
is_same<remove_cvref_t<X>, int8x4_t>::value) ||
(is_same<remove_cvref_t<T>, int8_t>::value &&
is_same<remove_cvref_t<X>, int8x8_t>::value) ||
(is_same<remove_cvref_t<T>, int8_t>::value &&
is_same<remove_cvref_t<X>, int8x16_t>::value) ||
(is_same<remove_cvref_t<T>, int8x4_t>::value &&
is_same<remove_cvref_t<X>, int8x4_t>::value) ||
(is_same<remove_cvref_t<T>, int8x8_t>::value &&
is_same<remove_cvref_t<X>, int8x8_t>::value) ||
(is_same<remove_cvref_t<T>, int8x16_t>::value &&
is_same<remove_cvref_t<X>, int8x16_t>::value),
"wrong! not implemented for this combination, please add "
"implementation");
if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
is_same<remove_cvref_t<X>, int8_t>::value)
{ {
static_assert((is_same<remove_cvref_t<T>, int8_t>::value && // HACK: cast pointer of x is bad
is_same<remove_cvref_t<X>, int8_t>::value) || // TODO: remove this after compiler fix
(is_same<remove_cvref_t<T>, int8_t>::value && *c_style_pointer_cast<int8_t*>(&p_data_[i]) =
is_same<remove_cvref_t<X>, int8x2_t>::value) || *c_style_pointer_cast<const int8_t*>(&x);
(is_same<remove_cvref_t<T>, int8_t>::value &&
is_same<remove_cvref_t<X>, int8x4_t>::value) ||
(is_same<remove_cvref_t<T>, int8_t>::value &&
is_same<remove_cvref_t<X>, int8x8_t>::value) ||
(is_same<remove_cvref_t<T>, int8_t>::value &&
is_same<remove_cvref_t<X>, int8x16_t>::value) ||
(is_same<remove_cvref_t<T>, int8x4_t>::value &&
is_same<remove_cvref_t<X>, int8x4_t>::value) ||
(is_same<remove_cvref_t<T>, int8x8_t>::value &&
is_same<remove_cvref_t<X>, int8x8_t>::value) ||
(is_same<remove_cvref_t<T>, int8x16_t>::value &&
is_same<remove_cvref_t<X>, int8x16_t>::value),
"wrong! not implemented for this combination, please add "
"implementation");
if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
is_same<remove_cvref_t<X>, int8_t>::value)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*c_style_pointer_cast<int8_t*>(&p_data_[i]) =
*c_style_pointer_cast<const int8_t*>(&x);
}
else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
is_same<remove_cvref_t<X>, int8x2_t>::value)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*c_style_pointer_cast<int16_t*>(&p_data_[i]) =
*c_style_pointer_cast<const int16_t*>(&x);
}
else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
is_same<remove_cvref_t<X>, int8x4_t>::value)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*c_style_pointer_cast<int32_t*>(&p_data_[i]) =
*c_style_pointer_cast<const int32_t*>(&x);
}
else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
is_same<remove_cvref_t<X>, int8x8_t>::value)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*c_style_pointer_cast<int32x2_t*>(&p_data_[i]) =
*c_style_pointer_cast<const int32x2_t*>(&x);
}
else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
is_same<remove_cvref_t<X>, int8x16_t>::value)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*c_style_pointer_cast<int32x4_t*>(&p_data_[i]) =
*c_style_pointer_cast<const int32x4_t*>(&x);
}
else if constexpr(is_same<remove_cvref_t<T>, int8x4_t>::value &&
is_same<remove_cvref_t<X>, int8x4_t>::value)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*c_style_pointer_cast<int32_t*>(&p_data_[i]) =
*c_style_pointer_cast<const int32_t*>(&x);
}
else if constexpr(is_same<remove_cvref_t<T>, int8x8_t>::value &&
is_same<remove_cvref_t<X>, int8x8_t>::value)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*c_style_pointer_cast<int32x2_t*>(&p_data_[i]) =
*c_style_pointer_cast<const int32x2_t*>(&x);
}
else if constexpr(is_same<remove_cvref_t<T>, int8x16_t>::value &&
is_same<remove_cvref_t<X>, int8x16_t>::value)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*c_style_pointer_cast<int32x4_t*>(&p_data_[i]) =
*c_style_pointer_cast<const int32x4_t*>(&x);
}
} }
else else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
is_same<remove_cvref_t<X>, int8x2_t>::value)
{ {
#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS // HACK: cast pointer of x is bad
X tmp = x; // TODO: remove this after compiler fix
*c_style_pointer_cast<int16_t*>(&p_data_[i]) =
__builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X)); *c_style_pointer_cast<const int16_t*>(&x);
#else }
*c_style_pointer_cast<X*>(&p_data_[i]) = x; else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
#endif is_same<remove_cvref_t<X>, int8x4_t>::value)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*c_style_pointer_cast<int32_t*>(&p_data_[i]) =
*c_style_pointer_cast<const int32_t*>(&x);
}
else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
is_same<remove_cvref_t<X>, int8x8_t>::value)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*c_style_pointer_cast<int32x2_t*>(&p_data_[i]) =
*c_style_pointer_cast<const int32x2_t*>(&x);
}
else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
is_same<remove_cvref_t<X>, int8x16_t>::value)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*c_style_pointer_cast<int32x4_t*>(&p_data_[i]) =
*c_style_pointer_cast<const int32x4_t*>(&x);
}
else if constexpr(is_same<remove_cvref_t<T>, int8x4_t>::value &&
is_same<remove_cvref_t<X>, int8x4_t>::value)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*c_style_pointer_cast<int32_t*>(&p_data_[i]) =
*c_style_pointer_cast<const int32_t*>(&x);
}
else if constexpr(is_same<remove_cvref_t<T>, int8x8_t>::value &&
is_same<remove_cvref_t<X>, int8x8_t>::value)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*c_style_pointer_cast<int32x2_t*>(&p_data_[i]) =
*c_style_pointer_cast<const int32x2_t*>(&x);
}
else if constexpr(is_same<remove_cvref_t<T>, int8x16_t>::value &&
is_same<remove_cvref_t<X>, int8x16_t>::value)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*c_style_pointer_cast<int32x4_t*>(&p_data_[i]) =
*c_style_pointer_cast<const int32x4_t*>(&x);
} }
#endif
} }
} }
else else
...@@ -305,27 +278,49 @@ struct DynamicBuffer ...@@ -305,27 +278,49 @@ struct DynamicBuffer
bool>::type = false> bool>::type = false>
__host__ __device__ void AtomicAdd(index_t i, bool is_valid_element, const X& x) __host__ __device__ void AtomicAdd(index_t i, bool is_valid_element, const X& x)
{ {
using scalar_t = typename scalar_type<remove_cvref_t<T>>::type;
// X contains multiple T // X contains multiple T
constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size; constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size; constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
static_assert(scalar_per_x_vector % scalar_per_t_vector == 0, static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
"wrong! X need to be multiple T"); "wrong! X should contain multiple T");
static_assert(GetAddressSpace() == AddressSpaceEnum_t::Global, "only support global mem"); static_assert(GetAddressSpace() == AddressSpaceEnum::Global, "only support global mem");
#if CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT
bool constexpr use_amd_buffer_addressing =
is_same_v<remove_cvref_t<scalar_t>, int32_t> ||
is_same_v<remove_cvref_t<scalar_t>, float> ||
(is_same_v<remove_cvref_t<scalar_t>, half_t> && scalar_per_x_vector % 2 == 0);
#elif CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && (!CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT)
bool constexpr use_amd_buffer_addressing = is_same_v<remove_cvref_t<scalar_t>, int32_t>;
#elif(!CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER) && CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT
bool constexpr use_amd_buffer_addressing =
is_same_v<remove_cvref_t<scalar_t>, float> ||
(is_same_v<remove_cvref_t<scalar_t>, half_t> && scalar_per_x_vector % 2 == 0);
#else
bool constexpr use_amd_buffer_addressing = false;
#endif
#if CK_USE_AMD_BUFFER_ATOMIC_ADD if constexpr(use_amd_buffer_addressing)
constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector; {
constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
amd_buffer_atomic_add<remove_cvref_t<T>, t_per_x>( amd_buffer_atomic_add<remove_cvref_t<T>, t_per_x>(
x, p_data_, i, is_valid_element, element_space_size_); x, p_data_, i, is_valid_element, element_space_size_);
#else }
if(is_valid_element) else
{ {
atomicAdd(&p_data_[i], x); if(is_valid_element)
{
// FIXME: atomicAdd is defined by HIP, need to avoid implicit type casting when
// calling it
atomicAdd(c_style_pointer_cast<X*>(&p_data_[i]), x);
}
} }
#endif
} }
__host__ __device__ static constexpr bool IsStaticBuffer() { return false; } __host__ __device__ static constexpr bool IsStaticBuffer() { return false; }
...@@ -333,14 +328,14 @@ struct DynamicBuffer ...@@ -333,14 +328,14 @@ struct DynamicBuffer
__host__ __device__ static constexpr bool IsDynamicBuffer() { return true; } __host__ __device__ static constexpr bool IsDynamicBuffer() { return true; }
}; };
template <AddressSpaceEnum_t BufferAddressSpace, typename T, typename ElementSpaceSize> template <AddressSpaceEnum BufferAddressSpace, typename T, typename ElementSpaceSize>
__host__ __device__ constexpr auto make_dynamic_buffer(T* p, ElementSpaceSize element_space_size) __host__ __device__ constexpr auto make_dynamic_buffer(T* p, ElementSpaceSize element_space_size)
{ {
return DynamicBuffer<BufferAddressSpace, T, ElementSpaceSize, true>{p, element_space_size}; return DynamicBuffer<BufferAddressSpace, T, ElementSpaceSize, true>{p, element_space_size};
} }
template < template <
AddressSpaceEnum_t BufferAddressSpace, AddressSpaceEnum BufferAddressSpace,
typename T, typename T,
typename ElementSpaceSize, typename ElementSpaceSize,
typename X, typename X,
...@@ -353,4 +348,3 @@ make_dynamic_buffer(T* p, ElementSpaceSize element_space_size, X invalid_element ...@@ -353,4 +348,3 @@ make_dynamic_buffer(T* p, ElementSpaceSize element_space_size, X invalid_element
} }
} // namespace ck } // namespace ck
#endif
#ifndef CK_UTILITY_HPP #pragma once
#define CK_UTILITY_HPP
#include "config.hpp" #include "config.hpp"
namespace ck { namespace ck {
...@@ -16,5 +14,3 @@ __device__ index_t get_block_1d_id() { return blockIdx.x; } ...@@ -16,5 +14,3 @@ __device__ index_t get_block_1d_id() { return blockIdx.x; }
__device__ index_t get_grid_size() { return gridDim.x; } __device__ index_t get_grid_size() { return gridDim.x; }
} // namespace ck } // namespace ck
#endif
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#include "common_header.hpp" #include "common_header.hpp"
#if CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX #if CK_EXPERIMENTAL_USE_DYNAMICALLY_INDEXED_MULTI_INDEX
#include "array_multi_index.hpp" #include "array_multi_index.hpp"
#else #else
#include "statically_indexed_array_multi_index.hpp" #include "statically_indexed_array_multi_index.hpp"
......
...@@ -28,7 +28,7 @@ ...@@ -28,7 +28,7 @@
namespace ck { namespace ck {
enum class ReduceTensorOp_t enum struct ReduceTensorOp
{ {
ADD = 0, ADD = 0,
MUL = 1, MUL = 1,
...@@ -41,19 +41,19 @@ enum class ReduceTensorOp_t ...@@ -41,19 +41,19 @@ enum class ReduceTensorOp_t
// MUL_NO_ZEROS = 8, // MUL_NO_ZEROS = 8,
}; };
enum class NanPropagation_t enum struct NanPropagation
{ {
NOT_PROPAGATE_NAN = 0, NOT_PROPAGATE_NAN = 0,
PROPAGATE_NAN = 1, PROPAGATE_NAN = 1,
}; };
enum class ReduceTensorIndices_t enum struct ReduceTensorIndices
{ {
NO_INDICES = 0, NO_INDICES = 0,
FLATTENED_INDICES = 1, FLATTENED_INDICES = 1,
}; };
enum class IndicesType_t enum struct IndicesType
{ {
INDICES_32BIT = 0, INDICES_32BIT = 0,
INDICES_64BIT = 1, INDICES_64BIT = 1,
......
...@@ -606,6 +606,12 @@ struct sequence_map_inverse ...@@ -606,6 +606,12 @@ struct sequence_map_inverse
SeqMap::Size()>::type; SeqMap::Size()>::type;
}; };
template <index_t... Xs, index_t... Ys>
__host__ __device__ constexpr bool operator==(Sequence<Xs...>, Sequence<Ys...>)
{
return ((Xs == Ys) && ...);
}
template <index_t... Xs, index_t... Ys> template <index_t... Xs, index_t... Ys>
__host__ __device__ constexpr auto operator+(Sequence<Xs...>, Sequence<Ys...>) __host__ __device__ constexpr auto operator+(Sequence<Xs...>, Sequence<Ys...>)
{ {
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
namespace ck { namespace ck {
// static buffer for scalar // static buffer for scalar
template <AddressSpaceEnum_t AddressSpace, template <AddressSpaceEnum AddressSpace,
typename T, typename T,
index_t N, index_t N,
bool InvalidElementUseNumericalZeroValue> // TODO remove this bool, no longer needed bool InvalidElementUseNumericalZeroValue> // TODO remove this bool, no longer needed
...@@ -17,10 +17,7 @@ struct StaticBuffer : public StaticallyIndexedArray<T, N> ...@@ -17,10 +17,7 @@ struct StaticBuffer : public StaticallyIndexedArray<T, N>
__host__ __device__ constexpr StaticBuffer() : base{} {} __host__ __device__ constexpr StaticBuffer() : base{} {}
__host__ __device__ static constexpr AddressSpaceEnum_t GetAddressSpace() __host__ __device__ static constexpr AddressSpaceEnum GetAddressSpace() { return AddressSpace; }
{
return AddressSpace;
}
__host__ __device__ static constexpr bool IsStaticBuffer() { return true; } __host__ __device__ static constexpr bool IsStaticBuffer() { return true; }
...@@ -42,7 +39,7 @@ struct StaticBuffer : public StaticallyIndexedArray<T, N> ...@@ -42,7 +39,7 @@ struct StaticBuffer : public StaticallyIndexedArray<T, N>
}; };
// static buffer for vector // static buffer for vector
template <AddressSpaceEnum_t AddressSpace, template <AddressSpaceEnum AddressSpace,
typename S, typename S,
index_t NumOfVector, index_t NumOfVector,
index_t ScalarPerVector, index_t ScalarPerVector,
...@@ -59,10 +56,7 @@ struct StaticBufferTupleOfVector ...@@ -59,10 +56,7 @@ struct StaticBufferTupleOfVector
__host__ __device__ constexpr StaticBufferTupleOfVector() : base{} {} __host__ __device__ constexpr StaticBufferTupleOfVector() : base{} {}
__host__ __device__ static constexpr AddressSpaceEnum_t GetAddressSpace() __host__ __device__ static constexpr AddressSpaceEnum GetAddressSpace() { return AddressSpace; }
{
return AddressSpace;
}
__host__ __device__ static constexpr bool IsStaticBuffer() { return true; } __host__ __device__ static constexpr bool IsStaticBuffer() { return true; }
...@@ -158,7 +152,7 @@ struct StaticBufferTupleOfVector ...@@ -158,7 +152,7 @@ struct StaticBufferTupleOfVector
} }
}; };
template <AddressSpaceEnum_t AddressSpace, typename T, index_t N> template <AddressSpaceEnum AddressSpace, typename T, index_t N>
__host__ __device__ constexpr auto make_static_buffer(Number<N>) __host__ __device__ constexpr auto make_static_buffer(Number<N>)
{ {
return StaticBuffer<AddressSpace, T, N, true>{}; return StaticBuffer<AddressSpace, T, N, true>{};
......
...@@ -7,7 +7,7 @@ namespace ck { ...@@ -7,7 +7,7 @@ namespace ck {
__device__ void block_sync_lds() __device__ void block_sync_lds()
{ {
#if CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM #if CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
asm volatile("\ asm volatile("\
s_waitcnt lgkmcnt(0) \n \ s_waitcnt lgkmcnt(0) \n \
s_barrier \ s_barrier \
......
...@@ -37,6 +37,10 @@ struct SpaceFillingCurve ...@@ -37,6 +37,10 @@ struct SpaceFillingCurve
__host__ __device__ static constexpr index_t GetNumOfAccess() __host__ __device__ static constexpr index_t GetNumOfAccess()
{ {
static_assert(TensorLengths::Size() == ScalarsPerAccess::Size());
static_assert(TensorLengths{} % ScalarsPerAccess{} ==
typename uniform_sequence_gen<TensorLengths::Size(), 0>::type{});
return reduce_on_sequence(TensorLengths{}, math::multiplies{}, Number<1>{}) / return reduce_on_sequence(TensorLengths{}, math::multiplies{}, Number<1>{}) /
ScalarPerVector; ScalarPerVector;
} }
...@@ -140,6 +144,15 @@ struct SpaceFillingCurve ...@@ -140,6 +144,15 @@ struct SpaceFillingCurve
}(); }();
return idx_md; return idx_md;
} }
// FIXME: rename this function
template <index_t AccessIdx1d>
static __device__ __host__ constexpr auto GetIndexTupleOfNumber(Number<AccessIdx1d>)
{
constexpr auto idx = GetIndex(Number<AccessIdx1d>{});
return generate_tuple([&](auto i) { return Number<idx[i]>{}; }, Number<nDim>{});
}
}; };
} // namespace ck } // namespace ck
......
...@@ -75,14 +75,14 @@ calculate_convolution_flops(const InDesc&, const WeiDesc& wei_desc, const OutDes ...@@ -75,14 +75,14 @@ calculate_convolution_flops(const InDesc&, const WeiDesc& wei_desc, const OutDes
} }
template <typename T> template <typename T>
inline auto activ(T v, const ck::ActivTypeEnum_t activ_type) inline auto activ(T v, const ck::ActivTypeEnum activ_type)
{ {
const T alpha = 0.3; const T alpha = 0.3;
switch(activ_type) switch(activ_type)
{ {
case ck::ActivTypeEnum_t::None: return v; case ck::ActivTypeEnum::None: return v;
case ck::ActivTypeEnum_t::LeakyRelu: return (v >= 0 ? v : alpha * v); case ck::ActivTypeEnum::LeakyRelu: return (v >= 0 ? v : alpha * v);
case ck::ActivTypeEnum_t::Sigmoid: return (1 / (1 + exp(-v))); case ck::ActivTypeEnum::Sigmoid: return (1 / (1 + exp(-v)));
default: throw std::runtime_error("unsupported activ type"); break; default: throw std::runtime_error("unsupported activ type"); break;
} }
} }
......
...@@ -48,8 +48,10 @@ struct DeviceMem ...@@ -48,8 +48,10 @@ struct DeviceMem
DeviceMem() = delete; DeviceMem() = delete;
DeviceMem(std::size_t mem_size); DeviceMem(std::size_t mem_size);
void* GetDeviceBuffer(); void* GetDeviceBuffer();
std::size_t GetBufferSize();
void ToDevice(const void* p); void ToDevice(const void* p);
void FromDevice(void* p); void FromDevice(void* p);
void SetZero();
~DeviceMem(); ~DeviceMem();
void* mpDeviceBuf; void* mpDeviceBuf;
...@@ -109,8 +111,6 @@ float launch_and_time_kernel( ...@@ -109,8 +111,6 @@ float launch_and_time_kernel(
timer.End(); timer.End();
// std::this_thread::sleep_for (std::chrono::microseconds(10));
return timer.GetElapsedTime() / nrepeat; return timer.GetElapsedTime() / nrepeat;
#else #else
std::ignore = nrepeat; std::ignore = nrepeat;
......
#pragma once #pragma once
#include "host_tensor.hpp" #include "host_tensor.hpp"
#include "common_header.hpp"
template <typename TensorDesc> template <typename TensorDesc>
void ostream_tensor_descriptor(TensorDesc, std::ostream& os = std::cout) void ostream_tensor_descriptor(TensorDesc, std::ostream& os = std::cout)
......
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef HOST_GENERIC_REDUCTION_HPP_
#define HOST_GENERIC_REDUCTION_HPP_
#include <vector>
#include <functional>
#include <limits>
#include <type_traits>
#include <cassert>
#include <cmath>
#include "reduction_enums.hpp"
#include "host_reduce_util.hpp"
using float16 = half_float::half;
namespace ck {
namespace host_reduce {
template <typename T>
static void
get_all_indexes(const std::vector<T>& dimLengths, int dim, std::vector<std::vector<T>>& indexes)
{
if(dim < dimLengths.size())
{
std::vector<std::vector<T>> updated_indexes;
if(dim == 0)
{
assert(indexes.size() == 0);
assert(dimLengths[dim] > 0);
for(T i = 0; i < dimLengths[dim]; i++)
{
std::vector<T> index = {i};
updated_indexes.push_back(index);
};
}
else
{
// go through all the current indexes
for(const auto& index : indexes)
for(T i = 0; i < dimLengths[dim]; i++)
{
auto index_new = index;
index_new.push_back(i);
updated_indexes.push_back(index_new);
};
};
// update to the indexes (output)
indexes = updated_indexes;
// further to construct the indexes from the updated status
get_all_indexes(dimLengths, dim + 1, indexes);
};
};
template <typename T>
static T get_offset_from_index(const std::vector<T>& strides, const std::vector<T>& index)
{
T offset = 0;
assert(strides.size() == index.size());
for(int i = 0; i < index.size(); i++)
offset += strides[i] * static_cast<T>(index[i]);
return (offset);
};
template <typename T>
static inline T get_flatten_offset(const std::vector<T>& lengths, const std::vector<T>& index)
{
T offset = 0;
assert(lengths.size() == index.size() && lengths.size() > 0);
int len = lengths.size();
T stride = 1;
// for len==1, the loop is not executed
for(int i = len - 1; i > 0; i--)
{
offset += stride * static_cast<T>(index[i]);
stride *= lengths[i];
};
offset += stride * static_cast<T>(index[0]);
return (offset);
};
template <typename InDataType,
typename AccDataType,
typename OutDataType,
ck::ReduceTensorOp_t ReduceOpId,
bool PropagateNan,
bool NeedIndices>
class ReductionHost
{
public:
ReductionHost() = default;
ReductionHost(HostTensorDescriptor& inDesc,
HostTensorDescriptor& outDesc,
const std::vector<int>& invariantDims_,
const std::vector<int>& toReduceDims_)
{
this->inLengths = to_int_vector(inDesc.GetLengths());
this->outLengths = to_int_vector(outDesc.GetLengths());
this->inStrides = to_int_vector(inDesc.GetStrides());
this->outStrides = to_int_vector(outDesc.GetStrides());
this->invariantDims = invariantDims_;
this->toReduceDims = toReduceDims_;
assert(this->inLengths.size() == this->outLengths.size());
assert(!this->toReduceDims.empty());
for(const auto dim : this->invariantDims)
this->invariantLengths.push_back(this->inLengths[dim]);
for(const auto dim : this->toReduceDims)
toReduceLengths.push_back(this->inLengths[dim]);
this->reduceAllDims = this->invariantDims.empty();
};
~ReductionHost(){};
void
Run(float alpha, const InDataType* in_data, float beta, OutDataType* out_data, int* indices)
{
if constexpr(NeedIndices)
RunImpl_with_indices(alpha, in_data, beta, out_data, indices);
else
RunImpl_no_indices(alpha, in_data, beta, out_data);
};
private:
std::vector<int> inLengths;
std::vector<int> outLengths;
std::vector<int> inStrides;
std::vector<int> outStrides;
std::vector<int> invariantLengths;
std::vector<int> toReduceLengths;
std::vector<int> invariantDims;
std::vector<int> toReduceDims;
bool reduceAllDims;
void RunImpl_with_indices(
float alpha, const InDataType* in_data, float beta, OutDataType* out_data, int* indices)
{
using ck::host_reduce::binop_with_nan_check;
using ck::host_reduce::binop_with_nan_check2;
using ck::host_reduce::float_equal_one;
using ck::host_reduce::float_equal_zero;
using ck::host_reduce::PosUnaryOpFn;
using ck::host_reduce::PreUnaryOpFn;
using ck::host_reduce::ReduceOpFn2;
using ck::host_reduce::ReduceOpZeroVal;
auto opReduce = ReduceOpFn2<AccDataType, ReduceOpId>();
int divider = 1;
for(int i = 0; i < toReduceLengths.size(); i++)
divider *= toReduceLengths[i];
auto PreUnaryOp = PreUnaryOpFn<AccDataType, ReduceOpId>(divider);
auto PosUnaryOp = PosUnaryOpFn<AccDataType, ReduceOpId>(divider);
if(reduceAllDims)
{
std::vector<std::vector<int>> indexes_1;
get_all_indexes(inLengths, 0, indexes_1); // generate the input indexes space
auto accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
int accuIndex = 0;
// go through indexes of the invariant dimensions
for(const auto& src_index : indexes_1)
{
auto src_offset = get_offset_from_index(this->inStrides, src_index);
auto currVal = static_cast<AccDataType>(in_data[src_offset]);
// unary operation before reducing, needed by AMAX. For MIN/MAX, nothing is actually
// done
PreUnaryOp(currVal);
auto currIndex = get_flatten_offset(inLengths, src_index);
binop_with_nan_check2<AccDataType, PropagateNan>(
opReduce, accuVal, currVal, accuIndex, currIndex);
};
// scale the accumulated value
if(!float_equal_one(alpha))
accuVal *= static_cast<AccDataType>(alpha);
// scale the prior dst value and add it to the accumulated value
if(!float_equal_zero(beta))
accuVal += static_cast<AccDataType>(out_data[0]) * static_cast<AccDataType>(beta);
// store the reduced value to dst location
out_data[0] = static_cast<OutDataType>(accuVal);
indices[0] = accuIndex;
}
else
{
std::vector<std::vector<int>> indexes_1, indexes_2;
get_all_indexes(
this->invariantLengths, 0, indexes_1); // generate the invariant indexes space
get_all_indexes(
this->toReduceLengths, 0, indexes_2); // generate the toReduce indexes space
// go through indexes of the invariant dimensions
for(const auto& index_1 : indexes_1)
{
std::vector<int> src_index;
std::vector<int> dst_index;
src_index.resize(this->inLengths.size());
// generate the part of src index belonging to invariant dims
for(int k = 0; k < invariantDims.size(); k++)
src_index[invariantDims[k]] = index_1[k];
for(int k = 0; k < invariantDims.size(); k++)
dst_index.push_back(index_1[k]);
int dst_offset = get_offset_from_index(this->outStrides, dst_index);
AccDataType accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
int accuIndex = 0;
// go through indexes of the toReduce dimensions
for(const auto& index_2 : indexes_2)
{
// generate the part of src index belonging to toReduce dims
for(int k = 0; k < toReduceDims.size(); k++)
src_index[toReduceDims[k]] = index_2[k];
auto src_offset = get_offset_from_index(this->inStrides, src_index);
auto currVal = static_cast<AccDataType>(in_data[src_offset]);
// unary operation before reducing, needed by AMAX. For MIN/MAX, nothing is
// actually done
PreUnaryOp(currVal);
auto currIndex = get_flatten_offset(toReduceLengths, index_2);
binop_with_nan_check2<AccDataType, PropagateNan>(
opReduce, accuVal, currVal, accuIndex, currIndex);
};
// scale the accumulated value
if(!float_equal_one(alpha))
accuVal *= static_cast<AccDataType>(alpha);
// scale the prior dst value and add it to the accumulated value
if(!float_equal_zero(beta))
accuVal += static_cast<AccDataType>(out_data[dst_offset]) *
static_cast<AccDataType>(beta);
// store the reduced value to dst location
out_data[dst_offset] = static_cast<OutDataType>(accuVal);
indices[dst_offset] = accuIndex;
};
};
}; // end of RunImpl_with_indices()
void
RunImpl_no_indices(float alpha, const InDataType* in_data, float beta, OutDataType* out_data)
{
using ck::host_reduce::binop_with_nan_check;
using ck::host_reduce::binop_with_nan_check2;
using ck::host_reduce::float_equal_one;
using ck::host_reduce::float_equal_zero;
using ck::host_reduce::PosUnaryOpFn;
using ck::host_reduce::PreUnaryOpFn;
using ck::host_reduce::ReduceOpFn;
using ck::host_reduce::ReduceOpZeroVal;
auto opReduce = ReduceOpFn<AccDataType, ReduceOpId>();
int divider = 1;
for(int i = 0; i < toReduceLengths.size(); i++)
divider *= toReduceLengths[i];
auto PreUnaryOp = PreUnaryOpFn<AccDataType, ReduceOpId>(divider);
auto PosUnaryOp = PosUnaryOpFn<AccDataType, ReduceOpId>(divider);
if(reduceAllDims)
{
std::vector<std::vector<int>> indexes_1;
get_all_indexes(inLengths, 0, indexes_1); // generate the input indexes space
auto accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
// go through indexes of the invariant dimensions
for(const auto& src_index : indexes_1)
{
auto src_offset = get_offset_from_index(this->inStrides, src_index);
auto currVal = static_cast<AccDataType>(in_data[src_offset]);
PreUnaryOp(currVal);
binop_with_nan_check<AccDataType, PropagateNan>(opReduce, accuVal, currVal);
};
PosUnaryOp(accuVal);
// scale the accumulated value
if(!float_equal_one(alpha))
accuVal *= static_cast<AccDataType>(alpha);
// scale the prior dst value and add it to the accumulated value
if(!float_equal_zero(beta))
accuVal += static_cast<AccDataType>(out_data[0]) * static_cast<AccDataType>(beta);
// store the reduced value to dst location
out_data[0] = static_cast<OutDataType>(accuVal);
}
else
{
std::vector<std::vector<int>> indexes_1, indexes_2;
get_all_indexes(
this->invariantLengths, 0, indexes_1); // generate the invariant indexes space
get_all_indexes(
this->toReduceLengths, 0, indexes_2); // generate the toReduce indexes space
// go through indexes of the invariant dimensions
for(const auto& index_1 : indexes_1)
{
std::vector<int> src_index;
std::vector<int> dst_index;
src_index.resize(this->inLengths.size());
for(int k = 0; k < invariantDims.size(); k++)
dst_index.push_back(index_1[k]);
int dst_offset = get_offset_from_index(this->outStrides, dst_index);
// generate the part of src index belonging to invariant dims
for(int k = 0; k < invariantDims.size(); k++)
src_index[invariantDims[k]] = index_1[k];
AccDataType accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
// go through indexes of the toReduce dimensions
for(const auto& index_2 : indexes_2)
{
// generate the part of src index belonging to toReduce dims
for(int k = 0; k < toReduceDims.size(); k++)
src_index[toReduceDims[k]] = index_2[k];
auto src_offset = get_offset_from_index(this->inStrides, src_index);
auto currVal = static_cast<AccDataType>(in_data[src_offset]);
PreUnaryOp(currVal);
binop_with_nan_check<AccDataType, PropagateNan>(opReduce, accuVal, currVal);
};
PosUnaryOp(accuVal);
// scale the accumulated value
if(!float_equal_one(alpha))
accuVal *= static_cast<AccDataType>(alpha);
// scale the prior dst value and add it to the accumulated value
if(!float_equal_zero(beta))
accuVal += static_cast<AccDataType>(out_data[dst_offset]) *
static_cast<AccDataType>(beta);
// store the reduced value to dst location
out_data[dst_offset] = static_cast<OutDataType>(accuVal);
};
};
}; // end of RunImpl_no_indices()
};
}; // end of namespace host_reduce
}; // end of namespace ck
#endif
...@@ -39,8 +39,8 @@ namespace ck { ...@@ -39,8 +39,8 @@ namespace ck {
namespace host_reduce { namespace host_reduce {
using ck::NanPropagation_t; using ck::NanPropagation;
using ck::ReduceTensorOp_t; using ck::ReduceTensorOp;
template <typename T> template <typename T>
static inline bool float_equal_one(T); static inline bool float_equal_one(T);
...@@ -66,95 +66,95 @@ static inline bool float_equal_zero(half_float::half x) ...@@ -66,95 +66,95 @@ static inline bool float_equal_zero(half_float::half x)
return x == static_cast<half_float::half>(0.0f); return x == static_cast<half_float::half>(0.0f);
}; };
template <typename compType, ReduceTensorOp_t ReduceOpId> template <typename AccDataType, ReduceTensorOp ReduceOpId>
__host__ static inline std::function<void(compType&)> PreUnaryOpFn(int) __host__ static inline std::function<void(AccDataType&)> PreUnaryOpFn(int)
{ {
using std::abs; using std::abs;
if constexpr(ReduceOpId == ReduceTensorOp_t::NORM1) if constexpr(ReduceOpId == ReduceTensorOp::NORM1)
{ {
return ([&](compType& a_) { a_ = abs(a_); }); return ([&](AccDataType& a_) { a_ = abs(a_); });
} }
else if constexpr(ReduceOpId == ReduceTensorOp_t::NORM2) else if constexpr(ReduceOpId == ReduceTensorOp::NORM2)
{ {
return ([&](compType& a_) { a_ = a_ * a_; }); return ([&](AccDataType& a_) { a_ = a_ * a_; });
} }
else if constexpr(ReduceOpId == ReduceTensorOp_t::AMAX) else if constexpr(ReduceOpId == ReduceTensorOp::AMAX)
{ {
return ([&](compType& a_) { a_ = abs(a_); }); return ([&](AccDataType& a_) { a_ = abs(a_); });
} }
else else
{ {
// ReduceTensorOp_t::AVG: // ReduceTensorOp::AVG:
// ReduceTensorOp_t::ADD: // ReduceTensorOp::ADD:
// ReduceTensorOp_t::MUL: // ReduceTensorOp::MUL:
// ReduceTensorOp_t::MIN: // ReduceTensorOp::MIN:
// ReduceTensorOp_t::MAX: // ReduceTensorOp::MAX:
return ([&](compType&) {}); return ([&](AccDataType&) {});
}; };
}; };
template <typename compType, ReduceTensorOp_t ReduceOpId> template <typename AccDataType, ReduceTensorOp ReduceOpId>
__host__ static inline std::function<void(compType&)> PosUnaryOpFn(int divider) __host__ static inline std::function<void(AccDataType&)> PosUnaryOpFn(int32_t divider)
{ {
using std::sqrt; using std::sqrt;
if constexpr(ReduceOpId == ReduceTensorOp_t::NORM2) if constexpr(ReduceOpId == ReduceTensorOp::NORM2)
{ {
return ([&](compType& a_) { a_ = sqrt(a_); }); return ([&](AccDataType& a_) { a_ = sqrt(a_); });
} }
else if constexpr(ReduceOpId == ReduceTensorOp_t::AVG) else if constexpr(ReduceOpId == ReduceTensorOp::AVG)
{ {
return ([&, divider](compType& a_) { return ([&, divider](AccDataType& a_) {
a_ = a_ / static_cast<compType>(static_cast<float>(divider)); a_ = a_ / static_cast<AccDataType>(static_cast<float>(divider));
}); });
} }
else else
{ {
// ReduceTensorOp_t::ADD: // ReduceTensorOp::ADD:
// ReduceTensorOp_t::NORM1: // ReduceTensorOp::NORM1:
// ReduceTensorOp_t::MUL: // ReduceTensorOp::MUL:
// ReduceTensorOp_t::MIN: // ReduceTensorOp::MIN:
// ReduceTensorOp_t::MAX: // ReduceTensorOp::MAX:
// ReduceTensorOp_t::AMAX: // ReduceTensorOp::AMAX:
return ([&](compType&) {}); return ([&](AccDataType&) {});
} }
}; };
template <typename compType, ReduceTensorOp_t ReduceOpId> template <typename AccDataType, ReduceTensorOp ReduceOpId>
__host__ static inline std::function<void(compType&, compType)> ReduceOpFn() __host__ static inline std::function<void(AccDataType&, AccDataType)> ReduceOpFn()
{ {
if constexpr(ReduceOpId == ReduceTensorOp_t::ADD || ReduceOpId == ReduceTensorOp_t::AVG || if constexpr(ReduceOpId == ReduceTensorOp::ADD || ReduceOpId == ReduceTensorOp::AVG ||
ReduceOpId == ReduceTensorOp_t::NORM1 || ReduceOpId == ReduceTensorOp_t::NORM2) ReduceOpId == ReduceTensorOp::NORM1 || ReduceOpId == ReduceTensorOp::NORM2)
{ {
return ([&](compType& a_, compType b_) { a_ = a_ + b_; }); return ([&](AccDataType& a_, AccDataType b_) { a_ = a_ + b_; });
} }
else if constexpr(ReduceOpId == ReduceTensorOp_t::MUL) else if constexpr(ReduceOpId == ReduceTensorOp::MUL)
{ {
return ([&](compType& a_, compType b_) { a_ = a_ * b_; }); return ([&](AccDataType& a_, AccDataType b_) { a_ = a_ * b_; });
} }
else if constexpr(ReduceOpId == ReduceTensorOp_t::MIN) else if constexpr(ReduceOpId == ReduceTensorOp::MIN)
{ {
return ([&](compType& a_, compType b_) { return ([&](AccDataType& a_, AccDataType b_) {
if(a_ > b_) if(a_ > b_)
a_ = b_; a_ = b_;
}); });
} }
else if constexpr(ReduceOpId == ReduceTensorOp_t::MAX || ReduceOpId == ReduceTensorOp_t::AMAX) else if constexpr(ReduceOpId == ReduceTensorOp::MAX || ReduceOpId == ReduceTensorOp::AMAX)
{ {
return ([&](compType& a_, compType b_) { return ([&](AccDataType& a_, AccDataType b_) {
if(a_ < b_) if(a_ < b_)
a_ = b_; a_ = b_;
}); });
} }
}; };
template <typename compType, ReduceTensorOp_t ReduceOpId> template <typename AccDataType, ReduceTensorOp ReduceOpId>
__host__ static inline std::function<void(compType&, compType, bool& changed)> ReduceOpFn2() __host__ static inline std::function<void(AccDataType&, AccDataType, bool& changed)> ReduceOpFn2()
{ {
if constexpr(ReduceOpId == ReduceTensorOp_t::MIN) if constexpr(ReduceOpId == ReduceTensorOp::MIN)
{ {
return ([&](compType& a_, compType b_, bool& changed) { return ([&](AccDataType& a_, AccDataType b_, bool& changed) {
if(a_ > b_) if(a_ > b_)
{ {
a_ = b_; a_ = b_;
...@@ -164,9 +164,9 @@ __host__ static inline std::function<void(compType&, compType, bool& changed)> R ...@@ -164,9 +164,9 @@ __host__ static inline std::function<void(compType&, compType, bool& changed)> R
changed = false; changed = false;
}); });
} }
else if constexpr(ReduceOpId == ReduceTensorOp_t::MAX || ReduceOpId == ReduceTensorOp_t::AMAX) else if constexpr(ReduceOpId == ReduceTensorOp::MAX || ReduceOpId == ReduceTensorOp::AMAX)
{ {
return ([&](compType& a_, compType b_, bool& changed) { return ([&](AccDataType& a_, AccDataType b_, bool& changed) {
if(a_ < b_) if(a_ < b_)
{ {
a_ = b_; a_ = b_;
...@@ -178,48 +178,49 @@ __host__ static inline std::function<void(compType&, compType, bool& changed)> R ...@@ -178,48 +178,49 @@ __host__ static inline std::function<void(compType&, compType, bool& changed)> R
} }
else else
{ {
// ReduceTensorOp_t::ADD: // ReduceTensorOp::ADD:
// ReduceTensorOp_t::MUL: // ReduceTensorOp::MUL:
// ReduceTensorOp_t::AVG: // ReduceTensorOp::AVG:
// ReduceTensorOp_t::NORM1: // ReduceTensorOp::NORM1:
// ReduceTensorOp_t::NORM2: // ReduceTensorOp::NORM2:
return (std::function<void(compType&, compType, bool&)>{}); return (std::function<void(AccDataType&, AccDataType, bool&)>{});
}; };
}; };
template <typename compType, ReduceTensorOp_t ReduceOpId> template <typename AccDataType, ReduceTensorOp ReduceOpId>
__host__ static inline compType ReduceOpZeroVal() __host__ static inline AccDataType ReduceOpZeroVal()
{ {
if constexpr(ReduceOpId == ReduceTensorOp_t::MUL) if constexpr(ReduceOpId == ReduceTensorOp::MUL)
{ {
return (static_cast<compType>(1.0f)); return (static_cast<AccDataType>(1.0f));
} }
else if constexpr(ReduceOpId == ReduceTensorOp_t::MIN) else if constexpr(ReduceOpId == ReduceTensorOp::MIN)
{ {
return (std::numeric_limits<compType>::max()); return (std::numeric_limits<AccDataType>::max());
} }
else if constexpr(ReduceOpId == ReduceTensorOp_t::MAX) else if constexpr(ReduceOpId == ReduceTensorOp::MAX)
{ {
return (std::numeric_limits<compType>::lowest()); return (std::numeric_limits<AccDataType>::lowest());
} }
else if constexpr(ReduceOpId == ReduceTensorOp_t::AMAX) else if constexpr(ReduceOpId == ReduceTensorOp::AMAX)
{ {
return (static_cast<compType>(0.0f)); return (static_cast<AccDataType>(0.0f));
} }
else else
{ {
// ReduceTensorOp_t::ADD // ReduceTensorOp::ADD
// ReduceTensorOp_t::AVG // ReduceTensorOp::AVG
// ReduceTensorOp_t::NORM1 // ReduceTensorOp::NORM1
// ReduceTensorOp_t::NORM2 // ReduceTensorOp::NORM2
return (static_cast<compType>(0.0f)); return (static_cast<AccDataType>(0.0f));
}; };
}; };
template <typename compType, bool PropagateNan> template <typename AccDataType, bool PropagateNan>
__host__ static inline void binop_with_nan_check(std::function<void(compType&, compType)> opReduce, __host__ static inline void
compType& accuVal, binop_with_nan_check(std::function<void(AccDataType&, AccDataType)> opReduce,
compType currVal) AccDataType& accuVal,
AccDataType currVal)
{ {
using std::isnan; using std::isnan;
...@@ -236,11 +237,11 @@ __host__ static inline void binop_with_nan_check(std::function<void(compType&, c ...@@ -236,11 +237,11 @@ __host__ static inline void binop_with_nan_check(std::function<void(compType&, c
}; };
}; };
template <typename compType, bool PropagateNan> template <typename AccDataType, bool PropagateNan>
__host__ static inline void __host__ static inline void
binop_with_nan_check2(std::function<void(compType&, compType, bool&)> opReduce, binop_with_nan_check2(std::function<void(AccDataType&, AccDataType, bool&)> opReduce,
compType& accuVal, AccDataType& accuVal,
compType currVal, AccDataType currVal,
int& accuIndex, int& accuIndex,
int currIndex) int currIndex)
{ {
......
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef HOST_REDUCTION_HPP_
#define HOST_REDUCTION_HPP_
#include <vector>
#include <array>
#include <functional>
#include "reduction_enums.hpp"
#include "host_reduce_util.hpp"
#include "host_tensor.hpp"
#include "data_type.hpp"
template <int NDim>
static void get_all_indexes(const std::array<size_t, NDim>& dimLengths,
std::vector<std::array<size_t, NDim>>& indexes)
{
static_assert(NDim >= 1, "NDim >= 1 is required to use this function!");
if constexpr(NDim == 1)
{
for(size_t i = 0; i < dimLengths[0]; i++)
{
std::array<size_t, 1> index{i};
indexes.push_back(index);
};
}
else
{
std::array<size_t, NDim - 1> partial_dim_lengths;
for(int i = 0; i < NDim - 1; i++)
partial_dim_lengths[i] = dimLengths[i + 1];
std::vector<std::array<size_t, NDim - 1>> partial_indexes;
get_all_indexes<NDim - 1>(partial_dim_lengths, partial_indexes);
for(size_t i = 0; i < dimLengths[0]; i++)
for(const auto& index : partial_indexes)
{
std::array<size_t, NDim> extIndex;
extIndex[0] = i;
for(int k = 0; k < NDim - 1; k++)
extIndex[k + 1] = index[k];
indexes.push_back(extIndex);
};
};
};
template <int NDim>
static size_t get_offset_from_index(const std::array<size_t, NDim>& strides,
const std::array<size_t, NDim>& index)
{
size_t offset = 0;
for(int i = 0; i < NDim; i++)
offset += strides[i] * index[i];
return (offset);
};
template <int NDim>
static size_t get_offset_from_index(const std::vector<size_t>& strides,
const std::array<size_t, NDim>& index)
{
size_t offset = 0;
for(int i = 0; i < NDim; i++)
offset += strides[i] * index[i];
return (offset);
};
template <typename InDataType,
typename AccDataType,
typename OutDataType,
ck::ReduceTensorOp ReduceOpId,
int Rank,
int NumReduceDim,
bool PropagateNan,
bool NeedIndices>
struct ReductionHost
{
using IndexDataType = int32_t;
static constexpr int NumInvariantDim = Rank - NumReduceDim;
std::vector<size_t> outStrides;
std::vector<int> invariantDims;
std::vector<int> reduceDims;
IndexDataType divider;
std::function<void(AccDataType&)> preUnaryOp;
std::function<void(AccDataType&)> posUnaryOp;
std::array<size_t, NumReduceDim> reduceLengths;
std::array<size_t, NumReduceDim> reduceStrides;
std::array<size_t, NumInvariantDim> invariantLengths;
std::array<size_t, NumInvariantDim> invariantStrides;
std::vector<std::array<size_t, NumReduceDim>> reduce_dim_indexes;
std::vector<std::array<size_t, NumInvariantDim>> invariant_dim_indexes;
ReductionHost(HostTensorDescriptor& inDesc,
HostTensorDescriptor& outDesc,
const std::vector<int>& invariantDims_,
const std::vector<int>& reduceDims_)
{
using ck::host_reduce::PosUnaryOpFn;
using ck::host_reduce::PreUnaryOpFn;
// this->outLengths = to_int_vector(outDesc.GetLengths());
this->outStrides = outDesc.GetStrides();
this->invariantDims = invariantDims_;
this->reduceDims = reduceDims_;
int product = 1;
for(int i = 0; i < NumReduceDim; i++)
{
reduceLengths[i] = inDesc.GetLengths()[reduceDims[i]];
reduceStrides[i] = inDesc.GetStrides()[reduceDims[i]];
product *= inDesc.GetLengths()[reduceDims[i]];
};
divider = product;
for(int i = 0; i < NumInvariantDim; i++)
{
invariantLengths[i] = inDesc.GetLengths()[invariantDims[i]];
invariantStrides[i] = inDesc.GetStrides()[invariantDims[i]];
};
reduce_dim_indexes.clear();
get_all_indexes<NumReduceDim>(reduceLengths, reduce_dim_indexes);
if constexpr(NumInvariantDim > 0)
{
invariant_dim_indexes.clear();
get_all_indexes<NumInvariantDim>(invariantLengths, invariant_dim_indexes);
};
preUnaryOp = PreUnaryOpFn<AccDataType, ReduceOpId>(divider);
posUnaryOp = PosUnaryOpFn<AccDataType, ReduceOpId>(divider);
};
void Run(float alpha,
const InDataType* in_data,
float beta,
OutDataType* out_data,
IndexDataType* out_indices)
{
if constexpr(NeedIndices)
{
RunImpl_with_index(alpha, in_data, beta, out_data, out_indices);
}
else
{
RunImpl_no_index(alpha, in_data, beta, out_data);
};
};
void RunImpl_with_index(float alpha,
const InDataType* in_data,
float beta,
OutDataType* out_data,
IndexDataType* out_indices)
{
using ck::type_convert;
using ck::host_reduce::binop_with_nan_check2;
using ck::host_reduce::float_equal_one;
using ck::host_reduce::float_equal_zero;
using ck::host_reduce::ReduceOpFn2;
using ck::host_reduce::ReduceOpZeroVal;
auto opReduce2 = ReduceOpFn2<AccDataType, ReduceOpId>();
if constexpr(NumInvariantDim == 0)
{
AccDataType accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
IndexDataType accuIndex = 0;
for(IndexDataType i = 0; i < reduce_dim_indexes.size(); i++)
{
auto offset_reduce =
get_offset_from_index<NumReduceDim>(reduceStrides, reduce_dim_indexes[i]);
auto currVal = type_convert<AccDataType>(in_data[offset_reduce]);
preUnaryOp(currVal);
auto currIndex = i;
binop_with_nan_check2<AccDataType, PropagateNan>(
opReduce2, accuVal, currVal, accuIndex, currIndex);
};
posUnaryOp(accuVal);
if(!float_equal_one(alpha))
accuVal *= type_convert<AccDataType>(alpha);
if(!float_equal_zero(beta))
accuVal += type_convert<AccDataType>(out_data[0]) * type_convert<AccDataType>(beta);
out_data[0] = type_convert<OutDataType>(accuVal);
out_indices[0] = accuIndex;
}
else
{
auto thread_reduce_func = [&](auto invariant_index) {
AccDataType accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
IndexDataType accuIndex = 0;
auto offset_invariant =
get_offset_from_index<NumInvariantDim>(invariantStrides, invariant_index);
for(IndexDataType i = 0; i < reduce_dim_indexes.size(); i++)
{
auto offset_reduce =
get_offset_from_index<NumReduceDim>(reduceStrides, reduce_dim_indexes[i]);
auto currVal =
type_convert<AccDataType>(in_data[offset_invariant + offset_reduce]);
preUnaryOp(currVal);
auto currIndex = i;
binop_with_nan_check2<AccDataType, PropagateNan>(
opReduce2, accuVal, currVal, accuIndex, currIndex);
};
posUnaryOp(accuVal);
if(!float_equal_one(alpha))
accuVal *= type_convert<AccDataType>(alpha);
auto dst_offset =
get_offset_from_index<NumInvariantDim>(outStrides, invariant_index);
if(!float_equal_zero(beta))
accuVal += type_convert<AccDataType>(out_data[dst_offset]) *
type_convert<AccDataType>(beta);
out_data[dst_offset] = type_convert<OutDataType>(accuVal);
out_indices[dst_offset] = accuIndex;
};
std::size_t num_thread = 1;
std::size_t work_per_thread =
(invariant_dim_indexes.size() + num_thread - 1) / num_thread;
std::vector<joinable_thread> threads(num_thread);
for(std::size_t it = 0; it < num_thread; ++it)
{
std::size_t iw_begin = it * work_per_thread;
std::size_t iw_end =
std::min((it + 1) * work_per_thread, invariant_dim_indexes.size());
auto f = [=] {
for(std::size_t iw = iw_begin; iw < iw_end; ++iw)
{
thread_reduce_func(invariant_dim_indexes[iw]);
}
};
threads[it] = joinable_thread(f);
}
};
};
void RunImpl_no_index(float alpha, const InDataType* in_data, float beta, OutDataType* out_data)
{
using ck::type_convert;
using ck::host_reduce::binop_with_nan_check;
using ck::host_reduce::float_equal_one;
using ck::host_reduce::float_equal_zero;
using ck::host_reduce::ReduceOpFn;
using ck::host_reduce::ReduceOpZeroVal;
auto opReduce = ReduceOpFn<AccDataType, ReduceOpId>();
if constexpr(NumInvariantDim == 0)
{
AccDataType accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
for(const auto& reduce_index : reduce_dim_indexes)
{
auto offset_reduce =
get_offset_from_index<NumReduceDim>(reduceStrides, reduce_index);
auto currVal = type_convert<AccDataType>(in_data[offset_reduce]);
preUnaryOp(currVal);
binop_with_nan_check<AccDataType, PropagateNan>(opReduce, accuVal, currVal);
};
posUnaryOp(accuVal);
if(!float_equal_one(alpha))
accuVal *= type_convert<AccDataType>(alpha);
if(!float_equal_zero(beta))
accuVal += type_convert<AccDataType>(out_data[0]) * type_convert<AccDataType>(beta);
out_data[0] = type_convert<OutDataType>(accuVal);
}
else
{
auto thread_reduce_func = [&](auto invariant_index) {
AccDataType accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
auto offset_invariant =
get_offset_from_index<NumInvariantDim>(invariantStrides, invariant_index);
for(const auto& reduce_index : reduce_dim_indexes)
{
auto offset_reduce =
get_offset_from_index<NumReduceDim>(reduceStrides, reduce_index);
auto currVal =
type_convert<AccDataType>(in_data[offset_invariant + offset_reduce]);
preUnaryOp(currVal);
binop_with_nan_check<AccDataType, PropagateNan>(opReduce, accuVal, currVal);
};
posUnaryOp(accuVal);
if(!float_equal_one(alpha))
accuVal *= type_convert<AccDataType>(alpha);
auto dst_offset =
get_offset_from_index<NumInvariantDim>(outStrides, invariant_index);
if(!float_equal_zero(beta))
accuVal += type_convert<AccDataType>(out_data[dst_offset]) *
type_convert<AccDataType>(beta);
out_data[dst_offset] = type_convert<OutDataType>(accuVal);
};
std::size_t num_thread = 1;
std::size_t work_per_thread =
(invariant_dim_indexes.size() + num_thread - 1) / num_thread;
std::vector<joinable_thread> threads(num_thread);
for(std::size_t it = 0; it < num_thread; ++it)
{
std::size_t iw_begin = it * work_per_thread;
std::size_t iw_end =
std::min((it + 1) * work_per_thread, invariant_dim_indexes.size());
auto f = [=] {
for(std::size_t iw = iw_begin; iw < iw_end; ++iw)
{
thread_reduce_func(invariant_dim_indexes[iw]);
}
};
threads[it] = joinable_thread(f);
}
};
};
};
#endif
...@@ -40,20 +40,6 @@ std::ostream& LogRangeAsType(std::ostream& os, Range&& range, std::string delim) ...@@ -40,20 +40,6 @@ std::ostream& LogRangeAsType(std::ostream& os, Range&& range, std::string delim)
return os; return os;
} }
typedef enum
{
Half = 0,
Float = 1,
} DataType_t;
template <typename T>
struct DataType;
template <>
struct DataType<float> : std::integral_constant<DataType_t, DataType_t::Float>
{
};
template <typename F, typename T, std::size_t... Is> template <typename F, typename T, std::size_t... Is>
auto call_f_unpack_args_impl(F f, T args, std::index_sequence<Is...>) auto call_f_unpack_args_impl(F f, T args, std::index_sequence<Is...>)
{ {
...@@ -87,10 +73,10 @@ struct HostTensorDescriptor ...@@ -87,10 +73,10 @@ struct HostTensorDescriptor
HostTensorDescriptor() = delete; HostTensorDescriptor() = delete;
template <typename X> template <typename X>
HostTensorDescriptor(std::vector<X> lens); HostTensorDescriptor(const std::vector<X>& lens);
template <typename X, typename Y> template <typename X, typename Y>
HostTensorDescriptor(std::vector<X> lens, std::vector<Y> strides); HostTensorDescriptor(const std::vector<X>& lens, const std::vector<Y>& strides);
void CalculateStrides(); void CalculateStrides();
...@@ -177,7 +163,7 @@ struct ParallelTensorFunctor ...@@ -177,7 +163,7 @@ struct ParallelTensorFunctor
return indices; return indices;
} }
void operator()(std::size_t num_thread = std::thread::hardware_concurrency()) const void operator()(std::size_t num_thread = 1) const
{ {
std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread; std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread;
...@@ -227,7 +213,7 @@ struct Tensor ...@@ -227,7 +213,7 @@ struct Tensor
Tensor(const HostTensorDescriptor& desc) : mDesc(desc), mData(mDesc.GetElementSpace()) {} Tensor(const HostTensorDescriptor& desc) : mDesc(desc), mData(mDesc.GetElementSpace()) {}
template <typename G> template <typename G>
void GenerateTensorValue(G g, std::size_t num_thread = std::thread::hardware_concurrency()) void GenerateTensorValue(G g, std::size_t num_thread = 1)
{ {
switch(mDesc.GetNumOfDimension()) switch(mDesc.GetNumOfDimension())
{ {
...@@ -299,85 +285,69 @@ struct Tensor ...@@ -299,85 +285,69 @@ struct Tensor
}; };
template <typename X> template <typename X>
HostTensorDescriptor::HostTensorDescriptor(std::vector<X> lens) : mLens(lens) HostTensorDescriptor::HostTensorDescriptor(const std::vector<X>& lens) : mLens(lens)
{ {
this->CalculateStrides(); this->CalculateStrides();
} }
template <typename X, typename Y> template <typename X, typename Y>
HostTensorDescriptor::HostTensorDescriptor(std::vector<X> lens, std::vector<Y> strides) HostTensorDescriptor::HostTensorDescriptor(const std::vector<X>& lens,
const std::vector<Y>& strides)
: mLens(lens), mStrides(strides) : mLens(lens), mStrides(strides)
{ {
} }
void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream& os = std::cout); void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream& os = std::cout);
float bf16_to_f32_(ck::bhalf_t src_val); #if 1
// FIXME: remove
void bf16_to_f32_(const Tensor<ck::bhalf_t>& src, Tensor<float>& dst); void bf16_to_f32_(const Tensor<ck::bhalf_t>& src, Tensor<float>& dst);
#endif
template <typename T> template <typename T>
void check_error(const Tensor<T>& ref, const Tensor<T>& result) float check_error(const Tensor<T>& ref, const Tensor<T>& result)
{ {
float error = 0; float l1_error = 0;
float max_diff = -1; float linf_error = -1;
float ref_value = 0, result_value = 0; float linf_rel_error = -1;
float linf_ref_value = 0, linf_result_value = 0;
float linf_rel_ref_value = 0, linf_rel_result_value = 0;
constexpr float eps = 1e-10;
if constexpr(std::is_same<ck::bhalf_t, T>::value) for(int i = 0; i < ref.mData.size(); ++i)
{ {
for(int i = 0; i < ref.mData.size(); ++i) float ref_v = ck::type_convert<float>(ref.mData[i]);
float result_v = ck::type_convert<float>(result.mData[i]);
float diff = std::abs(ref_v - result_v);
float rel_diff = diff / std::max(std::abs(ref_v), eps);
l1_error += diff;
if(linf_error < diff)
{ {
error += std::abs(bf16_to_f32_(ref.mData[i]) - bf16_to_f32_(result.mData[i])); linf_error = diff;
float diff = std::abs(bf16_to_f32_(ref.mData[i]) - bf16_to_f32_(result.mData[i])); linf_ref_value = ref_v;
if(max_diff < diff) linf_result_value = result_v;
{
max_diff = diff;
ref_value = bf16_to_f32_(ref.mData[i]);
result_value = bf16_to_f32_(result.mData[i]);
}
} }
}
else if(linf_rel_error < rel_diff)
{
for(int i = 0; i < ref.mData.size(); ++i)
{ {
error += std::abs(double(ref.mData[i]) - double(result.mData[i])); linf_rel_error = rel_diff;
float diff = std::abs(double(ref.mData[i]) - double(result.mData[i])); linf_rel_ref_value = ref_v;
if(max_diff < diff) linf_rel_result_value = result_v;
{
max_diff = diff;
ref_value = ref.mData[i];
result_value = result.mData[i];
}
} }
} }
std::cout << "error: " << error << std::endl; std::cout << "Absolute Error L1 Norm (sum of abs diff): " << l1_error << std::endl;
std::cout << "max_diff: " << max_diff << ", " << ref_value << ", " << result_value << std::endl; std::cout << "Absolute Error L-inf Norm (max abs diff): " << linf_error << ", ref "
} << linf_ref_value << ", result " << linf_result_value << std::endl;
std::cout << "Relative Error L-inf Norm (max relative abs diff): " << linf_rel_error << ", ref "
template <typename T> << linf_rel_ref_value << ", result " << linf_rel_result_value << std::endl;
void check_indices(const Tensor<T>& ref, const Tensor<T>& result)
{
bool has_error = false;
int error_count = 0;
for(int i = 0; i < ref.mData.size(); ++i)
{
if(ref.mData[i] != result.mData[i])
{
std::cerr << std::endl
<< "Indices different at position " << i << " (ref: " << ref.mData[i]
<< ", result: " << result.mData[i] << ")" << std::endl;
has_error = true;
error_count++;
if(error_count == 20)
break;
};
}
if(!has_error) return linf_error;
std::cout << std::endl << "Indices result is completely acccurate!" << std::endl;
} }
#endif #endif
#ifndef HOST_TENSOR_GENERATOR_HPP #pragma once
#define HOST_TENSOR_GENERATOR_HPP
#include <cmath> #include <cmath>
#include <numeric>
#include "config.hpp" #include "config.hpp"
template <typename T> template <typename T>
...@@ -93,8 +94,8 @@ struct GeneratorTensor_2<int8_t> ...@@ -93,8 +94,8 @@ struct GeneratorTensor_2<int8_t>
template <typename T> template <typename T>
struct GeneratorTensor_3 struct GeneratorTensor_3
{ {
T min_value = 0; float min_value = 0;
T max_value = 1; float max_value = 1;
template <typename... Is> template <typename... Is>
T operator()(Is...) T operator()(Is...)
...@@ -122,22 +123,6 @@ struct GeneratorTensor_3<ck::bhalf_t> ...@@ -122,22 +123,6 @@ struct GeneratorTensor_3<ck::bhalf_t>
} }
}; };
template <>
struct GeneratorTensor_3<int8_t>
{
float min_value = 0;
float max_value = 1;
template <typename... Is>
int8_t operator()(Is...)
{
int8_t min_tmp = static_cast<int8_t>(min_value);
int8_t max_tmp = static_cast<int8_t>(max_value);
return (std::rand() % (max_tmp - min_tmp)) + min_tmp;
}
};
struct GeneratorTensor_Checkboard struct GeneratorTensor_Checkboard
{ {
template <typename... Ts> template <typename... Ts>
...@@ -163,5 +148,3 @@ struct GeneratorTensor_Sequential ...@@ -163,5 +148,3 @@ struct GeneratorTensor_Sequential
return dims[Dim]; return dims[Dim];
} }
}; };
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment