Commit dd6a8de4 authored by Jehandad Khan's avatar Jehandad Khan
Browse files

Merge branch 'develop' into jd/dev_pkg

parents 0aa899aa abf4bdb9
#ifndef CK_COMMON_HEADER_HPP
#define CK_COMMON_HEADER_HPP
#pragma once
#include "config.hpp"
#include "array.hpp"
#include "container_helper.hpp"
......@@ -20,30 +18,29 @@
#include "number.hpp"
#include "sequence.hpp"
#include "sequence_helper.hpp"
#include "synchronization.hpp"
#include "tuple.hpp"
#include "tuple_helper.hpp"
#include "type.hpp"
#include "magic_division.hpp"
#include "utility.hpp"
#include "c_style_pointer_cast.hpp"
#include "amd_address_space.hpp"
#include "amd_buffer_addressing.hpp"
#include "static_buffer.hpp"
#include "dynamic_buffer.hpp"
#include "is_known_at_compile_time.hpp"
#include "transpose_vectors.hpp"
#include "inner_product.hpp"
#include "element_wise_operation.hpp"
#include "debug.hpp"
#include "amd_buffer_addressing.hpp"
#include "get_id.hpp"
#include "synchronization.hpp"
#include "amd_address_space.hpp"
#include "static_buffer.hpp"
#include "dynamic_buffer.hpp"
// TODO: remove this
#if CK_USE_AMD_INLINE_ASM
#include "amd_inline_asm.hpp"
#endif
#if CK_USE_AMD_XDLOPS
#ifdef CK_USE_AMD_MFMA
#include "amd_xdlops.hpp"
#endif
#endif
#ifndef CK_FLOAT_TYPE_AMD_HPP
#define CK_FLOAT_TYPE_AMD_HPP
#pragma once
#include "statically_indexed_array.hpp"
namespace ck {
......@@ -937,7 +935,7 @@ __host__ __device__ Y type_convert(X x)
// convert bfp16 to fp32
template <>
inline __host__ __device__ float type_convert(bhalf_t x)
inline __host__ __device__ float type_convert<float, bhalf_t>(bhalf_t x)
{
union
{
......@@ -950,7 +948,7 @@ inline __host__ __device__ float type_convert(bhalf_t x)
// convert fp32 to bfp16
template <>
inline __host__ __device__ bhalf_t type_convert(float x)
inline __host__ __device__ bhalf_t type_convert<bhalf_t, float>(float x)
{
union
{
......@@ -1090,4 +1088,3 @@ struct NumericLimits<half_t>
};
} // namespace ck
#endif
......@@ -3,7 +3,7 @@
namespace ck {
enum DataTypeEnum_t
enum struct DataTypeEnum
{
Half = 0,
Float = 1,
......
......@@ -6,35 +6,35 @@
namespace ck {
template <DataTypeEnum_t DataTypeEnum>
template <DataTypeEnum DataTypeEnum>
struct get_datatype_from_enum;
template <>
struct get_datatype_from_enum<DataTypeEnum_t::Int8>
struct get_datatype_from_enum<DataTypeEnum::Int8>
{
using type = int8_t;
};
template <>
struct get_datatype_from_enum<DataTypeEnum_t::Int32>
struct get_datatype_from_enum<DataTypeEnum::Int32>
{
using type = int32_t;
};
template <>
struct get_datatype_from_enum<DataTypeEnum_t::Half>
struct get_datatype_from_enum<DataTypeEnum::Half>
{
using type = half_t;
};
template <>
struct get_datatype_from_enum<DataTypeEnum_t::Float>
struct get_datatype_from_enum<DataTypeEnum::Float>
{
using type = float;
};
template <>
struct get_datatype_from_enum<DataTypeEnum_t::Double>
struct get_datatype_from_enum<DataTypeEnum::Double>
{
using type = double;
};
......@@ -45,31 +45,31 @@ struct get_datatype_enum_from_type;
template <>
struct get_datatype_enum_from_type<int8_t>
{
static constexpr DataTypeEnum_t value = DataTypeEnum_t::Int8;
static constexpr DataTypeEnum value = DataTypeEnum::Int8;
};
template <>
struct get_datatype_enum_from_type<int32_t>
{
static constexpr DataTypeEnum_t value = DataTypeEnum_t::Int32;
static constexpr DataTypeEnum value = DataTypeEnum::Int32;
};
template <>
struct get_datatype_enum_from_type<half_t>
{
static constexpr DataTypeEnum_t value = DataTypeEnum_t::Half;
static constexpr DataTypeEnum value = DataTypeEnum::Half;
};
template <>
struct get_datatype_enum_from_type<float>
{
static constexpr DataTypeEnum_t value = DataTypeEnum_t::Float;
static constexpr DataTypeEnum value = DataTypeEnum::Float;
};
template <>
struct get_datatype_enum_from_type<double>
{
static constexpr DataTypeEnum_t value = DataTypeEnum_t::Double;
static constexpr DataTypeEnum value = DataTypeEnum::Double;
};
} // namespace ck
......
#ifndef CK_BUFFER_HPP
#define CK_BUFFER_HPP
#pragma once
#include "amd_buffer_addressing.hpp"
#include "c_style_pointer_cast.hpp"
#include "config.hpp"
......@@ -8,7 +6,7 @@
namespace ck {
template <AddressSpaceEnum_t BufferAddressSpace,
template <AddressSpaceEnum BufferAddressSpace,
typename T,
typename ElementSpaceSize,
bool InvalidElementUseNumericalZeroValue>
......@@ -34,7 +32,7 @@ struct DynamicBuffer
{
}
__host__ __device__ static constexpr AddressSpaceEnum_t GetAddressSpace()
__host__ __device__ static constexpr AddressSpaceEnum GetAddressSpace()
{
return BufferAddressSpace;
}
......@@ -55,7 +53,7 @@ struct DynamicBuffer
constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
"wrong! X need to be multiple T");
"wrong! X should contain multiple T");
#if CK_USE_AMD_BUFFER_LOAD
bool constexpr use_amd_buffer_addressing = true;
......@@ -63,7 +61,7 @@ struct DynamicBuffer
bool constexpr use_amd_buffer_addressing = false;
#endif
if constexpr(GetAddressSpace() == AddressSpaceEnum_t::Global && use_amd_buffer_addressing)
if constexpr(GetAddressSpace() == AddressSpaceEnum::Global && use_amd_buffer_addressing)
{
constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
......@@ -81,50 +79,48 @@ struct DynamicBuffer
}
else
{
if constexpr(InvalidElementUseNumericalZeroValue)
if(is_valid_element)
{
#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
X tmp;
__builtin_memcpy(&tmp, &(p_data_[i]), sizeof(X));
return is_valid_element ? tmp : X{0};
return tmp;
#else
return is_valid_element ? *c_style_pointer_cast<const X*>(&p_data_[i]) : X{0};
return *c_style_pointer_cast<const X*>(&p_data_[i]);
#endif
}
else
{
#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
X tmp;
__builtin_memcpy(&tmp, &(p_data_[i]), sizeof(X));
return is_valid_element ? tmp : X{invalid_element_value_};
#else
return is_valid_element ? *c_style_pointer_cast<const X*>(&p_data_[i])
: X{invalid_element_value_};
#endif
if constexpr(InvalidElementUseNumericalZeroValue)
{
return X{0};
}
else
{
return X{invalid_element_value_};
}
}
}
}
template <InMemoryDataOperationEnum_t Op,
template <InMemoryDataOperationEnum Op,
typename X,
typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
typename scalar_type<remove_cvref_t<T>>::type>::value,
bool>::type = false>
__host__ __device__ void Update(index_t i, bool is_valid_element, const X& x)
{
if constexpr(Op == InMemoryDataOperationEnum_t::Set)
if constexpr(Op == InMemoryDataOperationEnum::Set)
{
this->template Set<X>(i, is_valid_element, x);
}
else if constexpr(Op == InMemoryDataOperationEnum_t::AtomicAdd)
else if constexpr(Op == InMemoryDataOperationEnum::AtomicAdd)
{
this->template AtomicAdd<X>(i, is_valid_element, x);
}
else if constexpr(Op == InMemoryDataOperationEnum_t::Add)
else if constexpr(Op == InMemoryDataOperationEnum::Add)
{
auto tmp = this->template Get<X>(i, is_valid_element);
this->template Set<X>(i, is_valid_element, x + tmp);
......@@ -145,143 +141,120 @@ struct DynamicBuffer
constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
"wrong! X need to be multiple T");
"wrong! X should contain multiple T");
if constexpr(GetAddressSpace() == AddressSpaceEnum_t::Global)
{
#if CK_USE_AMD_BUFFER_STORE
constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
amd_buffer_store<remove_cvref_t<T>, t_per_x>(
x, p_data_, i, is_valid_element, element_space_size_);
bool constexpr use_amd_buffer_addressing = true;
#else
if(is_valid_element)
{
#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
X tmp = x;
bool constexpr use_amd_buffer_addressing = false;
#endif
__builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X));
#if CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
bool constexpr workaround_int8_ds_write_issue = true;
#else
*c_style_pointer_cast<X*>(&p_data_[i]) = x;
#endif
}
bool constexpr workaround_int8_ds_write_issue = false;
#endif
if constexpr(GetAddressSpace() == AddressSpaceEnum::Global && use_amd_buffer_addressing)
{
constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
amd_buffer_store<remove_cvref_t<T>, t_per_x>(
x, p_data_, i, is_valid_element, element_space_size_);
}
else if constexpr(GetAddressSpace() == AddressSpaceEnum_t::Lds)
else if constexpr(GetAddressSpace() == AddressSpaceEnum::Lds &&
is_same<typename scalar_type<remove_cvref_t<T>>::type, int8_t>::value &&
workaround_int8_ds_write_issue)
{
if(is_valid_element)
{
#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
X tmp = x;
__builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X));
#else
*c_style_pointer_cast<X*>(&p_data_[i]) = x;
#endif
#else
// HACK: compiler would lower IR "store<i8, 16> address_space(3)" into
// inefficient
// HACK: compiler would lower IR "store<i8, 16> address_space(3)" into inefficient
// ISA, so I try to let compiler emit IR "store<i32, 4>" which would be lower to
// ds_write_b128
// TODO: remove this after compiler fix
if constexpr(is_same<typename scalar_type<remove_cvref_t<T>>::type, int8_t>::value)
static_assert((is_same<remove_cvref_t<T>, int8_t>::value &&
is_same<remove_cvref_t<X>, int8_t>::value) ||
(is_same<remove_cvref_t<T>, int8_t>::value &&
is_same<remove_cvref_t<X>, int8x2_t>::value) ||
(is_same<remove_cvref_t<T>, int8_t>::value &&
is_same<remove_cvref_t<X>, int8x4_t>::value) ||
(is_same<remove_cvref_t<T>, int8_t>::value &&
is_same<remove_cvref_t<X>, int8x8_t>::value) ||
(is_same<remove_cvref_t<T>, int8_t>::value &&
is_same<remove_cvref_t<X>, int8x16_t>::value) ||
(is_same<remove_cvref_t<T>, int8x4_t>::value &&
is_same<remove_cvref_t<X>, int8x4_t>::value) ||
(is_same<remove_cvref_t<T>, int8x8_t>::value &&
is_same<remove_cvref_t<X>, int8x8_t>::value) ||
(is_same<remove_cvref_t<T>, int8x16_t>::value &&
is_same<remove_cvref_t<X>, int8x16_t>::value),
"wrong! not implemented for this combination, please add "
"implementation");
if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
is_same<remove_cvref_t<X>, int8_t>::value)
{
static_assert((is_same<remove_cvref_t<T>, int8_t>::value &&
is_same<remove_cvref_t<X>, int8_t>::value) ||
(is_same<remove_cvref_t<T>, int8_t>::value &&
is_same<remove_cvref_t<X>, int8x2_t>::value) ||
(is_same<remove_cvref_t<T>, int8_t>::value &&
is_same<remove_cvref_t<X>, int8x4_t>::value) ||
(is_same<remove_cvref_t<T>, int8_t>::value &&
is_same<remove_cvref_t<X>, int8x8_t>::value) ||
(is_same<remove_cvref_t<T>, int8_t>::value &&
is_same<remove_cvref_t<X>, int8x16_t>::value) ||
(is_same<remove_cvref_t<T>, int8x4_t>::value &&
is_same<remove_cvref_t<X>, int8x4_t>::value) ||
(is_same<remove_cvref_t<T>, int8x8_t>::value &&
is_same<remove_cvref_t<X>, int8x8_t>::value) ||
(is_same<remove_cvref_t<T>, int8x16_t>::value &&
is_same<remove_cvref_t<X>, int8x16_t>::value),
"wrong! not implemented for this combination, please add "
"implementation");
if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
is_same<remove_cvref_t<X>, int8_t>::value)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*c_style_pointer_cast<int8_t*>(&p_data_[i]) =
*c_style_pointer_cast<const int8_t*>(&x);
}
else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
is_same<remove_cvref_t<X>, int8x2_t>::value)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*c_style_pointer_cast<int16_t*>(&p_data_[i]) =
*c_style_pointer_cast<const int16_t*>(&x);
}
else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
is_same<remove_cvref_t<X>, int8x4_t>::value)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*c_style_pointer_cast<int32_t*>(&p_data_[i]) =
*c_style_pointer_cast<const int32_t*>(&x);
}
else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
is_same<remove_cvref_t<X>, int8x8_t>::value)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*c_style_pointer_cast<int32x2_t*>(&p_data_[i]) =
*c_style_pointer_cast<const int32x2_t*>(&x);
}
else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
is_same<remove_cvref_t<X>, int8x16_t>::value)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*c_style_pointer_cast<int32x4_t*>(&p_data_[i]) =
*c_style_pointer_cast<const int32x4_t*>(&x);
}
else if constexpr(is_same<remove_cvref_t<T>, int8x4_t>::value &&
is_same<remove_cvref_t<X>, int8x4_t>::value)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*c_style_pointer_cast<int32_t*>(&p_data_[i]) =
*c_style_pointer_cast<const int32_t*>(&x);
}
else if constexpr(is_same<remove_cvref_t<T>, int8x8_t>::value &&
is_same<remove_cvref_t<X>, int8x8_t>::value)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*c_style_pointer_cast<int32x2_t*>(&p_data_[i]) =
*c_style_pointer_cast<const int32x2_t*>(&x);
}
else if constexpr(is_same<remove_cvref_t<T>, int8x16_t>::value &&
is_same<remove_cvref_t<X>, int8x16_t>::value)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*c_style_pointer_cast<int32x4_t*>(&p_data_[i]) =
*c_style_pointer_cast<const int32x4_t*>(&x);
}
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*c_style_pointer_cast<int8_t*>(&p_data_[i]) =
*c_style_pointer_cast<const int8_t*>(&x);
}
else
else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
is_same<remove_cvref_t<X>, int8x2_t>::value)
{
#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
X tmp = x;
__builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X));
#else
*c_style_pointer_cast<X*>(&p_data_[i]) = x;
#endif
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*c_style_pointer_cast<int16_t*>(&p_data_[i]) =
*c_style_pointer_cast<const int16_t*>(&x);
}
else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
is_same<remove_cvref_t<X>, int8x4_t>::value)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*c_style_pointer_cast<int32_t*>(&p_data_[i]) =
*c_style_pointer_cast<const int32_t*>(&x);
}
else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
is_same<remove_cvref_t<X>, int8x8_t>::value)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*c_style_pointer_cast<int32x2_t*>(&p_data_[i]) =
*c_style_pointer_cast<const int32x2_t*>(&x);
}
else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
is_same<remove_cvref_t<X>, int8x16_t>::value)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*c_style_pointer_cast<int32x4_t*>(&p_data_[i]) =
*c_style_pointer_cast<const int32x4_t*>(&x);
}
else if constexpr(is_same<remove_cvref_t<T>, int8x4_t>::value &&
is_same<remove_cvref_t<X>, int8x4_t>::value)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*c_style_pointer_cast<int32_t*>(&p_data_[i]) =
*c_style_pointer_cast<const int32_t*>(&x);
}
else if constexpr(is_same<remove_cvref_t<T>, int8x8_t>::value &&
is_same<remove_cvref_t<X>, int8x8_t>::value)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*c_style_pointer_cast<int32x2_t*>(&p_data_[i]) =
*c_style_pointer_cast<const int32x2_t*>(&x);
}
else if constexpr(is_same<remove_cvref_t<T>, int8x16_t>::value &&
is_same<remove_cvref_t<X>, int8x16_t>::value)
{
// HACK: cast pointer of x is bad
// TODO: remove this after compiler fix
*c_style_pointer_cast<int32x4_t*>(&p_data_[i]) =
*c_style_pointer_cast<const int32x4_t*>(&x);
}
#endif
}
}
else
......@@ -305,27 +278,49 @@ struct DynamicBuffer
bool>::type = false>
__host__ __device__ void AtomicAdd(index_t i, bool is_valid_element, const X& x)
{
using scalar_t = typename scalar_type<remove_cvref_t<T>>::type;
// X contains multiple T
constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
"wrong! X need to be multiple T");
static_assert(GetAddressSpace() == AddressSpaceEnum_t::Global, "only support global mem");
"wrong! X should contain multiple T");
static_assert(GetAddressSpace() == AddressSpaceEnum::Global, "only support global mem");
#if CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT
bool constexpr use_amd_buffer_addressing =
is_same_v<remove_cvref_t<scalar_t>, int32_t> ||
is_same_v<remove_cvref_t<scalar_t>, float> ||
(is_same_v<remove_cvref_t<scalar_t>, half_t> && scalar_per_x_vector % 2 == 0);
#elif CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && (!CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT)
bool constexpr use_amd_buffer_addressing = is_same_v<remove_cvref_t<scalar_t>, int32_t>;
#elif(!CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER) && CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT
bool constexpr use_amd_buffer_addressing =
is_same_v<remove_cvref_t<scalar_t>, float> ||
(is_same_v<remove_cvref_t<scalar_t>, half_t> && scalar_per_x_vector % 2 == 0);
#else
bool constexpr use_amd_buffer_addressing = false;
#endif
#if CK_USE_AMD_BUFFER_ATOMIC_ADD
constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
if constexpr(use_amd_buffer_addressing)
{
constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
amd_buffer_atomic_add<remove_cvref_t<T>, t_per_x>(
x, p_data_, i, is_valid_element, element_space_size_);
#else
if(is_valid_element)
amd_buffer_atomic_add<remove_cvref_t<T>, t_per_x>(
x, p_data_, i, is_valid_element, element_space_size_);
}
else
{
atomicAdd(&p_data_[i], x);
if(is_valid_element)
{
// FIXME: atomicAdd is defined by HIP, need to avoid implicit type casting when
// calling it
atomicAdd(c_style_pointer_cast<X*>(&p_data_[i]), x);
}
}
#endif
}
__host__ __device__ static constexpr bool IsStaticBuffer() { return false; }
......@@ -333,14 +328,14 @@ struct DynamicBuffer
__host__ __device__ static constexpr bool IsDynamicBuffer() { return true; }
};
template <AddressSpaceEnum_t BufferAddressSpace, typename T, typename ElementSpaceSize>
template <AddressSpaceEnum BufferAddressSpace, typename T, typename ElementSpaceSize>
__host__ __device__ constexpr auto make_dynamic_buffer(T* p, ElementSpaceSize element_space_size)
{
return DynamicBuffer<BufferAddressSpace, T, ElementSpaceSize, true>{p, element_space_size};
}
template <
AddressSpaceEnum_t BufferAddressSpace,
AddressSpaceEnum BufferAddressSpace,
typename T,
typename ElementSpaceSize,
typename X,
......@@ -353,4 +348,3 @@ make_dynamic_buffer(T* p, ElementSpaceSize element_space_size, X invalid_element
}
} // namespace ck
#endif
#ifndef CK_UTILITY_HPP
#define CK_UTILITY_HPP
#pragma once
#include "config.hpp"
namespace ck {
......@@ -16,5 +14,3 @@ __device__ index_t get_block_1d_id() { return blockIdx.x; }
__device__ index_t get_grid_size() { return gridDim.x; }
} // namespace ck
#endif
......@@ -3,7 +3,7 @@
#include "common_header.hpp"
#if CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX
#if CK_EXPERIMENTAL_USE_DYNAMICALLY_INDEXED_MULTI_INDEX
#include "array_multi_index.hpp"
#else
#include "statically_indexed_array_multi_index.hpp"
......
......@@ -28,7 +28,7 @@
namespace ck {
enum class ReduceTensorOp_t
enum struct ReduceTensorOp
{
ADD = 0,
MUL = 1,
......@@ -41,19 +41,19 @@ enum class ReduceTensorOp_t
// MUL_NO_ZEROS = 8,
};
enum class NanPropagation_t
enum struct NanPropagation
{
NOT_PROPAGATE_NAN = 0,
PROPAGATE_NAN = 1,
};
enum class ReduceTensorIndices_t
enum struct ReduceTensorIndices
{
NO_INDICES = 0,
FLATTENED_INDICES = 1,
};
enum class IndicesType_t
enum struct IndicesType
{
INDICES_32BIT = 0,
INDICES_64BIT = 1,
......
......@@ -606,6 +606,12 @@ struct sequence_map_inverse
SeqMap::Size()>::type;
};
template <index_t... Xs, index_t... Ys>
__host__ __device__ constexpr bool operator==(Sequence<Xs...>, Sequence<Ys...>)
{
return ((Xs == Ys) && ...);
}
template <index_t... Xs, index_t... Ys>
__host__ __device__ constexpr auto operator+(Sequence<Xs...>, Sequence<Ys...>)
{
......
......@@ -6,7 +6,7 @@
namespace ck {
// static buffer for scalar
template <AddressSpaceEnum_t AddressSpace,
template <AddressSpaceEnum AddressSpace,
typename T,
index_t N,
bool InvalidElementUseNumericalZeroValue> // TODO remove this bool, no longer needed
......@@ -17,10 +17,7 @@ struct StaticBuffer : public StaticallyIndexedArray<T, N>
__host__ __device__ constexpr StaticBuffer() : base{} {}
__host__ __device__ static constexpr AddressSpaceEnum_t GetAddressSpace()
{
return AddressSpace;
}
__host__ __device__ static constexpr AddressSpaceEnum GetAddressSpace() { return AddressSpace; }
__host__ __device__ static constexpr bool IsStaticBuffer() { return true; }
......@@ -42,7 +39,7 @@ struct StaticBuffer : public StaticallyIndexedArray<T, N>
};
// static buffer for vector
template <AddressSpaceEnum_t AddressSpace,
template <AddressSpaceEnum AddressSpace,
typename S,
index_t NumOfVector,
index_t ScalarPerVector,
......@@ -59,10 +56,7 @@ struct StaticBufferTupleOfVector
__host__ __device__ constexpr StaticBufferTupleOfVector() : base{} {}
__host__ __device__ static constexpr AddressSpaceEnum_t GetAddressSpace()
{
return AddressSpace;
}
__host__ __device__ static constexpr AddressSpaceEnum GetAddressSpace() { return AddressSpace; }
__host__ __device__ static constexpr bool IsStaticBuffer() { return true; }
......@@ -158,7 +152,7 @@ struct StaticBufferTupleOfVector
}
};
template <AddressSpaceEnum_t AddressSpace, typename T, index_t N>
template <AddressSpaceEnum AddressSpace, typename T, index_t N>
__host__ __device__ constexpr auto make_static_buffer(Number<N>)
{
return StaticBuffer<AddressSpace, T, N, true>{};
......
......@@ -7,7 +7,7 @@ namespace ck {
__device__ void block_sync_lds()
{
#if CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
#if CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
asm volatile("\
s_waitcnt lgkmcnt(0) \n \
s_barrier \
......
......@@ -37,6 +37,10 @@ struct SpaceFillingCurve
__host__ __device__ static constexpr index_t GetNumOfAccess()
{
static_assert(TensorLengths::Size() == ScalarsPerAccess::Size());
static_assert(TensorLengths{} % ScalarsPerAccess{} ==
typename uniform_sequence_gen<TensorLengths::Size(), 0>::type{});
return reduce_on_sequence(TensorLengths{}, math::multiplies{}, Number<1>{}) /
ScalarPerVector;
}
......@@ -140,6 +144,15 @@ struct SpaceFillingCurve
}();
return idx_md;
}
// FIXME: rename this function
template <index_t AccessIdx1d>
static __device__ __host__ constexpr auto GetIndexTupleOfNumber(Number<AccessIdx1d>)
{
constexpr auto idx = GetIndex(Number<AccessIdx1d>{});
return generate_tuple([&](auto i) { return Number<idx[i]>{}; }, Number<nDim>{});
}
};
} // namespace ck
......
......@@ -75,14 +75,14 @@ calculate_convolution_flops(const InDesc&, const WeiDesc& wei_desc, const OutDes
}
template <typename T>
inline auto activ(T v, const ck::ActivTypeEnum_t activ_type)
inline auto activ(T v, const ck::ActivTypeEnum activ_type)
{
const T alpha = 0.3;
switch(activ_type)
{
case ck::ActivTypeEnum_t::None: return v;
case ck::ActivTypeEnum_t::LeakyRelu: return (v >= 0 ? v : alpha * v);
case ck::ActivTypeEnum_t::Sigmoid: return (1 / (1 + exp(-v)));
case ck::ActivTypeEnum::None: return v;
case ck::ActivTypeEnum::LeakyRelu: return (v >= 0 ? v : alpha * v);
case ck::ActivTypeEnum::Sigmoid: return (1 / (1 + exp(-v)));
default: throw std::runtime_error("unsupported activ type"); break;
}
}
......
......@@ -48,8 +48,10 @@ struct DeviceMem
DeviceMem() = delete;
DeviceMem(std::size_t mem_size);
void* GetDeviceBuffer();
std::size_t GetBufferSize();
void ToDevice(const void* p);
void FromDevice(void* p);
void SetZero();
~DeviceMem();
void* mpDeviceBuf;
......@@ -109,8 +111,6 @@ float launch_and_time_kernel(
timer.End();
// std::this_thread::sleep_for (std::chrono::microseconds(10));
return timer.GetElapsedTime() / nrepeat;
#else
std::ignore = nrepeat;
......
#pragma once
#include "host_tensor.hpp"
#include "common_header.hpp"
template <typename TensorDesc>
void ostream_tensor_descriptor(TensorDesc, std::ostream& os = std::cout)
......
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef HOST_GENERIC_REDUCTION_HPP_
#define HOST_GENERIC_REDUCTION_HPP_
#include <vector>
#include <functional>
#include <limits>
#include <type_traits>
#include <cassert>
#include <cmath>
#include "reduction_enums.hpp"
#include "host_reduce_util.hpp"
using float16 = half_float::half;
namespace ck {
namespace host_reduce {
template <typename T>
static void
get_all_indexes(const std::vector<T>& dimLengths, int dim, std::vector<std::vector<T>>& indexes)
{
if(dim < dimLengths.size())
{
std::vector<std::vector<T>> updated_indexes;
if(dim == 0)
{
assert(indexes.size() == 0);
assert(dimLengths[dim] > 0);
for(T i = 0; i < dimLengths[dim]; i++)
{
std::vector<T> index = {i};
updated_indexes.push_back(index);
};
}
else
{
// go through all the current indexes
for(const auto& index : indexes)
for(T i = 0; i < dimLengths[dim]; i++)
{
auto index_new = index;
index_new.push_back(i);
updated_indexes.push_back(index_new);
};
};
// update to the indexes (output)
indexes = updated_indexes;
// further to construct the indexes from the updated status
get_all_indexes(dimLengths, dim + 1, indexes);
};
};
template <typename T>
static T get_offset_from_index(const std::vector<T>& strides, const std::vector<T>& index)
{
T offset = 0;
assert(strides.size() == index.size());
for(int i = 0; i < index.size(); i++)
offset += strides[i] * static_cast<T>(index[i]);
return (offset);
};
template <typename T>
static inline T get_flatten_offset(const std::vector<T>& lengths, const std::vector<T>& index)
{
T offset = 0;
assert(lengths.size() == index.size() && lengths.size() > 0);
int len = lengths.size();
T stride = 1;
// for len==1, the loop is not executed
for(int i = len - 1; i > 0; i--)
{
offset += stride * static_cast<T>(index[i]);
stride *= lengths[i];
};
offset += stride * static_cast<T>(index[0]);
return (offset);
};
template <typename InDataType,
typename AccDataType,
typename OutDataType,
ck::ReduceTensorOp_t ReduceOpId,
bool PropagateNan,
bool NeedIndices>
class ReductionHost
{
public:
ReductionHost() = default;
ReductionHost(HostTensorDescriptor& inDesc,
HostTensorDescriptor& outDesc,
const std::vector<int>& invariantDims_,
const std::vector<int>& toReduceDims_)
{
this->inLengths = to_int_vector(inDesc.GetLengths());
this->outLengths = to_int_vector(outDesc.GetLengths());
this->inStrides = to_int_vector(inDesc.GetStrides());
this->outStrides = to_int_vector(outDesc.GetStrides());
this->invariantDims = invariantDims_;
this->toReduceDims = toReduceDims_;
assert(this->inLengths.size() == this->outLengths.size());
assert(!this->toReduceDims.empty());
for(const auto dim : this->invariantDims)
this->invariantLengths.push_back(this->inLengths[dim]);
for(const auto dim : this->toReduceDims)
toReduceLengths.push_back(this->inLengths[dim]);
this->reduceAllDims = this->invariantDims.empty();
};
~ReductionHost(){};
void
Run(float alpha, const InDataType* in_data, float beta, OutDataType* out_data, int* indices)
{
if constexpr(NeedIndices)
RunImpl_with_indices(alpha, in_data, beta, out_data, indices);
else
RunImpl_no_indices(alpha, in_data, beta, out_data);
};
private:
std::vector<int> inLengths;
std::vector<int> outLengths;
std::vector<int> inStrides;
std::vector<int> outStrides;
std::vector<int> invariantLengths;
std::vector<int> toReduceLengths;
std::vector<int> invariantDims;
std::vector<int> toReduceDims;
bool reduceAllDims;
void RunImpl_with_indices(
float alpha, const InDataType* in_data, float beta, OutDataType* out_data, int* indices)
{
using ck::host_reduce::binop_with_nan_check;
using ck::host_reduce::binop_with_nan_check2;
using ck::host_reduce::float_equal_one;
using ck::host_reduce::float_equal_zero;
using ck::host_reduce::PosUnaryOpFn;
using ck::host_reduce::PreUnaryOpFn;
using ck::host_reduce::ReduceOpFn2;
using ck::host_reduce::ReduceOpZeroVal;
auto opReduce = ReduceOpFn2<AccDataType, ReduceOpId>();
int divider = 1;
for(int i = 0; i < toReduceLengths.size(); i++)
divider *= toReduceLengths[i];
auto PreUnaryOp = PreUnaryOpFn<AccDataType, ReduceOpId>(divider);
auto PosUnaryOp = PosUnaryOpFn<AccDataType, ReduceOpId>(divider);
if(reduceAllDims)
{
std::vector<std::vector<int>> indexes_1;
get_all_indexes(inLengths, 0, indexes_1); // generate the input indexes space
auto accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
int accuIndex = 0;
// go through indexes of the invariant dimensions
for(const auto& src_index : indexes_1)
{
auto src_offset = get_offset_from_index(this->inStrides, src_index);
auto currVal = static_cast<AccDataType>(in_data[src_offset]);
// unary operation before reducing, needed by AMAX. For MIN/MAX, nothing is actually
// done
PreUnaryOp(currVal);
auto currIndex = get_flatten_offset(inLengths, src_index);
binop_with_nan_check2<AccDataType, PropagateNan>(
opReduce, accuVal, currVal, accuIndex, currIndex);
};
// scale the accumulated value
if(!float_equal_one(alpha))
accuVal *= static_cast<AccDataType>(alpha);
// scale the prior dst value and add it to the accumulated value
if(!float_equal_zero(beta))
accuVal += static_cast<AccDataType>(out_data[0]) * static_cast<AccDataType>(beta);
// store the reduced value to dst location
out_data[0] = static_cast<OutDataType>(accuVal);
indices[0] = accuIndex;
}
else
{
std::vector<std::vector<int>> indexes_1, indexes_2;
get_all_indexes(
this->invariantLengths, 0, indexes_1); // generate the invariant indexes space
get_all_indexes(
this->toReduceLengths, 0, indexes_2); // generate the toReduce indexes space
// go through indexes of the invariant dimensions
for(const auto& index_1 : indexes_1)
{
std::vector<int> src_index;
std::vector<int> dst_index;
src_index.resize(this->inLengths.size());
// generate the part of src index belonging to invariant dims
for(int k = 0; k < invariantDims.size(); k++)
src_index[invariantDims[k]] = index_1[k];
for(int k = 0; k < invariantDims.size(); k++)
dst_index.push_back(index_1[k]);
int dst_offset = get_offset_from_index(this->outStrides, dst_index);
AccDataType accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
int accuIndex = 0;
// go through indexes of the toReduce dimensions
for(const auto& index_2 : indexes_2)
{
// generate the part of src index belonging to toReduce dims
for(int k = 0; k < toReduceDims.size(); k++)
src_index[toReduceDims[k]] = index_2[k];
auto src_offset = get_offset_from_index(this->inStrides, src_index);
auto currVal = static_cast<AccDataType>(in_data[src_offset]);
// unary operation before reducing, needed by AMAX. For MIN/MAX, nothing is
// actually done
PreUnaryOp(currVal);
auto currIndex = get_flatten_offset(toReduceLengths, index_2);
binop_with_nan_check2<AccDataType, PropagateNan>(
opReduce, accuVal, currVal, accuIndex, currIndex);
};
// scale the accumulated value
if(!float_equal_one(alpha))
accuVal *= static_cast<AccDataType>(alpha);
// scale the prior dst value and add it to the accumulated value
if(!float_equal_zero(beta))
accuVal += static_cast<AccDataType>(out_data[dst_offset]) *
static_cast<AccDataType>(beta);
// store the reduced value to dst location
out_data[dst_offset] = static_cast<OutDataType>(accuVal);
indices[dst_offset] = accuIndex;
};
};
}; // end of RunImpl_with_indices()
void
RunImpl_no_indices(float alpha, const InDataType* in_data, float beta, OutDataType* out_data)
{
using ck::host_reduce::binop_with_nan_check;
using ck::host_reduce::binop_with_nan_check2;
using ck::host_reduce::float_equal_one;
using ck::host_reduce::float_equal_zero;
using ck::host_reduce::PosUnaryOpFn;
using ck::host_reduce::PreUnaryOpFn;
using ck::host_reduce::ReduceOpFn;
using ck::host_reduce::ReduceOpZeroVal;
auto opReduce = ReduceOpFn<AccDataType, ReduceOpId>();
int divider = 1;
for(int i = 0; i < toReduceLengths.size(); i++)
divider *= toReduceLengths[i];
auto PreUnaryOp = PreUnaryOpFn<AccDataType, ReduceOpId>(divider);
auto PosUnaryOp = PosUnaryOpFn<AccDataType, ReduceOpId>(divider);
if(reduceAllDims)
{
std::vector<std::vector<int>> indexes_1;
get_all_indexes(inLengths, 0, indexes_1); // generate the input indexes space
auto accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
// go through indexes of the invariant dimensions
for(const auto& src_index : indexes_1)
{
auto src_offset = get_offset_from_index(this->inStrides, src_index);
auto currVal = static_cast<AccDataType>(in_data[src_offset]);
PreUnaryOp(currVal);
binop_with_nan_check<AccDataType, PropagateNan>(opReduce, accuVal, currVal);
};
PosUnaryOp(accuVal);
// scale the accumulated value
if(!float_equal_one(alpha))
accuVal *= static_cast<AccDataType>(alpha);
// scale the prior dst value and add it to the accumulated value
if(!float_equal_zero(beta))
accuVal += static_cast<AccDataType>(out_data[0]) * static_cast<AccDataType>(beta);
// store the reduced value to dst location
out_data[0] = static_cast<OutDataType>(accuVal);
}
else
{
std::vector<std::vector<int>> indexes_1, indexes_2;
get_all_indexes(
this->invariantLengths, 0, indexes_1); // generate the invariant indexes space
get_all_indexes(
this->toReduceLengths, 0, indexes_2); // generate the toReduce indexes space
// go through indexes of the invariant dimensions
for(const auto& index_1 : indexes_1)
{
std::vector<int> src_index;
std::vector<int> dst_index;
src_index.resize(this->inLengths.size());
for(int k = 0; k < invariantDims.size(); k++)
dst_index.push_back(index_1[k]);
int dst_offset = get_offset_from_index(this->outStrides, dst_index);
// generate the part of src index belonging to invariant dims
for(int k = 0; k < invariantDims.size(); k++)
src_index[invariantDims[k]] = index_1[k];
AccDataType accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
// go through indexes of the toReduce dimensions
for(const auto& index_2 : indexes_2)
{
// generate the part of src index belonging to toReduce dims
for(int k = 0; k < toReduceDims.size(); k++)
src_index[toReduceDims[k]] = index_2[k];
auto src_offset = get_offset_from_index(this->inStrides, src_index);
auto currVal = static_cast<AccDataType>(in_data[src_offset]);
PreUnaryOp(currVal);
binop_with_nan_check<AccDataType, PropagateNan>(opReduce, accuVal, currVal);
};
PosUnaryOp(accuVal);
// scale the accumulated value
if(!float_equal_one(alpha))
accuVal *= static_cast<AccDataType>(alpha);
// scale the prior dst value and add it to the accumulated value
if(!float_equal_zero(beta))
accuVal += static_cast<AccDataType>(out_data[dst_offset]) *
static_cast<AccDataType>(beta);
// store the reduced value to dst location
out_data[dst_offset] = static_cast<OutDataType>(accuVal);
};
};
}; // end of RunImpl_no_indices()
};
}; // end of namespace host_reduce
}; // end of namespace ck
#endif
......@@ -39,8 +39,8 @@ namespace ck {
namespace host_reduce {
using ck::NanPropagation_t;
using ck::ReduceTensorOp_t;
using ck::NanPropagation;
using ck::ReduceTensorOp;
template <typename T>
static inline bool float_equal_one(T);
......@@ -66,95 +66,95 @@ static inline bool float_equal_zero(half_float::half x)
return x == static_cast<half_float::half>(0.0f);
};
template <typename compType, ReduceTensorOp_t ReduceOpId>
__host__ static inline std::function<void(compType&)> PreUnaryOpFn(int)
template <typename AccDataType, ReduceTensorOp ReduceOpId>
__host__ static inline std::function<void(AccDataType&)> PreUnaryOpFn(int)
{
using std::abs;
if constexpr(ReduceOpId == ReduceTensorOp_t::NORM1)
if constexpr(ReduceOpId == ReduceTensorOp::NORM1)
{
return ([&](compType& a_) { a_ = abs(a_); });
return ([&](AccDataType& a_) { a_ = abs(a_); });
}
else if constexpr(ReduceOpId == ReduceTensorOp_t::NORM2)
else if constexpr(ReduceOpId == ReduceTensorOp::NORM2)
{
return ([&](compType& a_) { a_ = a_ * a_; });
return ([&](AccDataType& a_) { a_ = a_ * a_; });
}
else if constexpr(ReduceOpId == ReduceTensorOp_t::AMAX)
else if constexpr(ReduceOpId == ReduceTensorOp::AMAX)
{
return ([&](compType& a_) { a_ = abs(a_); });
return ([&](AccDataType& a_) { a_ = abs(a_); });
}
else
{
// ReduceTensorOp_t::AVG:
// ReduceTensorOp_t::ADD:
// ReduceTensorOp_t::MUL:
// ReduceTensorOp_t::MIN:
// ReduceTensorOp_t::MAX:
return ([&](compType&) {});
// ReduceTensorOp::AVG:
// ReduceTensorOp::ADD:
// ReduceTensorOp::MUL:
// ReduceTensorOp::MIN:
// ReduceTensorOp::MAX:
return ([&](AccDataType&) {});
};
};
template <typename compType, ReduceTensorOp_t ReduceOpId>
__host__ static inline std::function<void(compType&)> PosUnaryOpFn(int divider)
template <typename AccDataType, ReduceTensorOp ReduceOpId>
__host__ static inline std::function<void(AccDataType&)> PosUnaryOpFn(int32_t divider)
{
using std::sqrt;
if constexpr(ReduceOpId == ReduceTensorOp_t::NORM2)
if constexpr(ReduceOpId == ReduceTensorOp::NORM2)
{
return ([&](compType& a_) { a_ = sqrt(a_); });
return ([&](AccDataType& a_) { a_ = sqrt(a_); });
}
else if constexpr(ReduceOpId == ReduceTensorOp_t::AVG)
else if constexpr(ReduceOpId == ReduceTensorOp::AVG)
{
return ([&, divider](compType& a_) {
a_ = a_ / static_cast<compType>(static_cast<float>(divider));
return ([&, divider](AccDataType& a_) {
a_ = a_ / static_cast<AccDataType>(static_cast<float>(divider));
});
}
else
{
// ReduceTensorOp_t::ADD:
// ReduceTensorOp_t::NORM1:
// ReduceTensorOp_t::MUL:
// ReduceTensorOp_t::MIN:
// ReduceTensorOp_t::MAX:
// ReduceTensorOp_t::AMAX:
return ([&](compType&) {});
// ReduceTensorOp::ADD:
// ReduceTensorOp::NORM1:
// ReduceTensorOp::MUL:
// ReduceTensorOp::MIN:
// ReduceTensorOp::MAX:
// ReduceTensorOp::AMAX:
return ([&](AccDataType&) {});
}
};
template <typename compType, ReduceTensorOp_t ReduceOpId>
__host__ static inline std::function<void(compType&, compType)> ReduceOpFn()
template <typename AccDataType, ReduceTensorOp ReduceOpId>
__host__ static inline std::function<void(AccDataType&, AccDataType)> ReduceOpFn()
{
if constexpr(ReduceOpId == ReduceTensorOp_t::ADD || ReduceOpId == ReduceTensorOp_t::AVG ||
ReduceOpId == ReduceTensorOp_t::NORM1 || ReduceOpId == ReduceTensorOp_t::NORM2)
if constexpr(ReduceOpId == ReduceTensorOp::ADD || ReduceOpId == ReduceTensorOp::AVG ||
ReduceOpId == ReduceTensorOp::NORM1 || ReduceOpId == ReduceTensorOp::NORM2)
{
return ([&](compType& a_, compType b_) { a_ = a_ + b_; });
return ([&](AccDataType& a_, AccDataType b_) { a_ = a_ + b_; });
}
else if constexpr(ReduceOpId == ReduceTensorOp_t::MUL)
else if constexpr(ReduceOpId == ReduceTensorOp::MUL)
{
return ([&](compType& a_, compType b_) { a_ = a_ * b_; });
return ([&](AccDataType& a_, AccDataType b_) { a_ = a_ * b_; });
}
else if constexpr(ReduceOpId == ReduceTensorOp_t::MIN)
else if constexpr(ReduceOpId == ReduceTensorOp::MIN)
{
return ([&](compType& a_, compType b_) {
return ([&](AccDataType& a_, AccDataType b_) {
if(a_ > b_)
a_ = b_;
});
}
else if constexpr(ReduceOpId == ReduceTensorOp_t::MAX || ReduceOpId == ReduceTensorOp_t::AMAX)
else if constexpr(ReduceOpId == ReduceTensorOp::MAX || ReduceOpId == ReduceTensorOp::AMAX)
{
return ([&](compType& a_, compType b_) {
return ([&](AccDataType& a_, AccDataType b_) {
if(a_ < b_)
a_ = b_;
});
}
};
template <typename compType, ReduceTensorOp_t ReduceOpId>
__host__ static inline std::function<void(compType&, compType, bool& changed)> ReduceOpFn2()
template <typename AccDataType, ReduceTensorOp ReduceOpId>
__host__ static inline std::function<void(AccDataType&, AccDataType, bool& changed)> ReduceOpFn2()
{
if constexpr(ReduceOpId == ReduceTensorOp_t::MIN)
if constexpr(ReduceOpId == ReduceTensorOp::MIN)
{
return ([&](compType& a_, compType b_, bool& changed) {
return ([&](AccDataType& a_, AccDataType b_, bool& changed) {
if(a_ > b_)
{
a_ = b_;
......@@ -164,9 +164,9 @@ __host__ static inline std::function<void(compType&, compType, bool& changed)> R
changed = false;
});
}
else if constexpr(ReduceOpId == ReduceTensorOp_t::MAX || ReduceOpId == ReduceTensorOp_t::AMAX)
else if constexpr(ReduceOpId == ReduceTensorOp::MAX || ReduceOpId == ReduceTensorOp::AMAX)
{
return ([&](compType& a_, compType b_, bool& changed) {
return ([&](AccDataType& a_, AccDataType b_, bool& changed) {
if(a_ < b_)
{
a_ = b_;
......@@ -178,48 +178,49 @@ __host__ static inline std::function<void(compType&, compType, bool& changed)> R
}
else
{
// ReduceTensorOp_t::ADD:
// ReduceTensorOp_t::MUL:
// ReduceTensorOp_t::AVG:
// ReduceTensorOp_t::NORM1:
// ReduceTensorOp_t::NORM2:
return (std::function<void(compType&, compType, bool&)>{});
// ReduceTensorOp::ADD:
// ReduceTensorOp::MUL:
// ReduceTensorOp::AVG:
// ReduceTensorOp::NORM1:
// ReduceTensorOp::NORM2:
return (std::function<void(AccDataType&, AccDataType, bool&)>{});
};
};
template <typename compType, ReduceTensorOp_t ReduceOpId>
__host__ static inline compType ReduceOpZeroVal()
template <typename AccDataType, ReduceTensorOp ReduceOpId>
__host__ static inline AccDataType ReduceOpZeroVal()
{
if constexpr(ReduceOpId == ReduceTensorOp_t::MUL)
if constexpr(ReduceOpId == ReduceTensorOp::MUL)
{
return (static_cast<compType>(1.0f));
return (static_cast<AccDataType>(1.0f));
}
else if constexpr(ReduceOpId == ReduceTensorOp_t::MIN)
else if constexpr(ReduceOpId == ReduceTensorOp::MIN)
{
return (std::numeric_limits<compType>::max());
return (std::numeric_limits<AccDataType>::max());
}
else if constexpr(ReduceOpId == ReduceTensorOp_t::MAX)
else if constexpr(ReduceOpId == ReduceTensorOp::MAX)
{
return (std::numeric_limits<compType>::lowest());
return (std::numeric_limits<AccDataType>::lowest());
}
else if constexpr(ReduceOpId == ReduceTensorOp_t::AMAX)
else if constexpr(ReduceOpId == ReduceTensorOp::AMAX)
{
return (static_cast<compType>(0.0f));
return (static_cast<AccDataType>(0.0f));
}
else
{
// ReduceTensorOp_t::ADD
// ReduceTensorOp_t::AVG
// ReduceTensorOp_t::NORM1
// ReduceTensorOp_t::NORM2
return (static_cast<compType>(0.0f));
// ReduceTensorOp::ADD
// ReduceTensorOp::AVG
// ReduceTensorOp::NORM1
// ReduceTensorOp::NORM2
return (static_cast<AccDataType>(0.0f));
};
};
template <typename compType, bool PropagateNan>
__host__ static inline void binop_with_nan_check(std::function<void(compType&, compType)> opReduce,
compType& accuVal,
compType currVal)
template <typename AccDataType, bool PropagateNan>
__host__ static inline void
binop_with_nan_check(std::function<void(AccDataType&, AccDataType)> opReduce,
AccDataType& accuVal,
AccDataType currVal)
{
using std::isnan;
......@@ -236,11 +237,11 @@ __host__ static inline void binop_with_nan_check(std::function<void(compType&, c
};
};
template <typename compType, bool PropagateNan>
template <typename AccDataType, bool PropagateNan>
__host__ static inline void
binop_with_nan_check2(std::function<void(compType&, compType, bool&)> opReduce,
compType& accuVal,
compType currVal,
binop_with_nan_check2(std::function<void(AccDataType&, AccDataType, bool&)> opReduce,
AccDataType& accuVal,
AccDataType currVal,
int& accuIndex,
int currIndex)
{
......
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef HOST_REDUCTION_HPP_
#define HOST_REDUCTION_HPP_
#include <vector>
#include <array>
#include <functional>
#include "reduction_enums.hpp"
#include "host_reduce_util.hpp"
#include "host_tensor.hpp"
#include "data_type.hpp"
template <int NDim>
static void get_all_indexes(const std::array<size_t, NDim>& dimLengths,
std::vector<std::array<size_t, NDim>>& indexes)
{
static_assert(NDim >= 1, "NDim >= 1 is required to use this function!");
if constexpr(NDim == 1)
{
for(size_t i = 0; i < dimLengths[0]; i++)
{
std::array<size_t, 1> index{i};
indexes.push_back(index);
};
}
else
{
std::array<size_t, NDim - 1> partial_dim_lengths;
for(int i = 0; i < NDim - 1; i++)
partial_dim_lengths[i] = dimLengths[i + 1];
std::vector<std::array<size_t, NDim - 1>> partial_indexes;
get_all_indexes<NDim - 1>(partial_dim_lengths, partial_indexes);
for(size_t i = 0; i < dimLengths[0]; i++)
for(const auto& index : partial_indexes)
{
std::array<size_t, NDim> extIndex;
extIndex[0] = i;
for(int k = 0; k < NDim - 1; k++)
extIndex[k + 1] = index[k];
indexes.push_back(extIndex);
};
};
};
template <int NDim>
static size_t get_offset_from_index(const std::array<size_t, NDim>& strides,
const std::array<size_t, NDim>& index)
{
size_t offset = 0;
for(int i = 0; i < NDim; i++)
offset += strides[i] * index[i];
return (offset);
};
template <int NDim>
static size_t get_offset_from_index(const std::vector<size_t>& strides,
const std::array<size_t, NDim>& index)
{
size_t offset = 0;
for(int i = 0; i < NDim; i++)
offset += strides[i] * index[i];
return (offset);
};
template <typename InDataType,
typename AccDataType,
typename OutDataType,
ck::ReduceTensorOp ReduceOpId,
int Rank,
int NumReduceDim,
bool PropagateNan,
bool NeedIndices>
struct ReductionHost
{
using IndexDataType = int32_t;
static constexpr int NumInvariantDim = Rank - NumReduceDim;
std::vector<size_t> outStrides;
std::vector<int> invariantDims;
std::vector<int> reduceDims;
IndexDataType divider;
std::function<void(AccDataType&)> preUnaryOp;
std::function<void(AccDataType&)> posUnaryOp;
std::array<size_t, NumReduceDim> reduceLengths;
std::array<size_t, NumReduceDim> reduceStrides;
std::array<size_t, NumInvariantDim> invariantLengths;
std::array<size_t, NumInvariantDim> invariantStrides;
std::vector<std::array<size_t, NumReduceDim>> reduce_dim_indexes;
std::vector<std::array<size_t, NumInvariantDim>> invariant_dim_indexes;
ReductionHost(HostTensorDescriptor& inDesc,
HostTensorDescriptor& outDesc,
const std::vector<int>& invariantDims_,
const std::vector<int>& reduceDims_)
{
using ck::host_reduce::PosUnaryOpFn;
using ck::host_reduce::PreUnaryOpFn;
// this->outLengths = to_int_vector(outDesc.GetLengths());
this->outStrides = outDesc.GetStrides();
this->invariantDims = invariantDims_;
this->reduceDims = reduceDims_;
int product = 1;
for(int i = 0; i < NumReduceDim; i++)
{
reduceLengths[i] = inDesc.GetLengths()[reduceDims[i]];
reduceStrides[i] = inDesc.GetStrides()[reduceDims[i]];
product *= inDesc.GetLengths()[reduceDims[i]];
};
divider = product;
for(int i = 0; i < NumInvariantDim; i++)
{
invariantLengths[i] = inDesc.GetLengths()[invariantDims[i]];
invariantStrides[i] = inDesc.GetStrides()[invariantDims[i]];
};
reduce_dim_indexes.clear();
get_all_indexes<NumReduceDim>(reduceLengths, reduce_dim_indexes);
if constexpr(NumInvariantDim > 0)
{
invariant_dim_indexes.clear();
get_all_indexes<NumInvariantDim>(invariantLengths, invariant_dim_indexes);
};
preUnaryOp = PreUnaryOpFn<AccDataType, ReduceOpId>(divider);
posUnaryOp = PosUnaryOpFn<AccDataType, ReduceOpId>(divider);
};
void Run(float alpha,
const InDataType* in_data,
float beta,
OutDataType* out_data,
IndexDataType* out_indices)
{
if constexpr(NeedIndices)
{
RunImpl_with_index(alpha, in_data, beta, out_data, out_indices);
}
else
{
RunImpl_no_index(alpha, in_data, beta, out_data);
};
};
void RunImpl_with_index(float alpha,
const InDataType* in_data,
float beta,
OutDataType* out_data,
IndexDataType* out_indices)
{
using ck::type_convert;
using ck::host_reduce::binop_with_nan_check2;
using ck::host_reduce::float_equal_one;
using ck::host_reduce::float_equal_zero;
using ck::host_reduce::ReduceOpFn2;
using ck::host_reduce::ReduceOpZeroVal;
auto opReduce2 = ReduceOpFn2<AccDataType, ReduceOpId>();
if constexpr(NumInvariantDim == 0)
{
AccDataType accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
IndexDataType accuIndex = 0;
for(IndexDataType i = 0; i < reduce_dim_indexes.size(); i++)
{
auto offset_reduce =
get_offset_from_index<NumReduceDim>(reduceStrides, reduce_dim_indexes[i]);
auto currVal = type_convert<AccDataType>(in_data[offset_reduce]);
preUnaryOp(currVal);
auto currIndex = i;
binop_with_nan_check2<AccDataType, PropagateNan>(
opReduce2, accuVal, currVal, accuIndex, currIndex);
};
posUnaryOp(accuVal);
if(!float_equal_one(alpha))
accuVal *= type_convert<AccDataType>(alpha);
if(!float_equal_zero(beta))
accuVal += type_convert<AccDataType>(out_data[0]) * type_convert<AccDataType>(beta);
out_data[0] = type_convert<OutDataType>(accuVal);
out_indices[0] = accuIndex;
}
else
{
auto thread_reduce_func = [&](auto invariant_index) {
AccDataType accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
IndexDataType accuIndex = 0;
auto offset_invariant =
get_offset_from_index<NumInvariantDim>(invariantStrides, invariant_index);
for(IndexDataType i = 0; i < reduce_dim_indexes.size(); i++)
{
auto offset_reduce =
get_offset_from_index<NumReduceDim>(reduceStrides, reduce_dim_indexes[i]);
auto currVal =
type_convert<AccDataType>(in_data[offset_invariant + offset_reduce]);
preUnaryOp(currVal);
auto currIndex = i;
binop_with_nan_check2<AccDataType, PropagateNan>(
opReduce2, accuVal, currVal, accuIndex, currIndex);
};
posUnaryOp(accuVal);
if(!float_equal_one(alpha))
accuVal *= type_convert<AccDataType>(alpha);
auto dst_offset =
get_offset_from_index<NumInvariantDim>(outStrides, invariant_index);
if(!float_equal_zero(beta))
accuVal += type_convert<AccDataType>(out_data[dst_offset]) *
type_convert<AccDataType>(beta);
out_data[dst_offset] = type_convert<OutDataType>(accuVal);
out_indices[dst_offset] = accuIndex;
};
std::size_t num_thread = 1;
std::size_t work_per_thread =
(invariant_dim_indexes.size() + num_thread - 1) / num_thread;
std::vector<joinable_thread> threads(num_thread);
for(std::size_t it = 0; it < num_thread; ++it)
{
std::size_t iw_begin = it * work_per_thread;
std::size_t iw_end =
std::min((it + 1) * work_per_thread, invariant_dim_indexes.size());
auto f = [=] {
for(std::size_t iw = iw_begin; iw < iw_end; ++iw)
{
thread_reduce_func(invariant_dim_indexes[iw]);
}
};
threads[it] = joinable_thread(f);
}
};
};
void RunImpl_no_index(float alpha, const InDataType* in_data, float beta, OutDataType* out_data)
{
using ck::type_convert;
using ck::host_reduce::binop_with_nan_check;
using ck::host_reduce::float_equal_one;
using ck::host_reduce::float_equal_zero;
using ck::host_reduce::ReduceOpFn;
using ck::host_reduce::ReduceOpZeroVal;
auto opReduce = ReduceOpFn<AccDataType, ReduceOpId>();
if constexpr(NumInvariantDim == 0)
{
AccDataType accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
for(const auto& reduce_index : reduce_dim_indexes)
{
auto offset_reduce =
get_offset_from_index<NumReduceDim>(reduceStrides, reduce_index);
auto currVal = type_convert<AccDataType>(in_data[offset_reduce]);
preUnaryOp(currVal);
binop_with_nan_check<AccDataType, PropagateNan>(opReduce, accuVal, currVal);
};
posUnaryOp(accuVal);
if(!float_equal_one(alpha))
accuVal *= type_convert<AccDataType>(alpha);
if(!float_equal_zero(beta))
accuVal += type_convert<AccDataType>(out_data[0]) * type_convert<AccDataType>(beta);
out_data[0] = type_convert<OutDataType>(accuVal);
}
else
{
auto thread_reduce_func = [&](auto invariant_index) {
AccDataType accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
auto offset_invariant =
get_offset_from_index<NumInvariantDim>(invariantStrides, invariant_index);
for(const auto& reduce_index : reduce_dim_indexes)
{
auto offset_reduce =
get_offset_from_index<NumReduceDim>(reduceStrides, reduce_index);
auto currVal =
type_convert<AccDataType>(in_data[offset_invariant + offset_reduce]);
preUnaryOp(currVal);
binop_with_nan_check<AccDataType, PropagateNan>(opReduce, accuVal, currVal);
};
posUnaryOp(accuVal);
if(!float_equal_one(alpha))
accuVal *= type_convert<AccDataType>(alpha);
auto dst_offset =
get_offset_from_index<NumInvariantDim>(outStrides, invariant_index);
if(!float_equal_zero(beta))
accuVal += type_convert<AccDataType>(out_data[dst_offset]) *
type_convert<AccDataType>(beta);
out_data[dst_offset] = type_convert<OutDataType>(accuVal);
};
std::size_t num_thread = 1;
std::size_t work_per_thread =
(invariant_dim_indexes.size() + num_thread - 1) / num_thread;
std::vector<joinable_thread> threads(num_thread);
for(std::size_t it = 0; it < num_thread; ++it)
{
std::size_t iw_begin = it * work_per_thread;
std::size_t iw_end =
std::min((it + 1) * work_per_thread, invariant_dim_indexes.size());
auto f = [=] {
for(std::size_t iw = iw_begin; iw < iw_end; ++iw)
{
thread_reduce_func(invariant_dim_indexes[iw]);
}
};
threads[it] = joinable_thread(f);
}
};
};
};
#endif
......@@ -40,20 +40,6 @@ std::ostream& LogRangeAsType(std::ostream& os, Range&& range, std::string delim)
return os;
}
typedef enum
{
Half = 0,
Float = 1,
} DataType_t;
template <typename T>
struct DataType;
template <>
struct DataType<float> : std::integral_constant<DataType_t, DataType_t::Float>
{
};
template <typename F, typename T, std::size_t... Is>
auto call_f_unpack_args_impl(F f, T args, std::index_sequence<Is...>)
{
......@@ -87,10 +73,10 @@ struct HostTensorDescriptor
HostTensorDescriptor() = delete;
template <typename X>
HostTensorDescriptor(std::vector<X> lens);
HostTensorDescriptor(const std::vector<X>& lens);
template <typename X, typename Y>
HostTensorDescriptor(std::vector<X> lens, std::vector<Y> strides);
HostTensorDescriptor(const std::vector<X>& lens, const std::vector<Y>& strides);
void CalculateStrides();
......@@ -177,7 +163,7 @@ struct ParallelTensorFunctor
return indices;
}
void operator()(std::size_t num_thread = std::thread::hardware_concurrency()) const
void operator()(std::size_t num_thread = 1) const
{
std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread;
......@@ -227,7 +213,7 @@ struct Tensor
Tensor(const HostTensorDescriptor& desc) : mDesc(desc), mData(mDesc.GetElementSpace()) {}
template <typename G>
void GenerateTensorValue(G g, std::size_t num_thread = std::thread::hardware_concurrency())
void GenerateTensorValue(G g, std::size_t num_thread = 1)
{
switch(mDesc.GetNumOfDimension())
{
......@@ -299,85 +285,69 @@ struct Tensor
};
template <typename X>
HostTensorDescriptor::HostTensorDescriptor(std::vector<X> lens) : mLens(lens)
HostTensorDescriptor::HostTensorDescriptor(const std::vector<X>& lens) : mLens(lens)
{
this->CalculateStrides();
}
template <typename X, typename Y>
HostTensorDescriptor::HostTensorDescriptor(std::vector<X> lens, std::vector<Y> strides)
HostTensorDescriptor::HostTensorDescriptor(const std::vector<X>& lens,
const std::vector<Y>& strides)
: mLens(lens), mStrides(strides)
{
}
void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream& os = std::cout);
float bf16_to_f32_(ck::bhalf_t src_val);
#if 1
// FIXME: remove
void bf16_to_f32_(const Tensor<ck::bhalf_t>& src, Tensor<float>& dst);
#endif
template <typename T>
void check_error(const Tensor<T>& ref, const Tensor<T>& result)
float check_error(const Tensor<T>& ref, const Tensor<T>& result)
{
float error = 0;
float max_diff = -1;
float ref_value = 0, result_value = 0;
float l1_error = 0;
float linf_error = -1;
float linf_rel_error = -1;
float linf_ref_value = 0, linf_result_value = 0;
float linf_rel_ref_value = 0, linf_rel_result_value = 0;
constexpr float eps = 1e-10;
if constexpr(std::is_same<ck::bhalf_t, T>::value)
for(int i = 0; i < ref.mData.size(); ++i)
{
for(int i = 0; i < ref.mData.size(); ++i)
float ref_v = ck::type_convert<float>(ref.mData[i]);
float result_v = ck::type_convert<float>(result.mData[i]);
float diff = std::abs(ref_v - result_v);
float rel_diff = diff / std::max(std::abs(ref_v), eps);
l1_error += diff;
if(linf_error < diff)
{
error += std::abs(bf16_to_f32_(ref.mData[i]) - bf16_to_f32_(result.mData[i]));
float diff = std::abs(bf16_to_f32_(ref.mData[i]) - bf16_to_f32_(result.mData[i]));
if(max_diff < diff)
{
max_diff = diff;
ref_value = bf16_to_f32_(ref.mData[i]);
result_value = bf16_to_f32_(result.mData[i]);
}
linf_error = diff;
linf_ref_value = ref_v;
linf_result_value = result_v;
}
}
else
{
for(int i = 0; i < ref.mData.size(); ++i)
if(linf_rel_error < rel_diff)
{
error += std::abs(double(ref.mData[i]) - double(result.mData[i]));
float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
if(max_diff < diff)
{
max_diff = diff;
ref_value = ref.mData[i];
result_value = result.mData[i];
}
linf_rel_error = rel_diff;
linf_rel_ref_value = ref_v;
linf_rel_result_value = result_v;
}
}
std::cout << "error: " << error << std::endl;
std::cout << "max_diff: " << max_diff << ", " << ref_value << ", " << result_value << std::endl;
}
template <typename T>
void check_indices(const Tensor<T>& ref, const Tensor<T>& result)
{
bool has_error = false;
int error_count = 0;
for(int i = 0; i < ref.mData.size(); ++i)
{
if(ref.mData[i] != result.mData[i])
{
std::cerr << std::endl
<< "Indices different at position " << i << " (ref: " << ref.mData[i]
<< ", result: " << result.mData[i] << ")" << std::endl;
has_error = true;
error_count++;
if(error_count == 20)
break;
};
}
std::cout << "Absolute Error L1 Norm (sum of abs diff): " << l1_error << std::endl;
std::cout << "Absolute Error L-inf Norm (max abs diff): " << linf_error << ", ref "
<< linf_ref_value << ", result " << linf_result_value << std::endl;
std::cout << "Relative Error L-inf Norm (max relative abs diff): " << linf_rel_error << ", ref "
<< linf_rel_ref_value << ", result " << linf_rel_result_value << std::endl;
if(!has_error)
std::cout << std::endl << "Indices result is completely acccurate!" << std::endl;
return linf_error;
}
#endif
#ifndef HOST_TENSOR_GENERATOR_HPP
#define HOST_TENSOR_GENERATOR_HPP
#pragma once
#include <cmath>
#include <numeric>
#include "config.hpp"
template <typename T>
......@@ -93,8 +94,8 @@ struct GeneratorTensor_2<int8_t>
template <typename T>
struct GeneratorTensor_3
{
T min_value = 0;
T max_value = 1;
float min_value = 0;
float max_value = 1;
template <typename... Is>
T operator()(Is...)
......@@ -122,22 +123,6 @@ struct GeneratorTensor_3<ck::bhalf_t>
}
};
template <>
struct GeneratorTensor_3<int8_t>
{
float min_value = 0;
float max_value = 1;
template <typename... Is>
int8_t operator()(Is...)
{
int8_t min_tmp = static_cast<int8_t>(min_value);
int8_t max_tmp = static_cast<int8_t>(max_value);
return (std::rand() % (max_tmp - min_tmp)) + min_tmp;
}
};
struct GeneratorTensor_Checkboard
{
template <typename... Ts>
......@@ -163,5 +148,3 @@ struct GeneratorTensor_Sequential
return dims[Dim];
}
};
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment