"git@developer.sourcefind.cn:modelzoo/resnet50_tensorflow.git" did not exist on "33fc3eebdc21821539bd1b17c3957f2184a9792c"
Unverified Commit fcbb9788 authored by Chao Liu's avatar Chao Liu Committed by GitHub
Browse files

Dynamic tensor descriptor (#24)



* support dynamic tensor descriptor

* use buffer load OOB feature for padding case

* add navi support

* add int8x4 inference kernel
Co-authored-by: default avatarChao Liu <chao@ixt-rack-81.local.lan>
Co-authored-by: default avatarJing Zhang <jizhan@amd.com>
parent bbcb67d0
#ifndef CK_AMD_LLVM_INTRINSIC_HPP
#define CK_AMD_LLVM_INTRINSIC_HPP
#include "float_type.hpp"
namespace ck {
__device__ int32_t __llvm_amdgcn_readfirstlane_i32(int32_t i) __asm("llvm.amdgcn.readfirstlane");
} // namespace ck
#endif
#ifndef CK_ARRAY_HPP #ifndef CK_ARRAY_HPP
#define CK_ARRAY_HPP #define CK_ARRAY_HPP
#include "sequence.hpp"
#include "functional2.hpp" #include "functional2.hpp"
#include "sequence.hpp"
namespace ck { namespace ck {
template <typename TData, index_t NSize> template <typename TData, index_t NSize>
struct Array struct Array
{ {
using type = Array<TData, NSize>; using type = Array;
using data_type = TData; using data_type = TData;
// TODO: implement empty Array TData mData[NSize];
index_t mData[NSize];
__host__ __device__ explicit constexpr Array() {}
template <typename X, typename... Xs>
__host__ __device__ constexpr Array(X x, Xs... xs)
: mData{static_cast<TData>(x), static_cast<TData>(xs)...}
{
static_assert(sizeof...(Xs) + 1 == NSize, "wrong! size");
}
__host__ __device__ static constexpr index_t Size() { return NSize; } __host__ __device__ static constexpr index_t Size() { return NSize; }
// TODO: remove
__host__ __device__ static constexpr index_t GetSize() { return Size(); }
template <index_t I>
__host__ __device__ constexpr const TData& At(Number<I>) const
{
static_assert(I < NSize, "wrong!");
return mData[I];
}
template <index_t I>
__host__ __device__ constexpr TData& At(Number<I>)
{
static_assert(I < NSize, "wrong!");
return mData[I];
}
__host__ __device__ constexpr const TData& At(index_t i) const { return mData[i]; } __host__ __device__ constexpr const TData& At(index_t i) const { return mData[i]; }
__host__ __device__ constexpr TData& At(index_t i) { return mData[i]; } __host__ __device__ constexpr TData& At(index_t i) { return mData[i]; }
template <typename I> __host__ __device__ constexpr const TData& operator[](index_t i) const { return At(i); }
__host__ __device__ constexpr const TData& operator[](I i) const
{
return At(i);
}
template <typename I> __host__ __device__ constexpr TData& operator()(index_t i) { return At(i); }
__host__ __device__ constexpr TData& operator()(I i)
{
return At(i);
}
template <typename T> template <typename T>
__host__ __device__ constexpr type& operator=(const T& x) __host__ __device__ constexpr auto operator=(const T& a)
{
static_for<0, Size(), 1>{}([&](auto i) { operator()(i) = x[i]; });
return *this;
}
struct lambda_PushBack // emulate constexpr lambda
{ {
const Array<TData, NSize>& old_array; static_assert(T::Size() == Size(), "wrong! size not the same");
Array<TData, NSize + 1>& new_array;
__host__ __device__ constexpr lambda_PushBack(const Array<TData, NSize>& old_array_,
Array<TData, NSize + 1>& new_array_)
: old_array(old_array_), new_array(new_array_)
{
}
template <index_t I>
__host__ __device__ constexpr void operator()(Number<I>) const
{
new_array(Number<I>{}) = old_array[I];
}
};
__host__ __device__ constexpr auto PushBack(TData x) const
{
Array<TData, NSize + 1> new_array;
static_for<0, NSize, 1>{}(lambda_PushBack(*this, new_array));
new_array(Number<NSize>{}) = x;
return new_array;
}
};
// Arr: Array
// Picks: Sequence<...>
template <typename Arr, typename Picks>
struct ArrayElementPicker
{
using type = ArrayElementPicker;
using data_type = typename Arr::data_type;
__host__ __device__ constexpr ArrayElementPicker() = delete;
__host__ __device__ explicit constexpr ArrayElementPicker(Arr& array) : mArray{array}
{
constexpr index_t imax = reduce_on_sequence(Picks{}, math::maxer<index_t>{}, Number<0>{});
static_assert(imax < Arr::Size(), "wrong! exceeding # array element");
}
__host__ __device__ static constexpr auto Size() { return Picks::Size(); }
template <index_t I>
__host__ __device__ constexpr const data_type& At(Number<I>) const
{
static_assert(I < Size(), "wrong!");
constexpr auto IP = Picks{}[I];
return mArray[IP];
}
template <index_t I>
__host__ __device__ constexpr data_type& At(Number<I>)
{
static_assert(I < Size(), "wrong!");
constexpr auto IP = Picks{}[I];
return mArray(IP);
}
template <typename I>
__host__ __device__ constexpr const data_type& operator[](I i) const
{
return At(i);
}
template <typename I>
__host__ __device__ constexpr data_type& operator()(I i)
{
return At(i);
}
template <typename T>
__host__ __device__ constexpr type& operator=(const T& a)
{
static_for<0, Size(), 1>{}([&](auto i) { operator()(i) = a[i]; }); static_for<0, Size(), 1>{}([&](auto i) { operator()(i) = a[i]; });
return *this; return *this;
} }
Arr& mArray;
};
template <typename Arr, typename Picks>
__host__ __device__ constexpr auto pick_array_element(Arr& a, Picks)
{
return ArrayElementPicker<Arr, Picks>(a);
}
template <typename T>
__host__ __device__ constexpr auto to_array(const T& x)
{
Array<typename T::data_type, T::Size()> y;
static_for<0, T::Size(), 1>{}([&](auto i) { y.At(i) = x.At(i); });
return y;
}
// TODO: remove this
template <index_t... Is>
__host__ __device__ constexpr auto sequence2array(Sequence<Is...>)
{
return Array<index_t, sizeof...(Is)>{Is...};
}
template <typename TData, index_t NSize>
__host__ __device__ constexpr auto make_zero_array()
{
constexpr auto zero_sequence = typename uniform_sequence_gen<NSize, 0>::type{};
constexpr auto zero_array = sequence2array(zero_sequence);
return zero_array;
}
template <typename TData, index_t NSize, index_t... IRs>
__host__ __device__ constexpr auto reorder_array_given_new2old(const Array<TData, NSize>& old_array,
Sequence<IRs...> /*new2old*/)
{
static_assert(NSize == sizeof...(IRs), "NSize not consistent");
static_assert(is_valid_sequence_map<Sequence<IRs...>>{}, "wrong! invalid reorder map");
return Array<TData, NSize>{old_array[IRs]...};
}
template <typename TData, index_t NSize, typename MapOld2New>
struct lambda_reorder_array_given_old2new
{
const Array<TData, NSize>& old_array;
Array<TData, NSize>& new_array;
__host__ __device__ constexpr lambda_reorder_array_given_old2new(
const Array<TData, NSize>& old_array_, Array<TData, NSize>& new_array_)
: old_array(old_array_), new_array(new_array_)
{
}
template <index_t IOldDim>
__host__ __device__ constexpr void operator()(Number<IOldDim>) const
{
TData old_data = old_array[IOldDim];
constexpr index_t INewDim = MapOld2New::At(Number<IOldDim>{});
new_array(Number<INewDim>{}) = old_data;
}
}; };
template <typename TData, index_t NSize, index_t... IRs> // empty Array
__host__ __device__ constexpr auto reorder_array_given_old2new(const Array<TData, NSize>& old_array, template <typename TData>
Sequence<IRs...> /*old2new*/) struct Array<TData, 0>
{ {
Array<TData, NSize> new_array; using type = Array;
using data_type = TData;
static_assert(NSize == sizeof...(IRs), "NSize not consistent");
static_assert(is_valid_sequence_map<Sequence<IRs...>>::value, "wrong! invalid reorder map");
static_for<0, NSize, 1>{}(
lambda_reorder_array_given_old2new<TData, NSize, Sequence<IRs...>>(old_array, new_array));
return new_array;
}
template <typename TData, index_t NSize, typename ExtractSeq>
__host__ __device__ constexpr auto extract_array(const Array<TData, NSize>& old_array, ExtractSeq)
{
Array<TData, ExtractSeq::GetSize()> new_array;
constexpr index_t new_size = ExtractSeq::GetSize();
static_assert(new_size <= NSize, "wrong! too many extract");
static_for<0, new_size, 1>{}([&](auto I) { new_array(I) = old_array[ExtractSeq::At(I)]; });
return new_array;
}
// emulate constepxr lambda for array
template <typename F, typename X, typename Y, typename Z>
struct lambda_array_math
{
const F& f;
const X& x;
const Y& y;
Z& z;
__host__ __device__ constexpr lambda_array_math(const F& f_, const X& x_, const Y& y_, Z& z_)
: f(f_), x(x_), y(y_), z(z_)
{
}
template <index_t IDim_> __host__ __device__ static constexpr index_t Size() { return 0; }
__host__ __device__ constexpr void operator()(Number<IDim_>) const
{
constexpr auto IDim = Number<IDim_>{};
z(IDim) = f(x[IDim], y[IDim]);
}
}; };
// Array = Array + Array template <typename X, typename... Xs>
template <typename TData, index_t NSize> __host__ __device__ constexpr auto make_array(X&& x, Xs&&... xs)
__host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Array<TData, NSize> b)
{ {
Array<TData, NSize> result; using data_type = remove_cv_t<remove_reference_t<X>>;
return Array<data_type, sizeof...(Xs) + 1>{{std::forward<X>(x), std::forward<Xs>(xs)...}};
auto f = math::plus<index_t>{};
static_for<0, NSize, 1>{}(
lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
f, a, b, result));
return result;
} }
// Array = Array - Array // make empty array
template <typename TData, index_t NSize> template <typename X>
__host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Array<TData, NSize> b) __host__ __device__ constexpr auto make_array()
{
Array<TData, NSize> result;
auto f = math::minus<index_t>{};
static_for<0, NSize, 1>{}(
lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
f, a, b, result));
return result;
}
// Array += Array
template <typename TData, index_t NSize>
__host__ __device__ constexpr auto operator+=(Array<TData, NSize>& a, Array<TData, NSize> b)
{
a = a + b;
return a;
}
// Array -= Array
template <typename TData, index_t NSize>
__host__ __device__ constexpr auto operator-=(Array<TData, NSize>& a, Array<TData, NSize> b)
{ {
a = a - b; return Array<X, 0>{};
return a;
}
// Array = Array + Sequence
template <typename TData, index_t NSize, index_t... Is>
__host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Sequence<Is...> b)
{
static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
Array<TData, NSize> result;
auto f = math::plus<index_t>{};
static_for<0, NSize, 1>{}(
lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
f, a, b, result));
return result;
}
// Array = Array - Sequence
template <typename TData, index_t NSize, index_t... Is>
__host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Sequence<Is...> b)
{
static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
Array<TData, NSize> result;
auto f = math::minus<index_t>{};
static_for<0, NSize, 1>{}(
lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
f, a, b, result));
return result;
}
// Array = Array * Sequence
template <typename TData, index_t NSize, index_t... Is>
__host__ __device__ constexpr auto operator*(Array<TData, NSize> a, Sequence<Is...> b)
{
static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
Array<TData, NSize> result;
auto f = math::multiplies<index_t>{};
static_for<0, NSize, 1>{}(
lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
f, a, b, result));
return result;
}
// Array = Sequence - Array
template <typename TData, index_t NSize, index_t... Is>
__host__ __device__ constexpr auto operator-(Sequence<Is...> a, Array<TData, NSize> b)
{
static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
Array<TData, NSize> result;
auto f = math::minus<index_t>{};
static_for<0, NSize, 1>{}(
lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
f, a, b, result));
return result;
}
// Array = Array * TData
template <typename TData, index_t NSize>
__host__ __device__ constexpr auto operator*(TData v, Array<TData, NSize> a)
{
Array<TData, NSize> result;
for(index_t i = 0; i < NSize; ++i)
{
result(i) = a[i] * v;
}
return result;
}
template <typename TData, index_t NSize, typename Reduce>
__host__ __device__ constexpr TData
accumulate_on_array(const Array<TData, NSize>& a, Reduce f, TData init)
{
TData result = init;
static_assert(NSize > 0, "wrong");
static_for<0, NSize, 1>{}([&](auto I) { result = f(result, a[I]); });
return result;
} }
} // namespace ck } // namespace ck
......
...@@ -2,21 +2,26 @@ ...@@ -2,21 +2,26 @@
#define CK_COMMON_HEADER_HPP #define CK_COMMON_HEADER_HPP
#include "config.hpp" #include "config.hpp"
#include "utility.hpp"
#include "integral_constant.hpp"
#include "number.hpp"
#include "float_type.hpp"
#include "type.hpp"
#include "tuple.hpp"
#include "math.hpp"
#include "sequence.hpp"
#include "array.hpp" #include "array.hpp"
#include "container_helper.hpp"
#include "statically_indexed_array.hpp"
#include "container_element_picker.hpp"
#include "float_type.hpp"
#include "functional.hpp" #include "functional.hpp"
#include "functional2.hpp" #include "functional2.hpp"
#include "functional3.hpp" #include "functional3.hpp"
#include "functional4.hpp" #include "functional4.hpp"
#include "in_memory_operation.hpp" #include "in_memory_operation.hpp"
#include "integral_constant.hpp"
#include "math.hpp"
#include "number.hpp"
#include "sequence.hpp"
#include "sequence_helper.hpp"
#include "synchronization.hpp" #include "synchronization.hpp"
#include "tuple.hpp"
#include "tuple_helper.hpp"
#include "type.hpp"
#include "utility.hpp"
#if CK_USE_AMD_INLINE_ASM #if CK_USE_AMD_INLINE_ASM
#include "amd_inline_asm.hpp" #include "amd_inline_asm.hpp"
......
#ifndef CK_CONFIG_AMD_HPP #ifndef CK_CONFIG_AMD_HPP
#define CK_CONFIG_AMD_HPP #define CK_CONFIG_AMD_HPP
#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
#include "hip/hip_runtime.h" #include "hip/hip_runtime.h"
#include "hip/hip_fp16.h" #include "hip/hip_fp16.h"
#endif
#include "bfloat16_dev.hpp" #include "bfloat16_dev.hpp"
// index type: unsigned or signed
#define CK_UNSIGNED_INDEX_TYPE 0
// device backend // device backend
#define CK_DEVICE_BACKEND_AMD 1 #define CK_DEVICE_BACKEND_AMD 1
// GPU ID
#define CK_AMD_GPU_GFX906 1
#define CK_AMD_GPU_GFX908 0
#define CK_AMD_GPU_GFX1030 0
// HIP version
#ifndef CK_HIP_VERSION_FLAT
#define CK_HIP_VERSION_FLAT 0
#endif
// launch bounds
#define CK_USE_LAUNCH_BOUNDS 0
#ifdef CK_USE_LAUNCH_BOUNDS
#define CK_MAX_THREAD_PER_BLOCK 256
#define CK_MIN_BLOCK_PER_CU 1
#endif
// buffer resourse
#if defined(CK_AMD_GPU_GFX906) || defined(CK_AMD_GPU_GFX908)
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000
#elif defined(CK_AMD_GPU_GFX1030)
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
#endif
// multi index
#define CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 0
// AMD inline asm // AMD inline asm
#ifndef CK_USE_AMD_INLINE_ASM #ifndef CK_USE_AMD_INLINE_ASM
#define CK_USE_AMD_INLINE_ASM 1 #define CK_USE_AMD_INLINE_ASM 1
...@@ -20,14 +47,18 @@ ...@@ -20,14 +47,18 @@
#define CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM 1 #define CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM 1
#endif #endif
#ifndef CK_USE_AMD_V_FMAC_F32
#define CK_USE_AMD_V_FMAC_F32 1
#endif
// AMD buffer addressing // AMD buffer addressing
#ifndef CK_USE_AMD_BUFFER_ADDRESSING #ifndef CK_USE_AMD_BUFFER_ADDRESSING
#define CK_USE_AMD_BUFFER_ADDRESSING 1 #define CK_USE_AMD_BUFFER_ADDRESSING 1
#endif #endif
// only gfx908 support native floating point atomic add // only gfx908 support native floating point atomic add
#ifndef CK_USE_AMD_BUFFER_ATOMIC_ADD #ifndef CK_USE_AMD_BUFFER_ATOMIC_FADD
#define CK_USE_AMD_BUFFER_ATOMIC_ADD 0 #define CK_USE_AMD_BUFFER_ATOMIC_FADD 0
#endif #endif
// AMD XDLOPS // AMD XDLOPS
...@@ -49,8 +80,16 @@ ...@@ -49,8 +80,16 @@
#endif #endif
// experimental implementation // experimental implementation
#ifndef CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK #ifndef CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
#define CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK 1 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#endif
#ifndef CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
#define CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK 1
#endif
#ifndef CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK_OFFSET_TRICK
#define CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK_OFFSET_TRICK 1
#endif #endif
#ifndef CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE #ifndef CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE
...@@ -65,14 +104,33 @@ ...@@ -65,14 +104,33 @@
#define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK 0 #define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK 0
#endif #endif
// pass tensor descriptor by value, pointer or void*
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE 1
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_POINTER 0
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 0
// hack: have underlying assumption that need to be satsified, otherwise it's a bug
// hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
// thread-invariant, otherwise it's a bug
// TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread"
#ifndef CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
#define CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0
#endif
// workaround: put all workaround here // workaround: put all workaround here
// workaround for unnecessary VGPA <--> AGRP data movement when using mfma LLVM intrinsic // workaround for unnecessary VGPR <--> AGPR data movement when using mfma LLVM intrinsic
#ifndef CK_WORKAROUND_SWDEV_229564 #ifndef CK_WORKAROUND_SWDEV_229564
#define CK_WORKAROUND_SWDEV_229564 1 #define CK_WORKAROUND_SWDEV_229564 1
#endif #endif
// workaround for buffer load/store fp16/bfp16 intrinsic bug
#ifndef CK_WORKAROUND_SWDEV_231101 // workaround for accvgpr over-allocation
#define CK_WORKAROUND_SWDEV_231101 1 #ifndef CK_WORKAROUND_SWDEV_241664
#define CK_WORKAROUND_SWDEV_241664 1
#endif
// workaround for compiler crash when compiling recursive lambda
#ifndef CK_WORKAROUND_SWDEV_275126
#define CK_WORKAROUND_SWDEV_275126 1
#endif #endif
namespace ck { namespace ck {
...@@ -91,14 +149,8 @@ enum InMemoryDataOperation ...@@ -91,14 +149,8 @@ enum InMemoryDataOperation
AtomicAdd AtomicAdd
}; };
#if CK_UNSIGNED_INDEX_TYPE // index type
using index_t = uint32_t;
#else
using index_t = int32_t; using index_t = int32_t;
#endif
// int32x4_t use by buffer_load and buffer_store llvm intrinsic
typedef int32_t int32x4_t __attribute__((ext_vector_type(4)));
} // namespace ck } // namespace ck
#endif #endif
#ifndef CK_CONTAINER_ELEMENT_PICKER_HPP
#define CK_CONTAINER_ELEMENT_PICKER_HPP
#include "functional2.hpp"
#include "sequence.hpp"
namespace ck {
// Arr: Array or StaticallyIndexedArray
// Picks: Sequence<...>
template <typename Arr, typename Picks>
struct ContainerElementPicker
{
using type = ContainerElementPicker;
#if 0
using data_type = typename Arr::data_type;
#endif
__host__ __device__ constexpr ContainerElementPicker() = delete;
__host__ __device__ constexpr ContainerElementPicker(Arr& array) : mArray{array}
{
constexpr index_t imax = reduce_on_sequence(Picks{}, math::maxer<index_t>{}, Number<0>{});
static_assert(imax < Arr::Size(), "wrong! exceeding # array element");
}
__host__ __device__ static constexpr auto Size() { return Picks::Size(); }
template <index_t I>
__host__ __device__ constexpr const auto& At(Number<I> i) const
{
static_assert(I < Size(), "wrong!");
constexpr auto IP = Picks{}[i];
return mArray[IP];
}
template <index_t I>
__host__ __device__ constexpr auto& At(Number<I> i)
{
static_assert(I < Size(), "wrong!");
constexpr auto IP = Picks{}[i];
return mArray(IP);
}
template <index_t I>
__host__ __device__ constexpr const auto& operator[](Number<I> i) const
{
return At(i);
}
template <index_t I>
__host__ __device__ constexpr auto& operator()(Number<I> i)
{
return At(i);
}
template <typename T>
__host__ __device__ constexpr auto operator=(const T& a)
{
static_assert(T::Size() == Size(), "wrong! size not the same");
static_for<0, Size(), 1>{}([&](auto i) { operator()(i) = a[i]; });
return *this;
}
private:
Arr& mArray;
};
// Arr: Array or StaticallyIndexedArray
// Picks: Sequence<...>
template <typename Arr, typename Picks>
struct ConstantContainerElementPicker
{
using type = ConstantContainerElementPicker;
#if 0
using data_type = typename Arr::data_type;
#endif
__host__ __device__ constexpr ConstantContainerElementPicker() = delete;
__host__ __device__ constexpr ConstantContainerElementPicker(const Arr& array) : mArray{array}
{
constexpr index_t imax = reduce_on_sequence(Picks{}, math::maxer<index_t>{}, Number<0>{});
static_assert(imax < Arr::Size(), "wrong! exceeding # array element");
}
__host__ __device__ static constexpr auto Size() { return Picks::Size(); }
template <index_t I>
__host__ __device__ constexpr const auto& At(Number<I> i) const
{
static_assert(I < Size(), "wrong!");
constexpr auto IP = Picks{}[i];
return mArray[IP];
}
template <index_t I>
__host__ __device__ constexpr const auto& operator[](Number<I> i) const
{
return At(i);
}
private:
const Arr& mArray;
};
template <typename Arr, typename Picks, typename X>
__host__ __device__ constexpr auto operator+=(ContainerElementPicker<Arr, Picks>& y, const X& x)
{
using Y = ContainerElementPicker<Arr, Picks>;
constexpr index_t nsize = Y::Size();
static_assert(nsize == X::Size(), "wrong! size not the same");
static_for<0, nsize, 1>{}([&](auto i) { y(i) += x[i]; });
return y;
}
template <typename Arr, typename Picks, typename X>
__host__ __device__ constexpr auto operator-=(ContainerElementPicker<Arr, Picks>& y, const X& x)
{
using Y = ContainerElementPicker<Arr, Picks>;
constexpr index_t nsize = Y::Size();
static_assert(nsize == X::Size(), "wrong! size not the same");
static_for<0, nsize, 1>{}([&](auto i) { y(i) -= x[i]; });
return y;
}
template <typename Arr, typename Picks>
__host__ __device__ constexpr auto pick_container_element(Arr& a, Picks)
{
return ContainerElementPicker<Arr, Picks>(a);
}
template <typename Arr, typename Picks>
__host__ __device__ constexpr auto pick_container_element(const Arr& a, Picks)
{
return ConstantContainerElementPicker<Arr, Picks>(a);
}
} // namespace ck
#endif
#ifndef CK_CONTAINER_HELPER_HPP
#define CK_CONTAINER_HELPER_HPP
#include "sequence.hpp"
#include "sequence_helper.hpp"
#include "array.hpp"
#include "tuple.hpp"
#include "tuple_helper.hpp"
#include "statically_indexed_array.hpp"
#include "container_element_picker.hpp"
namespace ck {
template <typename TData, index_t NSize>
__host__ __device__ constexpr auto container_push_back(const Array<TData, NSize>& a, const TData& x)
{
Array<TData, NSize + 1> r;
static_for<0, NSize, 1>{}([&r, &a ](auto i) constexpr { r(i) = a[i]; });
r(Number<NSize>{}) = x;
return r;
}
template <typename... Ts, typename T>
__host__ __device__ constexpr auto container_push_front(const Tuple<Ts...>& a, const T& x)
{
return container_cat(make_tuple(x), a);
}
template <typename... Ts, typename T>
__host__ __device__ constexpr auto container_push_back(const Tuple<Ts...>& a, const T& x)
{
return container_cat(a, make_tuple(x));
}
template <typename TData, index_t NSize, index_t... IRs>
__host__ __device__ constexpr auto
container_reorder_given_new2old(const Array<TData, NSize>& old_array, Sequence<IRs...> /*new2old*/)
{
static_assert(NSize == sizeof...(IRs), "wrong! size not consistent");
static_assert(is_valid_sequence_map<Sequence<IRs...>>{}, "wrong! invalid reorder map");
return make_array(old_array[Number<IRs>{}]...);
}
template <typename TData, index_t NSize, index_t... IRs>
__host__ __device__ constexpr auto
container_reorder_given_old2new(const Array<TData, NSize>& old_array, Sequence<IRs...> old2new)
{
return container_reorder_given_new2old(
old_array, typename sequence_map_inverse<decltype(old2new)>::type{});
}
template <typename... Ts, index_t... IRs>
__host__ __device__ constexpr auto container_reorder_given_new2old(const Tuple<Ts...>& old_tuple,
Sequence<IRs...> /*new2old*/)
{
static_assert(sizeof...(Ts) == sizeof...(IRs), "wrong! size not consistent");
static_assert(is_valid_sequence_map<Sequence<IRs...>>{}, "wrong! invalid reorder map");
return make_tuple(old_tuple[Number<IRs>{}]...);
}
template <typename... Ts, index_t... IRs>
__host__ __device__ constexpr auto container_reorder_given_old2new(const Tuple<Ts...>& old_tuple,
Sequence<IRs...> old2new)
{
return container_reorder_given_new2old(
old_tuple, typename sequence_map_inverse<decltype(old2new)>::type{});
}
template <index_t... Is, index_t... IRs>
__host__ __device__ constexpr auto container_reorder_given_new2old(Sequence<Is...> /* old_seq */,
Sequence<IRs...> /*new2old*/)
{
static_assert(sizeof...(Is) == sizeof...(IRs), "wrong! size not consistent");
static_assert(is_valid_sequence_map<Sequence<IRs...>>{}, "wrong! invalid reorder map");
return Sequence<Sequence<Is...>::At(Number<IRs>{})...>{};
}
template <index_t... Is, index_t... IRs>
__host__ __device__ constexpr auto container_reorder_given_old2new(Sequence<Is...> old_seq,
Sequence<IRs...> /* old2new */)
{
static_assert(sizeof...(Is) == sizeof...(IRs), "wrong! size not consistent");
static_assert(is_valid_sequence_map<Sequence<IRs...>>{}, "wrong! invalid reorder map");
constexpr auto new2old = typename sequence_map_inverse<Sequence<IRs...>>::type{};
return container_reorder_give_new2old(old_seq, new2old);
}
#if !CK_WORKAROUND_SWDEV_275126
// rocm-4.1 compiler would crash for recursive lambda
template <typename Container,
typename Reduce,
typename Init,
index_t IBegin = 0,
index_t IEnd = Container::Size(),
index_t IStep = 1>
__host__ __device__ constexpr auto container_reduce(const Container& x,
Reduce reduce,
Init init,
Number<IBegin> = Number<0>{},
Number<IEnd> = Number<Container::Size()>{},
Number<IStep> = Number<1>{})
{
static_assert((IEnd - IBegin) % IStep == 0, "wrong!");
// f is recursive function, fs is a dummy of f
// i is index, y_old is current scan, r_old is current reduction
auto f = [&](auto fs, auto i, auto r_old) {
auto r_new = reduce(x[i], r_old);
if constexpr(i.value < IEnd - IStep)
{
// recursively call f/fs
return fs(fs, i + Number<IStep>{}, r_new);
}
else
{
return r_new;
}
};
// start recursion
return f(f, Number<IBegin>{}, init);
}
#else
// i is index, y_old is current scan, r_old is current reduction
template <typename Container,
typename Reduce,
typename ROld,
index_t I,
index_t IEnd,
index_t IStep>
__host__ __device__ constexpr auto container_reduce_impl(
const Container& x, Reduce reduce, ROld r_old, Number<I> i, Number<IEnd>, Number<IStep>)
{
auto r_new = reduce(x[i], r_old);
if constexpr(i.value < IEnd - IStep)
{
return container_reduce_impl(
x, reduce, r_new, i + Number<IStep>{}, Number<IEnd>{}, Number<IStep>{});
}
else
{
return r_new;
}
}
// rocm-4.1 compiler would crash for recursive lambda
template <typename Container,
typename Reduce,
typename Init,
index_t IBegin = 0,
index_t IEnd = Container::Size(),
index_t IStep = 1>
__host__ __device__ constexpr auto container_reduce(const Container& x,
Reduce reduce,
Init init,
Number<IBegin> = Number<0>{},
Number<IEnd> = Number<Container::Size()>{},
Number<IStep> = Number<1>{})
{
static_assert((IEnd - IBegin) % IStep == 0, "wrong!");
return container_reduce_impl(
x, reduce, init, Number<IBegin>{}, Number<IEnd>{}, Number<IStep>{});
}
#endif
template <typename TData, index_t NSize, typename Reduce>
__host__ __device__ constexpr auto
container_reverse_inclusive_scan(const Array<TData, NSize>& x, Reduce f, TData init)
{
Array<TData, NSize> y;
TData r = init;
static_for<NSize - 1, 0, -1>{}([&](auto i) {
r = f(r, x[i]);
y(i) = r;
});
r = f(r, x[Number<0>{}]);
y(Number<0>{}) = r;
return y;
}
template <typename TData, index_t NSize, typename Reduce>
__host__ __device__ constexpr auto
container_reverse_exclusive_scan(const Array<TData, NSize>& x, Reduce f, TData init)
{
Array<TData, NSize> y;
TData r = init;
static_for<NSize - 1, 0, -1>{}([&](auto i) {
y(i) = r;
r = f(r, x[i]);
});
y(Number<0>{}) = r;
return y;
}
#if !CK_WORKAROUND_SWDEV_275126
// rocm4.1 compiler would crash with recursive lambda
template <typename... Xs, typename Reduce, typename Init>
__host__ __device__ constexpr auto
container_reverse_exclusive_scan(const Tuple<Xs...>& x, Reduce reduce, Init init)
{
constexpr index_t NSize = sizeof...(Xs);
// f is recursive function, fs is a dummy of f
// i is index, y_old is current scan, r_old is current reduction
auto f = [&](auto fs, auto i, auto y_old, auto r_old) {
auto r_new = reduce(x[i], r_old);
auto y_new = container_push_front(y_old, r_new);
if constexpr(i.value > 1)
{
// recursively call f/fs
return fs(fs, i - Number<1>{}, y_new, r_new);
}
else
{
return y_new;
}
};
// start recursion
return f(f, Number<NSize - 1>{}, make_tuple(init), init);
}
#else
// i is index, y_old is current scan, r_old is current reduction
template <typename... Xs, typename Reduce, index_t I, typename YOld, typename ROld>
__host__ __device__ constexpr auto container_reverse_exclusive_scan_impl(
const Tuple<Xs...>& x, Reduce reduce, Number<I> i, YOld y_old, ROld r_old)
{
auto r_new = reduce(x[i], r_old);
auto y_new = container_push_front(y_old, r_new);
if constexpr(i.value > 1)
{
// recursively call f/fs
return container_reverse_exclusive_scan_impl(x, reduce, i - Number<1>{}, y_new, r_new);
}
else
{
return y_new;
}
}
template <typename... Xs, typename Reduce, typename Init>
__host__ __device__ constexpr auto
container_reverse_exclusive_scan(const Tuple<Xs...>& x, Reduce reduce, Init init)
{
constexpr index_t NSize = sizeof...(Xs);
return container_reverse_exclusive_scan_impl(
x, reduce, Number<NSize - 1>{}, make_tuple(init), init);
}
#endif
// TODO: update to like container_reverse_exclusive_scan to deal with Tuple of Numebr<>
template <typename... Xs, typename Reduce, typename TData>
__host__ __device__ constexpr auto
container_reverse_inclusive_scan(const Tuple<Xs...>& x, Reduce f, TData init)
{
constexpr index_t NSize = sizeof...(Xs);
Tuple<Xs...> y;
TData r = init;
static_for<NSize - 1, 0, -1>{}([&](auto i) {
r = f(r, x[i]);
y(i) = r;
});
r = f(r, x[Number<0>{}]);
y(Number<0>{}) = r;
return y;
}
template <typename X, typename... Ys>
__host__ __device__ constexpr auto container_cat(const X& x, const Ys&... ys)
{
return container_cat(x, container_cat(ys...));
}
template <typename T, index_t NX, index_t NY>
__host__ __device__ constexpr auto container_cat(const Array<T, NX>& ax, const Array<T, NY>& ay)
{
return unpack2(
[&](auto&&... zs) { return make_array(std::forward<decltype(zs)>(zs)...); }, ax, ay);
}
template <typename... X, typename... Y>
__host__ __device__ constexpr auto container_cat(const Tuple<X...>& tx, const Tuple<Y...>& ty)
{
return unpack2(
[&](auto&&... zs) { return make_tuple(std::forward<decltype(zs)>(zs)...); }, tx, ty);
}
template <typename Container>
__host__ __device__ constexpr auto container_cat(const Container& x)
{
return x;
}
template <typename T, index_t N, index_t... Is>
__host__ __device__ constexpr auto get_container_subset(const Array<T, N>& arr, Sequence<Is...>)
{
static_assert(N >= sizeof...(Is), "wrong! size");
return make_array(arr[Number<Is>{}]...);
}
template <typename... Ts, index_t... Is>
__host__ __device__ constexpr auto get_container_subset(const Tuple<Ts...>& tup, Sequence<Is...>)
{
static_assert(sizeof...(Ts) >= sizeof...(Is), "wrong! size");
return make_tuple(tup[Number<Is>{}]...);
}
template <typename T, index_t N, index_t... Is>
__host__ __device__ constexpr void
set_container_subset(Array<T, N>& y, Sequence<Is...> picks, const Array<T, sizeof...(Is)>& x)
{
static_assert(N >= sizeof...(Is), "wrong! size");
static_for<0, sizeof...(Is), 1>{}([&](auto i) { y(picks[i]) = x[i]; });
}
template <typename... Ys, index_t... Is, typename... Xs>
__host__ __device__ constexpr void
set_container_subset(Tuple<Ys...>& y, Sequence<Is...> picks, const Tuple<Xs...>& x)
{
static_assert(sizeof...(Ys) >= sizeof...(Is) && sizeof...(Is) == sizeof...(Xs), "wrong! size");
static_for<0, sizeof...(Is), 1>{}([&](auto i) { y(picks[i]) = x[i]; });
}
template <index_t... Is>
__host__ __device__ constexpr auto sequence_to_tuple_of_number(Sequence<Is...>)
{
using Seq = Sequence<Is...>;
return generate_tuple(
[&](auto i) {
constexpr index_t tmp = Seq::At(i);
return Number<tmp>{};
},
Seq::Size());
}
} // namespace ck
#endif
...@@ -3,264 +3,279 @@ ...@@ -3,264 +3,279 @@
namespace ck { namespace ck {
// For some reason, HIP compiler need this definition to generate optimal ISA template <typename T, index_t N>
// float struct vector_type;
typedef float float2_t __attribute__((ext_vector_type(2)));
typedef float float4_t __attribute__((ext_vector_type(4)));
typedef float float16_t __attribute__((ext_vector_type(16)));
typedef float float32_t __attribute__((ext_vector_type(32)));
// float16
typedef _Float16 half_t;
typedef _Float16 half2_t __attribute__((ext_vector_type(2)));
typedef _Float16 half4_t __attribute__((ext_vector_type(4)));
typedef _Float16 half8_t __attribute__((ext_vector_type(8)));
// bfloat16
typedef ushort ushort2_t __attribute__((ext_vector_type(2)));
typedef ushort ushort4_t __attribute__((ext_vector_type(4)));
typedef ushort ushort8_t __attribute__((ext_vector_type(8)));
template <class T, index_t N>
struct vector_type
{
typedef struct
{
T scalar[N];
} MemoryType;
};
template <> template <typename T>
struct vector_type<float, 1> struct vector_type<T, 1>
{ {
using MemoryType = float; using type = T;
template <index_t I> union
__host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
{ {
static_assert(I < 1, "wrong"); T d1_;
*(reinterpret_cast<float*>(&v) + I) = s; StaticallyIndexedArray<T, 1> d1x1_;
} } data_;
};
template <> __host__ __device__ constexpr vector_type() : data_{type{0}} {}
struct vector_type<float, 2>
{
using MemoryType = float2_t;
union DataType __host__ __device__ constexpr vector_type(type v) : data_{v} {}
{
MemoryType vector;
float scalar[2];
};
template <index_t I> __host__ __device__ static constexpr index_t Size() { return 1; }
__host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
{
static_assert(I < 2, "wrong");
*(reinterpret_cast<float*>(&v) + I) = s;
}
__host__ __device__ static MemoryType Pack(float s0, float s1) __host__ __device__ constexpr const auto& Vector() const { return data_.d1_; }
{
DataType data;
data.scalar[0] = s0;
data.scalar[1] = s1;
return data.vector;
}
};
template <> __host__ __device__ constexpr auto& Vector() { return data_.d1_; }
struct vector_type<float, 4>
{
using MemoryType = float4_t;
__host__ __device__ static constexpr index_t GetSize() { return 4; } __host__ __device__ constexpr const auto& Scalars() const { return data_.d1x1_; }
template <index_t I> __host__ __device__ constexpr auto& Scalars() { return data_.d1x1_; }
__host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
{
static_assert(I < 4, "wrong");
*(reinterpret_cast<float*>(&v) + I) = s;
}
};
template <> __host__ __device__ constexpr const auto& Vectors(Number<1>) const { return data_.d1x1_; }
struct vector_type<half_t, 1>
{
using MemoryType = half_t;
template <index_t I> __host__ __device__ constexpr auto& Vectors(Number<1>) { return data_.d1x1_; }
__host__ __device__ static void SetScalar(MemoryType& v, half_t s, Number<I>)
{
static_assert(I < 1, "wrong");
*(reinterpret_cast<half_t*>(&v) + I) = s;
}
}; };
template <> template <typename T>
struct vector_type<half_t, 2> struct vector_type<T, 2>
{ {
using MemoryType = half2_t; using d1_t = T;
typedef T d2_t __attribute__((ext_vector_type(2)));
union DataType using type = d2_t;
{
MemoryType vector;
half_t scalar[2];
};
template <index_t I> union
__host__ __device__ static void SetScalar(MemoryType& v, half_t s, Number<I>)
{ {
static_assert(I < 2, "wrong"); d2_t d2_;
*(reinterpret_cast<half_t*>(&v) + I) = s; StaticallyIndexedArray<d1_t, 2> d1x2_;
} StaticallyIndexedArray<d2_t, 1> d2x1_;
} data_;
__host__ __device__ static MemoryType Pack(half_t s0, half_t s1) __host__ __device__ constexpr vector_type() : data_{type{0}} {}
{
DataType data;
data.scalar[0] = s0;
data.scalar[1] = s1;
return data.vector;
}
};
template <> __host__ __device__ constexpr vector_type(type v) : data_{v} {}
struct vector_type<half_t, 4>
{
using MemoryType = half4_t;
union DataType __host__ __device__ static constexpr index_t Size() { return 2; }
{
MemoryType vector;
half_t scalar[4];
};
template <index_t I> __host__ __device__ constexpr const auto& Vector() const { return data_.d2_; }
__host__ __device__ static void SetScalar(MemoryType& v, half_t s, Number<I>)
{
static_assert(I < 4, "wrong");
*(reinterpret_cast<half_t*>(&v) + I) = s;
}
__host__ __device__ static MemoryType Pack(half_t s0, half_t s1, half_t s2, half_t s3) __host__ __device__ constexpr auto& Vector() { return data_.d2_; }
{
DataType data;
data.scalar[0] = s0;
data.scalar[1] = s1;
data.scalar[2] = s2;
data.scalar[3] = s3;
return data.vector;
}
};
template <> __host__ __device__ constexpr const auto& Scalars() const { return data_.d1x2_; }
struct vector_type<half_t, 8>
{
using MemoryType = half8_t;
union DataType __host__ __device__ constexpr auto& Scalars() { return data_.d1x2_; }
{
MemoryType vector;
half_t scalar[8];
};
template <index_t I> __host__ __device__ constexpr const auto& Vectors(Number<1>) const { return data_.d1x2_; }
__host__ __device__ static void SetScalar(MemoryType& v, half_t s, Number<I>)
{ __host__ __device__ constexpr const auto& Vectors(Number<2>) const { return data_.d2x1_; }
static_assert(I < 8, "wrong");
*(reinterpret_cast<half_t*>(&v) + I) = s; __host__ __device__ constexpr auto& Vectors(Number<1>) { return data_.d1x2_; }
}
__host__ __device__ constexpr auto& Vectors(Number<2>) { return data_.d2x1_; }
}; };
template <> template <typename T>
struct vector_type<ushort, 1> struct vector_type<T, 4>
{ {
using MemoryType = ushort; using d1_t = T;
typedef T d2_t __attribute__((ext_vector_type(2)));
typedef T d4_t __attribute__((ext_vector_type(4)));
template <index_t I> using type = d4_t;
__host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
union
{ {
static_assert(I < 1, "wrong"); d4_t d4_;
*(reinterpret_cast<ushort*>(&v) + I) = s; StaticallyIndexedArray<d1_t, 4> d1x4_;
} StaticallyIndexedArray<d2_t, 2> d2x2_;
StaticallyIndexedArray<d4_t, 1> d4x1_;
} data_;
__host__ __device__ constexpr vector_type() : data_{type{0}} {}
__host__ __device__ constexpr vector_type(type v) : data_{v} {}
__host__ __device__ static constexpr index_t Size() { return 4; }
__host__ __device__ constexpr const auto& Vector() const { return data_.d4_; }
__host__ __device__ constexpr auto& Vector() { return data_.d4_; }
__host__ __device__ constexpr const auto& Scalars() const { return data_.d1x4_; }
__host__ __device__ constexpr auto& Scalars() { return data_.d1x4_; }
__host__ __device__ constexpr const auto& Vectors(Number<1>) const { return data_.d1x4_; }
__host__ __device__ constexpr const auto& Vectors(Number<2>) const { return data_.d2x2_; }
__host__ __device__ constexpr const auto& Vectors(Number<4>) const { return data_.d4x1_; }
__host__ __device__ constexpr auto& Vectors(Number<1>) { return data_.d1x4_; }
__host__ __device__ constexpr auto& Vectors(Number<2>) { return data_.d2x2_; }
__host__ __device__ constexpr auto& Vectors(Number<4>) { return data_.d4x1_; }
}; };
template <> template <typename T>
struct vector_type<ushort, 2> struct vector_type<T, 8>
{ {
using MemoryType = ushort2_t; using d1_t = T;
typedef T d2_t __attribute__((ext_vector_type(2)));
typedef T d4_t __attribute__((ext_vector_type(4)));
typedef T d8_t __attribute__((ext_vector_type(8)));
union DataType using type = d8_t;
{
MemoryType vector;
ushort scalar[2];
};
template <index_t I> union
__host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
{ {
static_assert(I < 2, "wrong"); d8_t d8_;
*(reinterpret_cast<ushort*>(&v) + I) = s; StaticallyIndexedArray<d1_t, 8> d1x8_;
} StaticallyIndexedArray<d2_t, 4> d2x4_;
StaticallyIndexedArray<d4_t, 2> d4x2_;
StaticallyIndexedArray<d8_t, 1> d8x1_;
} data_;
__host__ __device__ static MemoryType Pack(ushort s0, ushort s1) __host__ __device__ constexpr vector_type() : data_{type{0}} {}
{
DataType data; __host__ __device__ constexpr vector_type(type v) : data_{v} {}
data.scalar[0] = s0;
data.scalar[1] = s1; __host__ __device__ static constexpr index_t Size() { return 8; }
return data.vector;
} __host__ __device__ constexpr const auto& Vector() const { return data_.d8_; }
__host__ __device__ constexpr auto& Vector() { return data_.d8_; }
__host__ __device__ constexpr const auto& Scalars() const { return data_.d1x8_; }
__host__ __device__ constexpr auto& Scalars() { return data_.d1x8_; }
__host__ __device__ constexpr const auto& Vectors(Number<1>) const { return data_.d1x8_; }
__host__ __device__ constexpr const auto& Vectors(Number<2>) const { return data_.d2x4_; }
__host__ __device__ constexpr const auto& Vectors(Number<4>) const { return data_.d4x2_; }
__host__ __device__ constexpr const auto& Vectors(Number<8>) const { return data_.d8x1_; }
__host__ __device__ constexpr auto& Vectors(Number<1>) { return data_.d1x8_; }
__host__ __device__ constexpr auto& Vectors(Number<2>) { return data_.d2x4_; }
__host__ __device__ constexpr auto& Vectors(Number<4>) { return data_.d4x2_; }
__host__ __device__ constexpr auto& Vectors(Number<8>) { return data_.d8x1_; }
}; };
template <> template <>
struct vector_type<ushort, 4> struct vector_type<int8_t, 2>
{ {
using MemoryType = ushort4_t; using d1_t = int8_t;
typedef int16_t d2_t;
union DataType using type = d2_t;
{
MemoryType vector;
ushort scalar[4];
};
template <index_t I> union
__host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
{ {
static_assert(I < 4, "wrong"); d2_t d2_;
*(reinterpret_cast<ushort*>(&v) + I) = s; StaticallyIndexedArray<d1_t, 2> d1x2_;
} StaticallyIndexedArray<d2_t, 1> d2x1_;
} data_;
__host__ __device__ static MemoryType Pack(ushort s0, ushort s1, ushort s2, ushort s3) __host__ __device__ constexpr vector_type() : data_{type{0}} {}
{
DataType data; __host__ __device__ constexpr vector_type(type v) : data_{v} {}
data.scalar[0] = s0;
data.scalar[1] = s1; __host__ __device__ static constexpr index_t Size() { return 2; }
data.scalar[2] = s2;
data.scalar[3] = s3; __host__ __device__ constexpr const auto& Vector() const { return data_.d2_; }
return data.vector;
} __host__ __device__ constexpr auto& Vector() { return data_.d2_; }
__host__ __device__ constexpr const auto& Scalars() const { return data_.d1x2_; }
__host__ __device__ constexpr auto& Scalars() { return data_.d1x2_; }
__host__ __device__ constexpr const auto& Vectors(Number<1>) const { return data_.d1x2_; }
__host__ __device__ constexpr const auto& Vectors(Number<2>) const { return data_.d2x1_; }
__host__ __device__ constexpr auto& Vectors(Number<1>) { return data_.d1x2_; }
__host__ __device__ constexpr auto& Vectors(Number<2>) { return data_.d2x1_; }
}; };
template <> template <>
struct vector_type<ushort, 8> struct vector_type<int8_t, 4>
{ {
using MemoryType = ushort8_t; using d1_t = int8_t;
typedef int16_t d2_t;
typedef int32_t d4_t;
union DataType using type = d4_t;
{
MemoryType vector;
ushort scalar[8];
};
template <index_t I> union
__host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
{ {
static_assert(I < 8, "wrong"); d4_t d4_;
*(reinterpret_cast<ushort*>(&v) + I) = s; StaticallyIndexedArray<d1_t, 4> d1x4_;
} StaticallyIndexedArray<d2_t, 2> d2x2_;
StaticallyIndexedArray<d4_t, 1> d4x1_;
} data_;
__host__ __device__ constexpr vector_type() : data_{type{0}} {}
__host__ __device__ constexpr vector_type(type v) : data_{v} {}
__host__ __device__ static constexpr index_t Size() { return 4; }
__host__ __device__ constexpr const auto& Vector() const { return data_.d4_; }
__host__ __device__ constexpr auto& Vector() { return data_.d4_; }
__host__ __device__ constexpr const auto& Scalars() const { return data_.d1x4_; }
__host__ __device__ constexpr auto& Scalars() { return data_.d1x4_; }
__host__ __device__ constexpr const auto& Vectors(Number<1>) const { return data_.d1x4_; }
__host__ __device__ constexpr const auto& Vectors(Number<2>) const { return data_.d2x2_; }
__host__ __device__ constexpr const auto& Vectors(Number<4>) const { return data_.d4x1_; }
__host__ __device__ constexpr auto& Vectors(Number<1>) { return data_.d1x4_; }
__host__ __device__ constexpr auto& Vectors(Number<2>) { return data_.d2x2_; }
__host__ __device__ constexpr auto& Vectors(Number<4>) { return data_.d4x1_; }
}; };
// fp32
using float2_t = typename vector_type<float, 2>::type;
using float4_t = typename vector_type<float, 4>::type;
using float8_t = typename vector_type<float, 8>::type;
// fp16
using half_t = _Float16;
using half2_t = typename vector_type<half_t, 2>::type;
using half4_t = typename vector_type<half_t, 4>::type;
using half8_t = typename vector_type<half_t, 8>::type;
// bfp16
using ushort2_t = typename vector_type<ushort, 2>::type;
using ushort4_t = typename vector_type<ushort, 4>::type;
using ushort8_t = typename vector_type<ushort, 8>::type;
// i32
using int32x2_t = typename vector_type<int32_t, 2>::type;
using int32x4_t = typename vector_type<int32_t, 4>::type;
using int32x8_t = typename vector_type<int32_t, 8>::type;
// i8
// hack for int8x4_t, because compiler does not have native support for int8x4_t
// int8x4_t is defined as int32_t
using int8x4_t = typename vector_type<int8_t, 4>::type;
// data type conversion // data type conversion
template <typename T> template <typename T>
struct type_convert struct type_convert
...@@ -291,113 +306,37 @@ struct inner_product_with_conversion ...@@ -291,113 +306,37 @@ struct inner_product_with_conversion
{ {
static constexpr auto convert = type_convert<T>(); static constexpr auto convert = type_convert<T>();
__device__ T operator()(float4_t a, float4_t b) const template <typename X, index_t N>
{ __device__ T operator()(typename vector_type<X, N>::type a,
const float* p_a_float = reinterpret_cast<const float*>(&a); typename vector_type<X, N>::type b) const
const float* p_b_float = reinterpret_cast<const float*>(&b);
T acc = 0;
for(index_t v = 0; v < 4; ++v)
{
acc += convert(p_a_float[v]) * convert(p_b_float[v]);
}
return acc;
}
__device__ T operator()(float2_t a, float2_t b) const
{
const float* p_a_float = reinterpret_cast<const float*>(&a);
const float* p_b_float = reinterpret_cast<const float*>(&b);
T acc = 0;
for(index_t v = 0; v < 2; ++v)
{
acc += convert(p_a_float[v]) * convert(p_b_float[v]);
}
return acc;
}
__device__ T operator()(float a, float b) const { return convert(a) * convert(b); }
__device__ T operator()(half2_t a, half2_t b) const
{
const half_t* p_a_half = reinterpret_cast<const half_t*>(&a);
const half_t* p_b_half = reinterpret_cast<const half_t*>(&b);
T acc = 0;
for(index_t v = 0; v < 2; ++v)
{
acc += convert(p_a_half[v]) * convert(p_b_half[v]);
}
return acc;
}
__device__ T operator()(half4_t a, half4_t b) const
{ {
const half_t* p_a_half = reinterpret_cast<const half_t*>(&a); const vector_type<X, N> a_vector{a};
const half_t* p_b_half = reinterpret_cast<const half_t*>(&b); const vector_type<X, N> b_vector{b};
T acc = 0; T acc = 0;
for(index_t v = 0; v < 4; ++v)
{
acc += convert(p_a_half[v]) * convert(p_b_half[v]);
}
return acc;
}
__device__ T operator()(half8_t a, half8_t b) const static_for<0, N, 1>{}([&](auto i) {
{ acc += convert(a_vector.Scalars()[i]) * convert(b_vector.Scalars()[i]);
const half_t* p_a_half = reinterpret_cast<const half_t*>(&a); });
const half_t* p_b_half = reinterpret_cast<const half_t*>(&b);
T acc = 0;
for(index_t v = 0; v < 8; ++v)
{
acc += convert(p_a_half[v]) * convert(p_b_half[v]);
}
return acc; return acc;
} }
__device__ T operator()(ushort2_t a, ushort2_t b) const __device__ T operator()(float_t a, float_t b) const { return convert(a) * convert(b); }
{
const ushort* p_a_bfloat16 = reinterpret_cast<const ushort*>(&a);
const ushort* p_b_bfloat16 = reinterpret_cast<const ushort*>(&b);
T acc = 0; // hack for int8x4_t, because compiler does not have native support for int8x4_t
for(index_t v = 0; v < 2; ++v) // int8x4_t is defined as int32_t
{ __device__ T operator()(int8x4_t a, int8x4_t b) const
acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]);
}
return acc;
}
__device__ T operator()(ushort4_t a, ushort4_t b) const
{ {
const ushort* p_a_bfloat16 = reinterpret_cast<const ushort*>(&a); const vector_type<int8_t, 4> a_vector{a};
const ushort* p_b_bfloat16 = reinterpret_cast<const ushort*>(&b); const vector_type<int8_t, 4> b_vector{b};
T acc = 0; T acc = 0;
for(index_t v = 0; v < 4; ++v)
{
acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]);
}
return acc;
}
__device__ T operator()(ushort8_t a, ushort8_t b) const static_for<0, 4, 1>{}([&](auto i) {
{ acc += convert(a_vector.Scalars()[i]) * convert(b_vector.Scalars()[i]);
const ushort* p_a_bfloat16 = reinterpret_cast<const ushort*>(&a); });
const ushort* p_b_bfloat16 = reinterpret_cast<const ushort*>(&b);
T acc = 0;
for(index_t v = 0; v < 8; ++v)
{
acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]);
}
return acc; return acc;
} }
}; };
......
...@@ -32,16 +32,16 @@ struct vector_type ...@@ -32,16 +32,16 @@ struct vector_type
typedef struct typedef struct
{ {
T scalar[N]; T scalar[N];
} MemoryType; } type;
}; };
template <> template <>
struct vector_type<float, 1> struct vector_type<float, 1>
{ {
using MemoryType = float; using type = float;
template <index_t I> template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>) __host__ __device__ static void SetScalar(type& v, float s, Number<I>)
{ {
static_assert(I < 1, "wrong"); static_assert(I < 1, "wrong");
*(reinterpret_cast<float*>(&v) + I) = s; *(reinterpret_cast<float*>(&v) + I) = s;
...@@ -51,22 +51,22 @@ struct vector_type<float, 1> ...@@ -51,22 +51,22 @@ struct vector_type<float, 1>
template <> template <>
struct vector_type<float, 2> struct vector_type<float, 2>
{ {
using MemoryType = float2_t; using type = float2_t;
union DataType union DataType
{ {
MemoryType vector; type vector;
float scalar[2]; float scalar[2];
}; };
template <index_t I> template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>) __host__ __device__ static void SetScalar(type& v, float s, Number<I>)
{ {
static_assert(I < 2, "wrong"); static_assert(I < 2, "wrong");
*(reinterpret_cast<float*>(&v) + I) = s; *(reinterpret_cast<float*>(&v) + I) = s;
} }
__host__ __device__ static MemoryType Pack(float s0, float s1) __host__ __device__ static type Pack(float s0, float s1)
{ {
DataType data; DataType data;
data.scalar[0] = s0; data.scalar[0] = s0;
...@@ -78,12 +78,12 @@ struct vector_type<float, 2> ...@@ -78,12 +78,12 @@ struct vector_type<float, 2>
template <> template <>
struct vector_type<float, 4> struct vector_type<float, 4>
{ {
using MemoryType = float4_t; using type = float4_t;
__host__ __device__ static constexpr index_t GetSize() { return 4; } __host__ __device__ static constexpr index_t GetSize() { return 4; }
template <index_t I> template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>) __host__ __device__ static void SetScalar(type& v, float s, Number<I>)
{ {
static_assert(I < 4, "wrong"); static_assert(I < 4, "wrong");
*(reinterpret_cast<float*>(&v) + I) = s; *(reinterpret_cast<float*>(&v) + I) = s;
...@@ -93,10 +93,10 @@ struct vector_type<float, 4> ...@@ -93,10 +93,10 @@ struct vector_type<float, 4>
template <> template <>
struct vector_type<half_t, 1> struct vector_type<half_t, 1>
{ {
using MemoryType = half_t; using type = half_t;
template <index_t I> template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, half_t s, Number<I>) __host__ __device__ static void SetScalar(type& v, half_t s, Number<I>)
{ {
static_assert(I < 1, "wrong"); static_assert(I < 1, "wrong");
*(reinterpret_cast<half_t*>(&v) + I) = s; *(reinterpret_cast<half_t*>(&v) + I) = s;
...@@ -106,22 +106,22 @@ struct vector_type<half_t, 1> ...@@ -106,22 +106,22 @@ struct vector_type<half_t, 1>
template <> template <>
struct vector_type<half_t, 2> struct vector_type<half_t, 2>
{ {
using MemoryType = half2_t; using type = half2_t;
union DataType union DataType
{ {
MemoryType vector; type vector;
half_t scalar[2]; half_t scalar[2];
}; };
template <index_t I> template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, half_t s, Number<I>) __host__ __device__ static void SetScalar(type& v, half_t s, Number<I>)
{ {
static_assert(I < 2, "wrong"); static_assert(I < 2, "wrong");
*(reinterpret_cast<half_t*>(&v) + I) = s; *(reinterpret_cast<half_t*>(&v) + I) = s;
} }
__host__ __device__ static MemoryType Pack(half_t s0, half_t s1) __host__ __device__ static type Pack(half_t s0, half_t s1)
{ {
DataType data; DataType data;
data.scalar[0] = s0; data.scalar[0] = s0;
......
...@@ -2,7 +2,6 @@ ...@@ -2,7 +2,6 @@
#define CK_FUNCTIONAL_HPP #define CK_FUNCTIONAL_HPP
#include "integral_constant.hpp" #include "integral_constant.hpp"
#include "sequence.hpp"
#include "type.hpp" #include "type.hpp"
namespace ck { namespace ck {
...@@ -56,8 +55,10 @@ struct static_if<true> ...@@ -56,8 +55,10 @@ struct static_if<true>
__host__ __device__ constexpr auto operator()(F f) const __host__ __device__ constexpr auto operator()(F f) const
{ {
// This is a trick for compiler: // This is a trick for compiler:
// Pass forwarder to lambda "f" as "auto" argument, and make sure "f" will use it, // Pass forwarder to lambda "f" as "auto" argument, and make sure "f" will
// this will make "f" a generic lambda, so that "f" won't be compiled until being // use it,
// this will make "f" a generic lambda, so that "f" won't be compiled
// until being
// instantiated here // instantiated here
f(forwarder{}); f(forwarder{});
return Type{}; return Type{};
...@@ -84,8 +85,10 @@ struct static_if<false> ...@@ -84,8 +85,10 @@ struct static_if<false>
__host__ __device__ static void Else(F f) __host__ __device__ static void Else(F f)
{ {
// This is a trick for compiler: // This is a trick for compiler:
// Pass forwarder to lambda "f" as "auto" argument, and make sure "f" will use it, // Pass forwarder to lambda "f" as "auto" argument, and make sure "f" will
// this will make "f" a generic lambda, so that "f" won't be compiled until being // use it,
// this will make "f" a generic lambda, so that "f" won't be compiled
// until being
// instantiated here // instantiated here
f(forwarder{}); f(forwarder{});
} }
......
...@@ -32,7 +32,8 @@ struct static_for ...@@ -32,7 +32,8 @@ struct static_for
static_assert(Increment != 0 && (NEnd - NBegin) % Increment == 0, static_assert(Increment != 0 && (NEnd - NBegin) % Increment == 0,
"Wrong! should satisfy (NEnd - NBegin) % Increment == 0"); "Wrong! should satisfy (NEnd - NBegin) % Increment == 0");
static_assert((Increment > 0 && NBegin <= NEnd) || (Increment < 0 && NBegin >= NEnd), static_assert((Increment > 0 && NBegin <= NEnd) || (Increment < 0 && NBegin >= NEnd),
"wrongs! should have NBegin <= NEnd"); "wrongs! should (Increment > 0 && NBegin <= NEnd) || (Increment < 0 && "
"NBegin >= NEnd)");
} }
template <class F> template <class F>
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
#include "functional.hpp" #include "functional.hpp"
#include "functional2.hpp" #include "functional2.hpp"
#include "sequence.hpp" #include "sequence.hpp"
#include "array.hpp" #include "multi_index.hpp"
namespace ck { namespace ck {
...@@ -63,7 +63,7 @@ struct ford_impl ...@@ -63,7 +63,7 @@ struct ford_impl
for(index_t i = 0; i < RemainLengths::Front(); ++i) for(index_t i = 0; i < RemainLengths::Front(); ++i)
{ {
ford_impl<decltype(RemainLengths::PopFront()), Orders>{}( ford_impl<decltype(RemainLengths::PopFront()), Orders>{}(
f, current_ordered_id.PushBack(i)); f, container_push_back(current_ordered_id, i));
} }
} }
}; };
...@@ -77,14 +77,16 @@ struct ford_impl<Sequence<>, Orders> ...@@ -77,14 +77,16 @@ struct ford_impl<Sequence<>, Orders>
__host__ __device__ constexpr void operator()(F f, CurrentOrderedId current_ordered_id) const __host__ __device__ constexpr void operator()(F f, CurrentOrderedId current_ordered_id) const
{ {
// retrive unordered Id // retrive unordered Id
f(reorder_array_given_old2new(current_ordered_id, Orders{})); f(container_reorder_given_old2new(current_ordered_id, Orders{}));
} }
}; };
} // namespace detail } // namespace detail
// Lengths is Sequence<...>, it is the length of each dimension for N-dimensional loop // Lengths is Sequence<...>, it is the length of each dimension for
// Orders is Sequence<...>, it is the order of dimension in which static_ford will loop over each // N-dimensional loop
// Orders is Sequence<...>, it is the order of dimension in which static_ford
// will loop over each
// dimension // dimension
template <class Lengths, template <class Lengths,
class Orders = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::type> class Orders = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::type>
...@@ -106,8 +108,10 @@ struct static_ford ...@@ -106,8 +108,10 @@ struct static_ford
} }
}; };
// Lengths is Sequence<...>, it is the length of each dimension for N-dimensional loop // Lengths is Sequence<...>, it is the length of each dimension for
// Orders is Sequence<...>, it is the order of dimension in which ford will loop over each // N-dimensional loop
// Orders is Sequence<...>, it is the order of dimension in which ford will loop
// over each
// dimension // dimension
template <class Lengths, template <class Lengths,
class Orders = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::type> class Orders = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::type>
...@@ -129,7 +133,7 @@ struct ford ...@@ -129,7 +133,7 @@ struct ford
for(index_t i = 0; i < ordered_lengths.Front(); ++i) for(index_t i = 0; i < ordered_lengths.Front(); ++i)
{ {
detail::ford_impl<decltype(ordered_lengths.PopFront()), Orders>{}(f, detail::ford_impl<decltype(ordered_lengths.PopFront()), Orders>{}(f,
Array<index_t, 1>{i}); make_multi_index(i));
} }
} }
}; };
......
...@@ -16,18 +16,46 @@ template <index_t... Is> ...@@ -16,18 +16,46 @@ template <index_t... Is>
struct unpack_impl<Sequence<Is...>> struct unpack_impl<Sequence<Is...>>
{ {
template <typename F, typename X> template <typename F, typename X>
__host__ __device__ constexpr auto operator()(F f, const X& x) const __host__ __device__ constexpr auto operator()(F&& f, X&& x) const
{ {
return f(x.At(Number<Is>{})...); return std::forward<F>(f)(std::forward<X>(x).At(Number<Is>{})...);
}
};
template <typename Seq0, typename Seq1>
struct unpack2_impl;
// TODO: remove this, after properly implementing unpack that takes any number of containers
template <index_t... Is, index_t... Js>
struct unpack2_impl<Sequence<Is...>, Sequence<Js...>>
{
template <typename F, typename X, typename Y>
__host__ __device__ constexpr auto operator()(F&& f, X&& x, Y&& y) const
{
return std::forward<F>(f)(std::forward<X>(x).At(Number<Is>{})...,
std::forward<Y>(y).At(Number<Js>{})...);
} }
}; };
} // namespace detail } // namespace detail
template <typename F, typename X> template <typename F, typename X>
__host__ __device__ constexpr auto unpack(F f, const X& x) __host__ __device__ constexpr auto unpack(F&& f, X&& x)
{
using X_ = remove_reference_t<X>;
return detail::unpack_impl<typename arithmetic_sequence_gen<0, X_::Size(), 1>::type>{}(
std::forward<F>(f), std::forward<X>(x));
}
// TODO: properly implement unpack that takes any number of containers
template <typename F, typename X, typename Y>
__host__ __device__ constexpr auto unpack2(F&& f, X&& x, Y&& y)
{ {
return detail::unpack_impl<typename arithmetic_sequence_gen<0, X::Size(), 1>::type>{}(f, x); using X_ = remove_reference_t<X>;
using Y_ = remove_reference_t<Y>;
return detail::unpack2_impl<typename arithmetic_sequence_gen<0, X_::Size(), 1>::type,
typename arithmetic_sequence_gen<0, Y_::Size(), 1>::type>{}(
std::forward<F>(f), std::forward<X>(x), std::forward<Y>(y));
} }
} // namespace ck } // namespace ck
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#if CK_USE_AMD_BUFFER_ADDRESSING #if CK_USE_AMD_BUFFER_ADDRESSING
#include "amd_buffer_addressing.hpp" #include "amd_buffer_addressing.hpp"
#include "amd_buffer_addressing_v2.hpp"
#endif #endif
namespace ck { namespace ck {
...@@ -43,7 +44,7 @@ __device__ void atomic_add_impl<float4_t>(float4_t* p_dst, float4_t src) ...@@ -43,7 +44,7 @@ __device__ void atomic_add_impl<float4_t>(float4_t* p_dst, float4_t src)
template <typename T, index_t DataPerAccess> template <typename T, index_t DataPerAccess>
struct SetData struct SetData
{ {
using vector_t = typename vector_type<T, DataPerAccess>::MemoryType; using vector_t = typename vector_type<T, DataPerAccess>::type;
// This version is only for compatibility, don't use this version if possible // This version is only for compatibility, don't use this version if possible
template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace> template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
...@@ -60,8 +61,13 @@ struct SetData ...@@ -60,8 +61,13 @@ struct SetData
{ {
if(src_valid) if(src_valid)
{ {
#if 0
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) = *reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
*reinterpret_cast<const vector_t*>(&p_src[src_offset]); *reinterpret_cast<const vector_t*>(&p_src[src_offset]);
#else
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
*reinterpret_cast<const vector_t*>(&p_src[0x3fffffff & src_offset]);
#endif
} }
else else
{ {
...@@ -88,7 +94,7 @@ struct SetData ...@@ -88,7 +94,7 @@ struct SetData
if(dst_valid) if(dst_valid)
{ {
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) = *reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
amd_buffer_load<T, DataPerAccess>(p_src, src_offset, src_valid, src_range); amd_buffer_load_v2<T, DataPerAccess>(p_src, src_offset, src_valid, src_range);
} }
} }
...@@ -108,12 +114,12 @@ struct SetData ...@@ -108,12 +114,12 @@ struct SetData
{ {
const auto zeros = vector_t(0); const auto zeros = vector_t(0);
amd_buffer_store<T, DataPerAccess>(src_valid ? &(p_src[src_offset]) amd_buffer_store_v2<T, DataPerAccess>(
: reinterpret_cast<const T*>(&zeros), src_valid ? *reinterpret_cast<const vector_t*>(&(p_src[src_offset])) : zeros,
p_dst, p_dst,
dst_offset, dst_offset,
dst_valid, dst_valid,
dst_range); dst_range);
} }
#endif #endif
}; };
...@@ -121,7 +127,7 @@ struct SetData ...@@ -121,7 +127,7 @@ struct SetData
template <typename T, index_t DataPerAccess> template <typename T, index_t DataPerAccess>
struct AtomicAddData struct AtomicAddData
{ {
using vector_t = typename vector_type<T, DataPerAccess>::MemoryType; using vector_t = typename vector_type<T, DataPerAccess>::type;
// This version is only for compatibility, don't use this version if possible // This version is only for compatibility, don't use this version if possible
template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace> template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
...@@ -141,7 +147,7 @@ struct AtomicAddData ...@@ -141,7 +147,7 @@ struct AtomicAddData
} }
} }
#if CK_USE_AMD_BUFFER_ADDRESSING && CK_USE_AMD_BUFFER_ATOMIC_ADD #if CK_USE_AMD_BUFFER_ADDRESSING && CK_USE_AMD_BUFFER_ATOMIC_FADD
// buffer_atomic requires: // buffer_atomic requires:
// 1) p_src_thread must be in vgpr space, p_dst_thread must be global memory // 1) p_src_thread must be in vgpr space, p_dst_thread must be global memory
// 2) p_dst_thread to be a wavewise pointer. // 2) p_dst_thread to be a wavewise pointer.
...@@ -185,25 +191,26 @@ __device__ void transfer_data(const T* p_src, ...@@ -185,25 +191,26 @@ __device__ void transfer_data(const T* p_src,
"wrong! InMemoryDataOperation not supported!"); "wrong! InMemoryDataOperation not supported!");
// keep it simple, don't use static_if here, otherwise compiler will do weird things // keep it simple, don't use static_if here, otherwise compiler will do weird things
if(SrcDataStride == 1 && DstDataStride == 1) if constexpr(SrcDataStride == 1 && DstDataStride == 1)
{ {
// TODO: use static_if::ElseIf if constexpr(DstInMemOp == InMemoryDataOperation::Set)
static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) { {
SetData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>( SetData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>(
p_src, src_offset, src_valid, src_range, p_dst, dst_offset, dst_valid, dst_range); p_src, src_offset, src_valid, src_range, p_dst, dst_offset, dst_valid, dst_range);
}); }
else if constexpr(DstInMemOp == InMemoryDataOperation::AtomicAdd)
static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) { {
AtomicAddData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>( AtomicAddData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>(
p_src, src_offset, src_valid, src_range, p_dst, dst_offset, dst_valid, dst_range); p_src, src_offset, src_valid, src_range, p_dst, dst_offset, dst_valid, dst_range);
}); }
} }
else else
{ {
#pragma unroll
for(index_t i = 0; i < DataPerAccess; ++i) for(index_t i = 0; i < DataPerAccess; ++i)
{ {
// TODO: use static_if::ElseIf if constexpr(DstInMemOp == InMemoryDataOperation::Set)
static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) { {
SetData<T, 1>{}.template Run<SrcAddressSpace, DstAddressSpace>( SetData<T, 1>{}.template Run<SrcAddressSpace, DstAddressSpace>(
p_src, p_src,
src_offset + i * SrcDataStride, src_offset + i * SrcDataStride,
...@@ -213,9 +220,9 @@ __device__ void transfer_data(const T* p_src, ...@@ -213,9 +220,9 @@ __device__ void transfer_data(const T* p_src,
dst_offset + i * DstDataStride, dst_offset + i * DstDataStride,
dst_valid, dst_valid,
dst_range); dst_range);
}); }
else if constexpr(DstInMemOp == InMemoryDataOperation::AtomicAdd)
static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) { {
AtomicAddData<T, 1>{}.template Run<SrcAddressSpace, DstAddressSpace>( AtomicAddData<T, 1>{}.template Run<SrcAddressSpace, DstAddressSpace>(
p_src, p_src,
src_offset + i * SrcDataStride, src_offset + i * SrcDataStride,
...@@ -225,7 +232,7 @@ __device__ void transfer_data(const T* p_src, ...@@ -225,7 +232,7 @@ __device__ void transfer_data(const T* p_src,
dst_offset + i * DstDataStride, dst_offset + i * DstDataStride,
dst_valid, dst_valid,
dst_range); dst_range);
}); }
} }
} }
} }
......
...@@ -37,7 +37,7 @@ __device__ void atomic_add_impl<float4_t>(float4_t* p_dst, float4_t src) ...@@ -37,7 +37,7 @@ __device__ void atomic_add_impl<float4_t>(float4_t* p_dst, float4_t src)
template <typename T, index_t DataPerAccess> template <typename T, index_t DataPerAccess>
struct SetData struct SetData
{ {
using vector_t = typename vector_type<T, DataPerAccess>::MemoryType; using vector_t = typename vector_type<T, DataPerAccess>::type;
template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace> template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
__device__ void Run(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) const __device__ void Run(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) const
...@@ -50,7 +50,7 @@ struct SetData ...@@ -50,7 +50,7 @@ struct SetData
template <typename T, index_t DataPerAccess> template <typename T, index_t DataPerAccess>
struct AtomicAddData struct AtomicAddData
{ {
using vector_t = typename vector_type<T, DataPerAccess>::MemoryType; using vector_t = typename vector_type<T, DataPerAccess>::type;
template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace> template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
__device__ void Run(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) const __device__ void Run(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) const
......
...@@ -33,6 +33,15 @@ struct multiplies ...@@ -33,6 +33,15 @@ struct multiplies
__host__ __device__ constexpr T operator()(T a, T b) const { return a * b; } __host__ __device__ constexpr T operator()(T a, T b) const { return a * b; }
}; };
struct multiplies_v2
{
template <typename A, typename B>
__host__ __device__ constexpr auto operator()(const A& a, const B& b) const
{
return a * b;
}
};
template <class T> template <class T>
struct maxer struct maxer
{ {
...@@ -105,8 +114,7 @@ __host__ __device__ constexpr T min(T x, Ts... xs) ...@@ -105,8 +114,7 @@ __host__ __device__ constexpr T min(T x, Ts... xs)
} }
// greatest common divisor, aka highest common factor // greatest common divisor, aka highest common factor
template <typename T> __host__ __device__ constexpr index_t gcd(index_t x, index_t y)
__host__ __device__ constexpr T gcd(T x, T y)
{ {
if(x == y || x == 0) if(x == y || x == 0)
{ {
...@@ -129,24 +137,29 @@ __host__ __device__ constexpr T gcd(T x, T y) ...@@ -129,24 +137,29 @@ __host__ __device__ constexpr T gcd(T x, T y)
template <index_t X, index_t Y> template <index_t X, index_t Y>
__host__ __device__ constexpr auto gcd(Number<X>, Number<Y>) __host__ __device__ constexpr auto gcd(Number<X>, Number<Y>)
{ {
constexpr auto result = gcd(X, Y); constexpr auto r = gcd(X, Y);
return Number<result>{};
return Number<r>{};
} }
template <typename X, typename... Ys> template <typename X,
typename... Ys,
typename std::enable_if<sizeof...(Ys) >= 2, bool>::type = false>
__host__ __device__ constexpr auto gcd(X x, Ys... ys) __host__ __device__ constexpr auto gcd(X x, Ys... ys)
{ {
return gcd(x, ys...); return gcd(x, ys...);
} }
// least common multiple // least common multiple
template <typename T> template <typename X, typename Y>
__host__ __device__ constexpr T lcm(T x, T y) __host__ __device__ constexpr auto lcm(X x, Y y)
{ {
return (x * y) / gcd(x, y); return (x * y) / gcd(x, y);
} }
template <typename X, typename... Ys> template <typename X,
typename... Ys,
typename std::enable_if<sizeof...(Ys) >= 2, bool>::type = false>
__host__ __device__ constexpr auto lcm(X x, Ys... ys) __host__ __device__ constexpr auto lcm(X x, Ys... ys)
{ {
return lcm(x, lcm(ys...)); return lcm(x, lcm(ys...));
...@@ -165,6 +178,6 @@ struct less ...@@ -165,6 +178,6 @@ struct less
}; };
} // namespace math } // namespace math
} // namspace ck } // namespace ck
#endif #endif
#ifndef CK_PRINT_HPP
#define CK_PRINT_HPP
#include "array.hpp"
#include "statically_indexed_array.hpp"
#include "container_helper.hpp"
#include "sequence.hpp"
namespace ck {
template <typename T>
__host__ __device__ void print_array(const char* s, T a)
{
using data_type = decltype(a.At(Number<0>{}));
constexpr index_t nsize = a.Size();
#if 0
if constexpr(is_same<data_type, uint32_t>{})
{
printf("%s size %u, {", s, nsize);
static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("%u, ", uint32_t{a[i]}); });
printf("}\n");
}
else if constexpr(is_same<data_type, int32_t>{})
{
printf("%s size %d, {", s, nsize);
static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("%d, ", int32_t{a[i]}); });
printf("}\n");
}
else if constexpr(is_same<data_type, bool>{})
{
printf("%s size %d, {", s, nsize);
static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("%d, ", bool{a[i]}); });
printf("}\n");
}
#else
printf("%s size %d, {", s, nsize);
static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("%d, ", int32_t{a[i]}); });
printf("}\n");
#endif
}
template <typename T>
__host__ __device__ void print_array_v2(const char* s, T a)
{
using data_type = decltype(a.At(Number<0>{}));
constexpr index_t nsize = a.Size();
#if 0
if constexpr(is_same<data_type, uint32_t>{})
{
printf("%s size %u, {", s, nsize);
static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("[%u] %u, ", i.value, a[i]); });
printf("}\n");
}
else if constexpr(is_same<data_type, int32_t>{})
{
printf("%s size %d, {", s, nsize);
static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("[%d] %d, ", i.value, a[i]); });
printf("}\n");
}
#else
printf("%s size %d, {", s, nsize);
static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("[%d] %d, ", i.value, a[i]); });
printf("}\n");
#endif
}
} // namespace ck
#endif
#ifndef CK_PRINT_ARRAY_HPP
#define CK_PRINT_ARRAY_HPP
#include "array.hpp"
namespace ck {
template <index_t NSize>
__host__ __device__ void print_array(const char* s, Array<uint32_t, NSize> a)
{
constexpr index_t nsize = a.GetSize();
static_assert(nsize > 0 && nsize <= 10, "wrong!");
static_if<nsize == 1>{}([&](auto) { printf("%s size %u, {%u}\n", s, nsize, a[0]); });
static_if<nsize == 2>{}([&](auto) { printf("%s size %u, {%u %u}\n", s, nsize, a[0], a[1]); });
static_if<nsize == 3>{}(
[&](auto) { printf("%s size %u, {%u %u %u}\n", s, nsize, a[0], a[1], a[2]); });
static_if<nsize == 4>{}(
[&](auto) { printf("%s size %u, {%u %u %u %u}\n", s, nsize, a[0], a[1], a[2], a[3]); });
static_if<nsize == 5>{}([&](auto) {
printf("%s size %u, {%u %u %u %u %u}\n", s, nsize, a[0], a[1], a[2], a[3], a[4]);
});
static_if<nsize == 6>{}([&](auto) {
printf("%s size %u, {%u %u %u %u %u %u}\n", s, nsize, a[0], a[1], a[2], a[3], a[4], a[5]);
});
static_if<nsize == 7>{}([&](auto) {
printf("%s size %u, {%u %u %u %u %u %u %u}\n",
s,
nsize,
a[0],
a[1],
a[2],
a[3],
a[4],
a[5],
a[6]);
});
static_if<nsize == 8>{}([&](auto) {
printf("%s size %u, {%u %u %u %u %u %u %u %u}\n",
s,
nsize,
a[0],
a[1],
a[2],
a[3],
a[4],
a[5],
a[6],
a[7]);
});
static_if<nsize == 9>{}([&](auto) {
printf("%s size %u, {%u %u %u %u %u %u %u %u %u}\n",
s,
nsize,
a[0],
a[1],
a[2],
a[3],
a[4],
a[5],
a[6],
a[7],
a[8]);
});
static_if<nsize == 10>{}([&](auto) {
printf("%s size %u, {%u %u %u %u %u %u %u %u %u %u}\n",
s,
nsize,
a[0],
a[1],
a[2],
a[3],
a[4],
a[5],
a[6],
a[7],
a[8],
a[9]);
});
}
template <index_t NSize>
__host__ __device__ void print_array(const char* s, Array<int32_t, NSize> a)
{
constexpr index_t nsize = a.GetSize();
static_assert(nsize > 0 && nsize <= 10, "wrong!");
static_if<nsize == 1>{}([&](auto) { printf("%s size %d, {%d}\n", s, nsize, a[0]); });
static_if<nsize == 2>{}([&](auto) { printf("%s size %d, {%d %d}\n", s, nsize, a[0], a[1]); });
static_if<nsize == 3>{}(
[&](auto) { printf("%s size %d, {%d %d %d}\n", s, nsize, a[0], a[1], a[2]); });
static_if<nsize == 4>{}(
[&](auto) { printf("%s size %d, {%d %d %d %d}\n", s, nsize, a[0], a[1], a[2], a[3]); });
static_if<nsize == 5>{}([&](auto) {
printf("%s size %d, {%d %d %d %d %d}\n", s, nsize, a[0], a[1], a[2], a[3], a[4]);
});
static_if<nsize == 6>{}([&](auto) {
printf("%s size %d, {%d %d %d %d %d %d}\n", s, nsize, a[0], a[1], a[2], a[3], a[4], a[5]);
});
static_if<nsize == 7>{}([&](auto) {
printf("%s size %d, {%d %d %d %d %d %d %d}\n",
s,
nsize,
a[0],
a[1],
a[2],
a[3],
a[4],
a[5],
a[6]);
});
static_if<nsize == 8>{}([&](auto) {
printf("%s size %d, {%d %d %d %d %d %d %d %d}\n",
s,
nsize,
a[0],
a[1],
a[2],
a[3],
a[4],
a[5],
a[6],
a[7]);
});
static_if<nsize == 9>{}([&](auto) {
printf("%s size %d, {%d %d %d %d %d %d %d %d %d}\n",
s,
nsize,
a[0],
a[1],
a[2],
a[3],
a[4],
a[5],
a[6],
a[7],
a[8]);
});
static_if<nsize == 10>{}([&](auto) {
printf("%s size %d, {%d %d %d %d %d %d %d %d %d %d}\n",
s,
nsize,
a[0],
a[1],
a[2],
a[3],
a[4],
a[5],
a[6],
a[7],
a[8],
a[9]);
});
}
} // namespace ck
#endif
#ifndef CK_PRINT_SEQUENCE_HPP
#define CK_PRINT_SEQUENCE_HPP
#include "sequence.hpp"
namespace ck {
template <index_t... Xs>
__host__ __device__ void print_sequence(const char* s, Sequence<Xs...>)
{
constexpr index_t nsize = Sequence<Xs...>::Size();
static_assert(nsize <= 10, "wrong!");
static_if<nsize == 0>{}([&](auto) { printf("%s size %u, {}\n", s, nsize, Xs...); });
static_if<nsize == 1>{}([&](auto) { printf("%s size %u, {%u}\n", s, nsize, Xs...); });
static_if<nsize == 2>{}([&](auto) { printf("%s size %u, {%u %u}\n", s, nsize, Xs...); });
static_if<nsize == 3>{}([&](auto) { printf("%s size %u, {%u %u %u}\n", s, nsize, Xs...); });
static_if<nsize == 4>{}([&](auto) { printf("%s size %u, {%u %u %u %u}\n", s, nsize, Xs...); });
static_if<nsize == 5>{}(
[&](auto) { printf("%s size %u, {%u %u %u %u %u}\n", s, nsize, Xs...); });
static_if<nsize == 6>{}(
[&](auto) { printf("%s size %u, {%u %u %u %u %u %u}\n", s, nsize, Xs...); });
static_if<nsize == 7>{}(
[&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
static_if<nsize == 8>{}(
[&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
static_if<nsize == 9>{}(
[&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
static_if<nsize == 10>{}(
[&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
}
} // namespace ck
#endif
...@@ -168,6 +168,14 @@ struct Sequence ...@@ -168,6 +168,14 @@ struct Sequence
{ {
return Sequence<f(Is)...>{}; return Sequence<f(Is)...>{};
} }
__host__ __device__ static void Print()
{
printf("{");
printf("size %d, ", index_t{Size()});
static_for<0, Size(), 1>{}([&](auto i) { printf("%d ", At(i).value); });
printf("}");
}
}; };
// merge sequence // merge sequence
...@@ -750,6 +758,13 @@ __host__ __device__ constexpr auto reverse_inclusive_scan_sequence(Seq, Reduce, ...@@ -750,6 +758,13 @@ __host__ __device__ constexpr auto reverse_inclusive_scan_sequence(Seq, Reduce,
return typename sequence_reverse_inclusive_scan<Seq, Reduce, Init>::type{}; return typename sequence_reverse_inclusive_scan<Seq, Reduce, Init>::type{};
} }
template <typename Seq, typename Reduce, index_t Init>
__host__ __device__ constexpr auto reverse_exclusive_scan_sequence(Seq, Reduce, Number<Init>)
{
return reverse_inclusive_scan_sequence(Seq::PopFront(), Reduce{}, Number<Init>{})
.PushBack(Number<Init>{});
}
template <typename Seq, typename Reduce, index_t Init> template <typename Seq, typename Reduce, index_t Init>
__host__ __device__ constexpr auto inclusive_scan_sequence(Seq, Reduce, Number<Init>) __host__ __device__ constexpr auto inclusive_scan_sequence(Seq, Reduce, Number<Init>)
{ {
......
#ifndef CK_SEQUENCE_HELPER_HPP
#define CK_SEQUENCE_HELPER_HPP
#include "sequence_helper.hpp"
namespace ck {
template <typename F, index_t N>
__host__ __device__ constexpr auto generate_sequence(F, Number<N>)
{
return typename sequence_gen<N, F>::type{};
}
} // namespace ck
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment