"docs/en/git@developer.sourcefind.cn:OpenDAS/mmcv.git" did not exist on "5c5a8e0355951dacb4fae8f4ae7850dc461ee799"
Commit 109f1e90 authored by Chao Liu's avatar Chao Liu
Browse files

Merge branch 'master' into implicit_gemm_v4_backward

parents 0b10c0bb a68b16a5
...@@ -18,18 +18,22 @@ struct Array ...@@ -18,18 +18,22 @@ struct Array
__host__ __device__ constexpr index_t GetSize() const { return NSize; } __host__ __device__ constexpr index_t GetSize() const { return NSize; }
__host__ __device__ constexpr TData operator[](index_t i) const { return mData[i]; } template <index_t I>
__host__ __device__ constexpr TData operator[](Number<I>) const
{
return mData[I];
}
__host__ __device__ TData& operator[](index_t i) { return mData[i]; } __host__ __device__ constexpr TData operator[](index_t i) const { return mData[i]; }
template <index_t I> template <index_t I>
__host__ __device__ constexpr TData Get(Number<I>) const __host__ __device__ TData& operator()(Number<I>)
{ {
static_assert(I < NSize, "wrong!");
return mData[I]; return mData[I];
} }
__host__ __device__ TData& operator()(index_t i) { return mData[i]; }
template <index_t I> template <index_t I>
__host__ __device__ constexpr void Set(Number<I>, TData x) __host__ __device__ constexpr void Set(Number<I>, TData x)
{ {
...@@ -38,16 +42,33 @@ struct Array ...@@ -38,16 +42,33 @@ struct Array
mData[I] = x; mData[I] = x;
} }
__host__ __device__ constexpr void Set(index_t I, TData x) { mData[I] = x; }
struct lambda_PushBack // emulate constexpr lambda
{
const Array<TData, NSize>& old_array;
Array<TData, NSize + 1>& new_array;
__host__ __device__ constexpr lambda_PushBack(const Array<TData, NSize>& old_array_,
Array<TData, NSize + 1>& new_array_)
: old_array(old_array_), new_array(new_array_)
{
}
template <index_t I>
__host__ __device__ constexpr void operator()(Number<I>) const
{
new_array.Set(Number<I>{}, old_array[I]);
}
};
__host__ __device__ constexpr auto PushBack(TData x) const __host__ __device__ constexpr auto PushBack(TData x) const
{ {
Array<TData, NSize + 1> new_array; Array<TData, NSize + 1> new_array;
static_for<0, NSize, 1>{}([&](auto I) { static_for<0, NSize, 1>{}(lambda_PushBack(*this, new_array));
constexpr index_t i = I.Get();
new_array[i] = mData[i];
});
new_array[NSize] = x; new_array.Set(Number<NSize>{}, x);
return new_array; return new_array;
} }
...@@ -62,93 +83,60 @@ __host__ __device__ constexpr auto sequence2array(Sequence<Is...>) ...@@ -62,93 +83,60 @@ __host__ __device__ constexpr auto sequence2array(Sequence<Is...>)
template <class TData, index_t NSize> template <class TData, index_t NSize>
__host__ __device__ constexpr auto make_zero_array() __host__ __device__ constexpr auto make_zero_array()
{ {
#if 0
Array<TData, NSize> a;
static_for<0, NSize, 1>{}([&](auto I) {
constexpr index_t i = I.Get();
a[i] = static_cast<TData>(0);
});
return a;
#else
constexpr auto zero_sequence = typename uniform_sequence_gen<NSize, 0>::SeqType{}; constexpr auto zero_sequence = typename uniform_sequence_gen<NSize, 0>::SeqType{};
constexpr auto zero_array = sequence2array(zero_sequence); constexpr auto zero_array = sequence2array(zero_sequence);
return zero_array; return zero_array;
#endif
} }
template <class TData, index_t NSize, index_t... IRs> template <class TData, index_t NSize, index_t... IRs>
__host__ __device__ constexpr auto reorder_array_given_new2old(const Array<TData, NSize>& old_array, __host__ __device__ constexpr auto reorder_array_given_new2old(const Array<TData, NSize>& old_array,
Sequence<IRs...> new2old) Sequence<IRs...> /*new2old*/)
{ {
Array<TData, NSize> new_array;
static_assert(NSize == sizeof...(IRs), "NSize not consistent"); static_assert(NSize == sizeof...(IRs), "NSize not consistent");
static_for<0, NSize, 1>{}([&](auto IDim) { static_assert(is_valid_sequence_map<Sequence<IRs...>>::value, "wrong! invalid reorder map");
constexpr index_t idim = IDim.Get();
new_array[idim] = old_array[new2old.Get(IDim)];
});
return new_array; return Array<TData, NSize>{old_array.mSize[IRs]...};
} }
#if 0
template <class TData, index_t NSize, index_t... IRs>
__host__ __device__ constexpr auto reorder_array_given_old2new(const Array<TData, NSize>& old_array,
Sequence<IRs...> old2new)
{
Array<TData, NSize> new_array;
static_assert(NSize == sizeof...(IRs), "NSize not consistent");
static_for<0, NSize, 1>{}([&](auto IDim) {
constexpr index_t idim = IDim.Get();
new_array[old2new.Get(IDim)] = old_array[idim];
});
return new_array;
}
#else
template <class TData, index_t NSize, class MapOld2New> template <class TData, index_t NSize, class MapOld2New>
struct reorder_array_given_old2new_impl struct lambda_reorder_array_given_old2new
{ {
const Array<TData, NSize>& old_array_ref; const Array<TData, NSize>& old_array;
Array<TData, NSize>& new_array_ref; Array<TData, NSize>& new_array;
__host__ __host__ __device__ constexpr lambda_reorder_array_given_old2new(
__device__ constexpr reorder_array_given_old2new_impl(const Array<TData, NSize>& old_array, const Array<TData, NSize>& old_array_, Array<TData, NSize>& new_array_)
Array<TData, NSize>& new_array) : old_array(old_array_), new_array(new_array_)
: old_array_ref(old_array), new_array_ref(new_array)
{ {
} }
template <index_t IOldDim> template <index_t IOldDim>
__host__ __device__ constexpr void operator()(Number<IOldDim>) const __host__ __device__ constexpr void operator()(Number<IOldDim>) const
{ {
TData old_data = old_array_ref.Get(Number<IOldDim>{}); TData old_data = old_array[IOldDim];
constexpr index_t INewDim = MapOld2New::Get(Number<IOldDim>{}); constexpr index_t INewDim = MapOld2New::Get(Number<IOldDim>{});
new_array_ref.Set(Number<INewDim>{}, old_data); new_array.Set(Number<INewDim>{}, old_data);
} }
}; };
template <class TData, index_t NSize, index_t... IRs> template <class TData, index_t NSize, index_t... IRs>
__host__ __device__ constexpr auto reorder_array_given_old2new(const Array<TData, NSize>& old_array, __host__ __device__ constexpr auto reorder_array_given_old2new(const Array<TData, NSize>& old_array,
Sequence<IRs...> old2new) Sequence<IRs...> /*old2new*/)
{ {
Array<TData, NSize> new_array; Array<TData, NSize> new_array;
static_assert(NSize == sizeof...(IRs), "NSize not consistent"); static_assert(NSize == sizeof...(IRs), "NSize not consistent");
static_assert(is_valid_sequence_map<Sequence<IRs...>>::value, "wrong! invalid reorder map");
static_for<0, NSize, 1>{}( static_for<0, NSize, 1>{}(
reorder_array_given_old2new_impl<TData, NSize, Sequence<IRs...>>(old_array, new_array)); lambda_reorder_array_given_old2new<TData, NSize, Sequence<IRs...>>(old_array, new_array));
return new_array; return new_array;
} }
#endif
template <class TData, index_t NSize, class ExtractSeq> template <class TData, index_t NSize, class ExtractSeq>
__host__ __device__ constexpr auto extract_array(const Array<TData, NSize>& old_array, ExtractSeq) __host__ __device__ constexpr auto extract_array(const Array<TData, NSize>& old_array, ExtractSeq)
...@@ -159,25 +147,44 @@ __host__ __device__ constexpr auto extract_array(const Array<TData, NSize>& old_ ...@@ -159,25 +147,44 @@ __host__ __device__ constexpr auto extract_array(const Array<TData, NSize>& old_
static_assert(new_size <= NSize, "wrong! too many extract"); static_assert(new_size <= NSize, "wrong! too many extract");
static_for<0, new_size, 1>{}([&](auto I) { static_for<0, new_size, 1>{}([&](auto I) { new_array(I) = old_array[ExtractSeq::Get(I)]; });
constexpr index_t i = I.Get();
new_array[i] = old_array[ExtractSeq::Get(I)];
});
return new_array; return new_array;
} }
template <class F, class X, class Y, class Z> // emulate constepxr lambda for array math
struct lambda_array_math
{
const F& f;
const X& x;
const Y& y;
Z& z;
__host__ __device__ constexpr lambda_array_math(const F& f_, const X& x_, const Y& y_, Z& z_)
: f(f_), x(x_), y(y_), z(z_)
{
}
template <index_t IDim_>
__host__ __device__ constexpr void operator()(Number<IDim_>) const
{
constexpr auto IDim = Number<IDim_>{};
z.Set(IDim, f(x[IDim], y[IDim]));
}
};
// Array = Array + Array // Array = Array + Array
template <class TData, index_t NSize> template <class TData, index_t NSize>
__host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Array<TData, NSize> b) __host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Array<TData, NSize> b)
{ {
Array<TData, NSize> result; Array<TData, NSize> result;
static_for<0, NSize, 1>{}([&](auto I) { auto f = mod_conv::plus<index_t>{};
constexpr index_t i = I.Get();
result[i] = a[i] + b[i]; static_for<0, NSize, 1>{}(
}); lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
f, a, b, result));
return result; return result;
} }
...@@ -188,11 +195,11 @@ __host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Array<TData, ...@@ -188,11 +195,11 @@ __host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Array<TData,
{ {
Array<TData, NSize> result; Array<TData, NSize> result;
static_for<0, NSize, 1>{}([&](auto I) { auto f = mod_conv::minus<index_t>{};
constexpr index_t i = I.Get();
result[i] = a[i] - b[i]; static_for<0, NSize, 1>{}(
}); lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
f, a, b, result));
return result; return result;
} }
...@@ -205,11 +212,11 @@ __host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Sequence<Is. ...@@ -205,11 +212,11 @@ __host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Sequence<Is.
Array<TData, NSize> result; Array<TData, NSize> result;
static_for<0, NSize, 1>{}([&](auto I) { auto f = mod_conv::plus<index_t>{};
constexpr index_t i = I.Get();
result[i] = a[i] + b.Get(I); static_for<0, NSize, 1>{}(
}); lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
f, a, b, result));
return result; return result;
} }
...@@ -222,11 +229,11 @@ __host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Sequence<Is. ...@@ -222,11 +229,11 @@ __host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Sequence<Is.
Array<TData, NSize> result; Array<TData, NSize> result;
static_for<0, NSize, 1>{}([&](auto I) { auto f = mod_conv::minus<index_t>{};
constexpr index_t i = I.Get();
result[i] = a[i] - b.Get(I); static_for<0, NSize, 1>{}(
}); lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
f, a, b, result));
return result; return result;
} }
...@@ -239,11 +246,11 @@ __host__ __device__ constexpr auto operator*(Array<TData, NSize> a, Sequence<Is. ...@@ -239,11 +246,11 @@ __host__ __device__ constexpr auto operator*(Array<TData, NSize> a, Sequence<Is.
Array<TData, NSize> result; Array<TData, NSize> result;
static_for<0, NSize, 1>{}([&](auto I) { auto f = mod_conv::multiplies<index_t>{};
constexpr index_t i = I.Get();
result[i] = a[i] * b.Get(I); static_for<0, NSize, 1>{}(
}); lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
f, a, b, result));
return result; return result;
} }
...@@ -256,11 +263,11 @@ __host__ __device__ constexpr auto operator-(Sequence<Is...> a, Array<TData, NSi ...@@ -256,11 +263,11 @@ __host__ __device__ constexpr auto operator-(Sequence<Is...> a, Array<TData, NSi
Array<TData, NSize> result; Array<TData, NSize> result;
static_for<0, NSize, 1>{}([&](auto I) { auto f = mod_conv::minus<index_t>{};
constexpr index_t i = I.Get();
result[i] = a.Get(I) - b[i]; static_for<0, NSize, 1>{}(
}); lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
f, a, b, result));
return result; return result;
} }
...@@ -273,10 +280,7 @@ accumulate_on_array(const Array<TData, NSize>& a, Reduce f, TData init) ...@@ -273,10 +280,7 @@ accumulate_on_array(const Array<TData, NSize>& a, Reduce f, TData init)
static_assert(NSize > 0, "wrong"); static_assert(NSize > 0, "wrong");
static_for<0, NSize, 1>{}([&](auto I) { static_for<0, NSize, 1>{}([&](auto I) { result = f(result, a[I]); });
constexpr index_t i = I.Get();
result = f(result, a[i]);
});
return result; return result;
} }
......
...@@ -9,6 +9,8 @@ ...@@ -9,6 +9,8 @@
template <class OriginalTensorDesc, class... OriginalDimMergeSeqs> template <class OriginalTensorDesc, class... OriginalDimMergeSeqs>
struct ConstantMergedTensorDescriptor struct ConstantMergedTensorDescriptor
{ {
using Type = ConstantMergedTensorDescriptor;
static constexpr auto mOriginalDimMergeSeqs = std::tuple<OriginalDimMergeSeqs...>{}; static constexpr auto mOriginalDimMergeSeqs = std::tuple<OriginalDimMergeSeqs...>{};
static constexpr index_t nDim = sizeof...(OriginalDimMergeSeqs); static constexpr index_t nDim = sizeof...(OriginalDimMergeSeqs);
...@@ -74,43 +76,17 @@ struct ConstantMergedTensorDescriptor ...@@ -74,43 +76,17 @@ struct ConstantMergedTensorDescriptor
return OriginalTensorDesc::GetElementSize(); return OriginalTensorDesc::GetElementSize();
} }
#if 0
__host__ __device__ static constexpr auto
GetOriginalMultiIndexFromMultiIndex(Array<index_t, nDim> multi_id)
{
Array<index_t, nOriginalDim> original_multi_id;
static_for<0, nDim, 1>{}([&](auto IDim) {
constexpr index_t idim = IDim.Get();
constexpr auto original_dims_partial = std::get<idim>(mOriginalDimMergeSeqs);
// get partial original-multi-id corresponding to this merged dimension
const auto original_multi_id_partial =
OriginalTensorDesc::Extract(original_dims_partial)
.GetMultiIndexFrom1dIndex(multi_id[idim]);
static_for<0, original_dims_partial.GetSize(), 1>{}([&](auto I_) {
constexpr auto I = decltype(I_){};
constexpr index_t idim_original = original_dims_partial.Get(I);
original_multi_id[idim_original] = original_multi_id_partial[I.Get()];
});
});
return original_multi_id;
}
#else
template <class OriginalDimsPartial> template <class OriginalDimsPartial>
struct GetOriginalMultiIndexFromMultiIndex_impl1 struct lambda_1_GetOriginalMultiIndexFromMultiIndex
{ {
const Array<index_t, OriginalDimsPartial::GetSize()>& original_multi_id_partial_ref; const Array<index_t, OriginalDimsPartial::GetSize()>& original_multi_id_partial;
Array<index_t, nOriginalDim>& original_multi_id_ref; Array<index_t, nOriginalDim>& original_multi_id;
__host__ __device__ constexpr GetOriginalMultiIndexFromMultiIndex_impl1( __host__ __device__ constexpr lambda_1_GetOriginalMultiIndexFromMultiIndex(
const Array<index_t, OriginalDimsPartial::GetSize()>& original_multi_id_partial, const Array<index_t, OriginalDimsPartial::GetSize()>& original_multi_id_partial_,
Array<index_t, nOriginalDim>& original_multi_id) Array<index_t, nOriginalDim>& original_multi_id_)
: original_multi_id_partial_ref(original_multi_id_partial), : original_multi_id_partial(original_multi_id_partial_),
original_multi_id_ref(original_multi_id) original_multi_id(original_multi_id_)
{ {
} }
...@@ -119,37 +95,36 @@ struct ConstantMergedTensorDescriptor ...@@ -119,37 +95,36 @@ struct ConstantMergedTensorDescriptor
{ {
constexpr index_t idim_original = OriginalDimsPartial::Get(Number<I>{}); constexpr index_t idim_original = OriginalDimsPartial::Get(Number<I>{});
index_t itmp = original_multi_id_partial_ref.Get(Number<I>{}); index_t itmp = original_multi_id_partial[I];
original_multi_id_ref.Set(Number<idim_original>{}, itmp); original_multi_id.Set(Number<idim_original>{}, itmp);
} }
}; };
struct GetOriginalMultiIndexFromMultiIndex_impl0 struct lambda_0_GetOriginalMultiIndexFromMultiIndex
{ {
const Array<index_t, nDim>& multi_id_ref; const Array<index_t, nDim>& multi_id;
Array<index_t, nOriginalDim>& original_multi_id_ref; Array<index_t, nOriginalDim>& original_multi_id;
__host__ __device__ constexpr GetOriginalMultiIndexFromMultiIndex_impl0( __host__ __device__ constexpr lambda_0_GetOriginalMultiIndexFromMultiIndex(
const Array<index_t, nDim>& multi_id, Array<index_t, nOriginalDim>& original_multi_id) const Array<index_t, nDim>& multi_id_, Array<index_t, nOriginalDim>& original_multi_id_)
: multi_id_ref(multi_id), original_multi_id_ref(original_multi_id) : multi_id(multi_id_), original_multi_id(original_multi_id_)
{ {
} }
template <index_t IDim> template <index_t IDim>
__host__ __device__ constexpr void operator()(Number<IDim>) const __host__ __device__ constexpr void operator()(Number<IDim>) const
{ {
constexpr auto original_dims_partial = constexpr auto original_dims_partial = std::get<IDim>(Type::mOriginalDimMergeSeqs);
std::get<IDim>(std::tuple<OriginalDimMergeSeqs...>{});
// get partial original-multi-id corresponding to this merged dimension // get partial original-multi-id corresponding to this merged dimension
const auto original_multi_id_partial = const auto original_multi_id_partial =
OriginalTensorDesc::Extract(original_dims_partial) OriginalTensorDesc::Extract(original_dims_partial)
.GetMultiIndexFrom1dIndex(multi_id_ref[IDim]); .GetMultiIndexFrom1dIndex(multi_id[IDim]);
static_for<0, original_dims_partial.GetSize(), 1>{}( static_for<0, original_dims_partial.GetSize(), 1>{}(
GetOriginalMultiIndexFromMultiIndex_impl1<decltype(original_dims_partial)>( lambda_1_GetOriginalMultiIndexFromMultiIndex<decltype(original_dims_partial)>(
original_multi_id_partial, original_multi_id_ref)); original_multi_id_partial, original_multi_id));
} }
}; };
...@@ -160,7 +135,7 @@ struct ConstantMergedTensorDescriptor ...@@ -160,7 +135,7 @@ struct ConstantMergedTensorDescriptor
Array<index_t, nOriginalDim> original_multi_id; Array<index_t, nOriginalDim> original_multi_id;
static_for<0, nDim, 1>{}( static_for<0, nDim, 1>{}(
GetOriginalMultiIndexFromMultiIndex_impl0(multi_id, original_multi_id)); lambda_0_GetOriginalMultiIndexFromMultiIndex(multi_id, original_multi_id));
return original_multi_id; return original_multi_id;
} }
...@@ -174,7 +149,6 @@ struct ConstantMergedTensorDescriptor ...@@ -174,7 +149,6 @@ struct ConstantMergedTensorDescriptor
return OriginalTensorDesc::GetOffsetFromMultiIndex(original_multi_id); return OriginalTensorDesc::GetOffsetFromMultiIndex(original_multi_id);
} }
#endif
__host__ __device__ static constexpr index_t __host__ __device__ static constexpr index_t
GetOffsetFromMultiIndex(Array<index_t, nDim> multi_id) GetOffsetFromMultiIndex(Array<index_t, nDim> multi_id)
...@@ -192,9 +166,9 @@ struct ConstantMergedTensorDescriptor ...@@ -192,9 +166,9 @@ struct ConstantMergedTensorDescriptor
__host__ __device__ static constexpr Array<index_t, nDim> GetMultiIndexFrom1dIndex(index_t id) __host__ __device__ static constexpr Array<index_t, nDim> GetMultiIndexFrom1dIndex(index_t id)
{ {
constexpr auto dummy_desc = make_ConstantTensorDescriptor_packed(GetLengths()); constexpr auto packed_desc = make_ConstantTensorDescriptor_packed(GetLengths());
return dummy_desc.GetMultiIndexFrom1dIndex(id); return packed_desc.GetMultiIndexFrom1dIndex(id);
} }
}; };
......
...@@ -48,26 +48,48 @@ struct ConstantTensorDescriptor ...@@ -48,26 +48,48 @@ struct ConstantTensorDescriptor
template <index_t I> template <index_t I>
__host__ __device__ static constexpr index_t GetLength(Number<I>) __host__ __device__ static constexpr index_t GetLength(Number<I>)
{ {
return Lengths{}.Get(Number<I>{}); return Lengths::Get(Number<I>{});
} }
template <index_t I> template <index_t I>
__host__ __device__ static constexpr index_t GetStride(Number<I>) __host__ __device__ static constexpr index_t GetStride(Number<I>)
{ {
return Strides{}.Get(Number<I>{}); return Strides::Get(Number<I>{});
} }
__host__ __device__ static constexpr bool AreStridesNonAscending() struct lambda_AreDimensionsContinuous
{ {
bool flag = true; bool& is_continuous;
static_for<0, nDim - 1, 1>{}([&](auto IDim) { __host__ __device__ constexpr lambda_AreDimensionsContinuous(bool& is_continuous_)
constexpr auto IDim_p1 = Number<IDim.Get() + 1>{}; : is_continuous(is_continuous_)
{
}
flag = flag && (GetLength(IDim) >= GetLength(IDim_p1)); template <index_t IDim_>
}); __host__ __device__ constexpr void operator()(Number<IDim_>) const
{
constexpr auto IDim = Number<IDim_>{};
constexpr auto IDim_p1 = Number<IDim_ + 1>{};
is_continuous =
is_continuous && (GetStride(IDim) >= GetStride(IDim_p1) &&
GetStride(IDim) == GetStride(IDim_p1) * GetLength(IDim_p1));
}
};
return flag; __host__ __device__ static constexpr bool AreDimensionsContinuous()
{
bool is_continuous = true;
static_for<0, nDim - 1, 1>{}(lambda_AreDimensionsContinuous(is_continuous));
return is_continuous;
}
__host__ __device__ static constexpr bool IsPackedTensor()
{
return AreDimensionsContinuous() && GetStride(Number<nDim - 1>{}) == 1;
} }
template <class T> template <class T>
...@@ -92,40 +114,24 @@ struct ConstantTensorDescriptor ...@@ -92,40 +114,24 @@ struct ConstantTensorDescriptor
return align.Get() * ((element_space_unaligned + align.Get() - 1) / align.Get()); return align.Get() * ((element_space_unaligned + align.Get() - 1) / align.Get());
} }
#if 0 // emulate constexpr lambda
template <index_t NSize> template <index_t NSize>
__host__ __device__ static constexpr index_t struct lambda_GetOffsetFromMultiIndex
GetOffsetFromMultiIndex(Array<index_t, NSize> multi_id)
{ {
static_assert(NSize == nDim, "wrong! Dimension not consistent"); Array<index_t, NSize>& multi_id;
index_t& offset;
index_t offset = 0;
static_for<0, nDim, 1>{}([&](auto IDim) { __host__
constexpr index_t idim = IDim.Get(); __device__ constexpr lambda_GetOffsetFromMultiIndex(Array<index_t, NSize>& multi_id_,
offset += multi_id[idim] * GetStride(IDim); index_t& offset_)
}); : multi_id(multi_id_), offset(offset_)
return offset;
}
#else
template <index_t NSize>
struct GetOffsetFromMultiIndex_impl
{
Array<index_t, NSize>& multi_id_ref;
index_t& offset_ref;
__host__ __device__ constexpr GetOffsetFromMultiIndex_impl(Array<index_t, NSize>& multi_id,
index_t& offset)
: multi_id_ref(multi_id), offset_ref(offset)
{ {
} }
template <index_t IDim> template <class X>
__host__ __device__ constexpr bool operator()(Number<IDim>) const __host__ __device__ constexpr void operator()(X IDim) const
{ {
offset_ref += multi_id_ref.Get(Number<IDim>{}) * Type::GetStride(Number<IDim>{}); offset += multi_id[IDim] * Type::GetStride(IDim);
return true;
} }
}; };
...@@ -137,11 +143,10 @@ struct ConstantTensorDescriptor ...@@ -137,11 +143,10 @@ struct ConstantTensorDescriptor
index_t offset = 0; index_t offset = 0;
static_for<0, nDim, 1>{}(GetOffsetFromMultiIndex_impl<NSize>(multi_id, offset)); static_for<0, nDim, 1>{}(lambda_GetOffsetFromMultiIndex<NSize>(multi_id, offset));
return offset; return offset;
} }
#endif
template <class... Is> template <class... Is>
__host__ __device__ static constexpr index_t GetOffsetFromMultiIndex(Is... is) __host__ __device__ static constexpr index_t GetOffsetFromMultiIndex(Is... is)
...@@ -160,47 +165,27 @@ struct ConstantTensorDescriptor ...@@ -160,47 +165,27 @@ struct ConstantTensorDescriptor
multi_id * GetStrides(), mod_conv::plus<index_t>{}, Number<0>{}); multi_id * GetStrides(), mod_conv::plus<index_t>{}, Number<0>{});
} }
#if 0 // emulate constexpr lambda
__host__ __device__ static constexpr Array<index_t, nDim> GetMultiIndexFrom1dIndex(index_t id) template <class PackedStrides>
{ struct lambda_GetMultiIndexFrom1dIndex
Array<index_t, nDim> multi_id;
constexpr auto dummy_strides = calculate_tensor_strides_packed(GetLengths());
// calculate index in each of the dimensions in the order of their dimension
static_for<0, nDim - 1, 1>{}([&](auto IDim) {
constexpr index_t idim = IDim.Get();
constexpr index_t stride = dummy_strides.Get(Number<idim>{});
multi_id[idim] = id / stride;
id -= multi_id[idim] * stride;
});
multi_id[nDim - 1] = id / dummy_strides.Get(Number<nDim - 1>{});
return multi_id;
}
#else
struct GetMultiIndexFrom1dIndex_impl
{ {
using DummyStrides = decltype(calculate_tensor_strides_packed(GetLengths())); index_t& id;
Array<index_t, nDim>& multi_id;
index_t& id_ref;
Array<index_t, nDim>& multi_id_ref;
__host__ __device__ constexpr GetMultiIndexFrom1dIndex_impl(index_t& id, __host__
Array<index_t, nDim>& multi_id) __device__ constexpr lambda_GetMultiIndexFrom1dIndex(index_t& id_,
: id_ref(id), multi_id_ref(multi_id) Array<index_t, nDim>& multi_id_)
: id(id_), multi_id(multi_id_)
{ {
} }
template <index_t IDim> template <class IDim_>
__host__ __device__ constexpr bool operator()(Number<IDim>) const __host__ __device__ constexpr void operator()(IDim_) const
{ {
constexpr index_t stride = DummyStrides::Get(Number<IDim>{}); constexpr auto IDim = IDim_{};
multi_id_ref.Set(Number<IDim>{}, id_ref / stride); constexpr index_t stride = PackedStrides::Get(IDim);
id_ref -= multi_id_ref.Get(Number<IDim>{}) * stride; multi_id.Set(IDim, id / stride);
id -= multi_id[IDim] * stride;
return true;
} }
}; };
...@@ -208,27 +193,15 @@ struct ConstantTensorDescriptor ...@@ -208,27 +193,15 @@ struct ConstantTensorDescriptor
{ {
Array<index_t, nDim> multi_id; Array<index_t, nDim> multi_id;
constexpr auto dummy_strides = calculate_tensor_strides_packed(GetLengths()); using PackedStrides = decltype(calculate_tensor_strides_packed(GetLengths()));
// calculate index in each of the dimensions in the order of their dimension // calculate index in each of the dimensions in the order of their dimension
static_for<0, nDim - 1, 1>{}(GetMultiIndexFrom1dIndex_impl(id, multi_id)); static_for<0, nDim - 1, 1>{}(lambda_GetMultiIndexFrom1dIndex<PackedStrides>(id, multi_id));
index_t itmp = id / dummy_strides.Get(Number<nDim - 1>{});
multi_id.Set(Number<nDim - 1>{}, itmp); multi_id.Set(Number<nDim - 1>{}, id / PackedStrides::Get(Number<nDim - 1>{}));
return multi_id; return multi_id;
} }
#endif
#if 0
// return type is Sequence<...>
template<index_t Id>
__host__ __device__ static constexpr auto GetMultiIndexFrom1dIndex(Number<Id>)
{
return inclusive_scan_sequence(f_impl, GetStrides(), Number<Id>{});
}
#endif
__host__ __device__ static constexpr auto __host__ __device__ static constexpr auto
GetOriginalMultiIndexFromMultiIndex(Array<index_t, nDim> multi_id) GetOriginalMultiIndexFromMultiIndex(Array<index_t, nDim> multi_id)
...@@ -236,9 +209,10 @@ struct ConstantTensorDescriptor ...@@ -236,9 +209,10 @@ struct ConstantTensorDescriptor
return multi_id; return multi_id;
} }
// This function doesn't do carry check on the highest dimension, for performance reason. // This function doesn't do carry check on the highest dimension for positive stepping (or
// It is the user's responsibility to make sure the result "new_mutli_id" is not out-of-bound // borrow check on the lowest dimension for negative stepping) , for performance reason. It is
// on the highest dimension // the user's responsibility to make sure the result "new_mutli_id" is not out-of-bound on the
// highest dimension for positive stepping (or on the lowest dimension for negative stepping)
template <bool PositiveDirection> template <bool PositiveDirection>
__host__ __device__ static Array<index_t, nDim> __host__ __device__ static Array<index_t, nDim>
UpdateMultiIndexGivenStepSizeOf1dIndex(Array<index_t, nDim> old_multi_id, UpdateMultiIndexGivenStepSizeOf1dIndex(Array<index_t, nDim> old_multi_id,
...@@ -262,14 +236,14 @@ struct ConstantTensorDescriptor ...@@ -262,14 +236,14 @@ struct ConstantTensorDescriptor
if(carry) if(carry)
{ {
++new_multi_id[idim]; ++new_multi_id(idim);
} }
carry = false; carry = false;
if(new_multi_id[idim] >= GetLength(IDim)) if(new_multi_id[idim] >= GetLength(IDim))
{ {
new_multi_id[idim] -= GetLength(IDim); new_multi_id(idim) -= GetLength(IDim);
carry = true; carry = true;
} }
}); });
...@@ -288,14 +262,14 @@ struct ConstantTensorDescriptor ...@@ -288,14 +262,14 @@ struct ConstantTensorDescriptor
if(borrow) if(borrow)
{ {
--new_multi_id[idim]; --new_multi_id(idim);
} }
borrow = false; borrow = false;
if(new_multi_id[idim] < GetLength(IDim)) if(new_multi_id[idim] < GetLength(IDim))
{ {
new_multi_id[idim] += GetLength(IDim); new_multi_id(idim) += GetLength(IDim);
borrow = true; borrow = true;
} }
}); });
...@@ -382,15 +356,7 @@ struct ConstantTensorDescriptor ...@@ -382,15 +356,7 @@ struct ConstantTensorDescriptor
return ConstantTensorDescriptor<decltype(new_lengths), decltype(new_strides)>{}; return ConstantTensorDescriptor<decltype(new_lengths), decltype(new_strides)>{};
} }
template <index_t Threashold, index_t Delta> // this function unfold dimension [FirstUnfoldDim, ..., LastUnfoldDim] into 1 dimension
struct f_unfold_impl
{
__host__ __device__ constexpr index_t operator()(index_t x) const
{
return x > Threashold ? x - Delta : x;
}
};
template <index_t FirstUnfoldDim, index_t LastUnfoldDim> template <index_t FirstUnfoldDim, index_t LastUnfoldDim>
__host__ __device__ static constexpr auto Unfold(Number<FirstUnfoldDim>, Number<LastUnfoldDim>) __host__ __device__ static constexpr auto Unfold(Number<FirstUnfoldDim>, Number<LastUnfoldDim>)
{ {
...@@ -398,24 +364,6 @@ struct ConstantTensorDescriptor ...@@ -398,24 +364,6 @@ struct ConstantTensorDescriptor
FirstUnfoldDim <= LastUnfoldDim, FirstUnfoldDim <= LastUnfoldDim,
"wrong! should have FirstUnfoldDim <= LastUnfoldDim!"); "wrong! should have FirstUnfoldDim <= LastUnfoldDim!");
#if 0 // cannot compile: compiler complain about constexpr
// dimensions to be unfold need to be in descending order (w.r.t. strides), and need to be
// packed in memory, otherwise, unfolding is invalid
static_for<FirstUnfoldDim, LastUnfoldDim, 1>{}([&](auto IDim_) {
constexpr auto IDim = decltype(IDim_){};
constexpr auto IDim_p1 = IDim + Number<1>{};
// check stride
static_assert(
GetStride(IDim) >= GetStride(IDim_p1),
"wrong! dimensions to be unfolded need to be in descending order w.r.t strides");
// check if packed
static_assert(GetStride(IDim_p1) * GetLength(IDim_p1) == GetStride(IDim),
"wrong! dimensions to be unfolded need to be packed");
});
#endif
// left and right // left and right
constexpr auto left = typename arithmetic_sequence_gen<0, FirstUnfoldDim, 1>::SeqType{}; constexpr auto left = typename arithmetic_sequence_gen<0, FirstUnfoldDim, 1>::SeqType{};
constexpr auto middle = constexpr auto middle =
...@@ -423,6 +371,9 @@ struct ConstantTensorDescriptor ...@@ -423,6 +371,9 @@ struct ConstantTensorDescriptor
constexpr auto right = constexpr auto right =
typename arithmetic_sequence_gen<LastUnfoldDim + 1, GetNumOfDimension(), 1>::SeqType{}; typename arithmetic_sequence_gen<LastUnfoldDim + 1, GetNumOfDimension(), 1>::SeqType{};
// dimensions to be unfolded need to be continuous
static_assert(Type::Extract(middle).AreDimensionsContinuous(), "wrong! not unfoldable");
// unfolded length, stride // unfolded length, stride
constexpr index_t unfold_length = accumulate_on_sequence( constexpr index_t unfold_length = accumulate_on_sequence(
GetLengths().Extract(middle), mod_conv::multiplies<index_t>{}, Number<1>{}); GetLengths().Extract(middle), mod_conv::multiplies<index_t>{}, Number<1>{});
...@@ -446,16 +397,16 @@ struct ConstantTensorDescriptor ...@@ -446,16 +397,16 @@ struct ConstantTensorDescriptor
template <class MapNew2Old> template <class MapNew2Old>
__host__ __device__ static constexpr auto ReorderGivenNew2Old(MapNew2Old) __host__ __device__ static constexpr auto ReorderGivenNew2Old(MapNew2Old)
{ {
return ConstantTensorDescriptor<decltype(Lengths{}.ReorderGivenNew2Old(MapNew2Old{})), return ConstantTensorDescriptor<decltype(Lengths::ReorderGivenNew2Old(MapNew2Old{})),
decltype(Strides{}.ReorderGivenNew2Old(MapNew2Old{}))>{}; decltype(Strides::ReorderGivenNew2Old(MapNew2Old{}))>{};
} }
#if 0 // require sequence_sort, which is not implemented yet #if 0 // require sequence_sort, which is not implemented yet
template <class MapOld2New> template <class MapOld2New>
__host__ __device__ static constexpr auto ReorderGivenOld2New(MapOld2New) __host__ __device__ static constexpr auto ReorderGivenOld2New(MapOld2New)
{ {
return ConstantTensorDescriptor<decltype(Lengths{}.ReorderGivenOld2New(MapOld2New{})), return ConstantTensorDescriptor<decltype(Lengths::ReorderGivenOld2New(MapOld2New{})),
decltype(Strides{}.ReorderGivenOld2New(MapOld2New{}))>{} decltype(Strides::ReorderGivenOld2New(MapOld2New{}))>{}
} }
#endif #endif
}; };
......
...@@ -2,6 +2,9 @@ ...@@ -2,6 +2,9 @@
#include "integral_constant.hip.hpp" #include "integral_constant.hip.hpp"
#include "functional.hip.hpp" #include "functional.hip.hpp"
template <class Seq>
struct is_valid_sequence_map;
template <index_t... Is> template <index_t... Is>
struct Sequence struct Sequence
{ {
...@@ -16,7 +19,23 @@ struct Sequence ...@@ -16,7 +19,23 @@ struct Sequence
{ {
static_assert(I < mSize, "wrong! I too large"); static_assert(I < mSize, "wrong! I too large");
// the last dummy element is to prevent compiler complain about empty Sequence // the last dummy element is to prevent compiler complain about empty array, when mSize = 0
const index_t mData[mSize + 1] = {Is..., 0};
return mData[I];
}
template <index_t I>
__host__ __device__ constexpr index_t operator[](Number<I>) const
{
static_assert(I < mSize, "wrong! I too large");
const index_t mData[mSize + 1] = {Is..., 0};
return mData[I];
}
// make sure I is constepxr
__host__ __device__ constexpr index_t operator[](index_t I) const
{
const index_t mData[mSize + 1] = {Is..., 0}; const index_t mData[mSize + 1] = {Is..., 0};
return mData[I]; return mData[I];
} }
...@@ -24,24 +43,24 @@ struct Sequence ...@@ -24,24 +43,24 @@ struct Sequence
template <index_t... IRs> template <index_t... IRs>
__host__ __device__ static constexpr auto ReorderGivenNew2Old(Sequence<IRs...> /*new2old*/) __host__ __device__ static constexpr auto ReorderGivenNew2Old(Sequence<IRs...> /*new2old*/)
{ {
#if 0 // require sequence_sort, which is not implemented yet static_assert(sizeof...(Is) == sizeof...(IRs),
static_assert(is_same<sequence_sort<Sequence<IRs...>>::SortedSeqType, "wrong! reorder map should have the same size as Sequence to be rerodered");
arithmetic_sequence_gen<0, mSize, 1>::SeqType>::value,
"wrong! invalid new2old map"); static_assert(is_valid_sequence_map<Sequence<IRs...>>::value, "wrong! invalid reorder map");
#endif
return Sequence<Type{}.Get(Number<IRs>{})...>{}; return Sequence<Type::Get(Number<IRs>{})...>{};
} }
#if 0 // require sequence_sort, which is not implemented yet #if 0 // require sequence_sort, which is not implemented yet
template <class MapOld2New> template <class MapOld2New>
__host__ __device__ static constexpr auto ReorderGivenOld2New(MapOld2New /*old2new*/) __host__ __device__ static constexpr auto ReorderGivenOld2New(MapOld2New /*old2new*/)
{ {
#if 0 static_assert(sizeof...(Is) == MapOld2New::GetSize(),
static_assert(is_same<sequence_sort<MapOld2New>::SortedSeqType, "wrong! reorder map should have the same size as Sequence to be rerodered");
arithmetic_sequence_gen<0, mSize, 1>::SeqType>::value,
"wrong! invalid old2new map"); static_assert(is_valid_sequence_map<MapOld2New>::value,
#endif "wrong! invalid reorder map");
constexpr auto map_new2old = typename sequence_map_inverse<MapOld2New>::SeqMapType{}; constexpr auto map_new2old = typename sequence_map_inverse<MapOld2New>::SeqMapType{};
return ReorderGivenNew2Old(map_new2old); return ReorderGivenNew2Old(map_new2old);
...@@ -87,13 +106,13 @@ struct Sequence ...@@ -87,13 +106,13 @@ struct Sequence
template <index_t... Ns> template <index_t... Ns>
__host__ __device__ static constexpr auto Extract(Number<Ns>...) __host__ __device__ static constexpr auto Extract(Number<Ns>...)
{ {
return Sequence<Type{}.Get(Number<Ns>{})...>{}; return Sequence<Type::Get(Number<Ns>{})...>{};
} }
template <index_t... Ns> template <index_t... Ns>
__host__ __device__ static constexpr auto Extract(Sequence<Ns...>) __host__ __device__ static constexpr auto Extract(Sequence<Ns...>)
{ {
return Sequence<Type{}.Get(Number<Ns>{})...>{}; return Sequence<Type::Get(Number<Ns>{})...>{};
} }
template <index_t I, index_t X> template <index_t I, index_t X>
...@@ -297,6 +316,7 @@ struct sequence_map_inverse<Sequence<Is...>> ...@@ -297,6 +316,7 @@ struct sequence_map_inverse<Sequence<Is...>>
}; };
#endif #endif
template <class Seq> template <class Seq>
struct is_valid_sequence_map struct is_valid_sequence_map
{ {
...@@ -322,11 +342,6 @@ __host__ __device__ constexpr auto operator-(Sequence<Xs...> seq_x, Sequence<Ys. ...@@ -322,11 +342,6 @@ __host__ __device__ constexpr auto operator-(Sequence<Xs...> seq_x, Sequence<Ys.
{ {
static_assert(sizeof...(Xs) == sizeof...(Ys), "wrong! inconsistent size"); static_assert(sizeof...(Xs) == sizeof...(Ys), "wrong! inconsistent size");
#if 0
static_for<0, seq_x.GetSize(), 1>{}(
[&](auto I) { static_assert(seq_x.Get(I) >= seq_y.Get(I), "wrong! going to undeflow"); });
#endif
return Sequence<(Xs - Ys)...>{}; return Sequence<(Xs - Ys)...>{};
} }
...@@ -363,15 +378,6 @@ __host__ __device__ constexpr auto operator+(Sequence<Xs...>, Number<Y>) ...@@ -363,15 +378,6 @@ __host__ __device__ constexpr auto operator+(Sequence<Xs...>, Number<Y>)
template <index_t... Xs, index_t Y> template <index_t... Xs, index_t Y>
__host__ __device__ constexpr auto operator-(Sequence<Xs...>, Number<Y>) __host__ __device__ constexpr auto operator-(Sequence<Xs...>, Number<Y>)
{ {
#if 0 // TODO: turn it on. Doesn't compile
constexpr auto seq_x = Sequence<Xs...>{};
static_for<0, sizeof...(Xs), 1>{}([&](auto Iter) {
constexpr auto I = decltype(Iter){};
static_assert(seq_x.Get(I) >= Y, "wrong! going to underflow");
});
#endif
return Sequence<(Xs - Y)...>{}; return Sequence<(Xs - Y)...>{};
} }
...@@ -404,13 +410,6 @@ __host__ __device__ constexpr auto operator-(Number<Y>, Sequence<Xs...>) ...@@ -404,13 +410,6 @@ __host__ __device__ constexpr auto operator-(Number<Y>, Sequence<Xs...>)
{ {
constexpr auto seq_x = Sequence<Xs...>{}; constexpr auto seq_x = Sequence<Xs...>{};
#if 0
static_for<0, sizeof...(Xs), 1>{}([&](auto Iter) {
constexpr auto I = decltype(Iter){};
static_assert(seq_x.Get(I) <= Y, "wrong! going to underflow");
});
#endif
return Sequence<(Y - Xs)...>{}; return Sequence<(Y - Xs)...>{};
} }
...@@ -482,25 +481,6 @@ __host__ __device__ constexpr auto inclusive_scan_sequence(Seq, Reduce, Number<I ...@@ -482,25 +481,6 @@ __host__ __device__ constexpr auto inclusive_scan_sequence(Seq, Reduce, Number<I
return reverse_inclusive_scan_sequence(Seq{}.Reverse(), Reduce{}, Number<Init>{}).Reverse(); return reverse_inclusive_scan_sequence(Seq{}.Reverse(), Reduce{}, Number<Init>{}).Reverse();
} }
template <class Seq>
struct accumulate_on_sequence_impl
{
template <class IDim>
__host__ __device__ constexpr index_t operator()(IDim) const
{
return Seq{}.Get(IDim{});
}
};
template <class Seq, class Reduce, index_t I>
__host__ __device__ constexpr index_t
accumulate_on_sequence(Seq, Reduce, Number<I> /*initial_value*/)
{
constexpr index_t a =
static_const_reduce_n<Seq::mSize>{}(accumulate_on_sequence_impl<Seq>{}, Reduce{});
return Reduce{}(a, I);
}
template <index_t... Is> template <index_t... Is>
__host__ __device__ constexpr auto Sequence<Is...>::PopFront() __host__ __device__ constexpr auto Sequence<Is...>::PopFront()
{ {
......
#pragma once
__device__ index_t get_thread_local_1d_id() { return threadIdx.x; }
__device__ index_t get_block_1d_id() { return blockIdx.x; }
template <class T1, class T2>
struct is_same
{
static constexpr bool value = false;
};
template <class T>
struct is_same<T, T>
{
static constexpr bool value = true;
};
template <class X, class Y>
__host__ __device__ constexpr bool is_same_type(X, Y)
{
return is_same<X, Y>::value;
}
namespace mod_conv { // namespace mod_conv
template <class T, T s>
struct scales
{
__host__ __device__ constexpr T operator()(T a) const { return s * a; }
};
template <class T>
struct plus
{
__host__ __device__ constexpr T operator()(T a, T b) const { return a + b; }
};
template <class T>
struct minus
{
__host__ __device__ constexpr T operator()(T a, T b) const { return a - b; }
};
template <class T>
struct multiplies
{
__host__ __device__ constexpr T operator()(T a, T b) const { return a * b; }
};
template <class T>
struct integer_divide_ceiler
{
__host__ __device__ constexpr T operator()(T a, T b) const
{
static_assert(is_same<T, index_t>::value || is_same<T, int>::value, "wrong type");
return (a + b - 1) / b;
}
};
template <class T>
__host__ __device__ constexpr T integer_divide_ceil(T a, T b)
{
static_assert(is_same<T, index_t>::value || is_same<T, int>::value, "wrong type");
return (a + b - 1) / b;
}
template <class T>
__host__ __device__ constexpr T max(T x, T y)
{
return x > y ? x : y;
}
template <class T, class... Ts>
__host__ __device__ constexpr T max(T x, Ts... xs)
{
static_assert(sizeof...(xs) > 0, "not enough argument");
auto y = max(xs...);
static_assert(is_same<decltype(y), T>::value, "not the same type");
return x > y ? x : y;
}
template <class T>
__host__ __device__ constexpr T min(T x, T y)
{
return x < y ? x : y;
}
template <class T, class... Ts>
__host__ __device__ constexpr T min(T x, Ts... xs)
{
static_assert(sizeof...(xs) > 0, "not enough argument");
auto y = min(xs...);
static_assert(is_same<decltype(y), T>::value, "not the same type");
return x < y ? x : y;
}
// this is wrong
// TODO: implement correct least common multiple, instead of calling max()
template <class T, class... Ts>
__host__ __device__ constexpr T lcm(T x, Ts... xs)
{
return max(x, xs...);
}
} // namespace mod_conv
...@@ -122,7 +122,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -122,7 +122,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
constexpr auto src_partial_original_desc = constexpr auto src_partial_original_desc =
SrcDesc::GetOriginalTensorDescriptor().Extract(src_partial_original_dims); SrcDesc::GetOriginalTensorDescriptor().Extract(src_partial_original_dims);
mThreadSrcPartialOffsets[idim] = src_partial_original_desc.GetOffsetFromMultiIndex( mThreadSrcPartialOffsets(idim) = src_partial_original_desc.GetOffsetFromMultiIndex(
extract_array(mThreadSrcOriginalMultiId, src_partial_original_dims)); extract_array(mThreadSrcOriginalMultiId, src_partial_original_dims));
}); });
...@@ -136,7 +136,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -136,7 +136,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
constexpr auto dst_partial_original_desc = constexpr auto dst_partial_original_desc =
DstDesc::GetOriginalTensorDescriptor().Extract(dst_partial_original_dims); DstDesc::GetOriginalTensorDescriptor().Extract(dst_partial_original_dims);
mThreadDstPartialOffsets[idim] = dst_partial_original_desc.GetOffsetFromMultiIndex( mThreadDstPartialOffsets(idim) = dst_partial_original_desc.GetOffsetFromMultiIndex(
extract_array(mThreadDstOriginalMultiId, dst_partial_original_dims)); extract_array(mThreadDstOriginalMultiId, dst_partial_original_dims));
}); });
...@@ -206,18 +206,16 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -206,18 +206,16 @@ struct BlockwiseGenericTensorSliceCopy_v1
#if 0 #if 0
constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){}); constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});
const auto src_thread_data_multi_id_begin = const auto src_thread_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
repeat_multi_id * data_per_cluster_per_dims; // cannot not constexpr, why?
const auto clipboard_data_multi_id_begin = const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
repeat_multi_id * thread_sub_tensor_lengths; // cannot not constexpr, why?
const index_t src_offset = SrcDesc{}.GetOffsetFromMultiIndex( const index_t src_offset =
src_thread_data_multi_id_begin); // cannot not constexpr, why? SrcDesc{}.GetOffsetFromMultiIndex(src_thread_data_multi_id_begin);
const index_t clipboard_offset = thread_tensor_desc.GetOffsetFromMultiIndex( const index_t clipboard_offset =
clipboard_data_multi_id_begin); // cannot not constexpr, why? thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin);
#else #else // HIP compiler performs better with these codes
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){}; constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
constexpr auto src_thread_data_multi_id_begin = constexpr auto src_thread_data_multi_id_begin =
...@@ -261,18 +259,15 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -261,18 +259,15 @@ struct BlockwiseGenericTensorSliceCopy_v1
#if 0 #if 0
constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){}); constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});
const auto clipboard_data_multi_id_begin = const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
repeat_multi_id * thread_sub_tensor_lengths; // cannot not constexpr, why?
const auto dst_data_multi_id_begin = const auto dst_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
repeat_multi_id * data_per_cluster_per_dims; // cannot not constexpr, why?
const index_t clipboard_offset = thread_tensor_desc.GetOffsetFromMultiIndex( const index_t clipboard_offset =
clipboard_data_multi_id_begin); // cannot not constexpr, why? thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin);
const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex( const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_id_begin);
dst_data_multi_id_begin); // cannot not constexpr, why? #else // HIP compiler performs better with these codes
#else
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){}; constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
constexpr auto clipboard_data_multi_id_begin = constexpr auto clipboard_data_multi_id_begin =
...@@ -343,33 +338,12 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -343,33 +338,12 @@ struct BlockwiseGenericTensorSliceCopy_v1
src_partial_original_desc.UpdateMultiIndexGivenStepSizeOf1dIndex( src_partial_original_desc.UpdateMultiIndexGivenStepSizeOf1dIndex(
old_src_partial_original_multi_id, StepSize, direction); old_src_partial_original_multi_id, StepSize, direction);
#if 0
{
if(debug_flag && get_block_1d_id() == 0)
{
printf("id %5u %5u: "
"old_src_partial_original_multi_id %u %u %u, "
"new_src_partial_original_multi_id %u %u %u, "
"mThreadSrcOffset %u, mThreadDstOffset %u \n",
get_block_1d_id(),
get_thread_local_1d_id(),
old_src_partial_original_multi_id[0],
old_src_partial_original_multi_id[1],
old_src_partial_original_multi_id[2],
new_src_partial_original_multi_id[0],
new_src_partial_original_multi_id[1],
new_src_partial_original_multi_id[2]
);
}
}
#endif
// update "mThreadSrcOriginalMultiId" // update "mThreadSrcOriginalMultiId"
static_for<0, decltype(src_partial_original_dims)::GetSize(), 1>{}([&](auto I_) { static_for<0, decltype(src_partial_original_dims)::GetSize(), 1>{}([&](auto I_) {
constexpr auto I = decltype(I_){}; constexpr auto I = decltype(I_){};
constexpr index_t idim_original = src_partial_original_dims.Get(I); constexpr index_t idim_original = src_partial_original_dims.Get(I);
mThreadSrcOriginalMultiId[idim_original] = mThreadSrcOriginalMultiId(idim_original) =
new_src_partial_original_multi_id[I.Get()]; new_src_partial_original_multi_id[I.Get()];
}); });
...@@ -381,7 +355,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -381,7 +355,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
new_src_partial_original_multi_id); new_src_partial_original_multi_id);
// update "mThreadSrcPartialOffsets" // update "mThreadSrcPartialOffsets"
mThreadSrcPartialOffsets[idim] = new_src_partial_offset; mThreadSrcPartialOffsets(idim) = new_src_partial_offset;
// update "mThreadSrcOffset", do "+" before "-" to avoid underflow // update "mThreadSrcOffset", do "+" before "-" to avoid underflow
mThreadSrcOffset = (mThreadSrcOffset + new_src_partial_offset) - old_src_partial_offset; mThreadSrcOffset = (mThreadSrcOffset + new_src_partial_offset) - old_src_partial_offset;
...@@ -401,15 +375,15 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -401,15 +375,15 @@ struct BlockwiseGenericTensorSliceCopy_v1
static_if<PositiveDirection>{}([&](auto fwd) { static_if<PositiveDirection>{}([&](auto fwd) {
mThreadSrcOffset += StepSize * fwd(SrcDesc{}).GetStride(IDim); mThreadSrcOffset += StepSize * fwd(SrcDesc{}).GetStride(IDim);
mThreadSrcOriginalMultiId[idim_original] += StepSize; mThreadSrcOriginalMultiId(idim_original) += StepSize;
mThreadSrcPartialOffsets[idim] += StepSize * fwd(SrcDesc{}).GetStride(IDim); mThreadSrcPartialOffsets(idim) += StepSize * fwd(SrcDesc{}).GetStride(IDim);
}).Else([&](auto fwd) { }).Else([&](auto fwd) {
mThreadSrcOffset -= StepSize * fwd(SrcDesc{}).GetStride(IDim); mThreadSrcOffset -= StepSize * fwd(SrcDesc{}).GetStride(IDim);
mThreadSrcOriginalMultiId[idim_original] -= StepSize; mThreadSrcOriginalMultiId(idim_original) -= StepSize;
mThreadSrcPartialOffsets[idim] -= StepSize * fwd(SrcDesc{}).GetStride(IDim); mThreadSrcPartialOffsets(idim) -= StepSize * fwd(SrcDesc{}).GetStride(IDim);
}); });
}); });
} }
......
#pragma once #pragma once
#include "base.hip.hpp"
#include "vector_type.hip.hpp" #include "vector_type.hip.hpp"
#include "integral_constant.hip.hpp" #include "integral_constant.hip.hpp"
#include "Sequence.hip.hpp" #include "Sequence.hip.hpp"
...@@ -10,109 +11,3 @@ ...@@ -10,109 +11,3 @@
#if USE_AMD_INLINE_ASM #if USE_AMD_INLINE_ASM
#include "amd_inline_asm.hip.hpp" #include "amd_inline_asm.hip.hpp"
#endif #endif
__device__ index_t get_thread_local_1d_id() { return threadIdx.x; }
__device__ index_t get_block_1d_id() { return blockIdx.x; }
template <class T1, class T2>
struct is_same
{
static constexpr bool value = false;
};
template <class T>
struct is_same<T, T>
{
static constexpr bool value = true;
};
template <class X, class Y>
__host__ __device__ constexpr bool is_same_type(X, Y)
{
return is_same<X, Y>::value;
}
namespace mod_conv { // namespace mod_conv
template <class T, T s>
struct scales
{
__host__ __device__ constexpr T operator()(T a) const { return s * a; }
};
template <class T>
struct plus
{
__host__ __device__ constexpr T operator()(T a, T b) const { return a + b; }
};
template <class T>
struct multiplies
{
__host__ __device__ constexpr T operator()(T a, T b) const { return a * b; }
};
template <class T>
struct integer_divide_ceiler
{
__host__ __device__ constexpr T operator()(T a, T b) const
{
static_assert(is_same<T, index_t>::value || is_same<T, int>::value, "wrong type");
return (a + b - 1) / b;
}
};
template <class T>
__host__ __device__ constexpr T integer_divide_ceil(T a, T b)
{
static_assert(is_same<T, index_t>::value || is_same<T, int>::value, "wrong type");
return (a + b - 1) / b;
}
template <class T>
__host__ __device__ constexpr T max(T x, T y)
{
return x > y ? x : y;
}
template <class T, class... Ts>
__host__ __device__ constexpr T max(T x, Ts... xs)
{
static_assert(sizeof...(xs) > 0, "not enough argument");
auto y = max(xs...);
static_assert(is_same<decltype(y), T>::value, "not the same type");
return x > y ? x : y;
}
template <class T>
__host__ __device__ constexpr T min(T x, T y)
{
return x < y ? x : y;
}
template <class T, class... Ts>
__host__ __device__ constexpr T min(T x, Ts... xs)
{
static_assert(sizeof...(xs) > 0, "not enough argument");
auto y = min(xs...);
static_assert(is_same<decltype(y), T>::value, "not the same type");
return x < y ? x : y;
}
// this is wrong
// TODO: implement correct least common multiple, instead of calling max()
template <class T, class... Ts>
__host__ __device__ constexpr T least_common_multiple(T x, Ts... xs)
{
return max(x, xs...);
}
} // namespace mod_conv
...@@ -19,18 +19,7 @@ struct swallow ...@@ -19,18 +19,7 @@ struct swallow
} }
}; };
#if 0 // Emulate if constexpr
template<class F>
__host__ __device__ constexpr auto unpacker(F f)
{
return [=](auto xs_array){ f(xs...); };
}
#endif
// Emulate compile time if statement for C++14
// Get the idea from
// "https://baptiste-wicht.com/posts/2015/07/simulate-static_if-with-c11c14.html"
// TODO: use if constexpr, when C++17 is supported
template <bool Predicate> template <bool Predicate>
struct static_if struct static_if
{ {
...@@ -81,28 +70,3 @@ struct static_if<false> ...@@ -81,28 +70,3 @@ struct static_if<false>
return Type{}; return Type{};
} }
}; };
template <index_t NLoop>
struct static_const_reduce_n
{
// signature of F: F(Number<I>)
template <class F, class Reduce>
__host__ __device__ constexpr auto operator()(F f, Reduce r) const
{
static_assert(NLoop > 1, "out-of-range");
constexpr auto a = f(Number<NLoop - 1>{});
auto b = static_const_reduce_n<NLoop - 1>{}(f, r); // TODO: cannot use constexpr here, weird
return r(a, b);
}
};
template <>
struct static_const_reduce_n<1>
{
template <class F, class Reduce>
__host__ __device__ constexpr auto operator()(F f, Reduce) const
{
return f(Number<0>{});
}
};
...@@ -2,29 +2,16 @@ ...@@ -2,29 +2,16 @@
#include "functional.hip.hpp" #include "functional.hip.hpp"
#include "Sequence.hip.hpp" #include "Sequence.hip.hpp"
#if 0 template <class>
template <index_t Iter, index_t Remaining, index_t Increment> struct static_for_impl;
struct static_for_impl
{
template <class F>
constexpr __host__ __device__ void operator()(F f) const
{
static_assert(Remaining % Increment == 0, "wrong! Remaining % Increment != 0");
static_assert(Increment <= Remaining, "will go out-of-range");
f(Number<Iter>{});
static_for_impl<Iter + Increment, Remaining - Increment, Increment>{}(f);
}
};
template <index_t Iter, index_t Increment> template <index_t... Is>
struct static_for_impl<Iter, 0, Increment> struct static_for_impl<Sequence<Is...>>
{ {
template <class F> template <class F>
constexpr __host__ __device__ void operator()(F) const __host__ __device__ constexpr void operator()(F f) const
{ {
// no work left, just return swallow{(f(Number<Is>{}), 0)...};
return;
} }
}; };
...@@ -33,48 +20,42 @@ template <index_t NBegin, index_t NEnd, index_t Increment> ...@@ -33,48 +20,42 @@ template <index_t NBegin, index_t NEnd, index_t Increment>
struct static_for struct static_for
{ {
template <class F> template <class F>
constexpr __host__ __device__ void operator()(F f) const __host__ __device__ constexpr void operator()(F f) const
{ {
static_assert(NBegin <= NEnd, "wrongs! should have NBegin <= NEnd"); static_assert(NBegin <= NEnd, "wrongs! should have NBegin <= NEnd");
static_assert((NEnd - NBegin) % Increment == 0, static_assert((NEnd - NBegin) % Increment == 0,
"Wrong! should satisfy (NEnd - NBegin) % Increment == 0"); "Wrong! should satisfy (NEnd - NBegin) % Increment == 0");
#if 0 static_for_impl<typename arithmetic_sequence_gen<NBegin, NEnd, Increment>::SeqType>{}(f);
static_if<(NBegin < NEnd)>{}(
[&](auto fwd) { static_for_impl<NBegin, NEnd - NBegin, fwd(Increment)>{}(f); });
#else
static_for_impl<NBegin, NEnd - NBegin, Increment>{}(f);
#endif
} }
}; };
#else
template <class>
struct static_for_impl;
template <index_t... Is> template <class Seq, class Reduce>
struct static_for_impl<Sequence<Is...>> struct lambda_accumulate_on_sequence
{ {
template <class F> const Reduce& f;
__host__ __device__ constexpr void operator()(F f) const index_t& result;
__host__ __device__ constexpr lambda_accumulate_on_sequence(const Reduce& f_, index_t& result_)
: f(f_), result(result_)
{ {
swallow{(f(Number<Is>{}), 0)...}; }
template <class IDim>
__host__ __device__ constexpr index_t operator()(IDim) const
{
return result = f(result, Seq::Get(IDim{}));
} }
}; };
// F signature: F(Number<Iter>) template <class Seq, class Reduce, index_t Init>
template <index_t NBegin, index_t NEnd, index_t Increment> __host__ __device__ constexpr index_t
struct static_for accumulate_on_sequence(Seq, Reduce f, Number<Init> /*initial_value*/)
{ {
template <class F> index_t result = Init;
__host__ __device__ constexpr void operator()(F f) const
{
static_assert(NBegin <= NEnd, "wrongs! should have NBegin <= NEnd");
static_assert((NEnd - NBegin) % Increment == 0, static_for<0, Seq::mSize, 1>{}(lambda_accumulate_on_sequence<Seq, Reduce>(f, result));
"Wrong! should satisfy (NEnd - NBegin) % Increment == 0");
static_for_impl<typename arithmetic_sequence_gen<NBegin, NEnd, Increment>::SeqType>{}(f); return result;
} }
};
#endif
...@@ -11,7 +11,7 @@ struct static_ford_impl ...@@ -11,7 +11,7 @@ struct static_ford_impl
// F signature: F(Sequence<...> multi_id) // F signature: F(Sequence<...> multi_id)
// CurrentMultiIndex: Sequence<...> // CurrentMultiIndex: Sequence<...>
template <class F, class CurrentMultiIndex> template <class F, class CurrentMultiIndex>
__host__ __device__ void operator()(F f, CurrentMultiIndex) const __host__ __device__ constexpr void operator()(F f, CurrentMultiIndex) const
{ {
static_assert(RemainLengths::GetSize() > 0, "wrong! should not get here"); static_assert(RemainLengths::GetSize() > 0, "wrong! should not get here");
...@@ -28,7 +28,7 @@ struct static_ford_impl<Sequence<>> ...@@ -28,7 +28,7 @@ struct static_ford_impl<Sequence<>>
// F signature: F(Sequence<...> multi_id) // F signature: F(Sequence<...> multi_id)
// CurrentMultiIndex: Sequence<...> // CurrentMultiIndex: Sequence<...>
template <class F, class CurrentMultiIndex> template <class F, class CurrentMultiIndex>
__host__ __device__ void operator()(F f, CurrentMultiIndex) const __host__ __device__ constexpr void operator()(F f, CurrentMultiIndex) const
{ {
f(CurrentMultiIndex{}); f(CurrentMultiIndex{});
} }
...@@ -40,7 +40,7 @@ struct static_ford ...@@ -40,7 +40,7 @@ struct static_ford
{ {
// F signature: F(Sequence<...> multi_id) // F signature: F(Sequence<...> multi_id)
template <class F> template <class F>
__host__ __device__ void operator()(F f) const __host__ __device__ constexpr void operator()(F f) const
{ {
static_assert(Lengths::GetSize() > 0, "wrong! Lengths is empty"); static_assert(Lengths::GetSize() > 0, "wrong! Lengths is empty");
...@@ -55,7 +55,7 @@ struct ford_impl ...@@ -55,7 +55,7 @@ struct ford_impl
// CurrentMultiIndex: Array<...> // CurrentMultiIndex: Array<...>
// RemainLengths: Sequence<...> // RemainLengths: Sequence<...>
template <class F, class CurrentMultiIndex, class RemainLengths> template <class F, class CurrentMultiIndex, class RemainLengths>
__host__ __device__ void __host__ __device__ constexpr void
operator()(F f, CurrentMultiIndex current_multi_id, RemainLengths) const operator()(F f, CurrentMultiIndex current_multi_id, RemainLengths) const
{ {
static_assert(RemainLengths::GetSize() == RemainDim, "wrong!"); static_assert(RemainLengths::GetSize() == RemainDim, "wrong!");
...@@ -77,7 +77,7 @@ struct ford_impl<1> ...@@ -77,7 +77,7 @@ struct ford_impl<1>
// CurrentMultiIndex: Array<...> // CurrentMultiIndex: Array<...>
// RemainLengths: Sequence<...> // RemainLengths: Sequence<...>
template <class F, class CurrentMultiIndex, class RemainLengths> template <class F, class CurrentMultiIndex, class RemainLengths>
__host__ __device__ void __host__ __device__ constexpr void
operator()(F f, CurrentMultiIndex current_multi_id, RemainLengths) const operator()(F f, CurrentMultiIndex current_multi_id, RemainLengths) const
{ {
static_assert(RemainLengths::GetSize() == 1, "wrong!"); static_assert(RemainLengths::GetSize() == 1, "wrong!");
...@@ -97,7 +97,7 @@ struct ford ...@@ -97,7 +97,7 @@ struct ford
{ {
// F signature: F(Array<...> multi_id) // F signature: F(Array<...> multi_id)
template <class F> template <class F>
__host__ __device__ void operator()(F f) const __host__ __device__ constexpr void operator()(F f) const
{ {
constexpr index_t first_length = Lengths{}.Front(); constexpr index_t first_length = Lengths{}.Front();
......
...@@ -103,7 +103,7 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn ...@@ -103,7 +103,7 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
// tensor view of blockwise input and weight in LDS // tensor view of blockwise input and weight in LDS
// be careful of alignment // be careful of alignment
constexpr index_t max_align = mod_conv::max(InBlockCopyDataPerRead_N, constexpr index_t max_align = mod_conv::lcm(InBlockCopyDataPerRead_N,
WeiBlockCopyDataPerRead_K, WeiBlockCopyDataPerRead_K,
GemmDataPerReadA, GemmDataPerReadA,
GemmDataPerReadB); GemmDataPerReadB);
...@@ -119,11 +119,11 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn ...@@ -119,11 +119,11 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
constexpr auto wei_cyx_k_block_desc = make_ConstantTensorDescriptor_aligned( constexpr auto wei_cyx_k_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<CPerBlock * Y * X, KPerBlock>{}, Sequence<CPerBlock * Y * X, KPerBlock>{},
Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{}); Number<mod_conv::lcm(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
constexpr auto wei_c_y_x_k_block_desc = make_ConstantTensorDescriptor_aligned( constexpr auto wei_c_y_x_k_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<CPerBlock, Y, X, KPerBlock>{}, Sequence<CPerBlock, Y, X, KPerBlock>{},
Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{}); Number<mod_conv::lcm(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
// tensor view of threadwise output in register // tensor view of threadwise output in register
constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor( constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor(
......
...@@ -104,7 +104,7 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn ...@@ -104,7 +104,7 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
// LDS tensor view // LDS tensor view
// be careful of alignment // be careful of alignment
constexpr index_t max_align = mod_conv::max(InBlockCopyDataPerRead_N, constexpr index_t max_align = mod_conv::lcm(InBlockCopyDataPerRead_N,
WeiBlockCopyDataPerRead_K, WeiBlockCopyDataPerRead_K,
GemmDataPerReadA, GemmDataPerReadA,
GemmDataPerReadB); GemmDataPerReadB);
...@@ -120,7 +120,7 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn ...@@ -120,7 +120,7 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
constexpr auto wei_c_x_k_block_desc = make_ConstantTensorDescriptor_aligned( constexpr auto wei_c_x_k_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<CPerBlock, X, KPerBlock>{}, Sequence<CPerBlock, X, KPerBlock>{},
Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{}); Number<mod_conv::lcm(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
// tensor view of threadwise output in register // tensor view of threadwise output in register
constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor( constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor(
......
...@@ -108,7 +108,7 @@ struct GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn ...@@ -108,7 +108,7 @@ struct GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn
// LDS tensor view // LDS tensor view
// be careful of alignment // be careful of alignment
constexpr index_t max_align = mod_conv::max(InBlockReorderDataPerWrite_N, constexpr index_t max_align = mod_conv::lcm(InBlockReorderDataPerWrite_N,
WeiBlockCopyDataPerRead_K, WeiBlockCopyDataPerRead_K,
GemmDataPerReadA, GemmDataPerReadA,
GemmDataPerReadB); GemmDataPerReadB);
......
...@@ -99,7 +99,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn ...@@ -99,7 +99,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
// LDS tensor view // LDS tensor view
// be careful of alignment // be careful of alignment
constexpr index_t max_align = mod_conv::max(InBlockCopyDataPerRead_N, constexpr index_t max_align = mod_conv::lcm(InBlockCopyDataPerRead_N,
WeiBlockCopyDataPerRead_K, WeiBlockCopyDataPerRead_K,
GemmDataPerReadA, GemmDataPerReadA,
GemmDataPerReadB); GemmDataPerReadB);
...@@ -115,7 +115,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn ...@@ -115,7 +115,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned( constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<CPerBlock, KPerBlock>{}, Sequence<CPerBlock, KPerBlock>{},
Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{}); Number<mod_conv::lcm(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
// tensor view of threadwise output in register // tensor view of threadwise output in register
constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor( constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor(
......
...@@ -104,7 +104,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn ...@@ -104,7 +104,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
// LDS tensor view // LDS tensor view
// be careful of alignment // be careful of alignment
constexpr index_t max_align = mod_conv::max(InBlockCopyDataPerRead_N, constexpr index_t max_align = mod_conv::lcm(InBlockCopyDataPerRead_N,
WeiBlockCopyDataPerRead_K, WeiBlockCopyDataPerRead_K,
GemmDataPerReadA, GemmDataPerReadA,
GemmDataPerReadB); GemmDataPerReadB);
...@@ -120,7 +120,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn ...@@ -120,7 +120,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned( constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<CPerBlock, KPerBlock>{}, Sequence<CPerBlock, KPerBlock>{},
Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{}); Number<mod_conv::lcm(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
// tensor view of threadwise output in register // tensor view of threadwise output in register
constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed( constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed(
......
...@@ -106,7 +106,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn ...@@ -106,7 +106,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
// LDS tensor view // LDS tensor view
// be careful of alignment // be careful of alignment
constexpr index_t max_align = mod_conv::max(InBlockReorderDataPerWrite_N, constexpr index_t max_align = mod_conv::lcm(InBlockReorderDataPerWrite_N,
WeiBlockCopyDataPerRead_K, WeiBlockCopyDataPerRead_K,
GemmDataPerReadA, GemmDataPerReadA,
GemmDataPerReadB); GemmDataPerReadB);
...@@ -122,7 +122,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn ...@@ -122,7 +122,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned( constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<CPerBlock, KPerBlock>{}, Sequence<CPerBlock, KPerBlock>{},
Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{}); Number<mod_conv::lcm(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
// tensor view of threadwise output in register // tensor view of threadwise output in register
constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed( constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed(
......
...@@ -105,7 +105,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw ...@@ -105,7 +105,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
// LDS tensor view // LDS tensor view
// be careful of alignment // be careful of alignment
constexpr index_t max_align = mod_conv::max(InBlockReorderDataPerWrite_N, constexpr index_t max_align = mod_conv::lcm(InBlockReorderDataPerWrite_N,
WeiBlockCopyDataPerRead_K, WeiBlockCopyDataPerRead_K,
GemmDataPerReadA, GemmDataPerReadA,
GemmDataPerReadB); GemmDataPerReadB);
...@@ -121,7 +121,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw ...@@ -121,7 +121,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned( constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<CPerBlock, KPerBlock>{}, Sequence<CPerBlock, KPerBlock>{},
Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{}); Number<mod_conv::lcm(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
// tensor view of threadwise output in register // tensor view of threadwise output in register
constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed( constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed(
......
...@@ -104,7 +104,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn ...@@ -104,7 +104,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn
// LDS tensor view // LDS tensor view
// be careful of alignment // be careful of alignment
constexpr index_t max_align = mod_conv::max(InBlockReorderDataPerWrite_N, constexpr index_t max_align = mod_conv::lcm(InBlockReorderDataPerWrite_N,
WeiBlockCopyDataPerRead_K, WeiBlockCopyDataPerRead_K,
GemmDataPerReadA, GemmDataPerReadA,
GemmDataPerReadB); GemmDataPerReadB);
...@@ -120,7 +120,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn ...@@ -120,7 +120,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn
constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned( constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<CPerBlock, KPerBlock>{}, Sequence<CPerBlock, KPerBlock>{},
Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{}); Number<mod_conv::lcm(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
// tensor view of threadwise output in register // tensor view of threadwise output in register
constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor( constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor(
......
...@@ -103,7 +103,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw ...@@ -103,7 +103,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
// LDS tensor view // LDS tensor view
// be careful of alignment // be careful of alignment
constexpr index_t max_align = mod_conv::max(InBlockReorderDataPerWrite_N, constexpr index_t max_align = mod_conv::lcm(InBlockReorderDataPerWrite_N,
WeiBlockCopyDataPerRead_K, WeiBlockCopyDataPerRead_K,
GemmDataPerReadA, GemmDataPerReadA,
GemmDataPerReadB); GemmDataPerReadB);
...@@ -119,7 +119,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw ...@@ -119,7 +119,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned( constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<CPerBlock, KPerBlock>{}, Sequence<CPerBlock, KPerBlock>{},
Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{}); Number<mod_conv::lcm(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
// tensor view of threadwise output in register // tensor view of threadwise output in register
constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed( constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed(
......
...@@ -181,7 +181,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn ...@@ -181,7 +181,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
// LDS: be careful of alignment // LDS: be careful of alignment
constexpr index_t max_align = constexpr index_t max_align =
mod_conv::max(index_t(4), InBlockCopyDataPerRead, WeiBlockCopyDataPerRead); mod_conv::lcm(index_t(4), InBlockCopyDataPerRead, WeiBlockCopyDataPerRead);
constexpr index_t in_block_space = in_cb_block_desc.GetElementSpace(Number<max_align>{}); constexpr index_t in_block_space = in_cb_block_desc.GetElementSpace(Number<max_align>{});
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment