"include/vscode:/vscode.git/clone" did not exist on "1c8126a4c2372530db822c28fe6d2a4eb8f3998b"
Commit 52423948 authored by Jehandad Khan's avatar Jehandad Khan
Browse files

Merge branch 'master' into jd_redux

parents b97af4ec 98a2cfcc
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
namespace ck { namespace ck {
template <class Lengths> template <class Lengths>
__host__ __device__ constexpr auto calculate_tensor_strides_packed(Lengths) __host__ __device__ constexpr auto calculate_tensor_strides_packed_old(Lengths)
{ {
return reverse_inclusive_scan_sequence( return reverse_inclusive_scan_sequence(
Lengths{}.PopFront(), math::multiplies<index_t>{}, Number<1>{}) Lengths{}.PopFront(), math::multiplies<index_t>{}, Number<1>{})
...@@ -14,12 +14,12 @@ __host__ __device__ constexpr auto calculate_tensor_strides_packed(Lengths) ...@@ -14,12 +14,12 @@ __host__ __device__ constexpr auto calculate_tensor_strides_packed(Lengths)
} }
template <class Lengths, index_t Align> template <class Lengths, index_t Align>
__host__ __device__ constexpr auto calculate_tensor_strides_aligned(Lengths, Number<Align>) __host__ __device__ constexpr auto calculate_tensor_strides_aligned_old(Lengths, Number<Align>)
{ {
constexpr index_t L_back_align = constexpr index_t L_back_align =
Align * math::integer_divide_ceiler<index_t>{}(Lengths{}.Back(), Align); Align * math::integer_divide_ceiler<index_t>{}(Lengths{}.Back(), Align);
return calculate_tensor_strides_packed( return calculate_tensor_strides_packed_old(
Lengths{}.Modify(Number<Lengths{}.GetSize() - 1>{}, Number<L_back_align>{})); Lengths{}.Modify(Number<Lengths{}.GetSize() - 1>{}, Number<L_back_align>{}));
} }
...@@ -96,13 +96,12 @@ struct ConstantTensorDescriptor ...@@ -96,13 +96,12 @@ struct ConstantTensorDescriptor
__host__ __device__ static constexpr auto GetElementSize() __host__ __device__ static constexpr auto GetElementSize()
{ {
return Number<accumulate_on_sequence( return Number<reduce_on_sequence(Lengths{}, math::multiplies<index_t>{}, Number<1>{})>{};
Lengths{}, math::multiplies<index_t>{}, Number<1>{})>{};
} }
__host__ __device__ static constexpr auto GetElementSpace() __host__ __device__ static constexpr auto GetElementSpace()
{ {
constexpr index_t element_space_unaligned = accumulate_on_sequence( constexpr index_t element_space_unaligned = reduce_on_sequence(
(GetLengths() - Number<1>{}) * GetStrides(), math::plus<index_t>{}, Number<1>{}); (GetLengths() - Number<1>{}) * GetStrides(), math::plus<index_t>{}, Number<1>{});
return Number<element_space_unaligned>{}; return Number<element_space_unaligned>{};
...@@ -155,7 +154,7 @@ struct ConstantTensorDescriptor ...@@ -155,7 +154,7 @@ struct ConstantTensorDescriptor
constexpr auto multi_id = Sequence<Is...>{}; constexpr auto multi_id = Sequence<Is...>{};
return Number<accumulate_on_sequence( return Number<reduce_on_sequence(
multi_id * GetStrides(), math::plus<index_t>{}, Number<0>{})>{}; multi_id * GetStrides(), math::plus<index_t>{}, Number<0>{})>{};
} }
...@@ -178,7 +177,7 @@ struct ConstantTensorDescriptor ...@@ -178,7 +177,7 @@ struct ConstantTensorDescriptor
{ {
constexpr auto IDim = IDim_{}; constexpr auto IDim = IDim_{};
constexpr index_t stride = PackedStrides::Get(IDim); constexpr index_t stride = PackedStrides::Get(IDim);
multi_id.Set(IDim, id / stride); multi_id(IDim) = id / stride;
id -= multi_id[IDim] * stride; id -= multi_id[IDim] * stride;
} }
}; };
...@@ -187,12 +186,12 @@ struct ConstantTensorDescriptor ...@@ -187,12 +186,12 @@ struct ConstantTensorDescriptor
{ {
Array<index_t, nDim> multi_id; Array<index_t, nDim> multi_id;
using PackedStrides = decltype(calculate_tensor_strides_packed(GetLengths())); using PackedStrides = decltype(calculate_tensor_strides_packed_old(GetLengths()));
// calculate index in each of the dimensions in the order of their dimension // calculate index in each of the dimensions in the order of their dimension
static_for<0, nDim - 1, 1>{}(lambda_GetMultiIndexFrom1dIndex<PackedStrides>(id, multi_id)); static_for<0, nDim - 1, 1>{}(lambda_GetMultiIndexFrom1dIndex<PackedStrides>(id, multi_id));
multi_id.Set(Number<nDim - 1>{}, id / PackedStrides::Get(Number<nDim - 1>{})); multi_id(Number<nDim - 1>{}) = id / PackedStrides::Get(Number<nDim - 1>{});
return multi_id; return multi_id;
} }
...@@ -204,7 +203,7 @@ struct ConstantTensorDescriptor ...@@ -204,7 +203,7 @@ struct ConstantTensorDescriptor
} }
// This function doesn't do carry check on the highest dimension for positive stepping (or // This function doesn't do carry check on the highest dimension for positive stepping (or
// borrow check on the lowest dimension for negative stepping) , for performance reason. It is // borrow check on the highest dimension for negative stepping) , for performance reason. It is
// the user's responsibility to make sure the result "new_mutli_id" is not out-of-bound on the // the user's responsibility to make sure the result "new_mutli_id" is not out-of-bound on the
// highest dimension for positive stepping (or on the lowest dimension for negative stepping) // highest dimension for positive stepping (or on the lowest dimension for negative stepping)
template <bool PositiveDirection> template <bool PositiveDirection>
...@@ -304,14 +303,73 @@ struct ConstantTensorDescriptor ...@@ -304,14 +303,73 @@ struct ConstantTensorDescriptor
GetStrides().PushBack(leaf_tensor::GetStrides()))>{}; GetStrides().PushBack(leaf_tensor::GetStrides()))>{};
} }
template <index_t IDimVector, index_t DataPerVector>
struct lambda_IsVectorizationAllowed
{
bool& is_allowed;
__host__ __device__ constexpr lambda_IsVectorizationAllowed(bool& is_allowed_)
: is_allowed(is_allowed_)
{
}
template <index_t IDim_>
__host__ __device__ constexpr void operator()(Number<IDim_>) const
{
constexpr auto IDim = Number<IDim_>{};
if(IDimVector != IDim && Strides::Get(IDim) % DataPerVector != 0)
{
is_allowed = false;
}
}
};
template <index_t IDimVector, index_t DataPerVector>
__host__ __device__ static constexpr bool IsVectorizationAllowed(Number<IDimVector>,
Number<DataPerVector>)
{
bool is_allowed = (Strides{}[IDimVector] == 1 || DataPerVector == 1) &&
Lengths{}[IDimVector] % DataPerVector == 0;
static_for<0, nDim, 1>{}(
lambda_IsVectorizationAllowed<IDimVector, DataPerVector>{is_allowed});
return is_allowed;
}
template <index_t IDim, index_t DataPerVector>
__host__ __device__ static constexpr auto Vectorize(Number<IDim>, Number<DataPerVector>)
{
constexpr auto idim = Number<IDim>{};
constexpr auto data_per_vector = Number<DataPerVector>{};
static_assert(IsVectorizationAllowed(idim, data_per_vector), "wrong!");
using vectorized_lengths =
decltype(Lengths::Modify(Number<IDim>{}, Number<Lengths{}[IDim] / DataPerVector>{}));
using vectorized_strides =
decltype((Strides{} / Number<DataPerVector>{}).Modify(Number<IDim>{}, Number<1>{}));
return ConstantTensorDescriptor<vectorized_lengths, vectorized_strides>{};
}
template <index_t IDim, index_t SliceLen> template <index_t IDim, index_t SliceLen>
__host__ __device__ static constexpr auto Slice(Number<IDim>, Number<SliceLen>) __host__ __device__ static constexpr auto Slice(Number<IDim>, Number<SliceLen>)
{ {
using slice_lengths = decltype(Lengths{}.Modify(Number<IDim>{}, Number<SliceLen>{})); using slice_lengths = decltype(Lengths::Modify(Number<IDim>{}, Number<SliceLen>{}));
return ConstantTensorDescriptor<slice_lengths, Strides>{}; return ConstantTensorDescriptor<slice_lengths, Strides>{};
} }
template <index_t... Is>
__host__ __device__ static constexpr auto Slice(Sequence<Is...> slice_lengths)
{
static_assert(slice_lengths.GetSize() == nDim, "wrong!");
return ConstantTensorDescriptor<decltype(slice_lengths), Strides>{};
}
template <index_t IDim, index_t SliceLength, index_t SliceStride> template <index_t IDim, index_t SliceLength, index_t SliceStride>
__host__ __device__ static constexpr auto __host__ __device__ static constexpr auto
StridedSlice(Number<IDim>, Number<SliceLength>, Number<SliceStride>) StridedSlice(Number<IDim>, Number<SliceLength>, Number<SliceStride>)
...@@ -330,7 +388,7 @@ struct ConstantTensorDescriptor ...@@ -330,7 +388,7 @@ struct ConstantTensorDescriptor
constexpr auto fold_intervals = Sequence<FoldIntervals...>{}; constexpr auto fold_intervals = Sequence<FoldIntervals...>{};
constexpr index_t fold_intervals_product = constexpr index_t fold_intervals_product =
accumulate_on_sequence(fold_intervals, math::multiplies<index_t>{}, Number<1>{}); reduce_on_sequence(fold_intervals, math::multiplies<index_t>{}, Number<1>{});
constexpr auto unfold_length = GetLength(Number<IDim>{}); constexpr auto unfold_length = GetLength(Number<IDim>{});
constexpr auto unfold_stride = GetStride(Number<IDim>{}); constexpr auto unfold_stride = GetStride(Number<IDim>{});
...@@ -388,7 +446,7 @@ struct ConstantTensorDescriptor ...@@ -388,7 +446,7 @@ struct ConstantTensorDescriptor
static_assert(Type::Extract(middle).AreDimensionsContinuous(), "wrong! not unfoldable"); static_assert(Type::Extract(middle).AreDimensionsContinuous(), "wrong! not unfoldable");
// unfolded length, stride // unfolded length, stride
constexpr index_t unfold_length = accumulate_on_sequence( constexpr index_t unfold_length = reduce_on_sequence(
GetLengths().Extract(middle), math::multiplies<index_t>{}, Number<1>{}); GetLengths().Extract(middle), math::multiplies<index_t>{}, Number<1>{});
constexpr index_t unfold_stride = GetStride(Number<LastUnfoldDim>{}); constexpr index_t unfold_stride = GetStride(Number<LastUnfoldDim>{});
...@@ -409,7 +467,7 @@ struct ConstantTensorDescriptor ...@@ -409,7 +467,7 @@ struct ConstantTensorDescriptor
__host__ __device__ static constexpr auto Pack() __host__ __device__ static constexpr auto Pack()
{ {
using packed_strides = decltype(calculate_tensor_strides_packed(Lengths{})); using packed_strides = decltype(calculate_tensor_strides_packed_old(Lengths{}));
return ConstantTensorDescriptor<Lengths, packed_strides>{}; return ConstantTensorDescriptor<Lengths, packed_strides>{};
} }
...@@ -431,7 +489,7 @@ struct ConstantTensorDescriptor ...@@ -431,7 +489,7 @@ struct ConstantTensorDescriptor
template <class Lengths> template <class Lengths>
__host__ __device__ constexpr auto make_ConstantTensorDescriptor_packed(Lengths) __host__ __device__ constexpr auto make_ConstantTensorDescriptor_packed(Lengths)
{ {
using Strides = decltype(calculate_tensor_strides_packed(Lengths{})); using Strides = decltype(calculate_tensor_strides_packed_old(Lengths{}));
return ConstantTensorDescriptor<Lengths, Strides>{}; return ConstantTensorDescriptor<Lengths, Strides>{};
} }
...@@ -444,7 +502,7 @@ __host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Stride ...@@ -444,7 +502,7 @@ __host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Stride
template <class Lengths, index_t Align> template <class Lengths, index_t Align>
__host__ __device__ constexpr auto make_ConstantTensorDescriptor_aligned(Lengths, Number<Align>) __host__ __device__ constexpr auto make_ConstantTensorDescriptor_aligned(Lengths, Number<Align>)
{ {
using Strides = decltype(calculate_tensor_strides_aligned(Lengths{}, Number<Align>{})); using Strides = decltype(calculate_tensor_strides_aligned_old(Lengths{}, Number<Align>{}));
return ConstantTensorDescriptor<Lengths, Strides>{}; return ConstantTensorDescriptor<Lengths, Strides>{};
} }
......
#ifndef CK_DIMENSION_HPP
#define CK_DIMENSION_HPP
#include "common_header.hpp"
namespace ck {
template <index_t Length>
struct Dimension
{
__host__ __device__ static constexpr auto GetLength() { return Number<Length>{}; }
};
template <index_t Length, index_t Stride>
struct NativeDimension
{
__host__ __device__ static constexpr auto GetLength() { return Number<Length>{}; }
__host__ __device__ static constexpr auto GetStride() { return Number<Stride>{}; }
__host__ __device__ static constexpr index_t CalculateOffset(index_t i) { return i * Stride; }
__host__ __device__ static constexpr index_t CalculateOffsetDiff(index_t i_diff)
{
return i_diff * Stride;
}
};
} // namespace ck
#endif
#ifndef CK_MULTI_INDEX_TRANSFORM_HPP
#define CK_MULTI_INDEX_TRANSFORM_HPP
#include "common_header.hpp"
namespace ck {
template <index_t N>
using MultiIndex = Array<index_t, N>;
template <typename... Xs>
__host__ __device__ constexpr auto make_multi_index(Xs... xs)
{
return MultiIndex<sizeof...(Xs)>(xs...);
}
template <index_t Length>
struct PassThrough
{
using LowerIndex = MultiIndex<1>;
using UpperIndex = MultiIndex<1>;
__host__ __device__ static constexpr auto GetNumOfLowerDimension() { return Number<1>{}; }
__host__ __device__ static constexpr auto GetNumOfUpperDimension() { return Number<1>{}; }
__host__ __device__ static constexpr auto GetUpperLengths() { return Sequence<Length>{}; }
__host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
{
return idx_up;
}
__host__ __device__ static constexpr auto
CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
const UpperIndex& /* idx_up_old */,
const LowerIndex& /* idx_low_old */)
{
return idx_up_diff;
}
__host__ __device__ static constexpr bool IsLinearTransform() { return true; }
__host__ __device__ static constexpr bool
IsUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */)
{
return true;
}
};
// LowerLengths: Sequence<...>
template <typename LowerLengths, typename LeftPads, typename RightPads>
struct Pad
{
static constexpr index_t nDim = LowerLengths::Size();
using LowerIndex = MultiIndex<nDim>;
using UpperIndex = MultiIndex<nDim>;
__host__ __device__ static constexpr auto GetNumOfLowerDimension() { return Number<nDim>{}; }
__host__ __device__ static constexpr auto GetNumOfUpperDimension() { return Number<nDim>{}; }
__host__ __device__ static constexpr auto GetUpperLengths()
{
return LowerLengths{} + LeftPads{} + RightPads{};
}
__host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
{
return idx_up - LeftPads{};
}
__host__ __device__ static constexpr auto
CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
const UpperIndex& /* idx_up_old */,
const LowerIndex& /* idx_low_old */)
{
return idx_up_diff;
}
__host__ __device__ static constexpr bool IsLinearTransform() { return true; }
__host__ __device__ constexpr bool
IsUpperIndexMappedToValidLowerIndex(const UpperIndex& idx_up) const
{
#if 0
struct lambda_no_pad
{
__host__ __device__ constexpr bool operator()(index_t x) const { return x == 0; }
};
if(sequence_all_of(LeftPads{}, lambda_no_pad{}) &&
sequence_all_of(RightPads{}, lambda_no_pad{}))
{
return true;
}
else
#endif
{
bool flag = true;
static_for<0, nDim, 1>{}([&](auto idim) {
// only check if there is left-padding
static_if<(LeftPads::At(idim) != 0)>{}(
[&](auto) { flag = flag && idx_up[idim] >= LeftPads::At(idim); });
// only check if there is right-padding
static_if<(RightPads::At(idim) != 0)>{}([&](auto) {
flag = flag && (idx_up[idim] < LeftPads::At(idim) + LowerLengths::At(idim));
});
});
return flag;
}
}
};
// LowerLengths: Sequence<...>
template <typename LowerLengths>
struct Merge
{
static constexpr index_t nDimLow = LowerLengths::Size();
static constexpr index_t nDimUp = 1;
using LowerIndex = MultiIndex<nDimLow>;
using UpperIndex = MultiIndex<nDimUp>;
__host__ __device__ static constexpr auto GetNumOfLowerDimension() { return Number<nDimLow>{}; }
__host__ __device__ static constexpr auto GetNumOfUpperDimension() { return Number<nDimUp>{}; }
__host__ __device__ static constexpr auto GetUpperLengths()
{
return Sequence<reduce_on_sequence(
LowerLengths{}, math::multiplies<index_t>{}, Number<1>{})>{};
}
// emulate constexpr lambda
template <typename PseudoLowStrides>
struct lambda_CalculateLowerIndex
{
index_t& itmp;
LowerIndex& idx_low;
__host__ __device__ explicit constexpr lambda_CalculateLowerIndex(index_t& itmp_,
LowerIndex& idx_low_)
: itmp(itmp_), idx_low(idx_low_)
{
}
template <typename IDim>
__host__ __device__ constexpr void operator()(IDim idim) const
{
constexpr index_t stride = PseudoLowStrides::At(idim);
idx_low(idim) = itmp / stride;
itmp -= idx_low[idim] * stride;
}
};
__host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
{
LowerIndex idx_low;
index_t itmp = idx_up[0];
constexpr auto pseudo_low_strides =
reverse_inclusive_scan_sequence(
LowerLengths::PopFront(), math::multiplies<index_t>{}, Number<1>{})
.PushBack(Number<1>{});
static_for<0, nDimLow - 1, 1>{}(
lambda_CalculateLowerIndex<decltype(pseudo_low_strides)>(itmp, idx_low));
idx_low(nDimLow - 1) = itmp / pseudo_low_strides[nDimLow - 1];
return idx_low;
}
// idx_low_diff depends on idx_low_old, so idx_low need to be up-to-date
// If idx_up_diff is known at compile-time, many calculations can be optimized
// away by compiler
// This function assume idx_low_old is not out-of-bound
__host__ __device__ static constexpr auto
CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
const UpperIndex& /* idx_up_old */,
const LowerIndex& idx_low_old)
{
// do nothing if idx_up_diff == 0
if(idx_up_diff[0] == 0)
{
return make_zero_array<index_t, nDimLow>();
}
// CalculateLowerIndex(idx_up_diff) has multiple integer divisions.
// If idx_up_diff is known at compile-time, the calculation can
// be done at compile-time. However, if idx_up_diff is only known
// at run-time, then the calculation will also be computed at
// run-time, and can be very expensive.
LowerIndex idx_low_new = idx_low_old + CalculateLowerIndex(idx_up_diff);
if(idx_up_diff[0] > 0)
{
bool carry = false;
// do carry check in reversed order, starting from lowest dimension
// don't check the highest dimension
static_for<0, nDimLow - 1, 1>{}([&](auto ireverse) {
constexpr index_t i = nDimLow - 1 - ireverse;
if(carry)
{
++idx_low_new(i);
}
carry = false;
if(idx_low_new[i] >= LowerLengths::At(i))
{
idx_low_new(i) -= LowerLengths::At(i);
carry = true;
}
});
// highest dimension, no out-of-bound check
if(carry)
{
++idx_low_new(0);
}
}
else if(idx_up_diff[0] < 0)
{
bool borrow = false;
// do borrow check in reversed order, starting from lowest dimension
// don't check the highest dimension
static_for<0, nDimLow - 1, 1>{}([&](auto ireverse) {
constexpr index_t i = nDimLow - 1 - ireverse;
if(borrow)
{
--idx_low_new(i);
}
borrow = false;
if(idx_low_new[i] < 0)
{
idx_low_new(i) += LowerLengths::At(i);
borrow = true;
}
});
// highest dimension, no out-of-bound check
if(borrow)
{
--idx_low_new(0);
}
}
return idx_low_new - idx_low_old;
}
__host__ __device__ static constexpr bool IsLinearTransform() { return false; }
__host__ __device__ static constexpr bool
IsUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */)
{
return true;
}
};
// UpperLengths: Sequence<...>
template <typename UpperLengths>
struct UnMerge
{
static constexpr index_t nDimLow = 1;
static constexpr index_t nDimUp = UpperLengths::Size();
using LowerIndex = MultiIndex<nDimLow>;
using UpperIndex = MultiIndex<nDimUp>;
__host__ __device__ static constexpr auto GetNumOfLowerDimension() { return Number<nDimLow>{}; }
__host__ __device__ static constexpr auto GetNumOfUpperDimension() { return Number<nDimUp>{}; }
__host__ __device__ static constexpr auto GetUpperLengths() { return UpperLengths{}; }
__host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
{
LowerIndex idx_low{0};
constexpr auto pseudo_up_strides =
reverse_inclusive_scan_sequence(
UpperLengths::PopFront(), math::multiplies<index_t>{}, Number<1>{})
.PushBack(Number<1>{});
static_for<0, nDimUp, 1>{}(
[&](auto idim) { idx_low(0) += idx_up[idim] * pseudo_up_strides[idim]; });
return idx_low;
}
__host__ __device__ static constexpr auto
CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
const UpperIndex& /* idx_up_old */,
const LowerIndex& /* idx_low_old */)
{
return CalculateLowerIndex(idx_up_diff);
}
__host__ __device__ static constexpr bool IsLinearTransform() { return true; }
__host__ __device__ static constexpr bool
IsUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */)
{
return true;
}
};
// UpperLengths: Sequence<...>
// Coefficients: Sequence<...>
// idx_low = coefficients[0, ...nDimUp-1] * idx_up[0, ...nDimUp-1] + coefficients[nDimUp]
template <typename UpperLengths, typename Coefficients>
struct Embed
{
static constexpr index_t nDimLow = 1;
static constexpr index_t nDimUp = UpperLengths::Size();
using LowerIndex = MultiIndex<nDimLow>;
using UpperIndex = MultiIndex<nDimUp>;
__host__ __device__ explicit constexpr Embed()
{
static_assert(UpperLengths::GetSize() == nDimUp && Coefficients::GetSize() == nDimUp + 1,
"wrong! # of dimensions not consistent");
}
__host__ __device__ static constexpr auto GetNumOfUpperDimension() { return Number<nDimUp>{}; }
__host__ __device__ static constexpr auto GetNumOfLowerDimension() { return Number<nDimLow>{}; }
__host__ __device__ static constexpr auto GetUpperLengths() { return UpperLengths{}; }
__host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
{
LowerIndex idx_low(Coefficients{}[nDimUp]);
static_for<0, nDimUp, 1>{}(
[&](auto idim) { idx_low(0) += idx_up[idim] * Coefficients{}[idim]; });
return idx_low;
}
__host__ __device__ static constexpr auto
CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
const UpperIndex& /* idx_up_old */,
const LowerIndex& /* idx_low_old */)
{
LowerIndex idx_low_diff{0};
static_for<0, nDimUp, 1>{}(
[&](auto idim) { idx_low_diff(0) += idx_up_diff[idim] * Coefficients{}[idim]; });
return idx_low_diff;
}
__host__ __device__ static constexpr bool IsLinearTransform() { return true; }
__host__ __device__ static constexpr bool
IsUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */)
{
return true;
}
};
template <index_t LowerLength, index_t VectorSize>
struct Vectorize
{
using LowerIndex = MultiIndex<1>;
using UpperIndex = MultiIndex<1>;
__host__ __device__ constexpr Vectorize()
{
static_assert(VectorSize > 0 && LowerLength % VectorSize == 0,
"wrong! cannot evenly divide");
}
__host__ __device__ static constexpr auto GetNumOfLowerDimension() { return Number<1>{}; }
__host__ __device__ static constexpr auto GetNumOfUpperDimension() { return Number<1>{}; }
__host__ __device__ static constexpr auto GetUpperLengths()
{
return Sequence<LowerLength / VectorSize>{};
}
__host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
{
return VectorSize * idx_up;
}
__host__ __device__ static constexpr auto
CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
const UpperIndex& /* idx_up_old */,
const LowerIndex& /* idx_low_old */)
{
return VectorSize * idx_up_diff;
}
__host__ __device__ static constexpr bool IsLinearTransform() { return true; }
__host__ __device__ static constexpr bool
IsUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */)
{
return true;
}
};
} // namespace ck
#endif
#ifndef CK_TENSOR_COORDINATE_HPP #ifndef CK_TENSOR_COORDINATE_V2_HPP
#define CK_TENSOR_COORDINATE_HPP #define CK_TENSOR_COORDINATE_V2_HPP
#include "common_header.hpp" #include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp" #include "dimension.hpp"
#include "ConstantMergedTensorDescriptor.hpp" #include "multi_index_transform.hpp"
#include "tensor_descriptor.hpp"
namespace ck { namespace ck {
template <class TensorDesc> template <typename TensorDesc>
struct NormalTensorCoordinate struct TensorCoordinate;
{
using type = NormalTensorCoordinate;
using tensor_desc_type = TensorDesc;
template <typename NativeTensorDesc>
struct NativeTensorCoordinate
{
using type = NativeTensorCoordinate;
using tensor_desc_type = NativeTensorDesc;
static constexpr index_t nDim = tensor_desc_type::GetNumOfDimension(); static constexpr index_t nDim = tensor_desc_type::GetNumOfDimension();
using Index = MultiIndex<nDim>;
__host__ __device__ constexpr NativeTensorCoordinate(Index idx)
: mIndex(idx), mOffset(tensor_desc_type::CalculateOffset(idx))
{
}
__host__ __device__ constexpr NormalTensorCoordinate(Array<index_t, nDim> tensor_index) template <typename... Xs>
: mOffset{tensor_desc_type::GetOffsetFromMultiIndex(tensor_index)} __host__ __device__ constexpr NativeTensorCoordinate(Xs... xs)
: NativeTensorCoordinate(Index{xs...})
{ {
} }
template <class... Xs> template <index_t... Xs>
__host__ __device__ constexpr NormalTensorCoordinate(Xs... xs) __host__ __device__ constexpr NativeTensorCoordinate(Sequence<Xs...>)
: NormalTensorCoordinate(Array<index_t, nDim>{xs...}) : NativeTensorCoordinate(Index{Xs...})
{ {
} }
__host__ __device__ constexpr index_t GetOffset() const { return mOffset; } __host__ __device__ static constexpr auto GetTensorDescriptor() { return tensor_desc_type{}; }
// T is Array or Sequence __host__ __device__ constexpr const Index& GetIndex() const { return mIndex; }
template <class T>
__host__ __device__ type operator+=(T step_sizes) __host__ __device__ constexpr const index_t& GetOffset() const { return mOffset; }
__host__ __device__ constexpr type operator+=(const Index& idx_diff)
{ {
static_assert(is_same<typename T::data_type, index_t>{} && T::GetSize() == nDim, "wrong!"); // mIndex is updated here, but some (or all) of its entries may never be used
// compiler should remove those entries as dead code
mIndex += idx_diff;
mOffset += tensor_desc_type::GetOffsetFromMultiIndex(step_sizes); mOffset += tensor_desc_type::CalculateOffsetDiff(idx_diff);
return *this; return *this;
} }
template <class T> __host__ __device__ constexpr type operator-=(const Index& idx_diff)
__host__ __device__ type operator-=(T step_sizes)
{ {
static_assert(is_same<typename T::data_type, index_t>{} && T::GetSize() == nDim, "wrong!"); // mIndex is updated here, but some (or all) of its entries may never be used
// compiler should remove those entries as dead code
mIndex -= idx_diff;
mOffset -= tensor_desc_type::GetOffsetFromMultiIndex(step_sizes); mOffset -= tensor_desc_type::CalculateOffsetDiff(idx_diff);
return *this; return *this;
} }
template <class T> __host__ __device__ constexpr type operator+(const Index& idx_diff) const
__host__ __device__ constexpr type operator+(T step_sizes) const
{ {
type coord = *this; type coord = *this;
coord += step_sizes; coord += idx_diff;
return coord; return coord;
} }
template <class T> __host__ __device__ constexpr type operator-(const Index& idx_diff) const
__host__ __device__ constexpr type operator-(T step_sizes) const
{ {
type coord = *this; type coord = *this;
coord -= step_sizes; coord -= idx_diff;
return coord; return coord;
} }
// reposition point of origin, and return compensated offset. __host__ __device__ static constexpr bool IsUpperIndexMappedToValidOffset() { return true; }
// This is a hack to reduce index calculation during looping over
// a tensor whose origin is this TensorCoordinate. It does so, by spitting
// out the run-time offset to the pointer (to the tensor data) held by this
// TensorCoordiante, so the caller can add the offset into the run-time pointer of
// the data, so only 1 run-time variable (update pointer) is needed, instead
// of 2 run-time variables (old pointer and this offset)
// TODO: after introducing the concept of "run-time tensor view", which contains the
// run-time pointer to the data, always keep track of the pointer, instead of both
// offset and the pointer. This also bring additional benefit that we don't need to
// worry the offset might underflow (because offset is unsigned integer) when updating it.
__host__ __device__ constexpr index_t RepositionOrigin()
{
index_t offset_diff = mOffset;
mOffset = 0;
return offset_diff;
}
private: private:
// mIndex may be saved and updated, however, the value of some (or all) of its entries may
// never be used. Compiler should be able to remove these entries as well as its calculation
// as dead code.
// TODO: make sure compiler indeed remove these dead code
Index mIndex;
index_t mOffset; index_t mOffset;
}; };
template <class TensorDesc> template <typename TransformedTensorDesc>
struct MergedTensorCoordinate struct TransformedTensorCoordinate
{ {
using type = MergedTensorCoordinate; using tensor_desc_type = TransformedTensorDesc;
using tensor_desc_type = TensorDesc; using LowerCoord =
typename TensorCoordinate<decltype(tensor_desc_type::GetLowerTensorDescriptor())>::type;
using UpperCoord = TransformedTensorCoordinate;
static constexpr index_t nDim = tensor_desc_type::GetNumOfDimension(); static constexpr index_t nDim = tensor_desc_type::GetNumOfDimension();
static constexpr index_t nOriginalDim = using UpperIndex = MultiIndex<nDim>;
tensor_desc_type::GetOriginalTensorDescriptor().GetNumOfDimension();
__host__ __device__ constexpr MergedTensorCoordinate(Array<index_t, nDim> tensor_index) __host__ __device__ constexpr TransformedTensorCoordinate(UpperIndex idx)
: mOriginalIndex{tensor_desc_type::GetOriginalMultiIndexFromMultiIndex(tensor_index)} : mIndexUp{idx}, mCoordLow{tensor_desc_type::CalculateLowerIndex(idx)}
{ {
// partial offset on each dimension }
static_for<0, nDim, 1>{}([&](auto idim) {
constexpr auto partial_original_dims =
tensor_desc_type::GetContainedOriginalDimensions(idim);
constexpr auto partial_original_desc =
tensor_desc_type::GetOriginalTensorDescriptor().Extract(partial_original_dims);
mPartialOffsets(idim) = partial_original_desc.GetOffsetFromMultiIndex(
extract_array(mOriginalIndex, partial_original_dims));
});
// complete offset template <typename... Xs>
mOffset = __host__ __device__ constexpr TransformedTensorCoordinate(Xs... xs)
accumulate_on_array(mPartialOffsets, math::plus<index_t>{}, static_cast<index_t>(0)); : TransformedTensorCoordinate(UpperIndex{xs...})
{
} }
template <class... Xs> template <index_t... Xs>
__host__ __device__ constexpr MergedTensorCoordinate(Xs... xs) __host__ __device__ constexpr TransformedTensorCoordinate(Sequence<Xs...>)
: MergedTensorCoordinate(Array<index_t, nDim>{xs...}) : TransformedTensorCoordinate(UpperIndex{Xs...})
{ {
} }
__host__ __device__ constexpr index_t GetOffset() const { return mOffset; } __host__ __device__ static constexpr auto GetTensorDescriptor() { return tensor_desc_type{}; }
template <class IDim, class T, bool PositiveDirection> __host__ __device__ constexpr const LowerCoord& GetLowerCoordinate() const { return mCoordLow; }
__host__ __device__ void
MoveOnDimension(IDim idim_, T step_size, integral_constant<bool, PositiveDirection>) __host__ __device__ constexpr const UpperIndex& GetUpperIndex() const { return mIndexUp; }
__host__ __device__ constexpr const UpperIndex& GetIndex() const { return GetUpperIndex(); }
__host__ __device__ constexpr const index_t& GetOffset() const
{ {
constexpr auto idim = idim_; return GetLowerCoordinate().GetOffset();
// if step_size is known at compile time
static_if<is_static<T>::value>{}(
[&](auto) { static_if<T{} == 0>{}([&](auto) { return; }); });
// update original index
static_if<tensor_desc_type::ContainMultipleOriginalDimensions(idim)>{}([&](auto) {
constexpr auto partial_original_dims =
tensor_desc_type::GetContainedOriginalDimensions(idim);
constexpr index_t ndim_partial_original = partial_original_dims.GetSize();
constexpr auto partial_original_desc =
tensor_desc_type::GetOriginalTensorDescriptor().Extract(partial_original_dims);
const auto partial_original_step_sizes =
partial_original_desc.GetMultiIndexFrom1dIndex(step_size);
// update partial original multi-id
auto partial_original_id = extract_array(mOriginalIndex, partial_original_dims);
static_if<PositiveDirection>{}([&](auto) {
partial_original_id += partial_original_step_sizes;
bool carry = false;
// do carry check in reversed order, starting from lowest dimension
// don't check the highest dimension
static_for<0, ndim_partial_original, 1>{}([&](auto IReverse) {
constexpr index_t i = ndim_partial_original - 1 - IReverse;
if(carry)
{
++partial_original_id(i);
}
carry = false;
if(partial_original_id[i] >= partial_original_desc.GetLength(i))
{
partial_original_id(i) -= partial_original_desc.GetLength(i);
carry = true;
}
});
}).Else([&](auto) {
// shift up multi-id to avoid unsigned integer underflow during intermediate
// calculations. After the shift, should have new_multi_id[...] >= 1
partial_original_id +=
partial_original_desc.GetLengths() - partial_original_step_sizes;
bool borrow = false;
// do borrow check in reversed order, starting from lowest dimension
// don't check the highest dimension
static_for<0, ndim_partial_original, 1>{}([&](auto IReverse) {
constexpr index_t i = ndim_partial_original - 1 - IReverse;
if(borrow)
{
--partial_original_id(i);
}
borrow = false;
if(partial_original_id[i] < partial_original_desc.GetLength(i))
{
partial_original_id(i) += partial_original_desc.GetLength(i);
borrow = true;
}
});
// shift back down multi-id
// here, should have new_multi_id[...] >= GetLengths()
partial_original_id = partial_original_id - partial_original_desc.GetLengths();
});
// update "mOriginalIndex"
static_for<0, ndim_partial_original, 1>{}([&](auto I) {
constexpr auto idim_original = partial_original_dims[I];
mOriginalIndex(idim_original) = partial_original_id[I];
});
// calculate new partial offset on this merged dimension
const index_t old_partial_offset = mPartialOffsets[idim];
mPartialOffsets(idim) =
partial_original_desc.GetOffsetFromMultiIndex(partial_original_id);
// update "mThreadSrcOffset", do "+" before "-" to avoid underflow
mOffset = (mOffset + mPartialOffsets[idim]) - old_partial_offset;
}).Else([&](auto fwd) {
static_if<PositiveDirection>{}([&](auto) {
mOffset += step_size * fwd(tensor_desc_type{}).GetStride(idim);
}).Else([&](auto) { mOffset -= step_size * fwd(tensor_desc_type{}).GetStride(idim); });
});
} }
// T is Array or Sequence __host__ __device__ constexpr UpperCoord operator+=(const UpperIndex& idx_up_diff)
template <class T>
__host__ __device__ type operator+=(T step_sizes)
{ {
static_assert(is_same<typename T::data_type, index_t>{} && T::GetSize() == nDim, "wrong!"); // For transformation of multi-index difference, not all transformation functions need to
// know the old lower-index or the old upper-index. We pass both of them to the
index_t normal_offset_diff = 0; // transformation function. The transformation function itself decides to use them or not.
mCoordLow += tensor_desc_type::CalculateLowerIndexDiff(
idx_up_diff, GetIndex(), GetLowerCoordinate().GetIndex());
static_for<0, nDim, 1>{}([&](auto idim) { // mIndexUp is updated here, but some (or all) of its entries may never be used
if(step_sizes[idim] != 0) // compiler should remove those entries as dead code
{ mIndexUp += idx_up_diff;
this->MoveOnDimension(idim, step_sizes[idim], integral_constant<bool, true>{});
}
});
return *this; return *this;
} }
template <class T> __host__ __device__ constexpr UpperCoord operator-=(const UpperIndex& idx_up_diff)
__host__ __device__ type operator-=(T step_sizes)
{ {
static_assert(is_same<typename T::data_type, index_t>{} && T::GetSize() == nDim, "wrong!"); mCoordLow -= tensor_desc_type::CalculateLowerIndexDiff(
idx_up_diff, GetIndex(), GetLowerCoordinate().GetIndex());
static_for<0, nDim, 1>{}([&](auto idim) { // mIndex is updated here, but some (or all) of its entries may never be used
if(step_sizes[idim] != 0) // compiler should remove those entries as dead code
{ mIndexUp -= idx_up_diff;
this->MoveOnDimension(idim, step_sizes[idim], integral_constant<bool, false>{});
}
});
return *this; return *this;
} }
template <class T> __host__ __device__ constexpr UpperCoord operator+(const UpperIndex& idx_up_diff) const
__host__ __device__ constexpr type operator+(T step_sizes) const
{ {
type coord = *this; UpperCoord coord_up = *this;
coord += step_sizes; coord_up += idx_up_diff;
return coord; return coord_up;
} }
template <class T> __host__ __device__ constexpr UpperCoord operator-(const UpperIndex& idx_up_diff) const
__host__ __device__ constexpr type operator-(T step_sizes) const
{ {
type coord = *this; UpperCoord coord_up = *this;
coord -= step_sizes; coord_up -= idx_up_diff;
return coord; return coord_up;
} }
__host__ __device__ static constexpr index_t RepositionOrigin() { return 0; } // this function should be inexpensive, because there is no upper-to-lower index transformation
__host__ __device__ constexpr bool IsUpperIndexMappedToValidOffset() const
{
return tensor_desc_type::IsUpperIndexMappedToValidLowerIndex(GetIndex()) &&
mCoordLow.IsUpperIndexMappedToValidOffset();
}
private: private:
// Allocate register memory for all merged dimensions and normal dimensions. // mIndexUp may be calculated and updated, however, the value of some (or all) of its entries
// However, only those merged dimensions, whose index will be involved in arithmetic // may
// after the construction of this TensorCoordinate (e.g. when user move a slicing // never be used. Compiler should be able to remove these entries as well as its calculation
// window on the merged dimension), will use these register memory. // as dead code.
// Let's hope compiler will optimize away those register memory allocated for normal // TODO: make sure compiler indeed remove these dead code
// dimensions, and those merged dimensions, that would never be involved in index UpperIndex mIndexUp;
// arithmetic after construction of TensorCoordinate. LowerCoord mCoordLow;
// TODO: refactor TensorCoordinate, after introducing the concept of "dimensions" };
// and simplify implementation of ConstantMergedTensorDescriptor, so we don't need to
// count on compiler to optimize way those register memory for us template <typename TensorDesc>
Array<index_t, nOriginalDim> mOriginalIndex; struct TensorCoordinate
Array<index_t, nDim> mPartialOffsets; {
private:
// complete offset template <typename... Ts>
index_t mOffset; __host__ __device__ static constexpr auto
MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
{
return NativeTensorCoordinate<NativeTensorDescriptor<Ts...>>(
make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
}
template <typename... Ts>
__host__ __device__ static constexpr auto
MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
{
return TransformedTensorCoordinate<TransformedTensorDescriptor<Ts...>>(
make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
}
public:
using type = decltype(MakeDummyTensorCoordinate(TensorDesc{}));
}; };
} // namespace ck } // namespace ck
......
#ifndef CK_TENSOR_COORDINATE_DEPRECATED_HPP
#define CK_TENSOR_COORDINATE_DEPRECATED_HPP
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
namespace ck {
// TensorDesc is ConstantTensorDescriptor
template <class TensorDesc>
struct NormalTensorCoordinate_deprecated
{
using type = NormalTensorCoordinate_deprecated;
using tensor_desc_type = TensorDesc;
static constexpr index_t nDim = tensor_desc_type::GetNumOfDimension();
__host__
__device__ constexpr NormalTensorCoordinate_deprecated(Array<index_t, nDim> tensor_index)
: mOffset{tensor_desc_type::GetOffsetFromMultiIndex(tensor_index)}
{
}
template <class... Xs>
__host__ __device__ constexpr NormalTensorCoordinate_deprecated(Xs... xs)
: NormalTensorCoordinate_deprecated(Array<index_t, nDim>{xs...})
{
}
template <index_t... Xs>
__host__ __device__ constexpr NormalTensorCoordinate_deprecated(Sequence<Xs...>)
: NormalTensorCoordinate_deprecated(Array<index_t, nDim>{Xs...})
{
}
__host__ __device__ constexpr index_t GetOffset() const { return mOffset; }
// T is Array or Sequence
template <class T>
__host__ __device__ type operator+=(T step_sizes)
{
static_assert(is_same<typename T::data_type, index_t>{} && T::GetSize() == nDim, "wrong!");
mOffset += tensor_desc_type::GetOffsetFromMultiIndex(step_sizes);
return *this;
}
template <class T>
__host__ __device__ type operator-=(T step_sizes)
{
static_assert(is_same<typename T::data_type, index_t>{} && T::GetSize() == nDim, "wrong!");
mOffset -= tensor_desc_type::GetOffsetFromMultiIndex(step_sizes);
return *this;
}
template <class T>
__host__ __device__ constexpr type operator+(T step_sizes) const
{
type coord = *this;
coord += step_sizes;
return coord;
}
template <class T>
__host__ __device__ constexpr type operator-(T step_sizes) const
{
type coord = *this;
coord -= step_sizes;
return coord;
}
// reposition point of origin, and return compensated offset.
// This is a hack to reduce index calculation during looping over
// a tensor whose origin is this TensorCoordinate. It does so, by spitting
// out the run-time offset to the pointer (to the tensor data) held by this
// TensorCoordiante, so the caller can add the offset into the run-time pointer of
// the data, so only 1 run-time variable (update pointer) is needed, instead
// of 2 run-time variables (old pointer and this offset)
// TODO: after introducing the concept of "run-time tensor view", which contains the
// run-time pointer to the data, always keep track of the pointer, instead of both
// offset and the pointer. This also bring additional benefit that we don't need to
// worry the offset might underflow (because offset is unsigned integer) when updating it.
__host__ __device__ constexpr index_t RepositionOrigin()
{
index_t offset_diff = mOffset;
mOffset = 0;
return offset_diff;
}
private:
index_t mOffset;
};
// TensorDesc is ConstantMergedTensorDescriptor
template <class TensorDesc>
struct MergedTensorCoordinate
{
using type = MergedTensorCoordinate;
using tensor_desc_type = TensorDesc;
static constexpr index_t nDim = tensor_desc_type::GetNumOfDimension();
static constexpr index_t nOriginalDim =
tensor_desc_type::GetOriginalTensorDescriptor().GetNumOfDimension();
__host__ __device__ constexpr MergedTensorCoordinate(Array<index_t, nDim> tensor_index)
: mOriginalIndex{tensor_desc_type::GetOriginalMultiIndexFromMultiIndex(tensor_index)}
{
// partial offset on each dimension
static_for<0, nDim, 1>{}([&](auto idim) {
constexpr auto partial_original_dims =
tensor_desc_type::GetContainedOriginalDimensions(idim);
constexpr auto partial_original_desc =
tensor_desc_type::GetOriginalTensorDescriptor().Extract(partial_original_dims);
mPartialOffsets(idim) = partial_original_desc.GetOffsetFromMultiIndex(
extract_array(mOriginalIndex, partial_original_dims));
});
// complete offset
mOffset =
accumulate_on_array(mPartialOffsets, math::plus<index_t>{}, static_cast<index_t>(0));
}
template <class... Xs>
__host__ __device__ constexpr MergedTensorCoordinate(Xs... xs)
: MergedTensorCoordinate(Array<index_t, nDim>{xs...})
{
}
__host__ __device__ constexpr index_t GetOffset() const { return mOffset; }
template <class IDim, class T, bool PositiveDirection>
__host__ __device__ void
MoveOnDimension(IDim idim_, T step_size, integral_constant<bool, PositiveDirection>)
{
constexpr auto idim = idim_;
// if step_size is known at compile time
static_if<is_static<T>::value>{}(
[&](auto) { static_if<T{} == 0>{}([&](auto) { return; }); });
// update original index
static_if<tensor_desc_type::ContainMultipleOriginalDimensions(idim)>{}([&](auto) {
constexpr auto partial_original_dims =
tensor_desc_type::GetContainedOriginalDimensions(idim);
constexpr index_t ndim_partial_original = partial_original_dims.GetSize();
constexpr auto partial_original_desc =
tensor_desc_type::GetOriginalTensorDescriptor().Extract(partial_original_dims);
const auto partial_original_step_sizes =
partial_original_desc.GetMultiIndexFrom1dIndex(step_size);
// update partial original multi-id
auto partial_original_id = extract_array(mOriginalIndex, partial_original_dims);
static_if<PositiveDirection>{}([&](auto) {
partial_original_id += partial_original_step_sizes;
bool carry = false;
// do carry check in reversed order, starting from lowest dimension
// don't check the highest dimension
static_for<0, ndim_partial_original - 1, 1>{}([&](auto IReverse) {
constexpr index_t i = ndim_partial_original - 1 - IReverse;
if(carry)
{
++partial_original_id(i);
}
carry = false;
if(partial_original_id[i] >= partial_original_desc.GetLength(i))
{
partial_original_id(i) -= partial_original_desc.GetLength(i);
carry = true;
}
});
// highest dimension
if(carry)
{
++partial_original_id(0);
}
}).Else([&](auto) {
// shift up multi-id to avoid unsigned integer underflow during intermediate
// calculations. After the shift, should have new_multi_id[...] >= 1
partial_original_id +=
partial_original_desc.GetLengths() - partial_original_step_sizes;
bool borrow = false;
// do borrow check in reversed order, starting from lowest dimension
// don't check the highest dimension
static_for<0, ndim_partial_original - 1, 1>{}([&](auto IReverse) {
constexpr index_t i = ndim_partial_original - 1 - IReverse;
if(borrow)
{
--partial_original_id(i);
}
borrow = false;
if(partial_original_id[i] < partial_original_desc.GetLength(i))
{
partial_original_id(i) += partial_original_desc.GetLength(i);
borrow = true;
}
});
// highest dimension
if(borrow)
{
--partial_original_id(0);
}
// shift back down multi-id
// here, should have new_multi_id[...] >= GetLengths()
partial_original_id = partial_original_id - partial_original_desc.GetLengths();
});
// update "mOriginalIndex"
static_for<0, ndim_partial_original, 1>{}([&](auto I) {
constexpr auto idim_original = partial_original_dims[I];
mOriginalIndex(idim_original) = partial_original_id[I];
});
// calculate new partial offset on this merged dimension
const index_t old_partial_offset = mPartialOffsets[idim];
mPartialOffsets(idim) =
partial_original_desc.GetOffsetFromMultiIndex(partial_original_id);
// update "mThreadSrcOffset", do "+" before "-" to avoid underflow
mOffset = (mOffset + mPartialOffsets[idim]) - old_partial_offset;
}).Else([&](auto fwd) {
static_if<PositiveDirection>{}([&](auto) {
mOffset += step_size * fwd(tensor_desc_type{}).GetStride(idim);
}).Else([&](auto) { mOffset -= step_size * fwd(tensor_desc_type{}).GetStride(idim); });
});
}
// T is Array or Sequence
template <class T>
__host__ __device__ type operator+=(T step_sizes)
{
static_assert(is_same<typename T::data_type, index_t>{} && T::GetSize() == nDim, "wrong!");
static_for<0, nDim, 1>{}([&](auto idim) {
// compiler should remove dead code path, because step_sizes is known at
// compile time
if(step_sizes[idim] != 0)
{
this->MoveOnDimension(idim, step_sizes[idim], integral_constant<bool, true>{});
}
});
return *this;
}
template <class T>
__host__ __device__ type operator-=(T step_sizes)
{
static_assert(is_same<typename T::data_type, index_t>{} && T::GetSize() == nDim, "wrong!");
static_for<0, nDim, 1>{}([&](auto idim) {
// compiler should remove dead code path, because step_sizes is known at
// compile time
if(step_sizes[idim] != 0)
{
this->MoveOnDimension(idim, step_sizes[idim], integral_constant<bool, false>{});
}
});
return *this;
}
template <class T>
__host__ __device__ constexpr type operator+(T step_sizes) const
{
type coord = *this;
coord += step_sizes;
return coord;
}
template <class T>
__host__ __device__ constexpr type operator-(T step_sizes) const
{
type coord = *this;
coord -= step_sizes;
return coord;
}
__host__ __device__ static constexpr index_t RepositionOrigin() { return 0; }
private:
// Allocate register memory for all merged dimensions and normal dimensions.
// However, only those merged dimensions, whose index will be involved in arithmetic
// after the construction of this TensorCoordinate (e.g. when user move a slicing
// window on the merged dimension), will use these register memory.
// Let's hope compiler will optimize away those register memory allocated for normal
// dimensions, and those merged dimensions, that would never be involved in index
// arithmetic after construction of TensorCoordinate.
// TODO: refactor TensorCoordinate, after introducing the concept of "dimensions"
// and simplify implementation of ConstantMergedTensorDescriptor, so we don't need to
// count on compiler to optimize away those register memory for us
Array<index_t, nOriginalDim> mOriginalIndex;
Array<index_t, nDim> mPartialOffsets;
// complete offset
index_t mOffset;
};
template <class TensorDesc>
struct TensorCoordinate_deprecated
{
private:
template <class... Ts>
__host__ __device__ static constexpr auto
MakeDummyTensorCoordinate(ConstantTensorDescriptor<Ts...>)
{
return NormalTensorCoordinate_deprecated<ConstantTensorDescriptor<Ts...>>();
}
template <class... Ts>
__host__ __device__ static constexpr auto
MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor<Ts...>)
{
return MergedTensorCoordinate<ConstantMergedTensorDescriptor<Ts...>>();
}
public:
using type = decltype(MakeDummyTensorCoordinate(TensorDesc{}));
};
} // namespace ck
#endif
#ifndef CK_TENSOR_COORDINATE_HELPER_HPP
#define CK_TENSOR_COORDINATE_HELPER_HPP
#include "tensor_coordiante_v2.hpp"
namespace ck {
template <typename TensorDesc>
__host__ __device__ constexpr auto
make_tensor_coordinate_v2(TensorDesc, MultiIndex<TensorDesc::GetNumOfDimension()> idx)
{
return typename TensorCoordinate<TensorDesc>::type(idx);
}
} // namespace ck
#endif
#ifndef CK_TENSOR_DESCRIPTOR_HPP
#define CK_TENSOR_DESCRIPTOR_HPP
#include "common_header.hpp"
#include "dimension.hpp"
#include "multi_index_transform.hpp"
namespace ck {
template <typename... NativeDimensions>
struct NativeTensorDescriptor
{
using type = NativeTensorDescriptor;
static constexpr index_t nDim = sizeof...(NativeDimensions);
static constexpr auto mDimensions = make_tuple(NativeDimensions{}...);
using Index = MultiIndex<nDim>;
__host__ __device__ static constexpr auto GetNumOfDimension() { return Number<nDim>{}; }
template <index_t IDim>
__host__ __device__ static constexpr auto GetLength(Number<IDim>)
{
return mDimensions.At(Number<IDim>{}).GetLength();
}
template <index_t IDim>
__host__ __device__ static constexpr auto GetStride(Number<IDim>)
{
return mDimensions.At(Number<IDim>{}).GetStride();
}
template <index_t... IDims>
__host__ __device__ static constexpr auto GetLengths(Sequence<IDims...>)
{
return Sequence<GetLength(Number<IDims>{})...>{};
}
template <index_t... IDims>
__host__ __device__ static constexpr auto GetStrides(Sequence<IDims...>)
{
return Sequence<GetStride(Number<IDims>{})...>{};
}
template <index_t IDim, index_t... IDims>
__host__ __device__ static constexpr auto GetLengths(Number<IDim>, Number<IDims>...)
{
return GetLengths(Sequence<IDim, IDims...>{});
}
template <index_t IDim, index_t... IDims>
__host__ __device__ static constexpr auto GetStrides(Number<IDim>, Number<IDims>...)
{
return GetStrides(Sequence<IDim, IDims...>{});
}
__host__ __device__ static constexpr auto GetLengths()
{
return GetLengths(typename arithmetic_sequence_gen<0, nDim, 1>::type{});
}
__host__ __device__ static constexpr auto GetStrides()
{
return GetStrides(typename arithmetic_sequence_gen<0, nDim, 1>::type{});
}
__host__ __device__ static constexpr index_t GetElementSize()
{
return reduce_on_sequence(GetLengths(), math::multiplies<index_t>{}, Number<1>{});
}
__host__ __device__ static constexpr index_t GetElementSpace()
{
return reduce_on_sequence(
(GetLengths() - Number<1>{}) * GetStrides(), math::plus<index_t>{}, Number<1>{});
}
// TODO: this cannot return constepxr because of use of lambda
__host__ __device__ static constexpr index_t CalculateOffset(const Index& idx)
{
index_t offset = 0;
static_for<0, nDim, 1>{}([&](auto idim) { offset += idx[idim] * GetStride(idim); });
return offset;
}
__host__ __device__ static constexpr index_t CalculateOffsetDiff(const Index& idx_diff)
{
index_t offset_diff = 0;
static_for<0, nDim, 1>{}(
[&](auto idim) { offset_diff += idx_diff[idim] * GetStride(idim); });
return offset_diff;
}
template <index_t IDim>
__host__ __device__ static constexpr bool IsLinearDimension(Number<IDim>)
{
return true;
}
__host__ __device__ static constexpr auto GetLinearDimensionMask()
{
return typename uniform_sequence_gen<nDim, 1>::type{};
}
__host__ __device__ static constexpr auto GetNonLinearDimensionMask()
{
return typename uniform_sequence_gen<nDim, 0>::type{};
}
__host__ __device__ static constexpr auto GetNonLinearDimensions() { return Sequence<>{}; }
#if 0
__host__ __device__ static constexpr auto GetNonLinearIndependentDimensionGroups()
{
return Tuple<>{};
}
#endif
__host__ __device__ static constexpr bool
IsUpperIndexMappedToValidOffset(const Index& /* idx */)
{
return true;
}
};
// LowerTensorDescriptor
// Transforms: Tuple<DimensionTransforms...>
// LowerDimensionIds: Tuple<Sequence<...>>
// UpperDimensionIds: Tuple<Sequence<...>>
template <typename LowTensorDescriptor,
typename Transforms,
typename LowDimensionIds,
typename UpDimensionIds>
struct TransformedTensorDescriptor
{
using type = TransformedTensorDescriptor;
static constexpr index_t nTransform = Transforms::Size();
struct lambda_merge_sequences
{
template <typename... Seqs>
__host__ __device__ constexpr auto operator()(Seqs... seqs) const
{
return merge_sequences(seqs...);
}
};
__host__ __device__ static constexpr auto GetNumOfLowerDimension()
{
// Here, we assume all lower-dimensions are active
// TODO: sanity-check all lower-dimension are indeed active
using duplicated_low_active_dims =
decltype(unpack(lambda_merge_sequences{}, LowDimensionIds{}));
using low_active_dims = typename sequence_unique_sort<duplicated_low_active_dims,
math::less<index_t>,
math::equal<index_t>>::type;
return low_active_dims::Size();
}
__host__ __device__ static constexpr auto GetNumOfUpperDimension()
{
using duplicated_up_active_dims =
decltype(unpack(lambda_merge_sequences{}, UpDimensionIds{}));
using up_active_dims = typename sequence_unique_sort<duplicated_up_active_dims,
math::less<index_t>,
math::equal<index_t>>::type;
return up_active_dims::Size();
}
static constexpr index_t nDimUp = GetNumOfUpperDimension();
static constexpr index_t nDimLow = GetNumOfLowerDimension();
using UpperIndex = MultiIndex<nDimUp>;
using LowerIndex = MultiIndex<nDimLow>;
__host__ __device__ constexpr TransformedTensorDescriptor()
{
static_assert(nTransform == Transforms::Size() && nTransform == LowDimensionIds::Size() &&
nTransform == UpDimensionIds::Size(),
"wrong! # of transformations not the same");
// sanity check:
// LowDimensionIds should include all low-dimensions,
// UpDimensionIds should include all up-dimensions
using mingled_up_dimension_ids =
decltype(unpack(lambda_merge_sequences{}, UpDimensionIds{}));
using sorted_up_dimension_ids =
typename sequence_sort<mingled_up_dimension_ids, math::less<index_t>>::type;
static_assert(sorted_up_dimension_ids::Size() == nDimUp &&
is_valid_sequence_map<sorted_up_dimension_ids>{},
"wrong! UpDimensionIds is not configured correctly");
using mingled_low_dimension_ids =
decltype(unpack(lambda_merge_sequences{}, LowDimensionIds{}));
using sorted_low_dimension_ids =
typename sequence_sort<mingled_low_dimension_ids, math::less<index_t>>::type;
static_assert(sorted_low_dimension_ids::Size() == nDimLow &&
is_valid_sequence_map<sorted_low_dimension_ids>{},
"wrong! LowDimensionIds is not configured correctly");
// TODO: sanity check: while a up-dimension could be associated with multille
// transformation, a low-dimension should be associated with only one transformation
// TODO: sanity-check: GetLowerLengths of each transform should be consistent with lengths
// of lower-tensor-descriptor
}
__host__ __device__ static constexpr auto GetNumOfDimension()
{
return GetNumOfUpperDimension();
}
__host__ __device__ static constexpr auto GetLowerTensorDescriptor()
{
return LowTensorDescriptor{};
}
struct lambda_GetUpperLengths
{
template <typename Transform>
__host__ __device__ constexpr auto operator()(const Transform& tran) const
{
return tran.GetUpperLengths();
}
};
__host__ __device__ static constexpr auto GetUpperLengths()
{
constexpr auto tuple_of_up_lengths =
transform_tuples(lambda_GetUpperLengths{}, Transforms{});
constexpr auto mingled_up_lengths = unpack(lambda_merge_sequences{}, tuple_of_up_lengths);
constexpr auto mingled_up_dimension_ids =
unpack(lambda_merge_sequences{}, UpDimensionIds{});
// TODO: sanity-check mingled_up_dimension_ids contain all upper-dimensions
// TODO: sanity-check mingled_up_lengths have no conflicting upper-length
// sort by upper-dimension-ids
using sort_up_dimension_ids = sequence_unique_sort<decltype(mingled_up_dimension_ids),
math::less<index_t>,
math::equal<index_t>>;
// sanity-check sorted-upper-dimension-ids should be Sequence<0, 1, ... nDimUp-1>
static_assert(is_same<typename sort_up_dimension_ids::type,
typename arithmetic_sequence_gen<0, nDimUp, 1>::type>{},
"wrong! UpDimensionIds is not configured correctly");
constexpr auto sorted2unsorted_map = typename sort_up_dimension_ids::sorted2unsorted_map{};
constexpr auto sorted_up_lengths =
pick_sequence_elements_by_ids(mingled_up_lengths, sorted2unsorted_map);
return sorted_up_lengths;
}
__host__ __device__ static constexpr auto GetLengths() { return GetUpperLengths(); }
template <index_t IDim>
__host__ __device__ static constexpr auto GetLength(Number<IDim>)
{
return GetLengths()[IDim];
}
template <index_t... IDims>
__host__ __device__ static constexpr auto GetLengths(Sequence<IDims...>)
{
return Sequence<GetLength(Number<IDims>{})...>{};
}
template <index_t IDim, index_t... IDims>
__host__ __device__ static constexpr auto GetLengths(Number<IDim>, Number<IDims>...)
{
return GetLengths(Sequence<IDim, IDims...>{});
}
__host__ __device__ static constexpr index_t GetElementSize()
{
return reduce_on_sequence(GetLengths(), math::multiplies<index_t>{}, Number<1>{});
}
__host__ __device__ static constexpr index_t GetElementSpace()
{
// TODO: Is this the correct definition for transformed tensor?
return GetLowerTensorDescriptor().GetElementSpace();
}
// TODO: right now return value is not constexpr because use of non-constexpr lambda
__host__ __device__ static constexpr LowerIndex CalculateLowerIndex(const UpperIndex& idx_up)
{
LowerIndex idx_low;
static_for<0, nTransform, 1>{}([&](auto itran) {
constexpr auto tran = Transforms{}.At(itran);
const auto idx_up_part = pick_array_element(idx_up, UpDimensionIds{}.At(itran));
auto idx_low_part = pick_array_element(idx_low, LowDimensionIds{}.At(itran));
// this assume each lower (single) index is only assocaited with one transformation,
// which is required for index transformation, and has been checked during constructor
// of TransformedTensorDescriptor
idx_low_part = tran.CalculateLowerIndex(to_array(idx_up_part));
});
return idx_low;
}
// TODO: right now return value is constexpr because use of non-constepxr lambda
__host__ __device__ static constexpr LowerIndex CalculateLowerIndexDiff(
const UpperIndex& idx_up_diff, const UpperIndex& idx_up_old, const LowerIndex& idx_low_old)
{
LowerIndex idx_low_diff;
static_for<0, nTransform, 1>{}([&](auto itran) {
constexpr auto tran = Transforms{}.At(itran);
const auto idx_up_diff_part =
pick_array_element(idx_up_diff, UpDimensionIds{}.At(itran));
const auto idx_up_old_part = pick_array_element(idx_up_old, UpDimensionIds{}.At(itran));
const auto idx_low_old_part =
pick_array_element(idx_low_old, LowDimensionIds{}.At(itran));
auto idx_low_diff_part = pick_array_element(idx_low_diff, LowDimensionIds{}.At(itran));
// this assume each lower (single) index is associated with only one transformation,
// which is required for index transformation, and has been checked during constructor
// of TransformedTensorDescriptor
idx_low_diff_part = tran.CalculateLowerIndexDiff(
to_array(idx_up_diff_part), to_array(idx_up_old_part), to_array(idx_low_old_part));
});
return idx_low_diff;
}
__host__ __device__ static constexpr index_t CalculateOffset(const UpperIndex& idx_up)
{
return GetLowerTensorDescriptor().CalculateOffset(CalculateLowerIndex(idx_up));
}
struct lambda_sequence_logical_and
{
template <typename... Seqs>
__host__ __device__ constexpr auto operator()(Seqs...) const
{
return typename sequence_reduce<logical_and<index_t>, Seqs...>::type{};
}
};
template <typename T>
struct lambda_is_true
{
__host__ __device__ constexpr auto operator()(const T& x) const
{
// TODO: remove static_cast once Sequence can take bool as entries
return static_cast<bool>(x) == true;
}
};
struct lambda_get_linear_dimension_mask_of_single_tranform
{
// check only one transform at a time
template <typename Transform, typename LowDimensionId, typename UpDimensionId>
__host__ __device__ constexpr auto
operator()(Transform, LowDimensionId, UpDimensionId) const
{
// judge if transformation is linear
constexpr bool is_linear_transform = Transform::IsLinearTransform();
// judge if all lower dimension are linear
constexpr bool are_all_low_dim_linear = sequence_all_of(
pick_sequence_elements_by_ids(GetLowerTensorDescriptor().GetLinearDimensionMask(),
LowDimensionId{}),
lambda_is_true<index_t>{});
// create linear mask for upper dimensions
constexpr bool are_up_dim_linear = is_linear_transform && are_all_low_dim_linear;
constexpr auto mask_of_up_linear_dims = modify_sequence_elements_by_ids(
typename uniform_sequence_gen<nDimUp, 1>::type{},
typename uniform_sequence_gen<UpDimensionId::Size(), are_up_dim_linear>::type{},
UpDimensionId{});
return mask_of_up_linear_dims;
}
};
// TODO: this is a hack, transform_tuples() doesn't compile, would complain about constexpr
template <typename F, typename X, typename Y, typename Z, index_t... Is>
__host__ __device__ static constexpr auto
dummy_transform_tuples_impl(F f, X x, Y y, Z z, Sequence<Is...>)
{
return make_tuple(f(x.At(Number<Is>{}), y.At(Number<Is>{}), z.At(Number<Is>{}))...);
}
__host__ __device__ static constexpr auto GetLinearDimensionMask()
{
#if 0
// create tuple of linear dimension masks, for all transformations
constexpr auto tuple_of_linear_dimension_mask =
transform_tuples(lambda_get_linear_dimension_mask_of_single_tranform{},
Transforms{},
LowDimensionIds{},
UpDimensionIds{});
#else
// create tuple of linear dimension masks, for all transformations
// TODO: this is a hack, transform_tuples() doesn't compile, complain about constexpr
constexpr auto tuple_of_linear_dimension_mask = dummy_transform_tuples_impl(
lambda_get_linear_dimension_mask_of_single_tranform{},
Transforms{},
LowDimensionIds{},
UpDimensionIds{},
typename arithmetic_sequence_gen<0, Transforms::Size(), 1>::type{});
#endif
// reduce tuple of masks into one mask
constexpr auto linear_dimension_mask =
unpack(lambda_sequence_logical_and{}, tuple_of_linear_dimension_mask);
return linear_dimension_mask;
}
__host__ __device__ static constexpr auto GetNonLinearDimensionMask()
{
return GetLinearDimensionMask().Transform(logical_not<index_t>{});
}
template <index_t IDim>
__host__ __device__ static constexpr bool IsLinearDimension(Number<IDim>)
{
return GetLinearDimensionMask().At(Number<IDim>{});
}
__host__ __device__ static constexpr auto GetLinearDimensions()
{
constexpr auto linear_dimension_mask = GetLinearDimensionMask();
return pick_sequence_elements_by_mask(
typename arithmetic_sequence_gen<0, nDimUp, 1>::type{}, linear_dimension_mask);
}
__host__ __device__ static constexpr auto GetNonLinearDimensions()
{
constexpr auto nonlinear_dimension_mask = GetNonLinearDimensionMask();
return pick_sequence_elements_by_mask(
typename arithmetic_sequence_gen<0, nDimUp, 1>::type{}, nonlinear_dimension_mask);
}
#if 0
__host__ __device__ static constexpr auto GetNonLinearIndependentDimensionGroups()
{
// not implemented
}
#endif
__host__ __device__ static constexpr bool
IsUpperIndexMappedToValidLowerIndex(const UpperIndex& idx_up)
{
bool flag = true;
static_for<0, nTransform, 1>{}([&](auto itran) {
constexpr auto tran = Transforms{}.At(itran);
const auto idx_up_part = pick_array_element(idx_up, UpDimensionIds{}.At(itran));
flag = flag && tran.IsUpperIndexMappedToValidLowerIndex(to_array(idx_up_part));
});
return flag;
}
// Whenever this function is called, it will call CalculateLowerIndex() recursively.
// If you have created a tensor coordinate already, instead of calling this function,
// you should call TensorCoordinate::IsUpperIndexMappedToValidOffset() which would
// be less expensive.
__host__ __device__ static constexpr bool
IsUpperIndexMappedToValidOffset(const UpperIndex& idx_up)
{
return IsUpperIndexMappedToValidLowerIndex(idx_up) &&
GetLowerTensorDescriptor().IsUpperIndexMappedToValidOffset(
CalculateLowerIndex(idx_up));
}
};
} // namespace ck
#endif
#ifndef CK_TENSOR_DESCRIPTOR_HELPER_HPP
#define CK_TENSOR_DESCRIPTOR_HELPER_HPP
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
namespace ck {
template <typename Lengths>
__host__ __device__ constexpr auto calculate_tensor_strides_packed(Lengths)
{
return reverse_inclusive_scan_sequence(
Lengths{}.PopFront(), math::multiplies<index_t>{}, Number<1>{})
.PushBack(Number<1>{});
}
template <typename Lengths, index_t Align>
__host__ __device__ constexpr auto calculate_tensor_strides_aligned(Lengths, Number<Align>)
{
constexpr index_t L_back_align =
Align * math::integer_divide_ceiler<index_t>{}(Lengths{}.Back(), Align);
return calculate_tensor_strides_packed(
Lengths{}.Modify(Number<Lengths{}.GetSize() - 1>{}, Number<L_back_align>{}));
}
template <index_t... Lengths, index_t... Strides>
__host__ __device__ constexpr auto make_native_tensor_descriptor(Sequence<Lengths...>,
Sequence<Strides...>)
{
return NativeTensorDescriptor<NativeDimension<Lengths, Strides>...>{};
}
template <typename Lengths>
__host__ __device__ constexpr auto make_native_tensor_descriptor_packed(Lengths)
{
constexpr auto strides = calculate_tensor_strides_packed(Lengths{});
return make_native_tensor_descriptor(Lengths{}, strides);
}
template <typename Lengths, index_t Align>
__host__ __device__ constexpr auto make_native_tensor_descriptor_aligned(Lengths, Number<Align>)
{
constexpr auto strides = calculate_tensor_strides_aligned(Lengths{}, Number<Align>{});
return make_native_tensor_descriptor(Lengths{}, strides);
}
template <typename LowTensorDescriptor,
typename Transforms,
typename LowDimensionIds,
typename UpDimensionIds>
__host__ __device__ constexpr auto
transform_tensor_descriptor(LowTensorDescriptor, Transforms, LowDimensionIds, UpDimensionIds)
{
return TransformedTensorDescriptor<LowTensorDescriptor,
Transforms,
LowDimensionIds,
UpDimensionIds>{};
}
template <typename LowerTensorDescriptor,
index_t... LowerLengths,
index_t... LowerDimensionIds,
index_t... UpperDimensionIds>
__host__ __device__ constexpr auto reorder_tensor_descriptor_impl(LowerTensorDescriptor,
Sequence<LowerLengths...>,
Sequence<LowerDimensionIds...>,
Sequence<UpperDimensionIds...>)
{
return TransformedTensorDescriptor<LowerTensorDescriptor,
Tuple<PassThrough<LowerLengths>...>,
Tuple<Sequence<LowerDimensionIds>...>,
Tuple<Sequence<UpperDimensionIds>...>>{};
}
template <typename LowerTensorDescriptor, typename MapLower2Upper>
__host__ __device__ constexpr auto
reorder_tensor_descriptor_given_lower2upper(LowerTensorDescriptor, MapLower2Upper)
{
static_assert(is_valid_sequence_map<MapLower2Upper>{},
"wrong! MapLower2Upper is not a valid map");
return reorder_tensor_descriptor_impl(
LowerTensorDescriptor{},
LowerTensorDescriptor::GetLengths(),
typename arithmetic_sequence_gen<0, LowerTensorDescriptor::GetNumOfDimension(), 1>::type{},
MapLower2Upper{});
}
template <typename LowerTensorDescriptor, typename MapUpper2Lower>
__host__ __device__ constexpr auto
reorder_tensor_descriptor_given_upper2lower(LowerTensorDescriptor, MapUpper2Lower)
{
return reorder_tensor_descriptor_given_lower2upper(
LowerTensorDescriptor{}, typename sequence_map_inverse<MapUpper2Lower>::type{});
}
template <typename Lengths, typename Strides>
__host__ __device__ constexpr bool AreDimensionsUnfoldable(Lengths, Strides)
{
static_assert(Lengths::Size() == Strides::Size(), "wrong!");
bool flag = true;
for(index_t i = 0; i < Lengths::Size() - 1; ++i)
{
flag = flag && Strides::At(i) == Strides::At(i + 1) * Lengths::At(i + 1);
}
return flag;
}
// unfold only support NativeTennsorDescriptor, for now
template <index_t FirstUnfoldDim, index_t LastUnfoldDim, typename... Ts>
__host__ __device__ constexpr auto unfold_tensor_descriptor(NativeTensorDescriptor<Ts...> desc,
Number<FirstUnfoldDim>,
Number<LastUnfoldDim>)
{
constexpr index_t nDim = desc.GetNumOfDimension();
static_assert(FirstUnfoldDim >= 0 && LastUnfoldDim < nDim && FirstUnfoldDim <= LastUnfoldDim,
"wrong! should have FirstUnfoldDim <= LastUnfoldDim!");
// left and right
constexpr auto left = typename arithmetic_sequence_gen<0, FirstUnfoldDim, 1>::type{};
constexpr auto middle =
typename arithmetic_sequence_gen<FirstUnfoldDim, LastUnfoldDim + 1, 1>::type{};
constexpr auto right = typename arithmetic_sequence_gen<LastUnfoldDim + 1, nDim, 1>::type{};
// sanity-checknfoldable
static_assert(AreDimensionsUnfoldable(desc.GetLengths(middle), desc.GetStrides(middle)),
"wrong! not unfoldable");
// unfolded length, stride
constexpr index_t unfold_length =
reduce_on_sequence(desc.GetLengths(middle), math::multiplies<index_t>{}, Number<1>{});
constexpr index_t unfold_stride = desc.GetStride(Number<LastUnfoldDim>{});
// new lengths, strides
constexpr auto new_lengths =
desc.GetLengths(left).PushBack(Number<unfold_length>{}).PushBack(desc.GetLengths(right));
constexpr auto new_strides =
desc.GetStrides(left).PushBack(Number<unfold_stride>{}).PushBack(desc.GetStrides(right));
return make_native_tensor_descriptor(new_lengths, new_strides);
}
#if 0
// not implemented
template <typename LowerTensorDescriptor,
typename PadDimensionIds,
typename LeftPads,
typename RightPads>
__host__ __device__ constexpr auto
pad_tensor_descriptor(LowerTensorDescriptor, PadLowerDimensionIds, LeftPads, RightPads)
{
constexpr index_t nDim = LowerTensorDescriptor::GetNumOfDimension();
constexpr auto non_pad_low_dim_ids = xxx;
return transform_tensor_descriptor(
LowerTensorDescriptor{},
make_tuple(Pad<decltype(LowerTensorDescriptor::GetLengths(PadLowerDimensionIds{})),
LeftPads,
RightPads>{})
.PushBack(PassThrough<xxxx>...),
make_tuple(PadLowerDimensionIds{}).PushBack(xxxx),
sequence_to_tuple(typename arithmetic_sequence_gen<0, nDim, 1> i::type{}));
}
#endif
// a cluster map 1d index to N-d index
template <typename Lengths, typename ArrangeOrder>
struct ClusterDescriptor
{
static constexpr index_t nDim = Lengths::Size();
static constexpr auto mDesc = transform_tensor_descriptor(
make_native_tensor_descriptor_packed(Lengths{}),
make_tuple(Merge<decltype(Lengths::ReorderGivenNew2Old(ArrangeOrder{}))>{}),
make_tuple(ArrangeOrder{}),
make_tuple(Sequence<0>{}));
__host__ __device__ constexpr ClusterDescriptor()
{
static_assert(Lengths::Size() == nDim && ArrangeOrder::Size() == nDim,
"wrong! size not the same");
static_assert(is_valid_sequence_map<ArrangeOrder>{}, "wrong! ArrangeOrder is wrong");
}
__host__ __device__ static constexpr index_t GetElementSize() { return mDesc.GetElementSize(); }
__host__ __device__ static constexpr auto CalculateClusterIndex(index_t idx_1d)
{
return mDesc.CalculateLowerIndex(MultiIndex<1>{idx_1d});
}
};
template <typename Lengths,
typename ArrangeOrder = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type>
__host__ __device__ constexpr auto make_cluster_descriptor(
Lengths, ArrangeOrder order = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type{})
{
return ClusterDescriptor<Lengths, ArrangeOrder>{};
}
template <typename... NativeDimensions>
__host__ __device__ void
print_tensor_descriptor(const char* s, const NativeTensorDescriptor<NativeDimensions...>& desc)
{
print_tensor_descriptor_impl(s, desc.GetLengths(), desc.GetStrides());
}
template <typename... Ts>
__host__ __device__ void print_tensor_descriptor(const char* s,
const TransformedTensorDescriptor<Ts...>& desc)
{
print_tensor_descriptor_impl(s, desc.GetLengths());
}
template <index_t... Lengths, index_t... Strides>
__host__ __device__ void
print_tensor_descriptor_impl(const char* s, Sequence<Lengths...>, Sequence<Strides...>)
{
constexpr index_t nDim = sizeof...(Lengths);
static_assert(nDim > 0 && nDim <= 12, "wrong!");
static_if<nDim == 1>{}([&](auto) {
printf("%s dim %u, lengths {%u}, strides {%u}\n", s, nDim, Lengths..., Strides...);
});
static_if<nDim == 2>{}([&](auto) {
printf("%s dim %u, lengths {%u %u}, strides {%u %u}\n", s, nDim, Lengths..., Strides...);
});
static_if<nDim == 3>{}([&](auto) {
printf(
"%s dim %u, lengths {%u %u %u}, strides {%u %u %u}\n", s, nDim, Lengths..., Strides...);
});
static_if<nDim == 4>{}([&](auto) {
printf("%s dim %u, lengths {%u %u %u %u}, strides {%u %u %u %u}\n",
s,
nDim,
Lengths...,
Strides...);
});
static_if<nDim == 5>{}([&](auto) {
printf("%s dim %u, lengths {%u %u %u %u %u}, strides {%u %u %u %u %u}\n",
s,
nDim,
Lengths...,
Strides...);
});
static_if<nDim == 6>{}([&](auto) {
printf("%s dim %u, lengths {%u %u %u %u %u %u}, strides {%u %u %u %u %u %u}\n",
s,
nDim,
Lengths...,
Strides...);
});
static_if<nDim == 7>{}([&](auto) {
printf("%s dim %u, lengths {%u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u}\n",
s,
nDim,
Lengths...,
Strides...);
});
static_if<nDim == 8>{}([&](auto) {
printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u %u}\n",
s,
nDim,
Lengths...,
Strides...);
});
static_if<nDim == 9>{}([&](auto) {
printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u %u "
"%u}\n",
s,
nDim,
Lengths...,
Strides...);
});
static_if<nDim == 10>{}([&](auto) {
printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u "
"%u %u %u}\n",
s,
nDim,
Lengths...,
Strides...);
});
static_if<nDim == 11>{}([&](auto) {
printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u "
"%u %u "
"%u %u %u}\n",
s,
nDim,
Lengths...,
Strides...);
});
static_if<nDim == 12>{}([&](auto) {
printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u "
"%u %u %u %u "
"%u %u %u}\n",
s,
nDim,
Lengths...,
Strides...);
});
}
template <index_t... Lengths>
__host__ __device__ void print_tensor_descriptor_impl(const char* s, Sequence<Lengths...>)
{
constexpr index_t nDim = sizeof...(Lengths);
static_assert(nDim > 0 && nDim <= 12, "wrong!");
static_if<nDim == 1>{}([&](auto) { printf("%s dim %u, lengths {%u}\n", s, nDim, Lengths...); });
static_if<nDim == 2>{}(
[&](auto) { printf("%s dim %u, lengths {%u %u}\n", s, nDim, Lengths...); });
static_if<nDim == 3>{}(
[&](auto) { printf("%s dim %u, lengths {%u %u %u}\n", s, nDim, Lengths...); });
static_if<nDim == 4>{}(
[&](auto) { printf("%s dim %u, lengths {%u %u %u %u}\n", s, nDim, Lengths...); });
static_if<nDim == 5>{}(
[&](auto) { printf("%s dim %u, lengths {%u %u %u %u %u}\n", s, nDim, Lengths...); });
static_if<nDim == 6>{}(
[&](auto) { printf("%s dim %u, lengths {%u %u %u %u %u %u}, \n", s, nDim, Lengths...); });
static_if<nDim == 7>{}(
[&](auto) { printf("%s dim %u, lengths {%u %u %u %u %u %u %u}\n", s, nDim, Lengths...); });
static_if<nDim == 8>{}([&](auto) {
printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...);
});
static_if<nDim == 9>{}([&](auto) {
printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...);
});
static_if<nDim == 10>{}([&](auto) {
printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...);
});
static_if<nDim == 11>{}([&](auto) {
printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...);
});
static_if<nDim == 12>{}([&](auto) {
printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...);
});
}
} // namespace ck
#endif
#ifndef CK_TENSOR_VIEW_HPP
#define CK_TENSOR_VIEW_HPP
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "tensor_coordinate_deprecated.hpp"
namespace ck {
// TensorDesc is ConstantTensorDescriptor or ConstantMergedTensorDescriptor
template <class TensorDesc, class TData>
struct NormalTensorView
{
using type = NormalTensorView;
using tensor_desc_type = TensorDesc;
using coordinate_type = typename NormalTensorCoordinate_deprecated<TensorDesc>::type;
using data_type = TData;
static constexpr auto nDim = TensorDesc::GetNumOfDimension();
__host__ __device__ constexpr NormalTensorView(TData* p_data) : mpData{p_data} {}
__host__ __device__ constexpr NormalTensorView() : NormalTensorView{nullptr} {}
__host__ __device__ static constexpr auto GetNumOfDimension() { return nDim; }
__host__ __device__ static constexpr auto GetLengths() { return TensorDesc::GetLengths(); }
__host__ __device__ const TData& operator[](coordinate_type coord) const
{
return mpData[coord.GetOffset()];
}
__host__ __device__ TData& operator()(coordinate_type coord) const
{
return mpData[coord.GetOffset()];
}
template <class IDim, class DataPerVector>
__host__ __device__ static constexpr auto IsVectorizationAllowed(IDim, DataPerVector)
{
return TensorDesc::IsVectorizationAllowed(IDim{}, DataPerVector{});
}
template <class IDim, class DataPerVector>
__host__ __device__ auto Vectorize(IDim idim, DataPerVector data_per_vector) const
{
static_assert(IsVectorizationAllowed(idim, data_per_vector), "wrong!");
using vector_t = typename vector_type<TData, data_per_vector>::MemoryType;
return NormalTensorView<decltype(TensorDesc::Vectorize(idim, data_per_vector)), vector_t>(
reinterpret_cast<vector_t*>(mpData));
}
template <index_t... Is>
__host__ __device__ auto Slice(coordinate_type slice_origin, Sequence<Is...> slice_lengths)
{
static_assert(slice_lengths.GetSize() == nDim, "wrong!");
return NormalTensorView<decltype(TensorDesc::Slice(slice_lengths)), TData>(
mpData + slice_origin.GetOffset());
}
template <class IDim, class SliceLen>
__host__ __device__ auto
Slice(coordinate_type slice_origin, IDim idim, SliceLen slice_len) const
{
return NormalTensorView<decltype(TensorDesc::Slice(idim, slice_len)), TData>(
mpData + slice_origin.GetOffset());
}
// slice_window is a slicing window on "*this"
template <class SliceWindow, class T, bool PositiveDirection>
__device__ void MoveSliceWindow(SliceWindow& slice_window,
T step_sizes,
integral_constant<bool, PositiveDirection>)
{
if(PositiveDirection)
{
slice_window.mpData += coordinate_type{step_sizes}.GetOffset();
}
else
{
slice_window.mpData -= coordinate_type{step_sizes}.GetOffset();
}
}
// private:
data_type* mpData;
};
template <class... Xs, class TData>
__host__ __device__ constexpr auto make_TensorView(ConstantTensorDescriptor<Xs...>, TData* p_data)
{
return NormalTensorView<ConstantTensorDescriptor<Xs...>, TData>{p_data};
}
} // namespace ck
#endif
#ifndef CK_TENSOR_VISIT_HPP
#define CK_TENSOR_VISIT_HPP
#include "common_header.hpp"
#include "dimension.hpp"
#include "dimension_transform.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_coordinate.hpp"
namespace ck {
template <class TensorDescriptor>
struct TensorVisit
{
using Index = typename TensorDescriptor::Index;
using Coordinate = typename TensorCoordinate<TensorDescriptor>::type;
__host__ __device__ static void Run_v1(Index idx_begin)
{
const auto coord_begin = Coordinate(idx_begin);
ford<TensorDescriptor::GetLengths()>{}(
[&](auto idx_diff) { index_t offset = (coord_begin + idx_diff).GetOffset(); });
}
__host__ __device__ static void Run_v2(Index idx_begin)
{
const auto coord_begin = Coordinate(idx_begin);
ford<TensorDescriptor::GetLengths()>{}([&](auto idx_diff) {
index_t offset_diff = coord_begin.GetOffsetDiff(idx_diff);
index_t offset = coord_begin.GetOffset() + offset_diff;
});
}
__host__ __device__ static void Run_v3(Index idx_begin)
{
const auto coord_begin = Coordinate(idx_begin);
constexpr auto linear_dimensions = TensorDescriptor::GetLinearDimensions();
constexpr auto nonlinear_dimensions = TensorDescriptor::GetNonLinearDimensions();
constexpr auto lengths = TensorDescriptor::GetLengths();
constexpr auto linear_dimension_lengths_hack =
lambda_HackLengths{}(lengths, linear_dimensions);
constexpr auto nonlinear_dimension_lengths_hack =
lambda_HackLengths{}(lengths, nonlinear_dimensions);
ford<nonlinear_dimension_lengths_hack>{}([&](auto idx_diff_nonlinear_hack) {
// run-time component
index_t offset_diff_nonlinear = coord_begin.GetOffsetDiff(idx_diff_nonlinear_hack);
ford<linear_dimension_lengths_hack>{}([&](auto idx_diff_linear_hack) {
// compile-time component
index_t offset_diff_linear = coord_begin.GetOffsetDiff(idx_diff_linear_hack);
index_t offset =
coord_begin.GetOffset() + offset_diff_nonlinear + offset_diff_linear;
});
});
}
__host__ __device__ static void Run_v4(Index idx_begin)
{
const auto coord_begin = Coordinate(idx_begin);
constexpr auto linear_dimensions = TensorDescriptor::GetLinearDimensions();
constexpr auto nonlinear_independent_dimension_groups =
TensorDescriptor::GetNonLinearIndependentDimensionGroups();
constexpr auto lengths = TensorDescriptor::GetLengths();
constexpr auto linear_dimension_lengths = lambda_HackLengths{}(lengths, linear_dimensions);
// run-time component
index_t offset_diff_nonlinear = 0;
template <index_t NGroup>
struct f_recursion
{
template <index_t IGroup>
__host__ __device__ void Run(Number<IGroup>)
{
constexpr auto nonlinear_independent_dimensions_igroup =
nonlinear_independent_dimension_groups.Get(igroup);
constexpr auto nonlinear_independent_lengths_igroup =
lambda_HackLengths{}(lengths, nonlinear_independent_dimensions_igroup);
ford<nonlinear_independent_lengths_igroup>{}(
[&](auto idx_diff_nonlinear_igroup_hack) {
// run-time component
offset_diff_nonlinear +=
coord_begin.GetOffsetDiff(idx_diff_nonlinear_igroup_hack);
Run(Number<IGroup + 1>{});
});
};
// inner-most work
template <>
__host__ __device__ void Run(Number<NGroup>)
{
ford<linear_dimension_lengths>{}([&](auto idx_diff_linear_hack) {
// compile-time component
index_t offset_diff_linear = coord_begin.GetOffsetDiff(idx_diff_linear_hack);
index_t offset =
coord_begin.GetOffset() + offset_diff_nonlinear + offset_diff_linear;
});
}
};
// run-time component
index_t offset_diff_nonlinear = 0;
f_recursion<nonlinear_independent_dimension_groups.GetSize()>{}.Run();
}
};
} // namespace ck
#endif
...@@ -563,7 +563,7 @@ struct Blockwise2dTensorCopy3 ...@@ -563,7 +563,7 @@ struct Blockwise2dTensorCopy3
} }
} }
__device__ constexpr index_t GetRegisterClipboardSize() const __device__ constexpr index_t GetRegisterBufferSize() const
{ {
static_assert(is_same<Float, float>{}, "wrong! only support float!\n"); static_assert(is_same<Float, float>{}, "wrong! only support float!\n");
...@@ -579,8 +579,8 @@ struct Blockwise2dTensorCopy3 ...@@ -579,8 +579,8 @@ struct Blockwise2dTensorCopy3
return DataPerRead * (L0 + thread_per_d0 - 1) / thread_per_d0; return DataPerRead * (L0 + thread_per_d0 - 1) / thread_per_d0;
} }
__device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src, __device__ void RunLoadRegisterBuffer(const Float* __restrict__ p_src,
Float* __restrict__ p_clipboard) const Float* __restrict__ p_clipboard) const
{ {
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
...@@ -630,8 +630,8 @@ struct Blockwise2dTensorCopy3 ...@@ -630,8 +630,8 @@ struct Blockwise2dTensorCopy3
} }
} }
__device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard, __device__ void RunStoreRegisterBuffer(const Float* __restrict__ p_clipboard,
Float* __restrict__ p_dst) const Float* __restrict__ p_dst) const
{ {
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
...@@ -681,8 +681,8 @@ struct Blockwise2dTensorCopy3 ...@@ -681,8 +681,8 @@ struct Blockwise2dTensorCopy3
} }
#if CK_USE_AMD_INLINE_ASM #if CK_USE_AMD_INLINE_ASM
__device__ void RunLoadRegisterClipboard_asm(const Float* __restrict__ p_src, __device__ void RunLoadRegisterBuffer_asm(const Float* __restrict__ p_src,
Float* p_clipboard) const Float* p_clipboard) const
{ {
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
...@@ -741,8 +741,8 @@ struct Blockwise2dTensorCopy3 ...@@ -741,8 +741,8 @@ struct Blockwise2dTensorCopy3
} }
} }
__device__ void RunStoreRegisterClipboard_asm(const Float* __restrict__ p_clipboard, __device__ void RunStoreRegisterBuffer_asm(const Float* __restrict__ p_clipboard,
Float* __restrict__ p_dst) const Float* __restrict__ p_dst) const
{ {
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
......
...@@ -162,7 +162,7 @@ struct Blockwise3dTensorCopy3 ...@@ -162,7 +162,7 @@ struct Blockwise3dTensorCopy3
"wrrong! BlockSize is not big enough for ThreadPerDims!"); "wrrong! BlockSize is not big enough for ThreadPerDims!");
constexpr index_t num_active_thread = constexpr index_t num_active_thread =
accumulate_on_sequence(ThreadPerDims{}, math::multiplies<index_t>{}, Number<1>{}); reduce_on_sequence(ThreadPerDims{}, math::multiplies<index_t>{}, Number<1>{});
if(BlockSize > num_active_thread) if(BlockSize > num_active_thread)
{ {
...@@ -237,7 +237,7 @@ struct Blockwise3dTensorCopy3 ...@@ -237,7 +237,7 @@ struct Blockwise3dTensorCopy3
} }
} }
__device__ static constexpr index_t GetRegisterClipboardSize() __device__ static constexpr index_t GetRegisterBufferSize()
{ {
static_assert(is_same<Float, float>{}, "wrong! only support float!\n"); static_assert(is_same<Float, float>{}, "wrong! only support float!\n");
...@@ -260,8 +260,8 @@ struct Blockwise3dTensorCopy3 ...@@ -260,8 +260,8 @@ struct Blockwise3dTensorCopy3
return DataPerRead * nloop_d0 * nloop_d1 * nloop_d2; return DataPerRead * nloop_d0 * nloop_d1 * nloop_d2;
} }
__device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src, __device__ void RunLoadRegisterBuffer(const Float* __restrict__ p_src,
Float* __restrict__ p_clipboard) const Float* __restrict__ p_clipboard) const
{ {
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
...@@ -316,8 +316,8 @@ struct Blockwise3dTensorCopy3 ...@@ -316,8 +316,8 @@ struct Blockwise3dTensorCopy3
} }
} }
__device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard, __device__ void RunStoreRegisterBuffer(const Float* __restrict__ p_clipboard,
Float* __restrict__ p_dst) const Float* __restrict__ p_dst) const
{ {
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
......
...@@ -505,7 +505,7 @@ struct Blockwise4dTensorCopy3 ...@@ -505,7 +505,7 @@ struct Blockwise4dTensorCopy3
"wrrong! BlockSize is not big enough for ThreadPerDims!"); "wrrong! BlockSize is not big enough for ThreadPerDims!");
constexpr index_t num_active_thread = constexpr index_t num_active_thread =
accumulate_on_sequence(ThreadPerDims{}, math::multiplies<index_t>{}, Number<1>{}); reduce_on_sequence(ThreadPerDims{}, math::multiplies<index_t>{}, Number<1>{});
if(BlockSize > num_active_thread) if(BlockSize > num_active_thread)
{ {
...@@ -596,7 +596,7 @@ struct Blockwise4dTensorCopy3 ...@@ -596,7 +596,7 @@ struct Blockwise4dTensorCopy3
} }
} }
__device__ constexpr index_t GetRegisterClipboardSize() const __device__ constexpr index_t GetRegisterBufferSize() const
{ {
static_assert(is_same<Float, float>{}, "wrong! only support float!\n"); static_assert(is_same<Float, float>{}, "wrong! only support float!\n");
...@@ -623,8 +623,8 @@ struct Blockwise4dTensorCopy3 ...@@ -623,8 +623,8 @@ struct Blockwise4dTensorCopy3
return DataPerRead * nloop_d0 * nloop_d1 * nloop_d2 * nloop_d3; return DataPerRead * nloop_d0 * nloop_d1 * nloop_d2 * nloop_d3;
} }
__device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src, __device__ void RunLoadRegisterBuffer(const Float* __restrict__ p_src,
Float* __restrict__ p_clipboard) const Float* __restrict__ p_clipboard) const
{ {
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
...@@ -690,8 +690,8 @@ struct Blockwise4dTensorCopy3 ...@@ -690,8 +690,8 @@ struct Blockwise4dTensorCopy3
} }
} }
__device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard, __device__ void RunStoreRegisterBuffer(const Float* __restrict__ p_clipboard,
Float* __restrict__ p_dst) const Float* __restrict__ p_dst) const
{ {
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
......
...@@ -5,6 +5,10 @@ ...@@ -5,6 +5,10 @@
#include "ConstantMatrixDescriptor.hpp" #include "ConstantMatrixDescriptor.hpp"
#include "threadwise_gemm.hpp" #include "threadwise_gemm.hpp"
#ifndef CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM
#define CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM 1
#endif
namespace ck { namespace ck {
template <index_t BlockSize, template <index_t BlockSize,
...@@ -97,24 +101,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -97,24 +101,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
mMyThreadOffsetB = c_thread_mtx_index.batch * BlockMatrixStrideB + mMyThreadOffsetB = c_thread_mtx_index.batch * BlockMatrixStrideB +
b_block_mtx.GetOffsetFromMultiIndex(0, c_thread_mtx_index.col); b_block_mtx.GetOffsetFromMultiIndex(0, c_thread_mtx_index.col);
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
print_ConstantMatrixDescriptor(BlockMatrixA{}, "a_block_mtx: ");
print_ConstantMatrixDescriptor(BlockMatrixB{}, "b_block_mtx: ");
print_ConstantMatrixDescriptor(ThreadMatrixC{}, "c_thread_mtx: ");
printf("%u %u, %u %u %u, %u %u\n",
get_block_1d_id(),
get_thread_local_1d_id(),
c_thread_mtx_index.batch,
c_thread_mtx_index.row,
c_thread_mtx_index.col,
mMyThreadOffsetA,
mMyThreadOffsetB);
}
#endif
} }
__device__ MatrixIndex GetBeginOfThreadMatrixC(index_t thread_id) const __device__ MatrixIndex GetBeginOfThreadMatrixC(index_t thread_id) const
...@@ -257,29 +243,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -257,29 +243,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
} }
} }
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
printf("a: %f %f %f %f %f %f %f %f, b: %f %f %f %f %f %f %f %f\n",
p_a_thread[0],
p_a_thread[1],
p_a_thread[2],
p_a_thread[3],
p_a_thread[4],
p_a_thread[5],
p_a_thread[6],
p_a_thread[7],
p_b_thread[0],
p_b_thread[1],
p_b_thread[2],
p_b_thread[3],
p_b_thread[4],
p_b_thread[5],
p_b_thread[6],
p_b_thread[7]);
}
#endif
threadwise_gemm(a_thread_mtx, threadwise_gemm(a_thread_mtx,
True, True,
p_a_thread, p_a_thread,
...@@ -311,10 +274,10 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -311,10 +274,10 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
// thread A, B for GEMM // thread A, B for GEMM
// A is transposed, b is not // A is transposed, b is not
constexpr auto a_thread_mtx = constexpr auto a_thread_mtx =
make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<MPerThread>{}); make_ConstantMatrixDescriptor_packed(Number<KPerThreadLoop>{}, Number<MPerThread>{});
constexpr auto b_thread_mtx = constexpr auto b_thread_mtx =
make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<NPerThread>{}); make_ConstantMatrixDescriptor_packed(Number<KPerThreadLoop>{}, Number<NPerThread>{});
// thread A-sub, B-sub for copy // thread A-sub, B-sub for copy
constexpr auto a_thread_sub_mtx = make_ConstantMatrixDescriptor( constexpr auto a_thread_sub_mtx = make_ConstantMatrixDescriptor(
...@@ -382,102 +345,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -382,102 +345,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
outerProduct4x4(reg_a[1], reg_b[0], reg_c[8], reg_c[10], reg_c[12], reg_c[14]); outerProduct4x4(reg_a[1], reg_b[0], reg_c[8], reg_c[10], reg_c[12], reg_c[14]);
outerProduct4x4(reg_a[1], reg_b[1], reg_c[9], reg_c[11], reg_c[13], reg_c[15]); outerProduct4x4(reg_a[1], reg_b[1], reg_c[9], reg_c[11], reg_c[13], reg_c[15]);
} }
template <class FloatA, class FloatB, class FloatC>
__device__ void Run_asm_v2(const FloatA* __restrict__ p_a_block,
const FloatB* __restrict__ p_b_block,
FloatC* __restrict__ p_c_thread) const
{
constexpr auto a_block_mtx = BlockMatrixA{};
constexpr auto b_block_mtx = BlockMatrixB{};
constexpr auto c_thread_mtx = ThreadMatrixC{};
constexpr index_t M = a_block_mtx.NCol();
constexpr index_t N = b_block_mtx.NCol();
constexpr index_t K = a_block_mtx.NRow(); // A is transposed
constexpr index_t MPerThread = c_thread_mtx.NRow();
constexpr index_t NPerThread = c_thread_mtx.NCol();
// thread A, B for GEMM
// A is transposed, b is not
constexpr auto a_thread_mtx =
make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<MPerThread>{});
constexpr auto b_thread_mtx =
make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<NPerThread>{});
// thread A-sub, B-sub for copy
constexpr auto a_thread_sub_mtx = make_ConstantMatrixDescriptor(
Number<KPerThreadLoop>{}, Number<MPerThreadSubC>{}, Number<MPerThread>{});
constexpr auto b_thread_sub_mtx = make_ConstantMatrixDescriptor(
Number<KPerThreadLoop>{}, Number<NPerThreadSubC>{}, Number<NPerThread>{});
FloatA p_a_thread[a_thread_mtx.GetElementSpace()];
FloatB p_b_thread[b_thread_mtx.GetElementSpace()];
constexpr index_t MPerLevel1Cluster = MPerThreadSubC * MLevel0Cluster * MLevel1Cluster;
constexpr index_t NPerLevel1Cluster = NPerThreadSubC * NLevel0Cluster * NLevel1Cluster;
// assertion for inline asm
static_assert(is_same<FloatA, float>{} && is_same<FloatB, float>{} &&
is_same<FloatC, float>{},
"Run_amd_asm only deal with float\n");
static_assert(MPerThreadSubC == 4 && NPerThreadSubC == 4 && KPerThreadLoop == 1 &&
MPerThread == 8 && NPerThread == 8,
"Run_amd_asm cannot deal with this GEMM shape yet\n");
static_assert(DataPerReadA == 4 && DataPerReadB == 4, "Run_amd_asm only do float4 read\n");
static_assert(BlockMatrixStrideA == 0 && BatchPerThread == 1,
"Run_amd_asm can only deal with BlockMatrixStrideA == 0 && BatchPerThread == "
"1 for now\n");
using Float4 = vector_type<float, 4>::MemoryType;
Float4* reg_a = (Float4*)(p_a_thread);
Float4* reg_b = (Float4*)(p_b_thread);
Float4* reg_c = (Float4*)(p_c_thread);
void* a_lds_loc = (void*)(p_a_block + mMyThreadOffsetA);
void* b_lds_loc = (void*)(p_b_block + mMyThreadOffsetB);
constexpr index_t a_lds_row_stride = sizeof(float) * a_block_mtx.RowStride();
constexpr index_t b_lds_row_stride = sizeof(float) * b_block_mtx.RowStride();
constexpr index_t a_lds_cluster_col_stride = sizeof(float) * MPerLevel1Cluster;
constexpr index_t b_lds_cluster_col_stride = sizeof(float) * NPerLevel1Cluster;
ds_read_b128(reg_a[0], a_lds_loc, 0);
ds_read_b128(reg_b[0], b_lds_loc, 0);
ds_read_b128(reg_b[1], b_lds_loc, b_lds_cluster_col_stride);
ds_read_b128(reg_a[1], a_lds_loc, a_lds_cluster_col_stride);
lgkmcnt(2);
outerProduct4x4(reg_a[0], reg_b[0], reg_c[0], reg_c[2], reg_c[4], reg_c[6]);
lgkmcnt(1);
outerProduct4x4(reg_a[0], reg_b[1], reg_c[1], reg_c[3], reg_c[5], reg_c[7]);
#pragma unroll
for(index_t k = 1; k < K; ++k)
{
ds_read_b128(reg_a[0], a_lds_loc, k * a_lds_row_stride);
lgkmcnt(1);
outerProduct4x4(reg_a[1], reg_b[0], reg_c[8], reg_c[10], reg_c[12], reg_c[14]);
ds_read_b128(reg_b[0], b_lds_loc, k * b_lds_row_stride);
outerProduct4x4(reg_a[1], reg_b[1], reg_c[9], reg_c[11], reg_c[13], reg_c[15]);
ds_read_b128(reg_b[1], b_lds_loc, b_lds_cluster_col_stride + k * b_lds_row_stride);
ds_read_b128(reg_a[1], a_lds_loc, a_lds_cluster_col_stride + k * a_lds_row_stride);
lgkmcnt(2);
outerProduct4x4(reg_a[0], reg_b[0], reg_c[0], reg_c[2], reg_c[4], reg_c[6]);
lgkmcnt(1);
outerProduct4x4(reg_a[0], reg_b[1], reg_c[1], reg_c[3], reg_c[5], reg_c[7]);
}
lgkmcnt(0);
outerProduct4x4(reg_a[1], reg_b[0], reg_c[8], reg_c[10], reg_c[12], reg_c[14]);
outerProduct4x4(reg_a[1], reg_b[1], reg_c[9], reg_c[11], reg_c[13], reg_c[15]);
}
#endif #endif
template <class FloatA, class FloatB, class FloatC> template <class FloatA, class FloatB, class FloatC>
......
...@@ -2,466 +2,57 @@ ...@@ -2,466 +2,57 @@
#define CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_HPP #define CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_HPP
#include "common_header.hpp" #include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp" #include "tensor_descriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp" #include "tensor_descriptor_helper.hpp"
#include "tensor_coordinate.hpp" #include "tensor_coordinate.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp" #include "threadwise_generic_tensor_slice_copy.hpp"
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
#endif
namespace ck { namespace ck {
// Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
// memory layout (ordering of dimensions) can be different between src and dst.
// This functions assume each thread is reading and writing a normal (not merged) tensor,
// to simplify index calculations. To satisfy this assumption, the user need to make sure
// that, on a merged dimension that constains multiple original dimensions, the length of
// the last original dimension need to be evenly dividable by its sub-lengths. Also, the
// repeat-length on the merged dimension need to be 1. These sanity checks are performed
// in constructor of BlockwiseGenericTensorSliceCopy_v1
template <index_t BlockSize,
class SrcDesc,
class DstDesc,
class SliceLengths,
class SubLengths,
class ThreadClusterLengths,
class ThreadClusterArrangeOrder,
class SrcDimAccessOrder,
class DstDimAccessOrder,
index_t SrcVectorAccessDim,
index_t DstVectorAccessDim,
index_t SrcDataPerAccess,
index_t DstDataPerAccess>
struct BlockwiseGenericTensorSliceCopy_v1
{
static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
static constexpr index_t nOriginalDimSrc =
SrcDesc::GetOriginalTensorDescriptor().GetNumOfDimension();
static constexpr index_t nOriginalDimDst =
DstDesc::GetOriginalTensorDescriptor().GetNumOfDimension();
// per-thread offset
index_t mThreadSrcOffset;
index_t mThreadDstOffset;
// "mThreadSrcOriginalMultiId", "mThreadSrcPartialOffsets, "mThreadDstOriginalMultiId",
// "mThreadDstPartialOffsets" are always calculated inside constructor, and would be
// updated if slicing-window is moved. However, they will not be used if you always move
// the slicing-window along a non-merged dimension. In that case, compiler should be
// able to remove these calculation.
// TODO: make sure compiler would actually remove them in that case
// partial offset in each (merged) dimension
Array<index_t, nDim> mThreadSrcPartialOffsets;
Array<index_t, nDim> mThreadDstPartialOffsets;
// multi-id of original tensor
Array<index_t, nOriginalDimSrc> mThreadSrcOriginalMultiId;
Array<index_t, nOriginalDimDst> mThreadDstOriginalMultiId;
__device__ BlockwiseGenericTensorSliceCopy_v1(Array<index_t, nDim> src_block_data_id_begin,
Array<index_t, nDim> dst_block_data_id_begin)
{
// check NDim consistency
static_assert(
nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() &&
nDim == SliceLengths::GetSize() && nDim == SubLengths::GetSize() &&
nDim == ThreadClusterLengths::GetSize() &&
nDim == ThreadClusterArrangeOrder::GetSize() &&
nDim == SrcDimAccessOrder::GetSize() && nDim == DstDimAccessOrder::GetSize(),
"wrong");
// check thread arrange order and read/write access order are valid
static_assert(is_valid_sequence_map<ThreadClusterArrangeOrder>::value &&
is_valid_sequence_map<SrcDimAccessOrder>::value &&
is_valid_sequence_map<DstDimAccessOrder>::value,
"wrong!");
// thread cluster
constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed(
ThreadClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));
// BlockSize
static_assert(BlockSize == thread_cluster_desc.GetElementSize(), "wrong! BlockSize");
// divide work
constexpr auto data_per_cluster_per_dims = SubLengths{} * ThreadClusterLengths{};
static_for<0, nDim, 1>{}([&](auto IDim) {
static_assert(SliceLengths::Get(IDim) % data_per_cluster_per_dims.Get(IDim) == 0,
"wrong! cannot evenly divide sliced tensor into cluster");
});
constexpr auto repeat_lengths = SliceLengths{} / data_per_cluster_per_dims;
// additional check for merged dimension
static_for<0, nDim, 1>{}([&](auto IDim_) {
// src
static_if<SrcDesc::ContainMultipleOriginalDimensions(IDim_)>{}([&](auto) {
constexpr auto IDim = decltype(IDim_){};
// on a merged dimension that constains multiple original dimensions,
// the length of the last original dimension need to evenly dividable by its
// sub-length,
// so each thread is effectively reading a normal (not merged) tensor
constexpr auto idim_last_original_src =
SrcDesc::GetContainedOriginalDimensions(IDim).Back();
static_assert(
SrcDesc::GetOriginalTensorDescriptor().GetLength(idim_last_original_src) %
SubLengths::Get(IDim) ==
0,
"wrong!");
// merged dimension should have repeat_lengths = 1
static_assert(repeat_lengths[IDim] == 1,
"wrong! repeat_lengths shoud be 1 on merged dimension");
});
// dst
static_if<DstDesc::ContainMultipleOriginalDimensions(IDim_)>{}([&](auto) {
constexpr auto IDim = decltype(IDim_){};
// on a merged dimension that constains multiple original dimensions,
// the length of the last original dimension need to evenly dividable by its
// sub-length,
// so each thread is effectively reading a normal (not merged) tensor
constexpr auto idim_last_original_dst =
DstDesc::GetContainedOriginalDimensions(IDim).Back();
static_assert(
DstDesc::GetOriginalTensorDescriptor().GetLength(idim_last_original_dst) %
SubLengths::Get(IDim) ==
0,
"wrong!");
// merged dimension should have repeat_lengths = 1
static_assert(repeat_lengths[IDim] == 1,
"wrong! repeat_lengths shoud be 1 on merged dimension");
});
});
// calculate mThreadSrcOffset, mThreadDstOffset
const auto thread_cluster_id =
thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
const auto data_cluster_id =
reorder_array_given_old2new(thread_cluster_id, ThreadClusterArrangeOrder{});
const auto thread_data_id_begin = data_cluster_id * SubLengths{};
// original multi-id
mThreadSrcOriginalMultiId = SrcDesc::GetOriginalMultiIndexFromMultiIndex(
src_block_data_id_begin + thread_data_id_begin);
mThreadDstOriginalMultiId = DstDesc::GetOriginalMultiIndexFromMultiIndex(
dst_block_data_id_begin + thread_data_id_begin);
// partial offset on each dimension
static_for<0, nDim, 1>{}([&](auto IDim) {
constexpr auto src_partial_original_dims =
SrcDesc::GetContainedOriginalDimensions(IDim);
constexpr auto src_partial_original_desc =
SrcDesc::GetOriginalTensorDescriptor().Extract(src_partial_original_dims);
mThreadSrcPartialOffsets(IDim) = src_partial_original_desc.GetOffsetFromMultiIndex(
extract_array(mThreadSrcOriginalMultiId, src_partial_original_dims));
});
static_for<0, nDim, 1>{}([&](auto IDim) {
constexpr auto dst_partial_original_dims =
DstDesc::GetContainedOriginalDimensions(IDim);
constexpr auto dst_partial_original_desc =
DstDesc::GetOriginalTensorDescriptor().Extract(dst_partial_original_dims);
mThreadDstPartialOffsets(IDim) = dst_partial_original_desc.GetOffsetFromMultiIndex(
extract_array(mThreadDstOriginalMultiId, dst_partial_original_dims));
});
// complete offset
mThreadSrcOffset = accumulate_on_array(
mThreadSrcPartialOffsets, math::plus<index_t>{}, static_cast<index_t>(0));
mThreadDstOffset = accumulate_on_array(
mThreadDstPartialOffsets, math::plus<index_t>{}, static_cast<index_t>(0));
}
__device__ static constexpr auto GetRegisterBufferDescriptor()
{
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ThreadClusterLengths{});
return make_ConstantTensorDescriptor_packed(SubLengths{} * repeat_lengths);
}
__device__ static constexpr index_t GetRegisterBufferSize()
{
return GetRegisterBufferDescriptor().GetElementSpace();
}
template <class TData>
__device__ void RunLoadRegisterBuffer(const TData* __restrict__ p_src,
TData* __restrict__ p_buffer) const
{
constexpr auto thread_sub_tensor_lengths = SubLengths{};
constexpr auto data_per_cluster_per_dims =
thread_sub_tensor_lengths * ThreadClusterLengths{};
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ThreadClusterLengths{});
constexpr auto thread_buffer_desc = GetRegisterBufferDescriptor();
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_id) {
constexpr auto src_thread_data_id_begin = repeat_id * data_per_cluster_per_dims;
constexpr auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths;
constexpr index_t src_offset =
SrcDesc::GetOffsetFromMultiIndex(src_thread_data_id_begin);
constexpr index_t buffer_offset =
thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin);
#else
ford<decltype(repeat_lengths)>{}([&](auto repeat_id) {
const auto src_thread_data_id_begin = repeat_id * data_per_cluster_per_dims;
const auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths;
const index_t src_offset = SrcDesc::GetOffsetFromMultiIndex(src_thread_data_id_begin);
const index_t buffer_offset =
thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin);
#endif
// By position the origin of the per-thread window at the point, where multi-index
// of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
// is assuming each thread is copy a noraml (not merged) tensor.
// To satisfy this assumption, the user need to make sure that, on a merged dimension
// that constains multiple original dimensions, the length of the last original
// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
// the merged dimension need to be 1. These sanity checks are performed in constructor
// of BlockwiseGenericTensorSliceCopy_v1
ThreadwiseGenericTensorSliceCopy_v1r2<SrcDesc,
decltype(thread_buffer_desc),
SubLengths,
SrcDimAccessOrder,
SrcVectorAccessDim,
SrcDataPerAccess,
1>(make_zero_array<index_t, nDim>(),
make_zero_array<index_t, nDim>())
.Run(p_src + src_offset + mThreadSrcOffset, p_buffer + buffer_offset);
});
}
template <class TData>
__device__ void RunStoreRegisterBuffer(const TData* __restrict__ p_buffer,
TData* __restrict__ p_dst) const
{
constexpr auto thread_sub_tensor_lengths = SubLengths{};
constexpr auto data_per_cluster_per_dims =
thread_sub_tensor_lengths * ThreadClusterLengths{};
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ThreadClusterLengths{});
constexpr auto thread_buffer_desc = GetRegisterBufferDescriptor();
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_id) {
constexpr auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths;
constexpr auto dst_data_id_begin = repeat_id * data_per_cluster_per_dims;
constexpr index_t buffer_offset =
thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin);
constexpr index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(dst_data_id_begin);
#else
ford<decltype(repeat_lengths)>{}([&](auto repeat_id) {
const auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths;
const auto dst_data_id_begin = repeat_id * data_per_cluster_per_dims;
const index_t buffer_offset =
thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin);
const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(dst_data_id_begin);
#endif
// By position the origin of the per-thread window at the point, where multi-index
// of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
// is assuming each thread is copy a noraml (not merged) tensor.
// To satisfy this assumption, the user need to make sure that, on a merged dimension
// that constains multiple original dimensions, the length of the last original
// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
// the merged dimension need to be 1. These sanity checks are performed in constructor
// of BlockwiseGenericTensorSliceCopy_v1
ThreadwiseGenericTensorSliceCopy_v1r2<decltype(thread_buffer_desc),
DstDesc,
SubLengths,
DstDimAccessOrder,
DstVectorAccessDim,
1,
DstDataPerAccess>(
make_zero_array<index_t, nDim>(), make_zero_array<index_t, nDim>())
.Run(p_buffer + buffer_offset, p_dst + dst_offset + mThreadDstOffset);
});
}
template <class TData>
__device__ void Run(const TData* __restrict__ p_src, TData* __restrict__ p_dst) const
{
TData p_buffer[GetRegisterBufferSize()];
RunLoadRegisterBuffer(p_src, p_buffer);
RunStoreRegisterBuffer(p_buffer, p_dst);
}
// When moving the slicing windows along a merged dimension, if the strides of the
// contained (by the merged dimension) original dimensions are not in descending order,
// then there is no guarantee that the new offset will be larger than the old offset
// for movement in positive direction (vice versue for movement in negative direction).
// As a result, there is the possiblity that the offset calculation may result in
// unsigned integer underflow (due to "-" operation). However, this hazard should not
// happen, as long as the users make sure the slicing window would not be moved out of
// the boundary of the tensor being sliced. This functions doesn't do runtime sanity
// check on out-of-bound slicing window, for performance reason
template <index_t IDim_, index_t StepSize, bool PositiveDirection>
__device__ void MoveSlicingWindowOnSourceTensor(
Number<IDim_>, Number<StepSize>, integral_constant<bool, PositiveDirection> direction)
{
constexpr auto IDim = Number<IDim_>{};
static_if<SrcDesc::ContainMultipleOriginalDimensions(IDim)>{}([&](auto) {
// logic for a merged dimension, also works for non-merged dimension, but its logic may
// be unncessarily complicated for compiler to remove calculations that are useless for
// a non-merged dimension
// extract partial original dimensions
constexpr auto src_partial_original_dims =
SrcDesc::GetContainedOriginalDimensions(IDim);
constexpr auto src_partial_original_desc =
SrcDesc::GetOriginalTensorDescriptor().Extract(src_partial_original_dims);
// calculate new partial original multi-id
auto old_src_partial_original_id =
extract_array(mThreadSrcOriginalMultiId, src_partial_original_dims);
auto new_src_partial_original_id =
src_partial_original_desc.UpdateMultiIndexGivenStepSizeOf1dIndex(
old_src_partial_original_id, StepSize, direction);
// update "mThreadSrcOriginalMultiId"
static_for<0, decltype(src_partial_original_dims)::GetSize(), 1>{}([&](auto I) {
constexpr auto IDimOriginal = src_partial_original_dims[I];
mThreadSrcOriginalMultiId(IDimOriginal) = new_src_partial_original_id[I];
});
// calculate new partial offset on this merged dimension
const index_t old_src_partial_offset = mThreadSrcPartialOffsets[IDim];
const index_t new_src_partial_offset =
src_partial_original_desc.GetOffsetFromMultiIndex(new_src_partial_original_id);
// update "mThreadSrcPartialOffsets"
mThreadSrcPartialOffsets(IDim) = new_src_partial_offset;
// update "mThreadSrcOffset", do "+" before "-" to avoid underflow
mThreadSrcOffset = (mThreadSrcOffset + new_src_partial_offset) - old_src_partial_offset;
}).Else([&](auto) {
// Logic for non-merged dimension. If you are never going to move the slicing window on
// a merged dimension, then "mThreadSrcOriginalMultiId" and "mThreadSrcPartialOffsets",
// which are being calculated here, will never be used later. In this case, compiler
// should be able to remove these calculations.
// TODO: make sure compiler would actually remove them in this case.
// It is the user's responsiblity to make sure the slicing window will not be moved out
// of the boundary of the tensor being sliced. Otherwise, there might be hazard like
// unsigned integer underflow. That is NO runtime sanity check to prevent the hazard
constexpr auto IDimOriginal = SrcDesc::GetContainedOriginalDimensions(IDim).Front();
static_if<PositiveDirection>{}([&](auto fwd) {
mThreadSrcOffset += StepSize * fwd(SrcDesc{}).GetStride(IDim);
mThreadSrcOriginalMultiId(IDimOriginal) += StepSize;
mThreadSrcPartialOffsets(IDim) += StepSize * fwd(SrcDesc{}).GetStride(IDim);
}).Else([&](auto fwd) {
mThreadSrcOffset -= StepSize * fwd(SrcDesc{}).GetStride(IDim);
mThreadSrcOriginalMultiId(IDimOriginal) -= StepSize;
mThreadSrcPartialOffsets(IDim) -= StepSize * fwd(SrcDesc{}).GetStride(IDim);
});
});
}
template <class T, bool PositiveDirection>
__device__ void
MoveSrcSlicingWindow(T step_sizes,
integral_constant<bool, PositiveDirection> positive_direction)
{
static_for<0, nDim, 1>{}([&](auto idim) {
if(step_sizes[idim] != 0)
{
MoveSlicingWindowOnSourceTensor(idim, step_sizes[idim], positive_direction);
}
});
}
};
template <index_t BlockSize, template <index_t BlockSize,
class SrcDesc, typename BlockSrcDesc,
class DstDesc, typename BlockDstDesc,
class SrcCoordinate, typename BlockSliceLengths,
class DstCoordinate, typename ThreadSliceLengths,
class SliceLengths, typename ThreadClusterLengths,
class SubLengths, typename ThreadClusterArrangeOrder,
class ThreadClusterLengths, typename SrcDimAccessOrder,
class ThreadClusterArrangeOrder, typename DstDimAccessOrder,
class SrcDimAccessOrder,
class DstDimAccessOrder,
index_t SrcVectorAccessDim, index_t SrcVectorAccessDim,
index_t DstVectorAccessDim, index_t DstVectorAccessDim,
index_t SrcDataPerAccess, index_t SrcDataPerAccess,
index_t DstDataPerAccess> index_t DstDataPerAccess>
struct BlockwiseGenericTensorSliceCopy_v2 struct BlockwiseGenericTensorSliceCopy_v4
{ {
static constexpr index_t nDim = SrcDesc::GetNumOfDimension(); static constexpr index_t nDim = BlockSrcDesc::GetNumOfDimension();
using Index = MultiIndex<nDim>;
__device__ constexpr BlockwiseGenericTensorSliceCopy_v2(SrcCoordinate src_block_slice_origin, __device__ constexpr BlockwiseGenericTensorSliceCopy_v4(const Index& src_block_slice_origin,
DstCoordinate dst_block_slice_origin) const Index& dst_block_slice_origin)
{ {
static_assert(nDim == SrcDesc::GetNumOfDimension() && static_assert(nDim == BlockSrcDesc::GetNumOfDimension() &&
nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() && nDim == BlockDstDesc::GetNumOfDimension() &&
nDim == SubLengths::GetSize() && nDim == BlockSliceLengths::Size() && nDim == ThreadSliceLengths::Size() &&
nDim == ThreadClusterLengths::GetSize() && nDim == ThreadClusterLengths::Size() &&
nDim == ThreadClusterArrangeOrder::GetSize(), nDim == ThreadClusterArrangeOrder::Size() &&
nDim == SrcDimAccessOrder::Size() && nDim == DstDimAccessOrder::Size(),
"wrong! nDim not consistent"); "wrong! nDim not consistent");
static_assert(is_same<SliceLengths, decltype(SubLengths{} * ThreadClusterLengths{})>{}, static_assert(
"wrong! threads should be mapped to cover entire slicing window"); is_same<BlockSliceLengths, decltype(ThreadSliceLengths{} * ThreadClusterLengths{})>{},
"wrong! threads should be mapped to cover entire slicing window");
constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed( // map threads to cluster
ThreadClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{})); constexpr auto thread_cluster_desc =
make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
static_assert(BlockSize == thread_cluster_desc.GetElementSize(), static_assert(BlockSize == thread_cluster_desc.GetElementSize(),
"wrong! BlockSize not consistent with ThreadClusterLengths"); "wrong! BlockSize not consistent with ThreadClusterLengths");
const auto thread_cluster_id = const auto thread_cluster_id =
thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id()); thread_cluster_desc.CalculateClusterIndex(get_thread_local_1d_id());
const auto data_cluster_id =
reorder_array_given_old2new(thread_cluster_id, ThreadClusterArrangeOrder{});
const auto thread_data_id_begin = data_cluster_id * SubLengths{}; const auto thread_data_id_begin = thread_cluster_id * ThreadSliceLengths{};
mThreadwiseLoad.SetSrcSliceOrigin(src_block_slice_origin + thread_data_id_begin); mThreadwiseLoad.SetSrcSliceOrigin(src_block_slice_origin + thread_data_id_begin);
mThreadwiseLoad.SetDstSliceOrigin(make_zero_array<index_t, nDim>()); mThreadwiseLoad.SetDstSliceOrigin(make_zero_array<index_t, nDim>());
...@@ -470,76 +61,106 @@ struct BlockwiseGenericTensorSliceCopy_v2 ...@@ -470,76 +61,106 @@ struct BlockwiseGenericTensorSliceCopy_v2
mThreadwiseStore.SetDstSliceOrigin(dst_block_slice_origin + thread_data_id_begin); mThreadwiseStore.SetDstSliceOrigin(dst_block_slice_origin + thread_data_id_begin);
} }
__device__ static constexpr index_t GetRegisterBufferSize() __device__ static constexpr index_t GetThreadBufferSize()
{ {
return RegisterBufferDesc::GetElementSpace(); return ThreadBufferDesc::GetElementSpace();
} }
template <class TData> template <typename BlockSrcData,
__device__ void RunLoadRegisterBuffer(const TData* p_src, TData* p_buffer) const typename ThreadBufferData,
address_space_t BlockSrcAddressSpace = address_space_t::generic,
address_space_t ThreadBufferAddressSpace = address_space_t::generic>
__device__ void RunLoadThreadBuffer(const BlockSrcData* p_block_src,
ThreadBufferData* p_thread_buffer) const
{ {
mThreadwiseLoad.Run(p_src, p_buffer); #if 0
mThreadwiseLoad.template Run<BlockSrcData,
ThreadBufferData,
BlockSrcAddressSpace,
ThreadBufferAddressSpace>(p_block_src,
p_thread_buffer);
#else // tweaking
mThreadwiseLoad.template Run_optimized_src_address_calculation<BlockSrcData,
ThreadBufferData,
BlockSrcAddressSpace,
ThreadBufferAddressSpace>(
p_block_src, p_thread_buffer);
#endif
} }
template <class TData> template <typename ThreadBufferData,
__device__ void RunStoreRegisterBuffer(const TData* p_buffer, TData* p_dst) const typename BlockDstData,
address_space_t ThreadBufferAddressSpace = address_space_t::generic,
address_space_t BlockDstAddressSpace = address_space_t::generic>
__device__ void RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer,
BlockDstData* p_block_dst) const
{ {
mThreadwiseStore.Run(p_buffer, p_dst); #if 0
mThreadwiseStore.template Run<ThreadBufferData,
BlockDstData,
ThreadBufferAddressSpace,
BlockDstAddressSpace>(p_thread_buffer, p_block_dst);
#else // tweaking
mThreadwiseStore.template Run_optimized_dst_address_calculation<ThreadBufferData,
BlockDstData,
ThreadBufferAddressSpace,
BlockDstAddressSpace>(
p_thread_buffer, p_block_dst);
#endif
} }
template <class TData> template <typename BlockSrcData,
__device__ void Run(const TData* p_src, TData* p_dst) const typename BlockDstData,
address_space_t BlockSrcAddressSpace = address_space_t::generic,
address_space_t BlockDstAddressSpace = address_space_t::generic>
__device__ void Run(const BlockSrcData* p_block_src, BlockDstData* p_block_dst) const
{ {
TData p_buffer[GetRegisterBufferSize()]; BlockSrcData p_thread_buffer[GetThreadBufferSize()];
mThreadwiseLoad.Run(p_src, p_buffer); RunLoadThreadBuffer<BlockSrcData,
mThreadwiseStore.Run(p_buffer, p_dst); BlockSrcData,
BlockSrcAddressSpace,
address_space_t::generic>(p_block_src, p_thread_buffer);
RunStoreThreadBuffer<BlockSrcData,
BlockDstData,
address_space_t::generic,
BlockDstAddressSpace>(p_thread_buffer, p_block_dst);
} }
template <class T, bool PositiveDirection> template <typename T, bool PositiveDirection>
__device__ void __device__ void
MoveSrcSlicingWindow(T step_sizes, MoveSrcSliceWindow(const T& step_sizes,
integral_constant<bool, PositiveDirection> positive_direction) integral_constant<bool, PositiveDirection> positive_direction)
{ {
mThreadwiseLoad.MoveSrcSlicingWindow(step_sizes, positive_direction); mThreadwiseLoad.MoveSrcSliceWindow(step_sizes, positive_direction);
} }
template <class T, bool PositiveDirection> template <typename T, bool PositiveDirection>
__device__ void __device__ void
MoveDstSlicingWindow(T step_sizes, MoveDstSliceWindow(const T& step_sizes,
integral_constant<bool, PositiveDirection> positive_direction) integral_constant<bool, PositiveDirection> positive_direction)
{ {
mThreadwiseLoad.MoveDstSlicingWindow(step_sizes, positive_direction); mThreadwiseStore.MoveDstSliceWindow(step_sizes, positive_direction);
} }
private: private:
using RegisterBufferDesc = decltype(make_ConstantTensorDescriptor_packed(SubLengths{})); using ThreadBufferDesc = decltype(make_native_tensor_descriptor_packed(ThreadSliceLengths{}));
using ThreadwiseLoad = using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v4r2<BlockSrcDesc,
ThreadwiseGenericTensorSliceCopy_v2r1<SrcDesc, ThreadBufferDesc,
RegisterBufferDesc, ThreadSliceLengths,
SrcCoordinate, SrcDimAccessOrder,
NormalTensorCoordinate<RegisterBufferDesc>, SrcVectorAccessDim,
SubLengths, SrcDataPerAccess,
SrcDimAccessOrder, 1>;
SrcDimAccessOrder,
SrcVectorAccessDim, using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v4r2<ThreadBufferDesc,
SrcVectorAccessDim, BlockDstDesc,
SrcDataPerAccess, ThreadSliceLengths,
1>; DstDimAccessOrder,
DstVectorAccessDim,
using ThreadwiseStore = 1,
ThreadwiseGenericTensorSliceCopy_v2r1<RegisterBufferDesc, DstDataPerAccess>;
DstDesc,
NormalTensorCoordinate<RegisterBufferDesc>,
DstCoordinate,
SubLengths,
DstDimAccessOrder,
DstDimAccessOrder,
DstVectorAccessDim,
DstVectorAccessDim,
1,
DstDataPerAccess>;
ThreadwiseLoad mThreadwiseLoad; ThreadwiseLoad mThreadwiseLoad;
ThreadwiseStore mThreadwiseStore; ThreadwiseStore mThreadwiseStore;
......
#ifndef CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_DEPRECATED_HPP
#define CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_DEPRECATED_HPP
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "tensor_view.hpp"
#include "tensor_coordinate_deprecated.hpp"
#include "threadwise_generic_tensor_slice_copy_deprecated.hpp"
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
#endif
namespace ck {
// Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
// memory layout (ordering of dimensions) can be different between src and dst.
// This functions assume each thread is reading and writing a normal (not merged) tensor,
// to simplify index calculations. To satisfy this assumption, the user need to make sure
// that, on a merged dimension that constains multiple original dimensions, the length of
// the last original dimension need to be evenly dividable by its sub-lengths. Also, the
// repeat-length on the merged dimension need to be 1. These sanity checks are performed
// in constructor of BlockwiseGenericTensorSliceCopy_v1
template <index_t BlockSize,
typename SrcDesc,
typename DstDesc,
typename SliceLengths,
typename SubLengths,
typename ThreadClusterLengths,
typename ThreadClusterArrangeOrder,
typename SrcDimAccessOrder,
typename DstDimAccessOrder,
index_t SrcVectorAccessDim,
index_t DstVectorAccessDim,
index_t SrcDataPerAccess,
index_t DstDataPerAccess>
struct BlockwiseGenericTensorSliceCopy_v1
{
static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
static constexpr index_t nOriginalDimSrc =
SrcDesc::GetOriginalTensorDescriptor().GetNumOfDimension();
static constexpr index_t nOriginalDimDst =
DstDesc::GetOriginalTensorDescriptor().GetNumOfDimension();
// per-thread offset
index_t mThreadSrcOffset;
index_t mThreadDstOffset;
// "mThreadSrcOriginalMultiId", "mThreadSrcPartialOffsets, "mThreadDstOriginalMultiId",
// "mThreadDstPartialOffsets" are always calculated inside constructor, and would be
// updated if slicing-window is moved. However, they will not be used if you always move
// the slicing-window along a non-merged dimension. In that case, compiler should be
// able to remove these calculation.
// TODO: make sure compiler would actually remove them in that case
// partial offset in each (merged) dimension
Array<index_t, nDim> mThreadSrcPartialOffsets;
Array<index_t, nDim> mThreadDstPartialOffsets;
// multi-id of original tensor
Array<index_t, nOriginalDimSrc> mThreadSrcOriginalMultiId;
Array<index_t, nOriginalDimDst> mThreadDstOriginalMultiId;
__device__ BlockwiseGenericTensorSliceCopy_v1(Array<index_t, nDim> src_block_data_id_begin,
Array<index_t, nDim> dst_block_data_id_begin)
{
// check NDim consistency
static_assert(
nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() &&
nDim == SliceLengths::GetSize() && nDim == SubLengths::GetSize() &&
nDim == ThreadClusterLengths::GetSize() &&
nDim == ThreadClusterArrangeOrder::GetSize() &&
nDim == SrcDimAccessOrder::GetSize() && nDim == DstDimAccessOrder::GetSize(),
"wrong");
// check thread arrange order and read/write access order are valid
static_assert(is_valid_sequence_map<ThreadClusterArrangeOrder>::value &&
is_valid_sequence_map<SrcDimAccessOrder>::value &&
is_valid_sequence_map<DstDimAccessOrder>::value,
"wrong!");
// thread cluster
constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed(
ThreadClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));
// BlockSize
static_assert(BlockSize == thread_cluster_desc.GetElementSize(), "wrong! BlockSize");
// divide work
constexpr auto data_per_cluster_per_dims = SubLengths{} * ThreadClusterLengths{};
static_for<0, nDim, 1>{}([&](auto IDim) {
static_assert(SliceLengths::Get(IDim) % data_per_cluster_per_dims.Get(IDim) == 0,
"wrong! cannot evenly divide sliced tensor into cluster");
});
constexpr auto repeat_lengths = SliceLengths{} / data_per_cluster_per_dims;
// additional check for merged dimension
static_for<0, nDim, 1>{}([&](auto IDim_) {
// src
static_if<SrcDesc::ContainMultipleOriginalDimensions(IDim_)>{}([&](auto) {
constexpr auto IDim = decltype(IDim_){};
// on a merged dimension that constains multiple original dimensions,
// the length of the last original dimension need to evenly dividable by its
// sub-length,
// so each thread is effectively reading a normal (not merged) tensor
constexpr auto idim_last_original_src =
SrcDesc::GetContainedOriginalDimensions(IDim).Back();
static_assert(
SrcDesc::GetOriginalTensorDescriptor().GetLength(idim_last_original_src) %
SubLengths::Get(IDim) ==
0,
"wrong!");
// merged dimension should have repeat_lengths = 1
static_assert(repeat_lengths[IDim] == 1,
"wrong! repeat_lengths shoud be 1 on merged dimension");
});
// dst
static_if<DstDesc::ContainMultipleOriginalDimensions(IDim_)>{}([&](auto) {
constexpr auto IDim = decltype(IDim_){};
// on a merged dimension that constains multiple original dimensions,
// the length of the last original dimension need to evenly dividable by its
// sub-length,
// so each thread is effectively reading a normal (not merged) tensor
constexpr auto idim_last_original_dst =
DstDesc::GetContainedOriginalDimensions(IDim).Back();
static_assert(
DstDesc::GetOriginalTensorDescriptor().GetLength(idim_last_original_dst) %
SubLengths::Get(IDim) ==
0,
"wrong!");
// merged dimension should have repeat_lengths = 1
static_assert(repeat_lengths[IDim] == 1,
"wrong! repeat_lengths shoud be 1 on merged dimension");
});
});
// calculate mThreadSrcOffset, mThreadDstOffset
const auto thread_cluster_id =
thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
const auto data_cluster_id =
reorder_array_given_old2new(thread_cluster_id, ThreadClusterArrangeOrder{});
const auto thread_data_id_begin = data_cluster_id * SubLengths{};
// original multi-id
mThreadSrcOriginalMultiId = SrcDesc::GetOriginalMultiIndexFromMultiIndex(
src_block_data_id_begin + thread_data_id_begin);
mThreadDstOriginalMultiId = DstDesc::GetOriginalMultiIndexFromMultiIndex(
dst_block_data_id_begin + thread_data_id_begin);
// partial offset on each dimension
static_for<0, nDim, 1>{}([&](auto IDim) {
constexpr auto src_partial_original_dims =
SrcDesc::GetContainedOriginalDimensions(IDim);
constexpr auto src_partial_original_desc =
SrcDesc::GetOriginalTensorDescriptor().Extract(src_partial_original_dims);
mThreadSrcPartialOffsets(IDim) = src_partial_original_desc.GetOffsetFromMultiIndex(
extract_array(mThreadSrcOriginalMultiId, src_partial_original_dims));
});
static_for<0, nDim, 1>{}([&](auto IDim) {
constexpr auto dst_partial_original_dims =
DstDesc::GetContainedOriginalDimensions(IDim);
constexpr auto dst_partial_original_desc =
DstDesc::GetOriginalTensorDescriptor().Extract(dst_partial_original_dims);
mThreadDstPartialOffsets(IDim) = dst_partial_original_desc.GetOffsetFromMultiIndex(
extract_array(mThreadDstOriginalMultiId, dst_partial_original_dims));
});
// complete offset
mThreadSrcOffset = accumulate_on_array(
mThreadSrcPartialOffsets, math::plus<index_t>{}, static_cast<index_t>(0));
mThreadDstOffset = accumulate_on_array(
mThreadDstPartialOffsets, math::plus<index_t>{}, static_cast<index_t>(0));
}
__device__ static constexpr auto GetRegisterBufferDescriptor()
{
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ThreadClusterLengths{});
return make_ConstantTensorDescriptor_packed(SubLengths{} * repeat_lengths);
}
__device__ static constexpr index_t GetRegisterBufferSize()
{
return GetRegisterBufferDescriptor().GetElementSpace();
}
template <typename TData>
__device__ void RunLoadRegisterBuffer(const TData* __restrict__ p_src,
TData* __restrict__ p_buffer) const
{
constexpr auto thread_sub_tensor_lengths = SubLengths{};
constexpr auto data_per_cluster_per_dims =
thread_sub_tensor_lengths * ThreadClusterLengths{};
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ThreadClusterLengths{});
constexpr auto thread_buffer_desc = GetRegisterBufferDescriptor();
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_id) {
constexpr auto src_thread_data_id_begin = repeat_id * data_per_cluster_per_dims;
constexpr auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths;
constexpr index_t src_offset =
SrcDesc::GetOffsetFromMultiIndex(src_thread_data_id_begin);
constexpr index_t buffer_offset =
thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin);
#else
ford<decltype(repeat_lengths)>{}([&](auto repeat_id) {
const auto src_thread_data_id_begin = repeat_id * data_per_cluster_per_dims;
const auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths;
const index_t src_offset = SrcDesc::GetOffsetFromMultiIndex(src_thread_data_id_begin);
const index_t buffer_offset =
thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin);
#endif
// By position the origin of the per-thread window at the point, where multi-index
// of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
// is assuming each thread is copy a noraml (not merged) tensor.
// To satisfy this assumption, the user need to make sure that, on a merged dimension
// that constains multiple original dimensions, the length of the last original
// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
// the merged dimension need to be 1. These sanity checks are performed in constructor
// of BlockwiseGenericTensorSliceCopy_v1
ThreadwiseGenericTensorSliceCopy_v1r2<SrcDesc,
decltype(thread_buffer_desc),
SubLengths,
SrcDimAccessOrder,
SrcVectorAccessDim,
SrcDataPerAccess,
1>(make_zero_array<index_t, nDim>(),
make_zero_array<index_t, nDim>())
.Run(p_src + src_offset + mThreadSrcOffset, p_buffer + buffer_offset);
});
}
template <typename TData>
__device__ void RunStoreRegisterBuffer(const TData* __restrict__ p_buffer,
TData* __restrict__ p_dst) const
{
constexpr auto thread_sub_tensor_lengths = SubLengths{};
constexpr auto data_per_cluster_per_dims =
thread_sub_tensor_lengths * ThreadClusterLengths{};
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ThreadClusterLengths{});
constexpr auto thread_buffer_desc = GetRegisterBufferDescriptor();
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_id) {
constexpr auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths;
constexpr auto dst_data_id_begin = repeat_id * data_per_cluster_per_dims;
constexpr index_t buffer_offset =
thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin);
constexpr index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(dst_data_id_begin);
#else
ford<decltype(repeat_lengths)>{}([&](auto repeat_id) {
const auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths;
const auto dst_data_id_begin = repeat_id * data_per_cluster_per_dims;
const index_t buffer_offset =
thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin);
const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(dst_data_id_begin);
#endif
// By position the origin of the per-thread window at the point, where multi-index
// of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
// is assuming each thread is copy a noraml (not merged) tensor.
// To satisfy this assumption, the user need to make sure that, on a merged dimension
// that constains multiple original dimensions, the length of the last original
// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
// the merged dimension need to be 1. These sanity checks are performed in constructor
// of BlockwiseGenericTensorSliceCopy_v1
ThreadwiseGenericTensorSliceCopy_v1r2<decltype(thread_buffer_desc),
DstDesc,
SubLengths,
DstDimAccessOrder,
DstVectorAccessDim,
1,
DstDataPerAccess>(
make_zero_array<index_t, nDim>(), make_zero_array<index_t, nDim>())
.Run(p_buffer + buffer_offset, p_dst + dst_offset + mThreadDstOffset);
});
}
template <typename TData>
__device__ void Run(const TData* __restrict__ p_src, TData* __restrict__ p_dst) const
{
TData p_buffer[GetRegisterBufferSize()];
RunLoadRegisterBuffer(p_src, p_buffer);
RunStoreRegisterBuffer(p_buffer, p_dst);
}
// When moving the slicing windows along a merged dimension, if the strides of the
// contained (by the merged dimension) original dimensions are not in descending order,
// then there is no guarantee that the new offset will be larger than the old offset
// for movement in positive direction (vice versue for movement in negative direction).
// As a result, there is the possiblity that the offset calculation may result in
// unsigned integer underflow (due to "-" operation). However, this hazard should not
// happen, as long as the users make sure the slicing window would not be moved out of
// the boundary of the tensor being sliced. This functions doesn't do runtime sanity
// check on out-of-bound slicing window, for performance reason
template <index_t IDim_, index_t StepSize, bool PositiveDirection>
__device__ void MoveSlicingWindowOnSourceTensor(
Number<IDim_>, Number<StepSize>, integral_constant<bool, PositiveDirection> direction)
{
constexpr auto IDim = Number<IDim_>{};
static_if<SrcDesc::ContainMultipleOriginalDimensions(IDim)>{}([&](auto) {
// logic for a merged dimension, also works for non-merged dimension, but its logic may
// be unncessarily complicated for compiler to remove calculations that are useless for
// a non-merged dimension
// extract partial original dimensions
constexpr auto src_partial_original_dims =
SrcDesc::GetContainedOriginalDimensions(IDim);
constexpr auto src_partial_original_desc =
SrcDesc::GetOriginalTensorDescriptor().Extract(src_partial_original_dims);
// calculate new partial original multi-id
auto old_src_partial_original_id =
extract_array(mThreadSrcOriginalMultiId, src_partial_original_dims);
auto new_src_partial_original_id =
src_partial_original_desc.UpdateMultiIndexGivenStepSizeOf1dIndex(
old_src_partial_original_id, StepSize, direction);
// update "mThreadSrcOriginalMultiId"
static_for<0, decltype(src_partial_original_dims)::GetSize(), 1>{}([&](auto I) {
constexpr auto IDimOriginal = src_partial_original_dims[I];
mThreadSrcOriginalMultiId(IDimOriginal) = new_src_partial_original_id[I];
});
// calculate new partial offset on this merged dimension
const index_t old_src_partial_offset = mThreadSrcPartialOffsets[IDim];
const index_t new_src_partial_offset =
src_partial_original_desc.GetOffsetFromMultiIndex(new_src_partial_original_id);
// update "mThreadSrcPartialOffsets"
mThreadSrcPartialOffsets(IDim) = new_src_partial_offset;
// update "mThreadSrcOffset", do "+" before "-" to avoid underflow
mThreadSrcOffset = (mThreadSrcOffset + new_src_partial_offset) - old_src_partial_offset;
}).Else([&](auto) {
// Logic for non-merged dimension. If you are never going to move the slicing window on
// a merged dimension, then "mThreadSrcOriginalMultiId" and "mThreadSrcPartialOffsets",
// which are being calculated here, will never be used later. In this case, compiler
// should be able to remove these calculations.
// TODO: make sure compiler would actually remove them in this case.
// It is the user's responsiblity to make sure the slicing window will not be moved out
// of the boundary of the tensor being sliced. Otherwise, there might be hazard like
// unsigned integer underflow. That is NO runtime sanity check to prevent the hazard
constexpr auto IDimOriginal = SrcDesc::GetContainedOriginalDimensions(IDim).Front();
static_if<PositiveDirection>{}([&](auto fwd) {
mThreadSrcOffset += StepSize * fwd(SrcDesc{}).GetStride(IDim);
mThreadSrcOriginalMultiId(IDimOriginal) += StepSize;
mThreadSrcPartialOffsets(IDim) += StepSize * fwd(SrcDesc{}).GetStride(IDim);
}).Else([&](auto fwd) {
mThreadSrcOffset -= StepSize * fwd(SrcDesc{}).GetStride(IDim);
mThreadSrcOriginalMultiId(IDimOriginal) -= StepSize;
mThreadSrcPartialOffsets(IDim) -= StepSize * fwd(SrcDesc{}).GetStride(IDim);
});
});
}
template <typename T, bool PositiveDirection>
__device__ void
MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
{
static_for<0, nDim, 1>{}([&](auto idim) {
if(step_sizes[idim] != 0)
{
MoveSlicingWindowOnSourceTensor(idim, step_sizes[idim], positive_direction);
}
});
}
};
// This version use TensorCoordiante
// Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
// memory layout (ordering of dimensions) can be different between src and dst.
template <index_t BlockSize,
typename SrcDesc,
typename DstDesc,
typename SliceLengths,
typename SubLengths,
typename ThreadClusterLengths,
typename ThreadClusterArrangeOrder,
typename SrcDimAccessOrder,
typename DstDimAccessOrder,
index_t SrcVectorAccessDim,
index_t DstVectorAccessDim,
index_t SrcDataPerAccess,
index_t DstDataPerAccess>
struct BlockwiseGenericTensorSliceCopy_v2
{
static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
using Index = MultiIndex<nDim>;
__device__ constexpr BlockwiseGenericTensorSliceCopy_v2(const Index& src_block_slice_origin,
const Index& dst_block_slice_origin)
{
static_assert(
nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() &&
nDim == SliceLengths::GetSize() && nDim == SubLengths::GetSize() &&
nDim == ThreadClusterLengths::GetSize() &&
nDim == ThreadClusterArrangeOrder::GetSize() &&
nDim == SrcDimAccessOrder::GetSize() && nDim == DstDimAccessOrder::GetSize(),
"wrong! nDim not consistent");
static_assert(is_same<SliceLengths, decltype(SubLengths{} * ThreadClusterLengths{})>{},
"wrong! threads should be mapped to cover entire slicing window");
constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed(
ThreadClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));
static_assert(BlockSize == thread_cluster_desc.GetElementSize(),
"wrong! BlockSize not consistent with ThreadClusterLengths");
const auto thread_cluster_id =
thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
const auto data_cluster_id =
reorder_array_given_old2new(thread_cluster_id, ThreadClusterArrangeOrder{});
const auto thread_data_id_begin = data_cluster_id * SubLengths{};
mThreadwiseLoad.SetSrcSliceOrigin(src_block_slice_origin + thread_data_id_begin);
mThreadwiseLoad.SetDstSliceOrigin(make_zero_array<index_t, nDim>());
mThreadwiseStore.SetSrcSliceOrigin(make_zero_array<index_t, nDim>());
mThreadwiseStore.SetDstSliceOrigin(dst_block_slice_origin + thread_data_id_begin);
}
__device__ static constexpr index_t GetThreadBufferSize()
{
return ThreadBufferDesc::GetElementSpace();
}
template <typename TData,
address_space_t BlockSrcAddressSpace = address_space_t::generic,
address_space_t ThreadBufferAddressSpace = address_space_t::generic>
__device__ void RunLoadThreadBuffer(const TData* p_block_src, TData* p_thread_buffer) const
{
#if 0
mThreadwiseLoad.Run(p_block_src, p_thread_buffer);
#else // tweaking
mThreadwiseLoad.template Run_optimized_address_calculation<TData,
BlockSrcAddressSpace,
ThreadBufferAddressSpace>(
p_block_src, p_thread_buffer);
#endif
}
template <typename TData,
address_space_t ThreadBufferAddressSpace = address_space_t::generic,
address_space_t BlockDstAddressSpace = address_space_t::generic>
__device__ void RunStoreThreadBuffer(const TData* p_thread_buffer, TData* p_block_dst) const
{
#if 0
mThreadwiseStore.Run(p_thread_buffer, p_block_dst);
#else // tweaking
mThreadwiseStore.template Run_optimized_address_calculation<TData,
ThreadBufferAddressSpace,
BlockDstAddressSpace>(
p_thread_buffer, p_block_dst);
#endif
}
template <typename TData,
address_space_t BlockSrcAddressSpace = address_space_t::generic,
address_space_t BlockDstAddressSpace = address_space_t::generic>
__device__ void Run(const TData* p_block_src, TData* p_block_dst) const
{
TData p_thread_buffer[GetThreadBufferSize()];
RunLoadThreadBuffer<TData, BlockSrcAddressSpace, address_space_t::generic>(p_block_src,
p_thread_buffer);
RunStoreThreadBuffer<TData, address_space_t::generic, BlockDstAddressSpace>(p_thread_buffer,
p_block_dst);
}
template <typename T, bool PositiveDirection>
__device__ void
MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
{
mThreadwiseLoad.MoveSrcSliceWindow(step_sizes, positive_direction);
}
template <typename T, bool PositiveDirection>
__device__ void
MoveDstSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
{
mThreadwiseStore.MoveDstSliceWindow(step_sizes, positive_direction);
}
private:
using ThreadBufferDesc = decltype(make_ConstantTensorDescriptor_packed(SubLengths{}));
using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v2r1<SrcDesc,
ThreadBufferDesc,
SubLengths,
SrcDimAccessOrder,
SrcDimAccessOrder,
SrcVectorAccessDim,
SrcVectorAccessDim,
SrcDataPerAccess,
1>;
using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v2r1<ThreadBufferDesc,
DstDesc,
SubLengths,
DstDimAccessOrder,
DstDimAccessOrder,
DstVectorAccessDim,
DstVectorAccessDim,
1,
DstDataPerAccess>;
ThreadwiseLoad mThreadwiseLoad;
ThreadwiseStore mThreadwiseStore;
};
// this version use TensorView and TensorCoordinate_deprecated
template <index_t BlockSize,
typename SrcTensor,
typename DstTensor,
typename SliceLengths,
typename SubLengths,
typename ThreadClusterLengths,
typename ThreadClusterArrangeOrder,
typename SrcDimAccessOrder,
typename DstDimAccessOrder,
index_t SrcVectorAccessDim,
index_t DstVectorAccessDim,
index_t SrcDataPerAccess,
index_t DstDataPerAccess>
struct BlockwiseGenericTensorSliceCopy_v3
{
static constexpr index_t nDim = SrcTensor::GetNumOfDimension();
using data_type = remove_cv_t<typename SrcTensor::data_type>;
using SrcCoordinate = typename SrcTensor::coordinate_type;
using DstCoordinate = typename DstTensor::coordinate_type;
__device__ constexpr BlockwiseGenericTensorSliceCopy_v3(SrcTensor src_block,
SrcCoordinate src_block_slice_origin,
DstTensor dst_block,
DstCoordinate dst_block_slice_origin)
: mThreadBuffer{make_TensorView(ThreadBufferDesc{}, mpBuffer)}
{
static_assert(
nDim == SrcTensor::GetNumOfDimension() && nDim == DstTensor::GetNumOfDimension() &&
nDim == SliceLengths::GetSize() && nDim == SubLengths::GetSize() &&
nDim == ThreadClusterLengths::GetSize() &&
nDim == ThreadClusterArrangeOrder::GetSize() &&
nDim == SrcDimAccessOrder::GetSize() && nDim == DstDimAccessOrder::GetSize(),
"wrong! nDim not consistent");
static_assert(is_same<SliceLengths, decltype(SubLengths{} * ThreadClusterLengths{})>{},
"wrong! threads should be mapped to cover entire slicing window");
static_assert(is_same<remove_cv_t<typename SrcTensor::data_type>,
remove_cv_t<typename DstTensor::data_type>>{},
"wrong! type conversion not supported yet");
constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed(
ThreadClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));
static_assert(BlockSize == thread_cluster_desc.GetElementSize(),
"wrong! BlockSize not consistent with ThreadClusterLengths");
const auto thread_cluster_id =
thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
const auto data_cluster_id =
reorder_array_given_old2new(thread_cluster_id, ThreadClusterArrangeOrder{});
const auto thread_data_id_begin = data_cluster_id * SubLengths{};
mThreadwiseLoad = ThreadwiseLoad(src_block,
src_block_slice_origin + thread_data_id_begin,
mThreadBuffer,
make_zero_array<index_t, nDim>());
mThreadwiseStore = ThreadwiseStore(mThreadBuffer,
make_zero_array<index_t, nDim>(),
dst_block,
dst_block_slice_origin + thread_data_id_begin);
}
__device__ void RunLoadRegisterBuffer() { mThreadwiseLoad.Run(); }
__device__ void RunStoreRegisterBuffer() const { mThreadwiseStore.Run(); }
__device__ void Run()
{
mThreadwiseLoad.Run();
mThreadwiseStore.Run();
}
template <typename T, bool PositiveDirection>
__device__ void
MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
{
mThreadwiseLoad.MoveSrcSliceWindow(step_sizes, positive_direction);
}
template <typename T, bool PositiveDirection>
__device__ void
MoveDstSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
{
mThreadwiseStore.MoveDstSliceWindow(step_sizes, positive_direction);
}
private:
using ThreadBufferDesc = decltype(make_ConstantTensorDescriptor_packed(SubLengths{}));
using ThreadBufferTensor = NormalTensorView<ThreadBufferDesc, data_type>;
using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v3r1<SrcTensor,
ThreadBufferTensor,
SubLengths,
SrcDimAccessOrder,
SrcDimAccessOrder,
SrcVectorAccessDim,
SrcVectorAccessDim,
SrcDataPerAccess,
1>;
using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v3r1<ThreadBufferTensor,
DstTensor,
SubLengths,
DstDimAccessOrder,
DstDimAccessOrder,
DstVectorAccessDim,
DstVectorAccessDim,
1,
DstDataPerAccess>;
data_type mpBuffer[ThreadBufferDesc::GetElementSpace()];
ThreadBufferTensor mThreadBuffer;
ThreadwiseLoad mThreadwiseLoad;
ThreadwiseStore mThreadwiseStore;
};
} // namespace ck
#endif
...@@ -165,7 +165,7 @@ struct BlockwiseTensorSliceReorderCopy_v3 ...@@ -165,7 +165,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
#endif #endif
} }
__device__ static constexpr index_t GetRegisterClipboardSize() __device__ static constexpr index_t GetRegisterBufferSize()
{ {
constexpr auto thread_sub_tensor_lengths = SrcSubLengths{}; constexpr auto thread_sub_tensor_lengths = SrcSubLengths{};
...@@ -183,8 +183,8 @@ struct BlockwiseTensorSliceReorderCopy_v3 ...@@ -183,8 +183,8 @@ struct BlockwiseTensorSliceReorderCopy_v3
return thread_tensor_desc.GetElementSpace(); return thread_tensor_desc.GetElementSpace();
} }
__device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src, __device__ void RunLoadRegisterBuffer(const Float* __restrict__ p_src,
Float* __restrict__ p_clipboard) const Float* __restrict__ p_clipboard) const
{ {
constexpr auto thread_sub_tensor_lengths = SrcSubLengths{}; constexpr auto thread_sub_tensor_lengths = SrcSubLengths{};
...@@ -219,8 +219,8 @@ struct BlockwiseTensorSliceReorderCopy_v3 ...@@ -219,8 +219,8 @@ struct BlockwiseTensorSliceReorderCopy_v3
}); });
} }
__device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard, __device__ void RunStoreRegisterBuffer(const Float* __restrict__ p_clipboard,
Float* __restrict__ p_dst) const Float* __restrict__ p_dst) const
{ {
constexpr auto thread_sub_tensor_lengths = SrcSubLengths{}; constexpr auto thread_sub_tensor_lengths = SrcSubLengths{};
...@@ -274,10 +274,10 @@ struct BlockwiseTensorSliceReorderCopy_v3 ...@@ -274,10 +274,10 @@ struct BlockwiseTensorSliceReorderCopy_v3
__device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
{ {
Float p_clipboard[GetRegisterClipboardSize()]; Float p_clipboard[GetRegisterBufferSize()];
RunLoadRegisterClipboard(p_src, p_clipboard); RunLoadRegisterBuffer(p_src, p_clipboard);
RunStoreRegisterClipboard(p_clipboard, p_dst); RunStoreRegisterBuffer(p_clipboard, p_dst);
} }
// this function doesn't do santiy check on whether the slicing window is out of the boundary // this function doesn't do santiy check on whether the slicing window is out of the boundary
......
...@@ -2,334 +2,86 @@ ...@@ -2,334 +2,86 @@
#define CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_HPP #define CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_HPP
#include "common_header.hpp" #include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp" #include "tensor_descriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp" #include "tensor_descriptor_helper.hpp"
#include "tensor_coordinate.hpp" #include "tensor_coordinate.hpp"
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 #ifndef CK_USE_AMD_INTRINSIC
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0 #define CK_USE_AMD_INTRINSIC 1
#endif #endif
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 #ifndef CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0 #define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1
#endif
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 0
#endif
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
#endif #endif
namespace ck { namespace ck {
// This threadwise copy allow vector access of src and dst. // This version use multi-index transformation
// It allows the dimensions of vector access to be different on src and dst.
// It also allows the vector size to be different on src and dst.
// It also allows order of access to be different on src and dst.
// It use register as buffer to hold all data moving from src to dst.
// It is designed for copying small amount of data, and src and dst are
// device memory or LDS.
// When copying large amout of data, let's hope compiler will reduce register
// used for the buffer.
template <class SrcDesc,
class DstDesc,
class SliceLengths,
class SrcDimAccessOrder,
class DstDimAccessOrder,
index_t SrcVectorAccessDim,
index_t DstVectorAccessDim,
index_t SrcDataPerAccess,
index_t DstDataPerAccess>
struct ThreadwiseGenericTensorSliceCopy_v1r1
{
static constexpr index_t nDim = SliceLengths::GetSize();
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r1(
Array<index_t, nDim> src_slice_origin, Array<index_t, nDim> dst_slice_origin)
: mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin)
{
static_assert(nDim == SrcDesc::GetNumOfDimension() &&
nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() &&
nDim == SrcDimAccessOrder::GetSize() &&
nDim == DstDimAccessOrder::GetSize(),
"wrong! # of dimensions not the same");
static_assert(is_valid_sequence_map<SrcDimAccessOrder>::value &&
is_valid_sequence_map<DstDimAccessOrder>::value,
"wrong! map is not valid");
static_assert(SliceLengths{}[SrcVectorAccessDim] % SrcDataPerAccess == 0 &&
SliceLengths{}[DstVectorAccessDim] % DstDataPerAccess == 0,
"wrong! cannot evenly divide");
// check vectorized memory access
constexpr auto src_vector_access_dim = Number<SrcVectorAccessDim>{};
constexpr auto dst_vector_access_dim = Number<DstVectorAccessDim>{};
static_if<!SrcDesc::ContainMultipleOriginalDimensions(src_vector_access_dim)>{}(
[&](auto fwd) {
static_assert(
(fwd(SrcDesc{}).GetStride(src_vector_access_dim) == 1 || SrcDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
})
.Else([&](auto fwd) {
static_assert(
(fwd(SrcDesc{}).GetLastOriginalDimensionStride(src_vector_access_dim) == 1 ||
SrcDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
});
static_if<!DstDesc::ContainMultipleOriginalDimensions(dst_vector_access_dim)>{}(
[&](auto fwd) {
static_assert(
(fwd(DstDesc{}).GetStride(dst_vector_access_dim) == 1 || DstDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
})
.Else([&](auto fwd) {
static_assert(
(fwd(DstDesc{}).GetLastOriginalDimensionStride(dst_vector_access_dim) == 1 ||
DstDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
});
}
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r1()
: ThreadwiseGenericTensorSliceCopy_v1r1(make_zero_array<index_t, nDim>(),
make_zero_array<index_t, nDim>())
{
}
__device__ void SetSrcSliceOrigin(Array<index_t, nDim> src_slice_origin)
{
mSrcSliceOrigin = src_slice_origin;
}
__device__ void SetDstSliceOrigin(Array<index_t, nDim> dst_slice_origin)
{
mDstSliceOrigin = dst_slice_origin;
}
template <class TData>
__device__ void Run(const TData* p_src, TData* p_dst) const
{
constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{});
TData p_buffer_[buffer_desc.GetElementSpace()];
TData* p_buffer = p_buffer_;
// copy data from src into buffer
{
using vector_t = typename vector_type<TData, SrcDataPerAccess>::MemoryType;
constexpr auto src_vector_access_dim = Number<SrcVectorAccessDim>{};
constexpr auto src_data_per_access = Number<SrcDataPerAccess>{};
constexpr auto src_access_lengths = SliceLengths::Modify(
src_vector_access_dim,
SliceLengths::Get(src_vector_access_dim) / src_data_per_access);
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
static_ford<decltype(src_access_lengths), SrcDimAccessOrder>{}([&](auto src_access_id) {
constexpr auto src_data_begin_id = src_access_id.Modify(
src_vector_access_dim,
src_access_id[src_vector_access_dim] * src_data_per_access);
const index_t src_offset =
SrcDesc::GetOffsetFromMultiIndex(mSrcSliceOrigin + src_data_begin_id);
// load vector from src
const vector_t vector_data = *reinterpret_cast<const vector_t*>(&p_src[src_offset]);
// unpack vector into buffer
static_for<0, SrcDataPerAccess, 1>{}([&](auto i) {
constexpr auto scalar_id =
typename uniform_sequence_gen<nDim, 0>::type{}.Modify(src_vector_access_dim,
i);
constexpr index_t buffer_offset =
buffer_desc.GetOffsetFromMultiIndex(src_data_begin_id + scalar_id);
p_buffer[buffer_offset] = reinterpret_cast<const TData*>(&vector_data)[i];
});
});
#else
ford<decltype(src_access_lengths), SrcDimAccessOrder>{}([&](auto src_access_id) {
auto src_data_begin_id = src_access_id;
src_data_begin_id(src_vector_access_dim) =
src_access_id[src_vector_access_dim] * src_data_per_access;
const index_t src_offset =
SrcDesc::GetOffsetFromMultiIndex(mSrcSliceOrigin + src_data_begin_id);
// load vector from src
const vector_t vector_data = *reinterpret_cast<const vector_t*>(&p_src[src_offset]);
// unpack vector into buffer
for(index_t i = 0; i < SrcDataPerAccess; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(src_vector_access_dim) = i;
const index_t buffer_offset =
buffer_desc.GetOffsetFromMultiIndex(src_data_begin_id + scalar_id);
p_buffer[buffer_offset] = reinterpret_cast<const TData*>(&vector_data)[i];
}
});
#endif
}
// copy data from buffer to dst
{
using vector_t = typename vector_type<TData, DstDataPerAccess>::MemoryType;
constexpr auto dst_vector_access_dim = Number<DstVectorAccessDim>{};
constexpr auto dst_data_per_access = Number<DstDataPerAccess>{};
constexpr auto dst_access_lengths = SliceLengths::Modify(
dst_vector_access_dim,
SliceLengths::Get(dst_vector_access_dim) / dst_data_per_access);
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
static_ford<decltype(dst_access_lengths), DstDimAccessOrder>{}([&](auto dst_access_id) {
constexpr auto dst_data_begin_id = dst_access_id.Modify(
dst_vector_access_dim,
dst_access_id[dst_vector_access_dim] * dst_data_per_access);
vector_t vector_data;
// pack vector from buffer
static_for<0, DstDataPerAccess, 1>{}([&](auto i) {
constexpr auto scalar_id =
typename uniform_sequence_gen<nDim, 0>::type{}.Modify(dst_vector_access_dim,
i);
constexpr index_t buffer_offset =
buffer_desc.GetOffsetFromMultiIndex(dst_data_begin_id + scalar_id);
reinterpret_cast<TData*>(&vector_data)[i] = p_buffer[buffer_offset];
});
const index_t dst_offset =
DstDesc::GetOffsetFromMultiIndex(mDstSliceOrigin + dst_data_begin_id);
// store vector into dst
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) = vector_data;
});
#else
ford<decltype(dst_access_lengths), DstDimAccessOrder>{}([&](auto dst_access_id) {
auto dst_data_begin_id = dst_access_id;
dst_data_begin_id(dst_vector_access_dim) =
dst_access_id[dst_vector_access_dim] * dst_data_per_access;
vector_t vector_data;
// pack vector from buffer
for(index_t i = 0; i < DstDataPerAccess; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(dst_vector_access_dim) = i;
const index_t buffer_offset =
buffer_desc.GetOffsetFromMultiIndex(dst_data_begin_id + scalar_id);
reinterpret_cast<TData*>(&vector_data)[i] = p_buffer[buffer_offset];
}
const index_t dst_offset =
DstDesc::GetOffsetFromMultiIndex(mDstSliceOrigin + dst_data_begin_id);
// store vector into dst
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) = vector_data;
});
#endif
}
}
private:
Array<index_t, nDim> mSrcSliceOrigin;
Array<index_t, nDim> mDstSliceOrigin;
};
// This threadwise copy allow vector access of src and dst. // This threadwise copy allow vector access of src and dst.
// It allows the vector size to be different on src and dst. // It allows the vector size to be different on src and dst.
// The dimensions of vector access should be the same on src and dst. // The dimensions of vector access should be the same on src and dst.
// The dimension access order should be the same on src and dst. // The dimension access order should be the same on src and dst.
// It is designed for cases, where one of src and dst is register, and // It is designed for cases, where one of src and dst is register, and
// the other is device memory or LDS // the other is device memory or LDS
template <class SrcDesc, template <typename SrcDesc,
class DstDesc, typename DstDesc,
class SliceLengths, typename SliceLengths,
class DimAccessOrder, typename DimAccessOrder,
index_t VectorAccessDim, index_t VectorAccessDim,
index_t SrcDataPerAccess, index_t SrcDataPerAccess,
index_t DstDataPerAccess> index_t DstDataPerAccess>
struct ThreadwiseGenericTensorSliceCopy_v1r2 struct ThreadwiseGenericTensorSliceCopy_v4r2
{ {
static constexpr index_t nDim = SliceLengths::GetSize(); static constexpr index_t nDim = SliceLengths::Size();
using Index = MultiIndex<nDim>;
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r2( using SrcCoord = typename TensorCoordinate<SrcDesc>::type;
Array<index_t, nDim> src_slice_origin, Array<index_t, nDim> dst_slice_origin) using DstCoord = typename TensorCoordinate<DstDesc>::type;
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v4r2(const Index& src_slice_origin,
const Index& dst_slice_origin)
: mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin) : mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin)
{ {
static_assert(nDim == SrcDesc::GetNumOfDimension() && static_assert(nDim == SrcDesc::GetNumOfDimension() &&
nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() && nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::Size() &&
nDim == DimAccessOrder::GetSize(), nDim == DimAccessOrder::Size(),
"wrong! # of dimensions not the same"); "wrong! # of dimensions not the same");
static_assert(is_valid_sequence_map<DimAccessOrder>::value, "wrong! map is not valid"); static_assert(is_valid_sequence_map<DimAccessOrder>{}, "wrong! map is not valid");
static_assert( static_assert(
SliceLengths{}[VectorAccessDim] % math::lcm(SrcDataPerAccess, DstDataPerAccess) == 0, SliceLengths{}[VectorAccessDim] % math::lcm(SrcDataPerAccess, DstDataPerAccess) == 0,
"wrong! cannot evenly divide"); "wrong! cannot evenly divide");
// check vectorized memory access // TODO:: sanity-check if vectorized memory access is allowed on src and dst
constexpr auto vector_access_dim = Number<VectorAccessDim>{};
static_if<!SrcDesc::ContainMultipleOriginalDimensions(vector_access_dim)>{}([&](auto fwd) {
static_assert(
(fwd(SrcDesc{}).GetStride(vector_access_dim) == 1 || SrcDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
}).Else([&](auto fwd) {
static_assert((fwd(SrcDesc{}).GetLastOriginalDimensionStride(vector_access_dim) == 1 ||
SrcDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
});
static_if<!DstDesc::ContainMultipleOriginalDimensions(vector_access_dim)>{}([&](auto fwd) {
static_assert(
(fwd(DstDesc{}).GetStride(vector_access_dim) == 1 || DstDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
}).Else([&](auto fwd) {
static_assert((fwd(DstDesc{}).GetLastOriginalDimensionStride(vector_access_dim) == 1 ||
DstDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
});
} }
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r2() __device__ constexpr ThreadwiseGenericTensorSliceCopy_v4r2()
: ThreadwiseGenericTensorSliceCopy_v1r2(make_zero_array<index_t, nDim>(), : ThreadwiseGenericTensorSliceCopy_v4r2(make_zero_array<index_t, nDim>(),
make_zero_array<index_t, nDim>()) make_zero_array<index_t, nDim>())
{ {
} }
__device__ void SetSrcSliceOrigin(Array<index_t, nDim> src_slice_origin) __device__ void SetSrcSliceOrigin(SrcCoord src_slice_origin)
{ {
mSrcSliceOrigin = src_slice_origin; mSrcSliceOrigin = src_slice_origin;
} }
__device__ void SetDstSliceOrigin(Array<index_t, nDim> dst_slice_origin) __device__ void SetDstSliceOrigin(DstCoord dst_slice_origin)
{ {
mDstSliceOrigin = dst_slice_origin; mDstSliceOrigin = dst_slice_origin;
} }
template <class TData> // Will do padding check on src data: Read 0 if src data is in padding area.
__device__ void Run(const TData* p_src, TData* p_dst) const // Will do padding check on dst data: No write if dst data is in paddin area.
template <typename SrcData,
typename DstData,
address_space_t SrcAddressSpace = address_space_t::generic,
address_space_t DstAddressSpace = address_space_t::generic>
__device__ void Run(const SrcData* p_src, DstData* p_dst) const
{ {
using src_vector_t = typename vector_type<TData, SrcDataPerAccess>::MemoryType; using src_vector_t = typename vector_type<SrcData, SrcDataPerAccess>::MemoryType;
using dst_vector_t = typename vector_type<TData, DstDataPerAccess>::MemoryType; using dst_vector_t = typename vector_type<DstData, DstDataPerAccess>::MemoryType;
constexpr auto vector_access_dim = Number<VectorAccessDim>{}; constexpr auto vector_access_dim = Number<VectorAccessDim>{};
...@@ -341,603 +93,442 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2 ...@@ -341,603 +93,442 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2
constexpr auto long_vector_access_lengths = SliceLengths::Modify( constexpr auto long_vector_access_lengths = SliceLengths::Modify(
vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size); vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size);
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 ford<decltype(long_vector_access_lengths), DimAccessOrder>{}([&](
static_ford<decltype(long_vector_access_lengths), DimAccessOrder>{}([&](
auto long_vector_access_id) { auto long_vector_access_id) {
// data id w.r.t slicing-window // data id w.r.t slicing-window
constexpr auto long_vector_data_begin_id = long_vector_access_id.Modify( auto long_vector_data_begin_id = long_vector_access_id;
vector_access_dim, long_vector_access_id[vector_access_dim] * long_vector_size); long_vector_data_begin_id(vector_access_dim) =
long_vector_size * long_vector_access_id[vector_access_dim];
// buffer to hold a long-vector
TData p_long_vector[long_vector_size];
// load data from src to the long-vector buffer
static_for<0, long_vector_size / src_data_per_access, 1>{}([&](auto i) {
constexpr auto scalar_id = typename uniform_sequence_gen<nDim, 0>::type{}.Modify(
vector_access_dim, i * src_data_per_access);
const index_t src_offset = SrcDesc::GetOffsetFromMultiIndex( // buffer to hold a src long-vector
mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id)); SrcData p_src_long_vector[long_vector_size];
constexpr index_t buffer_offset = i * src_data_per_access; // zero out buffer
for(index_t i = 0; i < long_vector_size; ++i)
{
p_src_long_vector[i] = 0;
}
*reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) = // load data from src to the long-vector buffer
*reinterpret_cast<const src_vector_t*>(&p_src[src_offset]); for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i)
}); {
auto scalar_id = make_zero_array<index_t, nDim>();
// store data from the long-vector buffer to dst scalar_id(vector_access_dim) = i * src_data_per_access;
static_for<0, long_vector_size / dst_data_per_access, 1>{}([&](auto i) {
constexpr auto scalar_id = typename uniform_sequence_gen<nDim, 0>::type{}.Modify(
vector_access_dim, i * dst_data_per_access);
constexpr index_t buffer_offset = i * dst_data_per_access; const index_t buffer_offset = i * src_data_per_access;
const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex( const auto src_coord = mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id);
mDstSliceOrigin + (long_vector_data_begin_id + scalar_id));
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_offset]) = // Check src vector's padding situation, only check the first data in this src
*reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]); // vector. It's user's responsiblity to make sure all data in the src vector
}); // has
}); // the same padding situation
if(src_coord.IsUpperIndexMappedToValidOffset())
{
static_if<SrcAddressSpace == address_space_t::global>{}([&](auto) {
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
__buffer_load<SrcData, SrcDataPerAccess>(
p_src, 0, src_coord.GetOffset());
#else #else
ford<decltype(long_vector_access_lengths), DimAccessOrder>{}( *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
[&](auto long_vector_access_id) { *reinterpret_cast<const src_vector_t*>(&p_src[src_coord.GetOffset()]);
#endif
}).Else([&](auto) {
// src can be all kinds of memory-space.
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
*reinterpret_cast<const src_vector_t*>(&p_src[src_coord.GetOffset()]);
});
}
}
// data id w.r.t slicing-window // SrcData to DstData conversion
auto long_vector_data_begin_id = long_vector_access_id; DstData p_dst_long_vector[long_vector_size];
long_vector_data_begin_id(vector_access_dim) =
long_vector_size * long_vector_access_id[vector_access_dim];
// buffer to hold a long-vector for(index_t i = 0; i < long_vector_size; ++i)
TData p_long_vector[long_vector_size]; {
p_dst_long_vector[i] = type_convert<DstData>{}(p_src_long_vector[i]);
}
// load data from src to the long-vector buffer // store data from the long-vector buffer to dst
for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i) for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i)
{ {
auto scalar_id = make_zero_array<index_t, nDim>(); auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(vector_access_dim) = i * src_data_per_access; scalar_id(vector_access_dim) = i * dst_data_per_access;
const index_t src_offset = SrcDesc::GetOffsetFromMultiIndex( const index_t buffer_offset = i * dst_data_per_access;
mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id));
const index_t buffer_offset = i * src_data_per_access; const auto dst_coord = mDstSliceOrigin + (long_vector_data_begin_id + scalar_id);
*reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) = // Check dst vector's padding situation, only check the first data in this dst
*reinterpret_cast<const src_vector_t*>(&p_src[src_offset]); // vector. It's user's responsiblity to make sure all data in the dst vector
} // has
// the same padding situation
// store data from the long-vector buffer to dst if(dst_coord.IsUpperIndexMappedToValidOffset())
for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i)
{ {
auto scalar_id = make_zero_array<index_t, nDim>(); static_if<DstAddressSpace == address_space_t::global>{}([&](auto) {
scalar_id(vector_access_dim) = i * dst_data_per_access; #if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
__buffer_store<DstData, DstDataPerAccess>(
const index_t buffer_offset = i * dst_data_per_access; *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]),
p_dst,
const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex( 0,
mDstSliceOrigin + (long_vector_data_begin_id + scalar_id)); dst_coord.GetOffset());
#else
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_offset]) = *reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
*reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]); *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
}
});
#endif #endif
}).Else([&](auto) {
// dst can be all kinds of memory-space
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
});
}
}
});
} }
private: // Modify Length to 1, if Mask is set to false
Array<index_t, nDim> mSrcSliceOrigin; // Used for isolating linear dimension from non-linear dimensions
Array<index_t, nDim> mDstSliceOrigin; template <index_t... Lengths, index_t... Mask>
}; __device__ static constexpr auto mask_lengths(Sequence<Lengths...>, Sequence<Mask...>)
template <class SrcDesc,
class DstDesc,
class SrcCoordinate,
class DstCoordinate,
class SliceLengths>
struct ThreadwiseGenericTensorSliceCopy_v2
{
static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v2(SrcCoordinate src_slice_origin,
DstCoordinate dst_slice_origin)
: mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin)
{
}
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v2()
: ThreadwiseGenericTensorSliceCopy_v2(make_zero_array<index_t, nDim>(),
make_zero_array<index_t, nDim>())
{
}
__device__ void SetSrcSliceOrigin(SrcCoordinate src_slice_origin)
{
mSrcSliceOrigin = src_slice_origin;
}
__device__ void SetDstSliceOrigin(DstCoordinate dst_slice_origin)
{ {
mDstSliceOrigin = dst_slice_origin; return Sequence<(Mask ? Lengths : 1)...>{};
} }
template <class TDesc, class Seq> // p_src must be global-memory, p_dst can be any memory-space.
struct IsolateMergedDimSliceLengthsHack // User should make sure p_src is a block-invariant pointer, because
{ // buffer_load is used for loading from global-memory into register buffer.
template <class IDim> // Will do padding check on src data: Read 0 if src data is in padding area.
__device__ constexpr index_t operator()(IDim idim) const // Will do padding check on dst data: No write if dst data is in paddin area.
{ // This version is optimized for address calculation of src tensor
return TDesc::ContainMultipleOriginalDimensions(idim) ? Seq{}[idim] : 1; template <typename SrcData,
} typename DstData,
}; address_space_t SrcAddressSpace = address_space_t::generic,
address_space_t DstAddressSpace = address_space_t::generic>
template <class TData> __device__ void Run_optimized_src_address_calculation(const SrcData* p_src,
__device__ void Run(const TData* p_src, TData* p_dst) const DstData* p_dst) const
{ {
constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{}); using src_vector_t = typename vector_type<SrcData, SrcDataPerAccess>::MemoryType;
using dst_vector_t = typename vector_type<DstData, DstDataPerAccess>::MemoryType;
TData p_buffer_[buffer_desc.GetElementSpace()]; constexpr auto vector_access_dim = Number<VectorAccessDim>{};
TData* p_buffer = p_buffer_;
// hacks to isolate merged dimension from normal dimensions, and calculate their offset constexpr auto src_data_per_access = Number<SrcDataPerAccess>{};
// seperately constexpr auto dst_data_per_access = Number<DstDataPerAccess>{};
// SrcMergedDimSliceLengthsHack has entry same as SliceLengths on src merged dimensions,
// but 1 on normal dimensions;
// SrcNormalDimSliceLengthsHack has entry same as SliceLengths on src normal dimensions,
// but 1 on merged dimensions;
using SrcMergedDimSliceLengthsHack =
typename sequence_gen<SliceLengths::GetSize(),
IsolateMergedDimSliceLengthsHack<SrcDesc, SliceLengths>>::type;
using SrcNormalDimSliceLengthsHack = constexpr auto long_vector_size = Number<math::lcm(SrcDataPerAccess, DstDataPerAccess)>{};
decltype((SliceLengths{} + Number<1>{}) - SrcMergedDimSliceLengthsHack{});
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 constexpr auto long_vector_access_lengths = SliceLengths::Modify(
static_ford<SrcMergedDimSliceLengthsHack>{}([&](auto merged_dim_data_id_) { vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size);
constexpr auto merged_dim_data_id = decltype(merged_dim_data_id_){};
const TData* p_src_tmp = p_src + (mSrcSliceOrigin + merged_dim_data_id).GetOffset(); // separate linear dimensions from non-linear dimensions
constexpr auto src_linear_dim_mask = SrcDesc::GetLinearDimensionMask();
constexpr auto src_nonlinear_dim_mask = SrcDesc::GetNonLinearDimensionMask();
static_ford<SrcNormalDimSliceLengthsHack>{}([&](auto normal_dim_data_id_) { static_assert(src_linear_dim_mask.At(VectorAccessDim) ||
constexpr auto normal_dim_data_id = decltype(normal_dim_data_id_){}; long_vector_size == SrcDataPerAccess,
"Warning! VectorAccessDim is not SrcDesc's linear dimension, performance "
"would drop");
constexpr index_t buffer_offset = // separate steps into linear and non-linear components, accoording to src tensor
buffer_desc.GetOffsetFromMultiIndex(merged_dim_data_id + normal_dim_data_id); constexpr auto linear_long_vector_access_lengths =
mask_lengths(long_vector_access_lengths, src_linear_dim_mask);
constexpr index_t src_normal_offset = constexpr auto nonlinear_long_vector_access_lengths =
SrcDesc::GetOffsetFromMultiIndex(normal_dim_data_id); mask_lengths(long_vector_access_lengths, src_nonlinear_dim_mask);
p_buffer[buffer_offset] = p_src_tmp[src_normal_offset]; // loop over src's non-linear dimensions
}); ford<decltype(nonlinear_long_vector_access_lengths)>{}([&](
}); auto nonlinear_dim_long_vector_access_id) {
#else
ford<SrcMergedDimSliceLengthsHack>{}([&](auto merged_dim_data_id) {
const TData* p_src_tmp = p_src + (mSrcSliceOrigin + merged_dim_data_id).GetOffset();
ford<SrcNormalDimSliceLengthsHack>{}([&](auto normal_dim_data_id) { // calculate step-sizes along src's nonlinear dimensions
const index_t buffer_offset = auto nonlinear_dim_data_steps = nonlinear_dim_long_vector_access_id;
buffer_desc.GetOffsetFromMultiIndex(merged_dim_data_id + normal_dim_data_id); nonlinear_dim_data_steps(vector_access_dim) =
long_vector_size * nonlinear_dim_long_vector_access_id[vector_access_dim];
const index_t src_normal_offset = // move src cooridnate along nonlinear dimensions
SrcDesc::GetOffsetFromMultiIndex(normal_dim_data_id); // this coordinate contains run-time per-thread offset
const auto src_nonlinear_coord = mSrcSliceOrigin + nonlinear_dim_data_steps;
p_buffer[buffer_offset] = p_src_tmp[src_normal_offset]; // loop over src's linear dimensions
}); ford<decltype(linear_long_vector_access_lengths)>{}([&](
}); auto linear_dim_long_vector_access_id) {
#endif
// DstMergedDimSliceLengthsHack has entry same as SliceLengths on dst merged dimensions, // step-sizes along src's linear dimensions
// but 1 on normal dimensions; auto linear_dim_data_steps = linear_dim_long_vector_access_id;
// DstNormalDimSliceLengthsHack has entry same as SliceLengths on dst normal dimensions, linear_dim_data_steps(vector_access_dim) =
// but 1 on merged dimensions; long_vector_size * linear_dim_long_vector_access_id[vector_access_dim];
using DstMergedDimSliceLengthsHack =
typename sequence_gen<SliceLengths::GetSize(),
IsolateMergedDimSliceLengthsHack<DstDesc, SliceLengths>>::type;
using DstNormalDimSliceLengthsHack = // buffer to hold a long-vector
decltype((SliceLengths{} + Number<1>{}) - DstMergedDimSliceLengthsHack{}); SrcData p_src_long_vector[long_vector_size];
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 // zero out buffer
static_ford<DstMergedDimSliceLengthsHack>{}([&](auto merged_dim_data_id_) { for(index_t i = 0; i < long_vector_size; ++i)
constexpr auto merged_dim_data_id = decltype(merged_dim_data_id_){}; {
p_src_long_vector[i] = 0;
}
TData* p_dst_tmp = p_dst + (mDstSliceOrigin + merged_dim_data_id).GetOffset(); // Loop over VectorAccessDim, and load data from src to the
// long-vector buffer.
// If VectorAccessDim is src's linear dimension, then src's
// offset-diff due to this looping is known at compile-time. If
// VectorAccessDim is src's nonlinear dimension, then src's
// offset-diff due to this looping is only known at run-time. For best
// performance, VectorAccessDim, should be src's linear dimension
for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(vector_access_dim) = i * src_data_per_access;
static_ford<DstNormalDimSliceLengthsHack>{}([&](auto normal_dim_data_id_) { const index_t buffer_offset = i * src_data_per_access;
constexpr auto normal_dim_data_id = decltype(normal_dim_data_id_){};
constexpr index_t buffer_offset = // move src cooridnate along linear dimensions
buffer_desc.GetOffsetFromMultiIndex(merged_dim_data_id + normal_dim_data_id); const auto src_coord =
src_nonlinear_coord + (linear_dim_data_steps + scalar_id);
constexpr index_t dst_normal_offset = // this is src compile-time offset
DstDesc::GetOffsetFromMultiIndex(normal_dim_data_id); // TODO: is this good implementation?
const index_t src_linear_offset =
src_coord.GetOffset() - src_nonlinear_coord.GetOffset();
p_dst_tmp[dst_normal_offset] = p_buffer[buffer_offset]; // Check src vector's padding situation, only check the first data in
}); // this src vector. It's user's responsiblity to make sure all data in
}); // the src vector has the same padding situation
if(src_coord.IsUpperIndexMappedToValidOffset())
{
static_if<SrcAddressSpace == address_space_t::global>{}([&](auto) {
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
__buffer_load<SrcData, SrcDataPerAccess>(
p_src, src_nonlinear_coord.GetOffset(), src_linear_offset);
#else #else
ford<DstMergedDimSliceLengthsHack>{}([&](auto merged_dim_data_id) { *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
TData* p_dst_tmp = p_dst + (mDstSliceOrigin + merged_dim_data_id).GetOffset(); *reinterpret_cast<const src_vector_t*>(
&p_src[src_nonlinear_coord.GetOffset() + src_linear_offset]);
ford<DstNormalDimSliceLengthsHack>{}([&](auto normal_dim_data_id) {
const index_t buffer_offset =
buffer_desc.GetOffsetFromMultiIndex(merged_dim_data_id + normal_dim_data_id);
const index_t dst_normal_offset =
DstDesc::GetOffsetFromMultiIndex(normal_dim_data_id);
p_dst_tmp[dst_normal_offset] = p_buffer[buffer_offset];
});
});
#endif #endif
} }).Else([&](auto) {
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
*reinterpret_cast<const src_vector_t*>(
&p_src[src_nonlinear_coord.GetOffset() + src_linear_offset]);
});
}
}
// T can be Sequence or Array // SrcData to DstData conversion
template <class T, bool PositiveDirection> DstData p_dst_long_vector[long_vector_size];
__device__ void MoveSrcSlicingWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
{
static_if<PositiveDirection>{}([&](auto) {
mSrcSliceOrigin += step_sizes;
}).Else([&](auto) { mSrcSliceOrigin -= step_sizes; });
}
template <class T, bool PositiveDirection> for(index_t i = 0; i < long_vector_size; ++i)
__device__ void MoveDstSlicingWindow(T step_sizes, integral_constant<bool, PositiveDirection>) {
{ p_dst_long_vector[i] = type_convert<DstData>{}(p_src_long_vector[i]);
static_if<PositiveDirection>{}([&](auto) { }
mDstSliceOrigin += step_sizes;
}).Else([&](auto) { mDstSliceOrigin -= step_sizes; });
}
private:
SrcCoordinate mSrcSliceOrigin;
DstCoordinate mDstSliceOrigin;
};
// This threadwise copy allow vector access of src and dst. // store data from the long-vector buffer to dst
// It allows the dimensions of vector access to be different on src and dst. for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i)
// It also allows the vector size to be different on src and dst. {
// It also allows order of access to be different on src and dst. auto scalar_id = make_zero_array<index_t, nDim>();
// It use register as buffer to hold all data moving from src to dst. scalar_id(vector_access_dim) = i * dst_data_per_access;
// It is designed for copying small amount of data, and src and dst are
// device memory or LDS.
// When copying large amout of data, let's hope compiler will reduce register
// used for the buffer.
template <class SrcDesc,
class DstDesc,
class SrcCoordinate,
class DstCoordinate,
class SliceLengths,
class SrcDimAccessOrder,
class DstDimAccessOrder,
index_t SrcVectorAccessDim,
index_t DstVectorAccessDim,
index_t SrcDataPerAccess,
index_t DstDataPerAccess>
struct ThreadwiseGenericTensorSliceCopy_v2r1
{
static constexpr index_t nDim = SliceLengths::GetSize();
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v2r1(SrcCoordinate src_slice_origin, const index_t buffer_offset = i * dst_data_per_access;
DstCoordinate dst_slice_origin)
: mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin)
{
static_assert(nDim == SrcDesc::GetNumOfDimension() &&
nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() &&
nDim == SrcDimAccessOrder::GetSize() &&
nDim == DstDimAccessOrder::GetSize(),
"wrong! # of dimensions not the same");
static_assert(is_valid_sequence_map<SrcDimAccessOrder>::value && // dst offset is calculated here, without explicitly separating into
is_valid_sequence_map<DstDimAccessOrder>::value, // compile-time and per-thread component
"wrong! map is not valid"); const auto dst_coord = mDstSliceOrigin + (nonlinear_dim_data_steps +
linear_dim_data_steps + scalar_id);
static_assert(SliceLengths{}[SrcVectorAccessDim] % SrcDataPerAccess == 0 &&
SliceLengths{}[DstVectorAccessDim] % DstDataPerAccess == 0,
"wrong! cannot evenly divide");
// check vectorized memory access
constexpr auto src_vector_access_dim = Number<SrcVectorAccessDim>{};
constexpr auto dst_vector_access_dim = Number<DstVectorAccessDim>{};
static_if<!SrcDesc::ContainMultipleOriginalDimensions(src_vector_access_dim)>{}(
[&](auto fwd) {
static_assert(
(fwd(SrcDesc{}).GetStride(src_vector_access_dim) == 1 || SrcDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
})
.Else([&](auto fwd) {
static_assert(
(fwd(SrcDesc{}).GetLastOriginalDimensionStride(src_vector_access_dim) == 1 ||
SrcDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
});
static_if<!DstDesc::ContainMultipleOriginalDimensions(dst_vector_access_dim)>{}( // Check dst vector's padding situation, only check the first data in
[&](auto fwd) { // this dst vector. It's user's responsiblity to make sure all data in
static_assert( // the dst vector has the same padding situation
(fwd(DstDesc{}).GetStride(dst_vector_access_dim) == 1 || DstDataPerAccess == 1), if(dst_coord.IsUpperIndexMappedToValidOffset())
"wrong! vectorized access is allowed only if stride == 1"); {
}) *reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
.Else([&](auto fwd) { *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
static_assert( }
(fwd(DstDesc{}).GetLastOriginalDimensionStride(dst_vector_access_dim) == 1 || }
DstDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
}); });
});
} }
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v2r1() // p_src could be any memory space, d_dst must be global memory.
: ThreadwiseGenericTensorSliceCopy_v2r1(make_zero_array<index_t, nDim>(), // User should make sure p_dst is a block-invariant pointer, because
make_zero_array<index_t, nDim>()) // buffer_load is used for storing data from regsiter buffer into global-memory.
{ // Will do padding check on src data: Read 0 if src data is in padding area.
} // Will do padding check on dst data: No write if dst data is in paddin area.
// This version is optimized for address calculation of dst tensor
__device__ void SetSrcSliceOrigin(SrcCoordinate src_slice_origin) template <typename SrcData,
{ typename DstData,
mSrcSliceOrigin = src_slice_origin; address_space_t SrcAddressSpace = address_space_t::generic,
} address_space_t DstAddressSpace = address_space_t::generic>
__device__ void Run_optimized_dst_address_calculation(const SrcData* p_src,
__device__ void SetDstSliceOrigin(DstCoordinate dst_slice_origin) DstData* p_dst) const
{
mDstSliceOrigin = dst_slice_origin;
}
template <class TDesc, class Lengths>
struct IsolateMergedDimLengths
{ {
template <class IDim> using src_vector_t = typename vector_type<SrcData, SrcDataPerAccess>::MemoryType;
__device__ constexpr index_t operator()(IDim idim) const using dst_vector_t = typename vector_type<DstData, DstDataPerAccess>::MemoryType;
{
return TDesc::ContainMultipleOriginalDimensions(idim) ? Lengths{}[idim] : 1;
}
};
template <class TData>
__device__ void Run(const TData* p_src, TData* p_dst) const
{
constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{});
TData p_buffer_[buffer_desc.GetElementSpace()];
TData* p_buffer = p_buffer_;
// copy data from src into buffer
{
using src_vector_t = typename vector_type<TData, SrcDataPerAccess>::MemoryType;
constexpr auto src_vector_access_dim = Number<SrcVectorAccessDim>{};
constexpr auto src_data_per_access = Number<SrcDataPerAccess>{};
constexpr auto src_access_lengths = SliceLengths::Modify(
src_vector_access_dim,
SliceLengths::Get(src_vector_access_dim) / src_data_per_access);
// Offset w.r.t merged dimensions need to be calculated at run-time. Offset w.r.t
// normal dimensions is known at compile time.
// Below is a hack to isolate merged dimension id from normal dimension id, so the
// corresponding offset can be calculated seperately at run-time and compile-time.
// src_merged_dim_access_lengths has the same value as src_access_lengths on src's
// merged dimensions, and has value = 1 on normal dimensions;
// src_merged_dim_access_lengths has the same value as src_access_lengths on src's
// normal dimensions, and has value = 1 on merged dimensions;
constexpr auto src_merged_dim_access_lengths = typename sequence_gen<
nDim,
IsolateMergedDimLengths<SrcDesc, decltype(src_access_lengths)>>::type{};
constexpr auto src_normal_dim_access_lengths = constexpr auto vector_access_dim = Number<VectorAccessDim>{};
src_access_lengths + Number<1>{} - src_merged_dim_access_lengths;
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 constexpr auto src_data_per_access = Number<SrcDataPerAccess>{};
// offset w.r.t. merged dimension need to be computed at run-time constexpr auto dst_data_per_access = Number<DstDataPerAccess>{};
static_ford<decltype(src_merged_dim_access_lengths), SrcDimAccessOrder>{}([&](
auto src_merged_dim_access_id_) {
constexpr auto src_merged_dim_access_id = decltype(src_merged_dim_access_id_){}; constexpr auto long_vector_size = Number<math::lcm(SrcDataPerAccess, DstDataPerAccess)>{};
constexpr auto src_merged_dim_data_id = src_merged_dim_access_id.Modify( constexpr auto long_vector_access_lengths = SliceLengths::Modify(
src_vector_access_dim, vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size);
src_merged_dim_access_id[src_vector_access_dim] * src_data_per_access);
const TData* p_src_tmp = // separate linear dimensions from non-linear dimensions
p_src + (mSrcSliceOrigin + src_merged_dim_data_id).GetOffset(); constexpr auto dst_linear_dim_mask = DstDesc::GetLinearDimensionMask();
constexpr auto dst_nonlinear_dim_mask = DstDesc::GetNonLinearDimensionMask();
// offset w.r.t. normal dimension can be computed at compile-time static_assert(dst_linear_dim_mask.At(VectorAccessDim) ||
static_ford<decltype(src_normal_dim_access_lengths), SrcDimAccessOrder>{}([&]( long_vector_size == DstDataPerAccess,
auto src_normal_dim_access_id_) { "Warning! VectorAccessDim is not DstDesc's linear dimension, performance "
"would drop");
constexpr auto src_normal_dim_access_id = decltype(src_normal_dim_access_id_){}; // separate steps into linear and non-linear components, accoording to dst tensor
constexpr auto linear_long_vector_access_lengths =
mask_lengths(long_vector_access_lengths, dst_linear_dim_mask);
constexpr auto src_normal_dim_data_id = src_normal_dim_access_id.Modify( constexpr auto nonlinear_long_vector_access_lengths =
src_vector_access_dim, mask_lengths(long_vector_access_lengths, dst_nonlinear_dim_mask);
src_normal_dim_access_id[src_vector_access_dim] * src_data_per_access);
constexpr index_t src_normal_offset = // loop over dst's non-linear dimensions
SrcDesc::GetOffsetFromMultiIndex(src_normal_dim_data_id); ford<decltype(nonlinear_long_vector_access_lengths)>{}([&](
auto nonlinear_dim_long_vector_access_id) {
// load vector from src // calculate step-sizes along dst's nonlinear dimensions
const src_vector_t vector_data = auto nonlinear_dim_data_steps = nonlinear_dim_long_vector_access_id;
*reinterpret_cast<const src_vector_t*>(&p_src_tmp[src_normal_offset]); nonlinear_dim_data_steps(vector_access_dim) =
long_vector_size * nonlinear_dim_long_vector_access_id[vector_access_dim];
// unpack vector into buffer // move dst cooridnate along nonlinear dimensions
static_for<0, SrcDataPerAccess, 1>{}([&](auto i) { // this coordinate contains run-time per-thread offset
constexpr auto scalar_id = const auto dst_nonlinear_coord = mDstSliceOrigin + nonlinear_dim_data_steps;
typename uniform_sequence_gen<nDim, 0>::type{}.Modify(
src_vector_access_dim, i);
constexpr index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex( // loop over dst's linear dimensions
src_merged_dim_data_id + src_normal_dim_data_id + scalar_id); ford<decltype(linear_long_vector_access_lengths)>{}([&](
auto linear_dim_long_vector_access_id) {
p_buffer[buffer_offset] = reinterpret_cast<const TData*>(&vector_data)[i]; // step-sizes along dst's linear dimensions
}); auto linear_dim_data_steps = linear_dim_long_vector_access_id;
}); linear_dim_data_steps(vector_access_dim) =
}); long_vector_size * linear_dim_long_vector_access_id[vector_access_dim];
#else
ford<decltype(src_merged_dim_access_lengths), SrcDimAccessOrder>{}([&](
auto src_merged_dim_access_id) {
auto src_merged_dim_data_id = src_merged_dim_access_id; // buffer to hold a long-vector
src_merged_dim_data_id(src_vector_access_dim) = SrcData p_src_long_vector[long_vector_size];
src_merged_dim_access_id[src_vector_access_dim] * src_data_per_access;
const TData* p_src_tmp =
p_src + (mSrcSliceOrigin + src_merged_dim_data_id).GetOffset();
// these should be compile-time known // zero out buffer
ford<decltype(src_normal_dim_access_lengths), SrcDimAccessOrder>{}([&]( for(index_t i = 0; i < long_vector_size; ++i)
auto src_normal_dim_access_id) { {
p_src_long_vector[i] = 0;
}
auto src_normal_dim_data_id = src_normal_dim_access_id; // Loop over VectorAccessDim, and load data from src to the
src_normal_dim_data_id(src_vector_access_dim) = // long-vector buffer.
src_normal_dim_access_id[src_vector_access_dim] * src_data_per_access; // If VectorAccessDim is dst's linear dimension, then dst's
// offset-diff due to this looping is known at compile-time. If
// VectorAccessDim is dst's nonlinear dimension, then dst's
// offset-diff due to this looping is only known at run-time. For best
// performance, VectorAccessDim, should be dst's linear dimension
for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(vector_access_dim) = i * src_data_per_access;
const index_t src_normal_offset = const index_t buffer_offset = i * src_data_per_access;
SrcDesc::GetOffsetFromMultiIndex(src_normal_dim_data_id);
// load vector from src // src offset is calculated here, without explicitly separating into
const src_vector_t vector_data = // compile-time and per-thread component
*reinterpret_cast<const src_vector_t*>(&p_src_tmp[src_normal_offset]); const auto src_coord = mSrcSliceOrigin + (nonlinear_dim_data_steps +
linear_dim_data_steps + scalar_id);
// unpack vector into buffer // Check src vector's padding situation, only check the first data in
for(index_t i = 0; i < SrcDataPerAccess; ++i) // this src vector. It's user's responsiblity to make sure all data in
// the src vector has the same padding situation
if(src_coord.IsUpperIndexMappedToValidOffset())
{ {
auto scalar_id = make_zero_array<index_t, nDim>(); *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
scalar_id(src_vector_access_dim) = i; *reinterpret_cast<const src_vector_t*>(&p_src[src_coord.GetOffset()]);
const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex(
src_merged_dim_data_id + src_normal_dim_data_id + scalar_id);
p_buffer[buffer_offset] = reinterpret_cast<const TData*>(&vector_data)[i];
} }
}); }
});
#endif
}
// copy data from buffer into dst
{
using dst_vector_t = typename vector_type<TData, DstDataPerAccess>::MemoryType;
constexpr auto dst_vector_access_dim = Number<DstVectorAccessDim>{};
constexpr auto dst_data_per_access = Number<DstDataPerAccess>{};
constexpr auto dst_access_lengths = SliceLengths::Modify(
dst_vector_access_dim,
SliceLengths::Get(dst_vector_access_dim) / dst_data_per_access);
constexpr auto dst_merged_dim_access_lengths = typename sequence_gen<
nDim,
IsolateMergedDimLengths<DstDesc, decltype(dst_access_lengths)>>::type{};
constexpr auto dst_normal_dim_access_lengths =
dst_access_lengths + Number<1>{} - dst_merged_dim_access_lengths;
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
// offset w.r.t. merged dimension need to be computed at run-time
static_ford<decltype(dst_merged_dim_access_lengths), DstDimAccessOrder>{}([&](
auto dst_merged_dim_access_id_) {
constexpr auto dst_merged_dim_access_id = decltype(dst_merged_dim_access_id_){};
constexpr auto dst_merged_dim_data_id = dst_merged_dim_access_id.Modify(
dst_vector_access_dim,
dst_merged_dim_access_id[dst_vector_access_dim] * dst_data_per_access);
TData* p_dst_tmp = p_dst + (mDstSliceOrigin + dst_merged_dim_data_id).GetOffset();
// offset w.r.t. normal dimension can be computed at compile-time
static_ford<decltype(dst_normal_dim_access_lengths), DstDimAccessOrder>{}([&](
auto dst_normal_dim_access_id_) {
constexpr auto dst_normal_dim_access_id = decltype(dst_normal_dim_access_id_){};
constexpr auto dst_normal_dim_data_id = dst_normal_dim_access_id.Modify(
dst_vector_access_dim,
dst_normal_dim_access_id[dst_vector_access_dim] * dst_data_per_access);
dst_vector_t vector_data;
// pack vector from buffer
static_for<0, DstDataPerAccess, 1>{}([&](auto i) {
constexpr auto scalar_id =
typename uniform_sequence_gen<nDim, 0>::type{}.Modify(
dst_vector_access_dim, i);
constexpr index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex(
dst_merged_dim_data_id + dst_normal_dim_data_id + scalar_id);
reinterpret_cast<TData*>(&vector_data)[i] = p_buffer[buffer_offset];
});
constexpr index_t dst_normal_offset = // SrcData to DstData conversion
DstDesc::GetOffsetFromMultiIndex(dst_normal_dim_data_id); DstData p_dst_long_vector[long_vector_size];
// write vector into dst for(index_t i = 0; i < long_vector_size; ++i)
*reinterpret_cast<dst_vector_t*>(&p_dst_tmp[dst_normal_offset]) = vector_data; {
}); p_dst_long_vector[i] = type_convert<DstData>{}(p_src_long_vector[i]);
}); }
#else
// offset w.r.t. merged dimension need to be computed at run-time
ford<decltype(dst_merged_dim_access_lengths), DstDimAccessOrder>{}([&](
auto dst_merged_dim_access_id) {
auto dst_merged_dim_data_id = dst_merged_dim_access_id;
dst_merged_dim_data_id(dst_vector_access_dim) =
dst_merged_dim_access_id[dst_vector_access_dim] * dst_data_per_access;
TData* p_dst_tmp = p_dst + (mDstSliceOrigin + dst_merged_dim_data_id).GetOffset(); // store data from the long-vector buffer to dst
for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(vector_access_dim) = i * dst_data_per_access;
// offset w.r.t. normal dimension can be computed at compile-time const index_t buffer_offset = i * dst_data_per_access;
ford<decltype(dst_normal_dim_access_lengths), DstDimAccessOrder>{}([&](
auto dst_normal_dim_access_id) {
auto dst_normal_dim_data_id = dst_normal_dim_access_id; // move dst cooridnate along linear dimensions
dst_normal_dim_data_id(dst_vector_access_dim) = const auto dst_coord =
dst_normal_dim_access_id[dst_vector_access_dim] * dst_data_per_access; dst_nonlinear_coord + (linear_dim_data_steps + scalar_id);
dst_vector_t vector_data; // this is dst compile-time offset
// TODO: is this good implementation?
const index_t dst_linear_offset =
dst_coord.GetOffset() - dst_nonlinear_coord.GetOffset();
// pack vector from buffer // Check dst vector's padding situation, only check the first data in
for(index_t i = 0; i < DstDataPerAccess; ++i) // this dst vector. It's user's responsiblity to make sure all data in
// the dst vector has the same padding situation
if(dst_coord.IsUpperIndexMappedToValidOffset())
{ {
auto scalar_id = make_zero_array<index_t, nDim>(); static_if<DstAddressSpace == address_space_t::global>{}([&](auto) {
scalar_id(dst_vector_access_dim) = i; #if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
__buffer_store<DstData, DstDataPerAccess>(
const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex( *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]),
dst_merged_dim_data_id + dst_normal_dim_data_id + scalar_id); p_dst,
dst_nonlinear_coord.GetOffset(),
reinterpret_cast<TData*>(&vector_data)[i] = p_buffer[buffer_offset]; dst_linear_offset);
#else
*reinterpret_cast<dst_vector_t*>(
&p_dst[dst_nonlinear_coord.GetOffset() + dst_linear_offset]) =
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
#endif
}).Else([&](auto) {
*reinterpret_cast<dst_vector_t*>(
&p_dst[dst_nonlinear_coord.GetOffset() + dst_linear_offset]) =
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
});
} }
}
const index_t dst_normal_offset =
DstDesc::GetOffsetFromMultiIndex(dst_normal_dim_data_id);
// write vector into dst
*reinterpret_cast<dst_vector_t*>(&p_dst_tmp[dst_normal_offset]) = vector_data;
});
}); });
#endif });
}
} }
// T can be Sequence or Array template <typename T, bool PositiveDirection>
template <class T, bool PositiveDirection> __device__ void MoveSrcSliceWindow(const T& step_sizes_,
__device__ void MoveSrcSlicingWindow(T step_sizes, integral_constant<bool, PositiveDirection>) integral_constant<bool, PositiveDirection>)
{ {
const auto step_sizes = to_array(step_sizes_);
static_if<PositiveDirection>{}([&](auto) { static_if<PositiveDirection>{}([&](auto) {
mSrcSliceOrigin += step_sizes; mSrcSliceOrigin += to_array(step_sizes);
}).Else([&](auto) { mSrcSliceOrigin -= step_sizes; }); }).Else([&](auto) { mSrcSliceOrigin -= step_sizes; });
} }
template <class T, bool PositiveDirection> template <typename T, bool PositiveDirection>
__device__ void MoveDstSlicingWindow(T step_sizes, integral_constant<bool, PositiveDirection>) __device__ void MoveDstSliceWindow(const T& step_sizes_,
integral_constant<bool, PositiveDirection>)
{ {
const auto step_sizes = to_array(step_sizes_);
static_if<PositiveDirection>{}([&](auto) { static_if<PositiveDirection>{}([&](auto) {
mDstSliceOrigin += step_sizes; mDstSliceOrigin += step_sizes;
}).Else([&](auto) { mDstSliceOrigin -= step_sizes; }); }).Else([&](auto) { mDstSliceOrigin -= step_sizes; });
} }
private: private:
SrcCoordinate mSrcSliceOrigin; SrcCoord mSrcSliceOrigin;
DstCoordinate mDstSliceOrigin; DstCoord mDstSliceOrigin;
}; };
} // namespace ck } // namespace ck
......
#ifndef CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_DEPRECATED_HPP
#define CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_DEPRECATED_HPP
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "tensor_view.hpp"
#include "tensor_coordinate_deprecated.hpp"
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
#endif
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
#endif
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
#endif
#ifndef CK_USE_AMD_INTRINSIC
#define CK_USE_AMD_INTRINSIC 1
#endif
#ifndef CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1
#endif
namespace ck {
// This threadwise copy allow vector access of src and dst.
// It allows the dimensions of vector access to be different on src and dst.
// It also allows the vector size to be different on src and dst.
// It also allows order of access to be different on src and dst.
// It use register as buffer to hold all data moving from src to dst.
// It is designed for copying small amount of data, and src and dst are
// device memory or LDS.
// When copying large amout of data, let's hope compiler will reduce register
// used for the buffer.
template <typename SrcDesc,
typename DstDesc,
typename SliceLengths,
typename SrcDimAccessOrder,
typename DstDimAccessOrder,
index_t SrcVectorAccessDim,
index_t DstVectorAccessDim,
index_t SrcDataPerAccess,
index_t DstDataPerAccess>
struct ThreadwiseGenericTensorSliceCopy_v1r1
{
static constexpr index_t nDim = SliceLengths::GetSize();
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r1(
Array<index_t, nDim> src_slice_origin, Array<index_t, nDim> dst_slice_origin)
: mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin)
{
static_assert(nDim == SrcDesc::GetNumOfDimension() &&
nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() &&
nDim == SrcDimAccessOrder::GetSize() &&
nDim == DstDimAccessOrder::GetSize(),
"wrong! # of dimensions not the same");
static_assert(is_valid_sequence_map<SrcDimAccessOrder>::value &&
is_valid_sequence_map<DstDimAccessOrder>::value,
"wrong! map is not valid");
static_assert(SliceLengths{}[SrcVectorAccessDim] % SrcDataPerAccess == 0 &&
SliceLengths{}[DstVectorAccessDim] % DstDataPerAccess == 0,
"wrong! cannot evenly divide");
// check vectorized memory access
constexpr auto src_vector_access_dim = Number<SrcVectorAccessDim>{};
constexpr auto dst_vector_access_dim = Number<DstVectorAccessDim>{};
static_if<!SrcDesc::ContainMultipleOriginalDimensions(src_vector_access_dim)>{}(
[&](auto fwd) {
static_assert(
(fwd(SrcDesc{}).GetStride(src_vector_access_dim) == 1 || SrcDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
})
.Else([&](auto fwd) {
static_assert(
(fwd(SrcDesc{}).GetLastOriginalDimensionStride(src_vector_access_dim) == 1 ||
SrcDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
});
static_if<!DstDesc::ContainMultipleOriginalDimensions(dst_vector_access_dim)>{}(
[&](auto fwd) {
static_assert(
(fwd(DstDesc{}).GetStride(dst_vector_access_dim) == 1 || DstDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
})
.Else([&](auto fwd) {
static_assert(
(fwd(DstDesc{}).GetLastOriginalDimensionStride(dst_vector_access_dim) == 1 ||
DstDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
});
}
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r1()
: ThreadwiseGenericTensorSliceCopy_v1r1(make_zero_array<index_t, nDim>(),
make_zero_array<index_t, nDim>())
{
}
__device__ void SetSrcSliceOrigin(Array<index_t, nDim> src_slice_origin)
{
mSrcSliceOrigin = src_slice_origin;
}
__device__ void SetDstSliceOrigin(Array<index_t, nDim> dst_slice_origin)
{
mDstSliceOrigin = dst_slice_origin;
}
template <typename TData>
__device__ void Run(const TData* p_src, TData* p_dst) const
{
constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{});
TData p_buffer_[buffer_desc.GetElementSpace()];
TData* p_buffer = p_buffer_;
// copy data from src into buffer
{
using vector_t = typename vector_type<TData, SrcDataPerAccess>::MemoryType;
constexpr auto src_vector_access_dim = Number<SrcVectorAccessDim>{};
constexpr auto src_data_per_access = Number<SrcDataPerAccess>{};
constexpr auto src_access_lengths = SliceLengths::Modify(
src_vector_access_dim,
SliceLengths::Get(src_vector_access_dim) / src_data_per_access);
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
static_ford<decltype(src_access_lengths), SrcDimAccessOrder>{}([&](auto src_access_id) {
constexpr auto src_data_begin_id = src_access_id.Modify(
src_vector_access_dim,
src_access_id[src_vector_access_dim] * src_data_per_access);
const index_t src_offset =
SrcDesc::GetOffsetFromMultiIndex(mSrcSliceOrigin + src_data_begin_id);
// load vector from src
const vector_t vector_data = *reinterpret_cast<const vector_t*>(&p_src[src_offset]);
// unpack vector into buffer
static_for<0, SrcDataPerAccess, 1>{}([&](auto i) {
constexpr auto scalar_id =
typename uniform_sequence_gen<nDim, 0>::type{}.Modify(src_vector_access_dim,
i);
constexpr index_t buffer_offset =
buffer_desc.GetOffsetFromMultiIndex(src_data_begin_id + scalar_id);
p_buffer[buffer_offset] = reinterpret_cast<const TData*>(&vector_data)[i];
});
});
#else
ford<decltype(src_access_lengths), SrcDimAccessOrder>{}([&](auto src_access_id) {
auto src_data_begin_id = src_access_id;
src_data_begin_id(src_vector_access_dim) =
src_access_id[src_vector_access_dim] * src_data_per_access;
const index_t src_offset =
SrcDesc::GetOffsetFromMultiIndex(mSrcSliceOrigin + src_data_begin_id);
// load vector from src
const vector_t vector_data = *reinterpret_cast<const vector_t*>(&p_src[src_offset]);
// unpack vector into buffer
for(index_t i = 0; i < SrcDataPerAccess; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(src_vector_access_dim) = i;
const index_t buffer_offset =
buffer_desc.GetOffsetFromMultiIndex(src_data_begin_id + scalar_id);
p_buffer[buffer_offset] = reinterpret_cast<const TData*>(&vector_data)[i];
}
});
#endif
}
// copy data from buffer to dst
{
using vector_t = typename vector_type<TData, DstDataPerAccess>::MemoryType;
constexpr auto dst_vector_access_dim = Number<DstVectorAccessDim>{};
constexpr auto dst_data_per_access = Number<DstDataPerAccess>{};
constexpr auto dst_access_lengths = SliceLengths::Modify(
dst_vector_access_dim,
SliceLengths::Get(dst_vector_access_dim) / dst_data_per_access);
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
static_ford<decltype(dst_access_lengths), DstDimAccessOrder>{}([&](auto dst_access_id) {
constexpr auto dst_data_begin_id = dst_access_id.Modify(
dst_vector_access_dim,
dst_access_id[dst_vector_access_dim] * dst_data_per_access);
vector_t vector_data{};
// pack vector from buffer
static_for<0, DstDataPerAccess, 1>{}([&](auto i) {
constexpr auto scalar_id =
typename uniform_sequence_gen<nDim, 0>::type{}.Modify(dst_vector_access_dim,
i);
constexpr index_t buffer_offset =
buffer_desc.GetOffsetFromMultiIndex(dst_data_begin_id + scalar_id);
reinterpret_cast<TData*>(&vector_data)[i] = p_buffer[buffer_offset];
});
const index_t dst_offset =
DstDesc::GetOffsetFromMultiIndex(mDstSliceOrigin + dst_data_begin_id);
// store vector into dst
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) = vector_data;
});
#else
ford<decltype(dst_access_lengths), DstDimAccessOrder>{}([&](auto dst_access_id) {
auto dst_data_begin_id = dst_access_id;
dst_data_begin_id(dst_vector_access_dim) =
dst_access_id[dst_vector_access_dim] * dst_data_per_access;
vector_t vector_data{};
// pack vector from buffer
for(index_t i = 0; i < DstDataPerAccess; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(dst_vector_access_dim) = i;
const index_t buffer_offset =
buffer_desc.GetOffsetFromMultiIndex(dst_data_begin_id + scalar_id);
reinterpret_cast<TData*>(&vector_data)[i] = p_buffer[buffer_offset];
}
const index_t dst_offset =
DstDesc::GetOffsetFromMultiIndex(mDstSliceOrigin + dst_data_begin_id);
// store vector into dst
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) = vector_data;
});
#endif
}
}
private:
Array<index_t, nDim> mSrcSliceOrigin;
Array<index_t, nDim> mDstSliceOrigin;
};
// This threadwise copy allow vector access of src and dst.
// It allows the vector size to be different on src and dst.
// The dimensions of vector access should be the same on src and dst.
// The dimension access order should be the same on src and dst.
// It is designed for cases, where one of src and dst is register, and
// the other is device memory or LDS
template <typename SrcDesc,
typename DstDesc,
typename SliceLengths,
typename DimAccessOrder,
index_t VectorAccessDim,
index_t SrcDataPerAccess,
index_t DstDataPerAccess>
struct ThreadwiseGenericTensorSliceCopy_v1r2
{
static constexpr index_t nDim = SliceLengths::GetSize();
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r2(
Array<index_t, nDim> src_slice_origin, Array<index_t, nDim> dst_slice_origin)
: mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin)
{
static_assert(nDim == SrcDesc::GetNumOfDimension() &&
nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() &&
nDim == DimAccessOrder::GetSize(),
"wrong! # of dimensions not the same");
static_assert(is_valid_sequence_map<DimAccessOrder>::value, "wrong! map is not valid");
static_assert(
SliceLengths{}[VectorAccessDim] % math::lcm(SrcDataPerAccess, DstDataPerAccess) == 0,
"wrong! cannot evenly divide");
// check vectorized memory access
constexpr auto vector_access_dim = Number<VectorAccessDim>{};
static_if<!SrcDesc::ContainMultipleOriginalDimensions(vector_access_dim)>{}([&](auto fwd) {
static_assert(
(fwd(SrcDesc{}).GetStride(vector_access_dim) == 1 || SrcDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
}).Else([&](auto fwd) {
static_assert((fwd(SrcDesc{}).GetLastOriginalDimensionStride(vector_access_dim) == 1 ||
SrcDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
});
static_if<!DstDesc::ContainMultipleOriginalDimensions(vector_access_dim)>{}([&](auto fwd) {
static_assert(
(fwd(DstDesc{}).GetStride(vector_access_dim) == 1 || DstDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
}).Else([&](auto fwd) {
static_assert((fwd(DstDesc{}).GetLastOriginalDimensionStride(vector_access_dim) == 1 ||
DstDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
});
}
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r2()
: ThreadwiseGenericTensorSliceCopy_v1r2(make_zero_array<index_t, nDim>(),
make_zero_array<index_t, nDim>())
{
}
__device__ void SetSrcSliceOrigin(Array<index_t, nDim> src_slice_origin)
{
mSrcSliceOrigin = src_slice_origin;
}
__device__ void SetDstSliceOrigin(Array<index_t, nDim> dst_slice_origin)
{
mDstSliceOrigin = dst_slice_origin;
}
template <typename TData>
__device__ void Run(const TData* p_src, TData* p_dst) const
{
using src_vector_t = typename vector_type<TData, SrcDataPerAccess>::MemoryType;
using dst_vector_t = typename vector_type<TData, DstDataPerAccess>::MemoryType;
constexpr auto vector_access_dim = Number<VectorAccessDim>{};
constexpr auto src_data_per_access = Number<SrcDataPerAccess>{};
constexpr auto dst_data_per_access = Number<DstDataPerAccess>{};
constexpr auto long_vector_size = Number<math::lcm(SrcDataPerAccess, DstDataPerAccess)>{};
constexpr auto long_vector_access_lengths = SliceLengths::Modify(
vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size);
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2
static_ford<decltype(long_vector_access_lengths), DimAccessOrder>{}([&](
auto long_vector_access_id) {
// data id w.r.t slicing-window
constexpr auto long_vector_data_begin_id = long_vector_access_id.Modify(
vector_access_dim, long_vector_access_id[vector_access_dim] * long_vector_size);
// buffer to hold a long-vector
TData p_long_vector[long_vector_size];
// load data from src to the long-vector buffer
static_for<0, long_vector_size / src_data_per_access, 1>{}([&](auto i) {
constexpr auto scalar_id = typename uniform_sequence_gen<nDim, 0>::type{}.Modify(
vector_access_dim, i * src_data_per_access);
const index_t src_offset = SrcDesc::GetOffsetFromMultiIndex(
mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id));
constexpr index_t buffer_offset = i * src_data_per_access;
*reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) =
*reinterpret_cast<const src_vector_t*>(&p_src[src_offset]);
});
// store data from the long-vector buffer to dst
static_for<0, long_vector_size / dst_data_per_access, 1>{}([&](auto i) {
constexpr auto scalar_id = typename uniform_sequence_gen<nDim, 0>::type{}.Modify(
vector_access_dim, i * dst_data_per_access);
constexpr index_t buffer_offset = i * dst_data_per_access;
const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(
mDstSliceOrigin + (long_vector_data_begin_id + scalar_id));
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_offset]) =
*reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]);
});
});
#else
ford<decltype(long_vector_access_lengths), DimAccessOrder>{}(
[&](auto long_vector_access_id) {
// data id w.r.t slicing-window
auto long_vector_data_begin_id = long_vector_access_id;
long_vector_data_begin_id(vector_access_dim) =
long_vector_size * long_vector_access_id[vector_access_dim];
// buffer to hold a long-vector
TData p_long_vector[long_vector_size];
// load data from src to the long-vector buffer
for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(vector_access_dim) = i * src_data_per_access;
const index_t src_offset = SrcDesc::GetOffsetFromMultiIndex(
mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id));
const index_t buffer_offset = i * src_data_per_access;
*reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) =
*reinterpret_cast<const src_vector_t*>(&p_src[src_offset]);
}
// store data from the long-vector buffer to dst
for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(vector_access_dim) = i * dst_data_per_access;
const index_t buffer_offset = i * dst_data_per_access;
const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(
mDstSliceOrigin + (long_vector_data_begin_id + scalar_id));
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_offset]) =
*reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]);
}
});
#endif
}
private:
Array<index_t, nDim> mSrcSliceOrigin;
Array<index_t, nDim> mDstSliceOrigin;
};
// This version use TensorCoordinate_deprecated
// This threadwise copy allow vector access of src and dst.
// It allows the dimensions of vector access to be different on src and dst.
// It also allows the vector size to be different on src and dst.
// It also allows order of access to be different on src and dst.
// It use register as buffer to hold all data moving from src to dst.
// It is designed for copying small amount of data, and src and dst are
// device memory or LDS.
// When copying large amout of data, let's hope compiler will reduce register
// used for the buffer.
template <typename SrcDesc,
typename DstDesc,
typename SliceLengths,
typename SrcDimAccessOrder,
typename DstDimAccessOrder,
index_t SrcVectorAccessDim,
index_t DstVectorAccessDim,
index_t SrcDataPerAccess,
index_t DstDataPerAccess>
struct ThreadwiseGenericTensorSliceCopy_v2r1
{
static constexpr index_t nDim = SliceLengths::GetSize();
using Index = MultiIndex<nDim>;
using SrcCoordinate = typename TensorCoordinate_deprecated<SrcDesc>::type;
using DstCoordinate = typename TensorCoordinate_deprecated<DstDesc>::type;
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v2r1(const Index& src_slice_origin,
const Index& dst_slice_origin)
: mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin)
{
static_assert(nDim == SrcDesc::GetNumOfDimension() &&
nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() &&
nDim == SrcDimAccessOrder::GetSize() &&
nDim == DstDimAccessOrder::GetSize(),
"wrong! # of dimensions not the same");
static_assert(is_valid_sequence_map<SrcDimAccessOrder>::value &&
is_valid_sequence_map<DstDimAccessOrder>::value,
"wrong! map is not valid");
static_assert(SliceLengths{}[SrcVectorAccessDim] % SrcDataPerAccess == 0 &&
SliceLengths{}[DstVectorAccessDim] % DstDataPerAccess == 0,
"wrong! cannot evenly divide");
// check vectorized memory access
constexpr auto src_vector_access_dim = Number<SrcVectorAccessDim>{};
constexpr auto dst_vector_access_dim = Number<DstVectorAccessDim>{};
static_if<!SrcDesc::ContainMultipleOriginalDimensions(src_vector_access_dim)>{}(
[&](auto fwd) {
static_assert(
(fwd(SrcDesc{}).GetStride(src_vector_access_dim) == 1 || SrcDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
})
.Else([&](auto fwd) {
static_assert(
(fwd(SrcDesc{}).GetLastOriginalDimensionStride(src_vector_access_dim) == 1 ||
SrcDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
});
static_if<!DstDesc::ContainMultipleOriginalDimensions(dst_vector_access_dim)>{}(
[&](auto fwd) {
static_assert(
(fwd(DstDesc{}).GetStride(dst_vector_access_dim) == 1 || DstDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
})
.Else([&](auto fwd) {
static_assert(
(fwd(DstDesc{}).GetLastOriginalDimensionStride(dst_vector_access_dim) == 1 ||
DstDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
});
}
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v2r1()
: ThreadwiseGenericTensorSliceCopy_v2r1(make_zero_array<index_t, nDim>(),
make_zero_array<index_t, nDim>())
{
}
__device__ void SetSrcSliceOrigin(SrcCoordinate src_slice_origin)
{
mSrcSliceOrigin = src_slice_origin;
}
__device__ void SetDstSliceOrigin(DstCoordinate dst_slice_origin)
{
mDstSliceOrigin = dst_slice_origin;
}
template <typename TDesc, class Lengths>
struct IsolateMergedDimLengths
{
template <typename IDim>
__device__ constexpr index_t operator()(IDim idim) const
{
return TDesc::ContainMultipleOriginalDimensions(idim) ? Lengths{}[idim] : 1;
}
};
template <typename TData,
address_space_t SrcAddressSpace = address_space_t::generic,
address_space_t DstAddressSpace = address_space_t::generic>
__device__ void Run(const TData* p_src, TData* p_dst) const
{
constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{});
TData p_buffer_[buffer_desc.GetElementSpace()];
TData* p_buffer = p_buffer_;
// copy data from src into buffer
{
using src_vector_t = typename vector_type<TData, SrcDataPerAccess>::MemoryType;
constexpr auto src_vector_access_dim = Number<SrcVectorAccessDim>{};
constexpr auto src_data_per_access = Number<SrcDataPerAccess>{};
constexpr auto src_access_lengths = SliceLengths::Modify(
src_vector_access_dim,
SliceLengths::Get(src_vector_access_dim) / src_data_per_access);
// Offset w.r.t merged dimensions need to be calculated at run-time. Offset w.r.t
// normal dimensions is known at compile time.
// Below is a hack to isolate merged dimension id from normal dimension id, so the
// corresponding offset can be calculated seperately at run-time and compile-time.
// src_merged_dim_access_lengths has the same value as src_access_lengths on src's
// merged dimensions, and has value = 1 on normal dimensions;
// src_merged_dim_access_lengths has the same value as src_access_lengths on src's
// normal dimensions, and has value = 1 on merged dimensions;
constexpr auto src_merged_dim_access_lengths = typename sequence_gen<
nDim,
IsolateMergedDimLengths<SrcDesc, decltype(src_access_lengths)>>::type{};
constexpr auto src_normal_dim_access_lengths =
src_access_lengths + Number<1>{} - src_merged_dim_access_lengths;
ford<decltype(src_merged_dim_access_lengths), SrcDimAccessOrder>{}([&](
auto src_merged_dim_access_id) {
auto src_merged_dim_data_id = src_merged_dim_access_id;
src_merged_dim_data_id(src_vector_access_dim) =
src_merged_dim_access_id[src_vector_access_dim] * src_data_per_access;
// offset w.r.t. merged dimension need be computed at run-time,
const index_t src_merged_offset =
(mSrcSliceOrigin + src_merged_dim_data_id).GetOffset();
ford<decltype(src_normal_dim_access_lengths), SrcDimAccessOrder>{}([&](
auto src_normal_dim_access_id) {
auto src_normal_dim_data_id = src_normal_dim_access_id;
src_normal_dim_data_id(src_vector_access_dim) =
src_normal_dim_access_id[src_vector_access_dim] * src_data_per_access;
// offset w.r.t. normal dimension is known at compile-time
const index_t src_normal_offset =
SrcDesc::GetOffsetFromMultiIndex(src_normal_dim_data_id);
src_vector_t vector_data;
// Read vector from src.
// 1. Source code version can take src of all kinds of memory-space
// 2. Inline asm versions using global_load or buffer_load can only take
// src from global-memory
//
// Commemt for loading from global-memory:
// When
// 1) using source code, in order for compiler to emit optimal
// load instruction, or
// 2) using inline asm (global_load or buffer_load), in order
// for inline asm to be valid,
// following assumptions need to be satisfied:
// 1. p_src need to be block-invariant (assumption)
// 2. src_normal_offset must be calculatd at compile time (guaranteed)
// 3. src_merged_offset can be runtime value (no assumption imposed)
static_if<SrcAddressSpace == address_space_t::global>{}([&](auto) {
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
vector_data = __buffer_load<TData, SrcDataPerAccess>(
p_src, src_merged_offset, src_normal_offset);
#else
vector_data = *reinterpret_cast<const src_vector_t*>(
&p_src[src_normal_offset + src_merged_offset]);
#endif
}).Else([&](auto) {
// src can be all kinds of memory-space.
vector_data = *reinterpret_cast<const src_vector_t*>(
&p_src[src_normal_offset + src_merged_offset]);
});
// unpack vector into buffer
for(index_t i = 0; i < SrcDataPerAccess; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(src_vector_access_dim) = i;
const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex(
src_merged_dim_data_id + src_normal_dim_data_id + scalar_id);
p_buffer[buffer_offset] = reinterpret_cast<const TData*>(&vector_data)[i];
}
});
});
}
// copy data from buffer into dst
{
using dst_vector_t = typename vector_type<TData, DstDataPerAccess>::MemoryType;
constexpr auto dst_vector_access_dim = Number<DstVectorAccessDim>{};
constexpr auto dst_data_per_access = Number<DstDataPerAccess>{};
constexpr auto dst_access_lengths = SliceLengths::Modify(
dst_vector_access_dim,
SliceLengths::Get(dst_vector_access_dim) / dst_data_per_access);
constexpr auto dst_merged_dim_access_lengths = typename sequence_gen<
nDim,
IsolateMergedDimLengths<DstDesc, decltype(dst_access_lengths)>>::type{};
constexpr auto dst_normal_dim_access_lengths =
dst_access_lengths + Number<1>{} - dst_merged_dim_access_lengths;
ford<decltype(dst_merged_dim_access_lengths), DstDimAccessOrder>{}(
[&](auto dst_merged_dim_access_id) {
auto dst_merged_dim_data_id = dst_merged_dim_access_id;
dst_merged_dim_data_id(dst_vector_access_dim) =
dst_merged_dim_access_id[dst_vector_access_dim] * dst_data_per_access;
// offset w.r.t. merged dimension need be computed at run-time,
const index_t dst_merged_offset =
(mDstSliceOrigin + dst_merged_dim_data_id).GetOffset();
ford<decltype(dst_normal_dim_access_lengths), DstDimAccessOrder>{}([&](
auto dst_normal_dim_access_id) {
auto dst_normal_dim_data_id = dst_normal_dim_access_id;
dst_normal_dim_data_id(dst_vector_access_dim) =
dst_normal_dim_access_id[dst_vector_access_dim] * dst_data_per_access;
dst_vector_t vector_data;
// pack vector from buffer
for(index_t i = 0; i < DstDataPerAccess; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(dst_vector_access_dim) = i;
const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex(
dst_merged_dim_data_id + dst_normal_dim_data_id + scalar_id);
reinterpret_cast<TData*>(&vector_data)[i] = p_buffer[buffer_offset];
}
// offset w.r.t. normal dimension is known at compile-time
const index_t dst_normal_offset =
DstDesc::GetOffsetFromMultiIndex(dst_normal_dim_data_id);
// Write vector into dst.
// 1. Source code version can take dst of all kinds of memory-space
// 2. Inline asm versions using global_store or buffer_store can only take
// dst from global-memory
//
// Commemt for storing into global-memory:
// When
// 1) using source code, in order for compiler to emit optimal
// store instruction, or
// 2) using inline asm (global_store or buffer_store), in order
// for inline asm to be valid,
// following assumptions need to be satisfied:
// 1. p_dst need to be block-invariant (assumption)
// 2. dst_normal_offset must be calculatd at compile time (guaranteed)
// 3. dst_merged_offset can be runtime value (no assumption imposed)
static_if<DstAddressSpace == address_space_t::global>{}([&](auto) {
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
__buffer_store<TData, DstDataPerAccess>(
vector_data, p_dst, dst_merged_offset, dst_normal_offset);
#else
*reinterpret_cast<dst_vector_t*>(
&p_dst[dst_normal_offset + dst_merged_offset]) = vector_data;
#endif
}).Else([&](auto) {
// dst can be all kinds of memory-space
*reinterpret_cast<dst_vector_t*>(
&p_dst[dst_normal_offset + dst_merged_offset]) = vector_data;
});
});
});
}
}
// T can be Sequence or Array
template <typename T, bool PositiveDirection>
__device__ void MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
{
static_if<PositiveDirection>{}([&](auto) {
mSrcSliceOrigin += step_sizes;
}).Else([&](auto) { mSrcSliceOrigin -= step_sizes; });
}
template <typename T, bool PositiveDirection>
__device__ void MoveDstSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
{
static_if<PositiveDirection>{}([&](auto) {
mDstSliceOrigin += step_sizes;
}).Else([&](auto) { mDstSliceOrigin -= step_sizes; });
}
private:
SrcCoordinate mSrcSliceOrigin;
DstCoordinate mDstSliceOrigin;
};
// this version use TensorView and TensorCoordinate_deprecated
template <typename SrcTensor,
typename DstTensor,
typename SliceLengths,
typename SrcDimAccessOrder,
typename DstDimAccessOrder,
index_t SrcVectorAccessDim,
index_t DstVectorAccessDim,
index_t SrcDataPerAccess,
index_t DstDataPerAccess>
struct ThreadwiseGenericTensorSliceCopy_v3r1
{
static constexpr index_t nDim = SrcTensor::GetNumOfDimension();
using data_type = remove_cv_t<typename SrcTensor::data_type>;
using SrcCoordinate = typename SrcTensor::coordinate_type;
using DstCoordinate = typename DstTensor::coordinate_type;
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v3r1(SrcTensor src,
SrcCoordinate src_slice_origin,
DstTensor dst,
DstCoordinate dst_slice_origin)
: mSrc{src},
mDst{dst},
mSrcSlice{src.Slice(src_slice_origin, SliceLengths{})},
mDstSlice{dst.Slice(dst_slice_origin, SliceLengths{})}
{
static_assert(nDim == SrcTensor::GetNumOfDimension() &&
nDim == DstTensor::GetNumOfDimension() &&
nDim == SliceLengths::GetSize() && nDim == SrcDimAccessOrder::GetSize() &&
nDim == DstDimAccessOrder::GetSize(),
"wrong! # of dimensions not the same");
static_assert(is_valid_sequence_map<SrcDimAccessOrder>::value &&
is_valid_sequence_map<DstDimAccessOrder>::value,
"wrong! map is not valid");
static_assert(is_same<remove_cv_t<typename SrcTensor::data_type>,
remove_cv_t<typename DstTensor::data_type>>{},
"wrong! type conversion is not supported yet");
static_assert(decltype(mSrcSlice)::IsVectorizationAllowed(Number<SrcVectorAccessDim>{},
Number<SrcDataPerAccess>{}) &&
decltype(mDstSlice)::IsVectorizationAllowed(Number<DstVectorAccessDim>{},
Number<DstDataPerAccess>{}),
"wrong! vectorized access is not allowed");
}
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v3r1()
: ThreadwiseGenericTensorSliceCopy_v3r1(
SrcTensor{}, SrcCoordinate{}, DstTensor{}, DstCoordinate{})
{
}
__device__ void Run() const
{
// buffer
constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SrcTensor::GetLengths());
data_type p_buffer[buffer_desc.GetElementSpace()];
auto buffer = make_TensorView(buffer_desc, p_buffer);
// copy data from src into buffer
{
using src_vector_t = typename vector_type<data_type, SrcDataPerAccess>::MemoryType;
constexpr auto src_vector_access_dim = Number<SrcVectorAccessDim>{};
constexpr auto src_data_per_access = Number<SrcDataPerAccess>{};
auto src_slice_vectorized =
mSrcSlice.Vectorize(src_vector_access_dim, src_data_per_access);
ford<decltype(src_slice_vectorized.GetLengths()), SrcDimAccessOrder>{}(
[&](auto src_vector_id) {
// load vector from src
const src_vector_t vector_data = src_slice_vectorized[src_vector_id];
// unpack vector into buffer
auto src_scalar_id = src_vector_id;
src_scalar_id(src_vector_access_dim) *= src_data_per_access;
for(index_t i = 0; i < SrcDataPerAccess; ++i)
{
auto id = make_zero_array<index_t, nDim>();
id(src_vector_access_dim) = i;
buffer(src_scalar_id + id) =
reinterpret_cast<const data_type*>(&vector_data)[i];
}
});
}
// copy data from buffer into dst
{
using dst_vector_t = typename vector_type<data_type, DstDataPerAccess>::MemoryType;
constexpr auto dst_vector_access_dim = Number<DstVectorAccessDim>{};
constexpr auto dst_data_per_access = Number<DstDataPerAccess>{};
auto dst_slice_vectorized =
mDstSlice.Vectorize(dst_vector_access_dim, dst_data_per_access);
ford<decltype(dst_slice_vectorized.GetLengths()), DstDimAccessOrder>{}(
[&](auto dst_vector_id) {
dst_vector_t vector_data{};
// pack vector from buffer
auto dst_scalar_id = dst_vector_id;
dst_scalar_id(dst_vector_access_dim) *= dst_data_per_access;
for(index_t i = 0; i < DstDataPerAccess; ++i)
{
auto id = make_zero_array<index_t, nDim>();
id(dst_vector_access_dim) = i;
reinterpret_cast<data_type*>(&vector_data)[i] = buffer[dst_scalar_id + id];
}
// write vector into dst
dst_slice_vectorized(dst_vector_id) = vector_data;
});
}
}
// T can be Sequence or Array
template <typename T, bool PositiveDirection>
__device__ void MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
{
mSrc.MoveSliceWindow(mSrcSlice, step_sizes, integral_constant<bool, PositiveDirection>{});
}
template <typename T, bool PositiveDirection>
__device__ void MoveDstSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
{
mDst.MoveSliceWindow(mDstSlice, step_sizes, integral_constant<bool, PositiveDirection>{});
}
private:
using SrcSlice = decltype(SrcTensor{}.Slice(make_zero_array<index_t, nDim>(), SliceLengths{}));
using DstSlice = decltype(DstTensor{}.Slice(make_zero_array<index_t, nDim>(), SliceLengths{}));
SrcTensor mSrc;
DstTensor mDst;
SrcSlice mSrcSlice;
DstSlice mDstSlice;
};
} // namespace ck
#endif
#ifndef CK_AMD_INTRINSIC_HPP
#define CK_AMD_INTRINSIC_HPP
#include "vector_type.hpp"
namespace ck {
__device__ float __llvm_amdgcn_buffer_load(int32x4_t rsrc,
uint32_t vindex,
uint32_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load");
__device__ vector_type<float, 2>::MemoryType
__llvm_amdgcn_buffer_loadx2(int32x4_t rsrc,
uint32_t vindex,
uint32_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.dwordx2");
__device__ vector_type<float, 4>::MemoryType
__llvm_amdgcn_buffer_loadx4(int32x4_t rsrc,
uint32_t vindex,
uint32_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.dwordx4");
__device__ void __llvm_amdgcn_buffer_store(float vdata,
int32x4_t rsrc,
uint32_t vindex,
uint32_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store");
__device__ void __llvm_amdgcn_buffer_storex2(vector_type<float, 2>::MemoryType vdata,
int32x4_t rsrc,
uint32_t vindex,
uint32_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store.dwordx2");
__device__ void __llvm_amdgcn_buffer_storex4(vector_type<float, 4>::MemoryType vdata,
int32x4_t rsrc,
uint32_t vindex,
uint32_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store.dwordx4");
// buffer_load and buffer_store
template <typename T, index_t VectorSize>
__device__ typename vector_type<T, VectorSize>::MemoryType __buffer_load(
const T* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset);
template <typename T, index_t VectorSize>
__device__ void __buffer_store(const typename vector_type<T, VectorSize>::MemoryType& src,
T* p_dst_block,
uint32_t dst_thread_data_offset,
uint32_t dst_const_data_offset);
template <>
__device__ float __buffer_load<float, 1>(const float* p_src_block,
uint32_t src_thread_data_offset,
uint32_t src_const_data_offset)
{
#if 0
float dst;
uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float);
int32x4_t src_block_setting{0};
// fill in byte 0 - 1
*reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
// fill in byte 2
reinterpret_cast<int*>(&src_block_setting)[2] = -1;
// fill in byte 3
reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
asm volatile("\n \
buffer_load_dword %0, %1, %2, %3 offen offset:0 \n \
s_waitcnt 0 \n \
"
: "=v"(dst)
: "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset));
return dst;
#else
float dst;
uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float);
int32x4_t src_block_setting{0};
// fill in byte 0 - 1
*reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
// fill in byte 2
reinterpret_cast<int*>(&src_block_setting)[2] = -1;
// fill in byte 3
reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
dst = __llvm_amdgcn_buffer_load(
src_block_setting, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
return dst;
#endif
}
template <>
__device__ vector_type<float, 2>::MemoryType __buffer_load<float, 2>(
const float* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset)
{
#if 0
vector_type<float, 2>::MemoryType dst;
uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float);
int32x4_t src_block_setting{0};
// fill in byte 0 - 1
*reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
// fill in byte 2
reinterpret_cast<int*>(&src_block_setting)[2] = -1;
// fill in byte 3
reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
asm volatile("\n \
buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0 \n \
s_waitcnt 0 \n \
"
: "=v"(dst)
: "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset));
return dst;
#else
vector_type<float, 2>::MemoryType dst;
uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float);
int32x4_t src_block_setting{0};
// fill in byte 0 - 1
*reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
// fill in byte 2
reinterpret_cast<int*>(&src_block_setting)[2] = -1;
// fill in byte 3
reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
dst = __llvm_amdgcn_buffer_loadx2(
src_block_setting, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
return dst;
#endif
}
template <>
__device__ vector_type<float, 4>::MemoryType __buffer_load<float, 4>(
const float* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset)
{
#if 0
vector_type<float, 4>::MemoryType dst;
uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float);
int32x4_t src_block_setting{0};
// fill in byte 0 - 1
*reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
// fill in byte 2
reinterpret_cast<int*>(&src_block_setting)[2] = -1;
// fill in byte 3
reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
asm volatile("\n \
buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0 \n \
s_waitcnt 0 \n \
"
: "=v"(dst)
: "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset));
return dst;
#elif 1
vector_type<float, 4>::MemoryType dst;
uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float);
int32x4_t src_block_setting{0};
// fill in byte 0 - 1
*reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
// fill in byte 2
reinterpret_cast<int*>(&src_block_setting)[2] = -1;
// fill in byte 3
reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
dst = __llvm_amdgcn_buffer_loadx4(
src_block_setting, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
return dst;
#endif
}
template <>
__device__ void __buffer_store<float, 1>(const float& src,
float* p_dst_block,
uint32_t dst_thread_data_offset,
uint32_t dst_const_data_offset)
{
#if 0
uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
int32x4_t dst_block_setting{0};
// fill in byte 0 - 1
*reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
// fill in byte 2
reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
// fill in byte 3
reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
asm volatile("\n \
buffer_store_dword %1, %2, %0, %3 offen offset:0 \n \
"
:
: "s"(dst_block_setting),
"v"(src),
"v"(dst_thread_addr_offset),
"s"(dst_const_addr_offset));
#else
uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
int32x4_t dst_block_setting{0};
// fill in byte 0 - 1
*reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
// fill in byte 2
reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
// fill in byte 3
reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
__llvm_amdgcn_buffer_store(
src, dst_block_setting, 0, dst_thread_addr_offset + dst_const_addr_offset, false, false);
#endif
}
template <>
__device__ void __buffer_store<float, 2>(const vector_type<float, 2>::MemoryType& src,
float* p_dst_block,
uint32_t dst_thread_data_offset,
uint32_t dst_const_data_offset)
{
#if 0
uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
int32x4_t dst_block_setting{0};
// fill in byte 0 - 1
*reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
// fill in byte 2
reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
// fill in byte 3
reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
asm volatile("\n \
buffer_store_dwordx2 %1, %2, %0, %3 offen offset:0 \n \
"
:
: "s"(dst_block_setting),
"v"(src),
"v"(dst_thread_addr_offset),
"s"(dst_const_addr_offset));
#else
uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
int32x4_t dst_block_setting{0};
// fill in byte 0 - 1
*reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
// fill in byte 2
reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
// fill in byte 3
reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
__llvm_amdgcn_buffer_storex2(
src, dst_block_setting, 0, dst_thread_addr_offset + dst_const_addr_offset, false, false);
#endif
}
template <>
__device__ void __buffer_store<float, 4>(const vector_type<float, 4>::MemoryType& src,
float* p_dst_block,
uint32_t dst_thread_data_offset,
uint32_t dst_const_data_offset)
{
#if 0
uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
int32x4_t dst_block_setting{0};
// fill in byte 0 - 1
*reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
// fill in byte 2
reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
// fill in byte 3
reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
asm volatile("\n \
buffer_store_dwordx4 %1, %2, %0, %3 offen offset:0 \n \
"
:
: "s"(dst_block_setting),
"v"(src),
"v"(dst_thread_addr_offset),
"s"(dst_const_addr_offset));
#else
uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
int32x4_t dst_block_setting{0};
// fill in byte 0 - 1
*reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
// fill in byte 2
reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
// fill in byte 3
reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
__llvm_amdgcn_buffer_storex4(
src, dst_block_setting, 0, dst_thread_addr_offset + dst_const_addr_offset, false, false);
#endif
}
} // namespace ck
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment