Commit 2ffa2708 authored by Chao Liu's avatar Chao Liu
Browse files

adding dynamic tensor descriptor

parent c1eaba27
...@@ -553,7 +553,7 @@ struct DummyDynamicTransform ...@@ -553,7 +553,7 @@ struct DummyDynamicTransform
auto in_gemmk_gemmn_coord = make_dynamic_tensor_coordinate(in_gemmk_gemmn_global_desc, idx); auto in_gemmk_gemmn_coord = make_dynamic_tensor_coordinate(in_gemmk_gemmn_global_desc, idx);
for(index_t iter = 0; iter < 100; ++iter) for(index_t iter = 0; iter < niter; ++iter)
{ {
constexpr auto gemmk1_gemmn0 = MultiIndex<2>{1, 0}; constexpr auto gemmk1_gemmn0 = MultiIndex<2>{1, 0};
...@@ -574,7 +574,11 @@ struct DummyDynamicTransform ...@@ -574,7 +574,11 @@ struct DummyDynamicTransform
1, 1,
p_out_global, p_out_global,
in_gemmk_gemmn_coord.GetOffset(), in_gemmk_gemmn_coord.GetOffset(),
#if 0
in_gemmk_gemmn_coord.IsOffsetValidAssumingUpperIndexIsValid(), in_gemmk_gemmn_coord.IsOffsetValidAssumingUpperIndexIsValid(),
#else
true,
#endif
in_gemmk_gemmn_global_desc.GetElementSpace()); in_gemmk_gemmn_global_desc.GetElementSpace());
} }
} }
......
...@@ -35,8 +35,8 @@ struct DynamicPassThrough ...@@ -35,8 +35,8 @@ struct DynamicPassThrough
} }
template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx> template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
__host__ __device__ static void CalculateLowerIndexDiff(LowIdxDiff& idx_low_diff, __host__ __device__ static void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_up_diff, const UpIdxDiff& idx_diff_up,
const LowIdx& /* idx_low_old */, const LowIdx& /* idx_low_old */,
const UpIdx& /* idx_up_old */) const UpIdx& /* idx_up_old */)
{ {
...@@ -44,7 +44,7 @@ struct DynamicPassThrough ...@@ -44,7 +44,7 @@ struct DynamicPassThrough
UpIdx::Size() == 1, UpIdx::Size() == 1,
"wrong! inconsistent # of dimension"); "wrong! inconsistent # of dimension");
idx_low_diff(0) = idx_up_diff[0]; idx_diff_low(0) = idx_diff_up[0];
} }
__host__ __device__ static constexpr bool IsLinearTransform() { return true; } __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
...@@ -86,7 +86,8 @@ struct DynamicLeftPad ...@@ -86,7 +86,8 @@ struct DynamicLeftPad
__host__ __device__ constexpr auto GetUpperLengths() const { return UpperIndex{up_length_}; } __host__ __device__ constexpr auto GetUpperLengths() const { return UpperIndex{up_length_}; }
template <typename LowIdx, typename UpIdx> template <typename LowIdx, typename UpIdx>
__host__ __device__ void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up) const __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
const UpIdx& idx_up) const
{ {
static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1, static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
"wrong! inconsistent # of dimension"); "wrong! inconsistent # of dimension");
...@@ -95,8 +96,8 @@ struct DynamicLeftPad ...@@ -95,8 +96,8 @@ struct DynamicLeftPad
} }
template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx> template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
__host__ __device__ static void CalculateLowerIndexDiff(LowIdxDiff& idx_low_diff, __host__ __device__ static void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_up_diff, const UpIdxDiff& idx_diff_up,
const LowIdx& /* idx_low_old */, const LowIdx& /* idx_low_old */,
const UpIdx& /* idx_up_old */) const UpIdx& /* idx_up_old */)
{ {
...@@ -104,7 +105,7 @@ struct DynamicLeftPad ...@@ -104,7 +105,7 @@ struct DynamicLeftPad
UpIdx::Size() == 1, UpIdx::Size() == 1,
"wrong! inconsistent # of dimension"); "wrong! inconsistent # of dimension");
idx_low_diff(0) = idx_up_diff[0]; idx_diff_low(0) = idx_diff_up[0];
} }
__host__ __device__ static constexpr bool IsLinearTransform() { return true; } __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
...@@ -159,8 +160,8 @@ struct DynamicRightPad ...@@ -159,8 +160,8 @@ struct DynamicRightPad
} }
template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx> template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
__host__ __device__ static void CalculateLowerIndexDiff(LowIdxDiff& idx_low_diff, __host__ __device__ static void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_up_diff, const UpIdxDiff& idx_diff_up,
const LowIdx& /* idx_low_old */, const LowIdx& /* idx_low_old */,
const UpIdx& /* idx_up_old */) const UpIdx& /* idx_up_old */)
{ {
...@@ -168,7 +169,7 @@ struct DynamicRightPad ...@@ -168,7 +169,7 @@ struct DynamicRightPad
UpIdx::Size() == 1, UpIdx::Size() == 1,
"wrong! inconsistent # of dimension"); "wrong! inconsistent # of dimension");
idx_low_diff(0) = idx_up_diff[0]; idx_diff_low(0) = idx_diff_up[0];
} }
__host__ __device__ static constexpr bool IsLinearTransform() { return true; } __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
...@@ -218,7 +219,8 @@ struct DynamicEmbed ...@@ -218,7 +219,8 @@ struct DynamicEmbed
__host__ __device__ constexpr auto GetUpperLengths() const { return up_lengths_; } __host__ __device__ constexpr auto GetUpperLengths() const { return up_lengths_; }
template <typename LowIdx, typename UpIdx> template <typename LowIdx, typename UpIdx>
__host__ __device__ void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up) const __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
const UpIdx& idx_up) const
{ {
static_assert(LowIdx::Size() == 1 && UpIdx::Size() == NDimUp, static_assert(LowIdx::Size() == 1 && UpIdx::Size() == NDimUp,
"wrong! inconsistent # of dimension"); "wrong! inconsistent # of dimension");
...@@ -233,21 +235,21 @@ struct DynamicEmbed ...@@ -233,21 +235,21 @@ struct DynamicEmbed
} }
template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx> template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
__host__ __device__ void CalculateLowerIndexDiff(LowIdxDiff& idx_low_diff, __host__ __device__ constexpr void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_up_diff, const UpIdxDiff& idx_diff_up,
const LowIdx& /* idx_low_old */, const LowIdx& /* idx_low_old */,
const UpIdx& /* idx_up_old */) const const UpIdx& /* idx_up_old */) const
{ {
static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == NDimUp && static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == NDimUp &&
LowIdx::Size() == 1 && UpIdx::Size() == NDimUp, LowIdx::Size() == 1 && UpIdx::Size() == NDimUp,
"wrong! inconsistent # of dimension"); "wrong! inconsistent # of dimension");
idx_low_diff(0) = 0; idx_diff_low(0) = 0;
#pragma unroll #pragma unroll
for(index_t i = 0; i < NDimUp; ++i) for(index_t i = 0; i < NDimUp; ++i)
{ {
idx_low_diff(0) += idx_up_diff[i] * coefficients_[i]; idx_diff_low(0) += idx_diff_up[i] * coefficients_[i];
} }
} }
...@@ -299,7 +301,8 @@ struct DynamicMerge ...@@ -299,7 +301,8 @@ struct DynamicMerge
__host__ __device__ constexpr auto GetUpperLengths() const { return UpperIndex{up_length_}; } __host__ __device__ constexpr auto GetUpperLengths() const { return UpperIndex{up_length_}; }
template <typename LowIdx, typename UpIdx> template <typename LowIdx, typename UpIdx>
__host__ __device__ void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up) const __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
const UpIdx& idx_up) const
{ {
static_assert(LowIdx::Size() == NDimLow && UpIdx::Size() == 1, static_assert(LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
"wrong! inconsistent # of dimension"); "wrong! inconsistent # of dimension");
...@@ -316,36 +319,49 @@ struct DynamicMerge ...@@ -316,36 +319,49 @@ struct DynamicMerge
idx_low(NDimLow - 1) = tmp; idx_low(NDimLow - 1) = tmp;
} }
// idx_low_diff depends on idx_low_old, so idx_low need to be up-to-date // idx_diff_low depends on idx_low_old, so idx_low need to be up-to-date
// If idx_up_diff is known at compile-time, many calculations can be optimized // If idx_diff_up is known at compile-time, many calculations can be optimized
// away by compiler // away by compiler
// This function assume idx_low_old is not out-of-bound // This function assume idx_low_old is not out-of-bound
template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx> template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
__host__ __device__ void CalculateLowerIndexDiff(LowIdxDiff& idx_low_diff, __host__ __device__ constexpr void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_up_diff, const UpIdxDiff& idx_diff_up,
const LowIdx& idx_low_old, const LowIdx& idx_low_old,
const UpIdx& /* idx_up_old */) const const UpIdx& /* idx_up_old */) const
{ {
static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 && static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
LowIdx::Size() == NDimLow && UpIdx::Size() == 1, LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
"wrong! inconsistent # of dimension"); "wrong! inconsistent # of dimension");
// CalculateLowerIndex(idx_low_diff_const) has multiple integer divisions. #if 1
// I only want to do this check, if idx_diff_up is know at compile-time
if(idx_diff_up[0] == 0)
{
#pragma unroll
for(index_t i = 0; i < NDimLow; ++i)
{
idx_diff_low(i) = 0;
}
return;
}
#endif
// CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
// However, // However,
// 1) If idx_up_diff is known at compile-time, then idx_low_diff_const // 1) If idx_diff_up is known at compile-time, then idx_diff_low_const
// can be calculated at compile-time. // can be calculated at compile-time.
// 2) If idx_up_diff is not known at compile-time, but its value // 2) If idx_diff_up is not known at compile-time, but its value
// doesn't change during the whole kernel execution, then // doesn't change during the whole kernel execution, then
// idx_low_diff_const also // idx_diff_low_const also
// doesn't change during the whole kernel execution. Compiler generated // doesn't change during the whole kernel execution. Compiler generated
// ISA should // ISA should
// only caclculate idx_low_diff_const once and save it durinng the whole // only caclculate idx_diff_low_const once and save it durinng the whole
// kernel execution // kernel execution
// If neither 1) nor 2) is satisfied, then the calculation will also be // If neither 1) nor 2) is satisfied, then the calculation will also be
// computed at // computed at
// run-time each time this function is called, and can be very expensive. // run-time each time this function is called, and can be very expensive.
LowerIndex idx_low_diff_const; LowerIndex idx_diff_low_const;
CalculateLowerIndex(idx_low_diff_const, idx_up_diff); CalculateLowerIndex(idx_diff_low_const, idx_diff_up);
// do carry check on each low dimension in reversed order // do carry check on each low dimension in reversed order
// do not need to check the first dimension // do not need to check the first dimension
...@@ -354,29 +370,29 @@ struct DynamicMerge ...@@ -354,29 +370,29 @@ struct DynamicMerge
#pragma unroll #pragma unroll
for(index_t i = NDimLow - 1; i > 0; --i) for(index_t i = NDimLow - 1; i > 0; --i)
{ {
// this should be saved as well // this should be saved in SGPR as well
index_t idx_low_length_minus_idx_low_diff_const = index_t idx_low_length_minus_idx_diff_low_const =
low_lengths_[i] - idx_low_diff_const[i]; low_lengths_[i] - idx_diff_low_const[i];
#if 0 #if 0
index_t idx_low_length_plus_idx_low_diff_const = index_t idx_low_length_plus_idx_diff_low_const =
low_lengths_[i] + idx_low_diff_const[i]; low_lengths_[i] + idx_diff_low_const[i];
#endif #endif
index_t idx_low_tmp = idx_low_old[i] + carry; index_t idx_low_tmp = idx_low_old[i] + carry;
bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_low_diff_const; bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const;
#if 0 #if 0
bool do_borrow = idx_low_tmp < -idx_low_diff_const[i]; bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
#endif #endif
idx_low_diff(i) = idx_diff_low(i) =
do_carry ? -idx_low_length_minus_idx_low_diff_const : idx_low_diff_const[i]; do_carry ? -idx_low_length_minus_idx_diff_low_const : idx_diff_low_const[i];
#if 0 #if 0
idx_low_diff(i) = idx_diff_low(i) =
do_borrow ? idx_low_length_plus_idx_low_diff_const : idx_low_diff[i]; do_borrow ? idx_low_length_plus_idx_diff_low_const : idx_diff_low[i];
#endif #endif
idx_low_diff(i) += carry; idx_diff_low(i) += carry;
carry = do_carry ? 1 : 0; carry = do_carry ? 1 : 0;
#if 0 #if 0
...@@ -384,7 +400,7 @@ struct DynamicMerge ...@@ -384,7 +400,7 @@ struct DynamicMerge
#endif #endif
} }
idx_low_diff(0) = idx_low_diff_const[0] + carry; idx_diff_low(0) = idx_diff_low_const[0] + carry;
} }
__host__ __device__ static constexpr bool IsLinearTransform() { return false; } __host__ __device__ static constexpr bool IsLinearTransform() { return false; }
...@@ -431,7 +447,8 @@ struct DynamicUnMerge ...@@ -431,7 +447,8 @@ struct DynamicUnMerge
__host__ __device__ constexpr auto GetUpperLengths() const { return up_lengths_; } __host__ __device__ constexpr auto GetUpperLengths() const { return up_lengths_; }
template <typename LowIdx, typename UpIdx> template <typename LowIdx, typename UpIdx>
__host__ __device__ void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up) const __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
const UpIdx& idx_up) const
{ {
idx_low(0) = idx_up[NDimUp]; idx_low(0) = idx_up[NDimUp];
...@@ -443,12 +460,12 @@ struct DynamicUnMerge ...@@ -443,12 +460,12 @@ struct DynamicUnMerge
} }
template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx> template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
__host__ __device__ void CalculateLowerIndexDiff(LowIdxDiff& idx_low_diff, __host__ __device__ constexpr void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_up_diff, const UpIdxDiff& idx_diff_up,
const LowIdx& /* idx_low_old */, const LowIdx& /* idx_low_old */,
const UpIdx& /* idx_up_old */) const const UpIdx& /* idx_up_old */) const
{ {
CalculateLowerIndex(idx_low_diff, idx_up_diff); CalculateLowerIndex(idx_diff_low, idx_diff_up);
} }
__host__ __device__ static constexpr bool IsLinearTransform() { return true; } __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
...@@ -486,7 +503,8 @@ struct DynamicFreeze ...@@ -486,7 +503,8 @@ struct DynamicFreeze
__host__ __device__ constexpr auto GetUpperLengths() const { return UpperIndex{}; } __host__ __device__ constexpr auto GetUpperLengths() const { return UpperIndex{}; }
template <typename LowIdx, typename UpIdx> template <typename LowIdx, typename UpIdx>
__host__ __device__ void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up) const __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
const UpIdx& idx_up) const
{ {
static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1, static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
"wrong! inconsistent # of dimension"); "wrong! inconsistent # of dimension");
...@@ -495,12 +513,12 @@ struct DynamicFreeze ...@@ -495,12 +513,12 @@ struct DynamicFreeze
} }
template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx> template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
__host__ __device__ static void CalculateLowerIndexDiff(LowIdxDiff& idx_low_diff, __host__ __device__ static void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_up_diff, const UpIdxDiff& idx_diff_up,
const LowIdx& /* idx_low_old */, const LowIdx& /* idx_low_old */,
const UpIdx& /* idx_up_old */) const UpIdx& /* idx_up_old */)
{ {
idx_low_diff(0) = index_t{0}; idx_diff_low(0) = index_t{0};
} }
__host__ __device__ static constexpr bool IsLinearTransform() { return true; } __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
......
#ifndef CK_DYNAMIC_TENSOR_DESCRIPTOR_HELPER_V2_HPP
#define CK_DYNAMIC_TENSOR_DESCRIPTOR_HELPER_V2_HPP
#include "common_header.hpp"
#include "dynamic_tensor_descriptor_v2.hpp"
namespace ck {
template <typename LowerTensorDescriptor,
typename Transforms,
typename LowerVisibleDimensionLowerVisibleIdss,
typename UpperVisibleDimensionUpperVisibleIdss>
__host__ __device__ constexpr auto
transform_dynamic_tensor_descriptor_v2(const LowerTensorDescriptor& low_tensor_desc,
const Transforms& transforms,
LowerVisibleDimensionLowerVisibleIdss,
UpperVisibleDimensionUpperVisibleIdss)
{
// convert lower visible dimension idss (tuple of sequences) to hidden dimension idss (tuple of sequences)
constexpr auto low_visible_dimension_hidden_idss = transform_tuples(
// convert lower visible dimension ids (a sequence) to hidden dimension ids (a sequence)
[](auto low_visible_dim_ids) {
return transform_sequences(
// convert lower visible dimension id to hidden dimension id
[](auto low_visible_dim_id) {
return low_tensor_desc.GetVisibleDimensionIds()[low_visible_dim_id];
},
low_visible_dim_ids);
},
LowerVisibleDimensionLowerVisibleIdss{});
constexpr auto up_visible_dims_
const auto all_transforms = merge_tuples(old_tensor_desc.GetTransforms(), new_transforms);
constexpr auto all_low_dim_idss =
merge_tuples(old_tensor_desc.GetLowerDimensionIdss(), new_low_dim_idss);
constexpr auto all_up_dim_idss =
merge_tuples(old_tensor_desc.GetUpperDimensionIdss(), new_up_dim_idss);
constexpr auto new_visible_dim_ids = new_up_dim_idss
return DynamicTensorDescriptor_v2<decltype(all_transforms),
decltype(all_low_dim_idss),
decltype(all_up_dim_idss),
decltype(new_visible_dim_ids)>{
all_transforms, old_tensor_desc.GetElementSpaceSize()};
}
} // namespace ck
#endif
#ifndef CK_DYNAMIC_TENSOR_DESCRIPTOR_V2_HPP
#define CK_DYNAMIC_TENSOR_DESCRIPTOR_V2_HPP
#include "common_header.hpp"
#include "dynamic_multi_index_transform.hpp"
namespace ck {
template <index_t NDimHidden, typename VisibleDimensionIds>
struct DynamicTensorCoordinate_v2;
template <index_t NTransform, index_t NDimVisible>
struct DynamicTensorCoordinateStep_v2;
// Transforms: Tuple<transforms...>
// LowerDimensionIdss : Tuple<Sequence<...>, ...>
// UpperDimensionIdss : Tuple<Sequence<...>, ...>
// VisibleDimensionIds> : Sequence<...>
template <typename Transforms,
typename LowerDimensionIdss,
typename UpperDimensionIdss,
typename VisibleDimensionIds>
struct DynamicTensorDescriptor_v2
{
constexpr static index_t ntransform_ = GetNumOfTransform();
constexpr static index_t ndim_visible_ = GetNumOfVisibleDimension();
constexpr static index_t ndim_hidden_ = GetNumOfHiddenDimension();
using VisibleIndex = MultiIndex<ndim_visible_>;
using HiddenIndex = MultiIndex<ndim_hidden_>;
__host__ __device__ explicit constexpr DynamicTensorDescriptor_v2(const Transforms& transforms,
index_t element_space_size)
: transforms_{transforms},
hidden_lengths_{InitializeHiddenLengths(transforms_, element_space_size)},
visble_lengths_{hidden_lengths_}
{
static_assert(Transforms::Size() == ntransforms_ &&
LowerDimensionIdss::Size() == ntransforms_ &&
UpperDimensionIdss::Size() == ntransforms_,
"wrong! inconsistent # of transformations");
// TODO check dependency of dimensions is valid
}
__host__ __device__ static constexpr index_t GetNumOfDimension() const
{
return GetNumOfVisibleDimension();
}
__host__ __device__ constexpr index_t GetLength(index_t idim) const
{
return visible_lengths_[idim];
}
__host__ __device__ constexpr const auto& GetLengths() const { return visible_lengths_; }
// maybe this result should be saved as a member variable
__host__ __device__ constexpr index_t GetElementSize() const
{
return reduce_on_array(GetLengths(), math::multiplies<index_t>{}, index_t{1});
}
__host__ __device__ constexpr index_t GetElementSpaceSize() const { return hidden_lengths_[0]; }
template <typename Idx>
__host__ __device__ constexpr index_t CalculateOffset(const Idx& idx) const
{
static_assert(Idx::Size() == GetNumOfDimension(), "wrong! inconsistent # of dimension");
return make_tensor_coordinate_v2(*this, idx).GetOffset();
}
private:
__host__ __device__ static constexpr index_t GetNumOfVisibleDimension()
{
return VisibleDimensionIds::Size();
}
__host__ __device__ static constexpr index_t GetNumOfHiddenDimension()
{
constexpr auto all_low_dim_ids =
unpack([](auto&&... xs) constexpr { return merge_sequences(xs...); },
LowerDimsionIdss{});
constexpr auto all_up_dim_ids =
unpack([](auto&&... xs) constexpr { return merge_sequences(xs...); },
UpperDimsionIdss{});
constexpr auto all_dim_ids = merge_sequenses(all_low_dim_ids, all_up_dim_ids);
using unique_sort_all_dim_ids = sequence_unique_sort<decltype(all_dim_ids),
math::less<index_t>,
math::equal<index_t>>::type;
return uniqie_sort_all_dim_ids::type::Size();
}
__host__ __device__ static constexpr index_t GetNumOfTransform() { return Transforms::Size(); }
__host__ __device__ constexpr const auto& GetTransforms() const { return transforms_; }
__host__ __device__ static constexpr auto GetLowerDimensionIdss()
{
return LowerDimensionIdss{};
}
__host__ __device__ static constexpr auto GetUpperDimensionIdss()
{
return UpperDimensionIdss{};
}
__host__ __device__ static constexpr index_t GetVisibleDimensionIds()
{
return VisibleDimensionIds{};
}
__host__ __device__ static constexpr auto InitializeHiddenLengths(const Transforms& transforms,
index_t element_space_size)
{
HiddenIndex lengths_hidden = make_zero_multi_index<ndim_hidden_>();
// this is the orignal tensor element space size
lengths_hidden(0) = element_space_size;
// lengths for all other hidden dimensions
static_for<0, ntransform_, 1>{}([&](auto itran) {
const auto& tran = transforms.At(itran);
constexpr auto up_dim_ids = UpperDimensionIdss::At(itran);
const auto lengths_up_pick = pick_array_element(lengths_hidden, up_dim_ids);
#pragma unroll
for(index_t i = 0; i < lengths_low.Size(); ++i)
{
lengths_low_pick(i) = tran.GetUpperLengths()[i];
}
});
return lengths_hidden;
}
// private member variables
const Transforms transforms_;
// TODO maybe hidden_lengths_ should use reference_wrapper to save space on stack?
const HiddenIndex hidden_lengths_;
// visible_lenths_ contains a reference to hidden_lengths_
const ArrayElementPicker<HiddenIndex, VisibleDimensionIds> visible_lengths_;
// friend functions for making and updating tensor coordinate
__host__
__device__ friend constexpr DynamicTensorCoordinate_v2<ndim_hidden_, VisibleDimensionIds>
make_tensor_coordinate_v2(const DynamicTensorDescriptor_v2& /* tensor_desc */,
const VisibleIndex& /* idx_visible */);
__host__ __device__ friend constexpr DynamicTensorCoordinateStep_v2<ntransform_, ndim_visible_>
make_tensor_coordinate_step_v2(const DynamicTensorDescriptor_v2& /* tensor_desc */,
const VisibleIndex& /* idx_diff_visible */);
__host__ __device__ friend void move_tensor_coordinate_v2(
const DynamicTensorDescriptor_v2& /* tensor_desc */,
DynamicTensorCoordinate_v2<ndim_hidden_, VisibleDimensionIds>& /* coord */,
const DynamicTensorCoordinateStep_v2<ntransform_, ndim_visible_>& /* coord_step */);
};
template <index_t NDimHidden, typename VisibleDimensionIds>
struct DynamicTensorCoordinate_v2
{
constexpr index_t ndim_visible_ = VisbleDimension::Size();
using HiddenIndex = MultiIndex<NDimHidden>;
using VisibleIndex = MultiIndex<ndim_visible_>;
__host__ __device__ explicit constexpr DynamicTensorCoordinate_v2(const HiddenIndex& idx_hidden)
: idx_hidden_{idx_hidden}, idx_visible_{idx_hidden_}
{
}
__host__ __device__ constexpr const auto& GetIndex() const { GetVisibleIndex(); }
__host__ __device__ constexpr index_t GetOffset() const { return idx_hidden_[0]; }
private:
__host__ __device__ constexpr const auto& GetHiddenIndex() const { return idx_hidden_; }
__host__ __device__ auto& GetHiddenIndex() { return idx_hidden_; }
__host__ __device__ constexpr const auto& GetVisibleIndex() const { return idx_visible_; }
__host__ __device__ auto& GetVisibleIndex() { return idx_visible_; }
// private member variables
HiddenIndex idx_hidden_;
// idx_visible_ contains a reference to idx_hidden_
ArrayElementPicker<HiddenIndex, VisibleDimensionIds> idx_visible_;
// friend functions for making and updating tensor coordinate
template <typename TensorDesc>
__host__ __device__ friend constexpr DynamicTensorCoordinate_v2
make_tensor_coordinate_v2(const TensorDesc& /* tensor_desc */,
const VisibleIndex& /* idx_visible */);
template <typename TensorDesc>
__host__ __device__ friend void move_tensor_coordinate_v2(
const TensorDesc& /* tensor_desc */,
DynamicTensorCoordinate_v2& /* coord */,
const DynamicTensorCoordinateStep_v2<TensorDesc::GetNumOfTransform(),
ndim_visible_>& /* coord_step */);
};
template <index_t NTransform, index_t NDimVisible>
struct DynamicTensorCoordinateStep_v2
{
using VisibleIndex = MultiIndex<NDimVisible>;
__host__ __device__ explicit constexpr DynamicTensorCoordinateStep_v2(
const VisibleIndex& idx_diff_visible, const Array<bool, NTransform>& do_transforms)
: idx_diff_visible_{idx_diff_visible}, do_transforms_{do_transforms}
{
}
private:
const VisibleIndex idx_diff_visible_;
const Array<bool, NTransform> do_transforms_;
// friend functions for updating tensor coordinate
template <typename TensorDesc>
__host__ __device__ friend constexpr DynamicTensorCoordinateStep_v2
make_tensor_coordinate_step_v2(const TensorDesc& /* tensor_desc */,
const VisibleIndex& /* idx_visible */);
template <typename TensorDesc, index_t NDimHidden, typename VisibleDimensionIds>
__host__ __device__ friend void move_tensor_coordinate_v2(
const TensorDesc& /* tensor_desc */,
DynamicTensorCoordinate_v2<NDimHidden, VisibleDimensionIds>& /* coord */,
const DynamicTensorCoordinateStep_v2& /* coord_step */);
};
template <typename TensorDesc, typename VisibleIndex>
__host__ __device__ constexpr auto make_tensor_coordinate_v2(const TensorDesc& tensor_desc,
const VisibleIndex& idx_visible)
{
static_assert(tensor_desc.GetNumOfDimension() == idx_visible.Size(),
"wrong! # of dimension inconsistent");
constexpr index_t ntransform = tensor_desc.GetNumOfTransformation();
constexpr index_t ndim_hidden = tensor_desc.GetNumOfHiddenDimension();
constexpr index_t ndim_visible = tensor_desc.GetNumOfVisibleDimension();
MultiIndex<ndim_hidden> idx_hidden;
auto idx_visible_pick = pick_array_element(idx_hidden, tensor_desc.GetVisibleDimensionIds());
// initialize visible index
#pragma unroll
for(index_t i < ndim_visible; i < ndim_visible, ++i)
{
idx_visible_pick(i) = idx_visible[i];
}
// calculate hidden index
static_for<ntransform - 1, -1, -1>{}([&](auto itran) {
const auto& tran = transforms_.At(itran);
constexpr auto dims_low = LowerDimensionIdss::At(itran);
constexpr auto dims_up = UpperDimensionIdss::At(itran);
const auto idx_up = pick_array_element(idx_hidden_, dim_up);
auto idx_low = pick_array_element(idx_hidden_, dim_low);
tran.CalculateLowerIndex(idx_up, idx_low);
});
// better to use std::move?
return DynamicTensorCoordinate_v2{idx_hidden};
}
template <typename TensorDesc, typename VisibleIndex>
__host__ __device__ constexpr auto
make_tensor_coordinate_step_v2(const TensorDesc& tensor_desc, const VisibleIndex& idx_diff_visible)
{
static_assert(tensor_desc.GetNumOfDimension() == idx_visible.Size(),
"wrong! # of dimension inconsistent");
constexpr index_t ntransform = tensor_desc.GetNumOfTransformation();
constexpr index_t ndim_hidden = tensor_desc.GetNumOfHiddenDimension();
constexpr index_t ndim_visible = tensor_desc.GetNumOfVisibleDimension();
Array<bool, ntransform> do_transforms = {false};
Array<bool, ndim_hidden> non_zero_diff = {false};
auto non_zero_diff_pick_visible =
pick_array_element(non_zero_diff, tensor_desc.GetVisibleDimensionIds());
#pragma unroll
for(index_t i < ndim_visible; i < ndim_visible, ++i)
{
non_zero_diff_pick_visible(i) = (idx_diff_visible[i] != 0);
}
static_for<ntransform - 1, -1, -1>{}([&](auto itran) {
const auto& tran = tensor_desec.GetTransforms().At(itran);
constexpr auto dims_low = tensor_desc.GetLowerDimensionIdss().At(itran);
constexpr auto dims_up = tensor_Desc.GetUpperDimensionIdss().At(itran);
const auto non_zero_diff_pick_up = pick_array_element(non_zero_diff, dims_up);
auto non_zero_diff_pick_low = pick_array_element(non_zero_diff, dims_low);
// if any of upper index diff components is non-zero, then
// 1) Need to do this transform
// 2) all components of lower index diff will assume to be non-zero and need to be
// computed
const bool idx_diff_up_has_non_zero =
reduce_on_array(non_zero_diff_pick_up, [](auto a, auto b) { return a or b; }, false);
do_transforms(itran) = idx_diff_up_has_non_zero;
#pragma unroll
for(index_t i = 0; i < dims_low.Size(); ++i)
{
non_zero_diff_pick_low(i) = idx_diff_up_has_non_zero;
}
});
return do_transforms;
}
template <typename TensorDesc, typename TensorCoord, typename TensorCoordStep>
__host__ __device__ void move_tensor_coordinate_v2(const TensorDesc& tensor_desc,
TensorCoord& coord,
const TensorCoordStep& coord_step)
{
constexpr index_t ndim_hidden = tensor_desc.GetNumOfHiddenDimension();
constexpr index_t ndim_visible = tensor_desc.GetNumOfVisibleDimension();
constexpr index_t ntransform = tensor_desc.GetNumOfTransform();
// this is what needs to be calculated
auto idx_diff_hidden = make_zero_multi_index<ndim_hidden>();
const auto idx_diff_visible_pick =
pick_array_element(idx_diff_hidden, tensor_desc.GetVisibleDimensionIds());
// initialize visible index diff
#pragma unroll
for(index_t i = 0; i < ndim_visible_; ++i)
{
idx_diff_visible_pick(i) = coord_step.GetVisibleIndexDiff()[i];
}
// this is what needs to be updated
auto& idx_hidden = coord.GetHiddenIndex();
// update hidden index
static_for<ntransform - 1, -1, -1>{}([&](auto itran) {
const auto& tran = tensor_desc.GetTransformations().At(itran);
constexpr auto dims_low = tensor_desc.GetLowerDimensionIdss().At(itran);
constexpr auto dims_up = tensor_desc.GetUpperDimensionIdss().At(itran);
// this const is for ArrayElementPicker, Array itself may not be const
const auto idx_up = pick_array_element(idx_hidden, dim_up);
const auto idx_low = pick_array_element(idx_hidden, dim_low);
const auto idx_diff_up = pick_array_element(idx_diff_hidden, dim_up);
const auto idx_diff_low = pick_array_element(idx_diff_hidden, dim_low);
tran.CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low, idx_up);
// update idx_low
idx_low += idx_diff_low;
});
}
template <typename TensorDesc, typename TensorCoord>
__host__ __device__ bool constexpr coordinate_has_valid_offset_assuming_visible_index_is_valid(
const TensorDesc& tensor_desc, const TensorCoord& coord)
{
bool valid = true;
constexpr index_t ntransform = tensor_desc.GetNumOfTransform();
const auto& idx_hidden = coord.GetHiddenIndex();
static_for<ntransform - 1, -1, -1>{}([&](auto itran) {
const auto tran = tensor_desc.GetTransforms().At(itran);
// check validity, only if current transformation does not always has a valid mapping
if constexpr(!decltype(tran)::IsValidUpperIndexAlwaysMappedToValidLowerIndex())
{
const auto idx_up =
pick_array_element(idx_hidden, tensor_desc.GetUpperDimensionIdss().At(itran));
valid = valid && tran.IsValidUpperIndexMappedToValidLowerIndex(idx_up);
}
});
return valid;
}
template <typename TensorDesc, typename TensorCoord>
__host__ __device__ bool constexpr coordinate_has_valid_offset(const TensorDesc& tensor_desc,
const TensorCoord& coord)
{
// check visible index
const auto& idx_visible = coord.GetVisibleIndex();
bool is_visible_index_valid = true;
#pragma unroll
for(index_t i = 0; i < tensor_desc.GetNumOfDimension(); ++i)
{
is_visible_index_valid = is_visible_index_valid &&
(idx_visible[i] >= 0 && idx_visible[i] < tensor_desc.GetLength(i));
}
// check other hidden index
return is_visible_index_valid &&
coordinate_has_valid_offset_assuming_visible_index_is_valid(tensor_desc, coord);
}
} // namespace ck
#endif
...@@ -14,6 +14,12 @@ __host__ __device__ constexpr auto make_multi_index(Xs... xs) ...@@ -14,6 +14,12 @@ __host__ __device__ constexpr auto make_multi_index(Xs... xs)
return MultiIndex<sizeof...(Xs)>(xs...); return MultiIndex<sizeof...(Xs)>(xs...);
} }
template <index_t NSize>
__host__ __device__ constexpr auto make_zero_multi_index()
{
make_zero_array<index_t, NSize>();
}
template <index_t Length> template <index_t Length>
struct PassThrough struct PassThrough
{ {
......
...@@ -133,7 +133,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, ...@@ -133,7 +133,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
constexpr index_t WeiBlockCopySrcDataPerRead_E = 2; constexpr index_t WeiBlockCopySrcDataPerRead_E = 2;
constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
#elif 0 #elif 1
// cdata = 64, BlockSize = 256, 128x128x8 // cdata = 64, BlockSize = 256, 128x128x8
constexpr index_t BlockSize = 256; constexpr index_t BlockSize = 256;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment