Commit c1ed17f8 authored by Chao Liu's avatar Chao Liu
Browse files

improving index calculation: change to UpdateIndexDiff()

parent 77c81617
...@@ -157,7 +157,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad ...@@ -157,7 +157,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
const index_t GemmM0 = GemmM / GemmM1; const index_t GemmM0 = GemmM / GemmM1;
const index_t GemmN0 = GemmN / GemmN1; const index_t GemmN0 = GemmN / GemmN1;
#if 1 // debug
const auto out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc = const auto out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc =
transform_dynamic_tensor_descriptor( transform_dynamic_tensor_descriptor(
out_gemmm_gemmn_global_desc, out_gemmm_gemmn_global_desc,
...@@ -165,16 +164,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad ...@@ -165,16 +164,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
DynamicUnMerge<2>{make_multi_index(GemmN0, GemmN1)}), DynamicUnMerge<2>{make_multi_index(GemmN0, GemmN1)}),
make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{})); make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
#else
const auto out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc =
transform_dynamic_tensor_descriptor(
out_gemmm_gemmn_global_desc,
make_tuple(
HackSemiDynamicUnMerge<3, Sequence<GemmM1>>{make_multi_index(1, GemmM0)},
HackSemiDynamicUnMerge<3, Sequence<GemmN1>>{make_multi_index(1, GemmN0)}),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
#endif
// GEMM // GEMM
using gridwise_gemm = GridwiseDynamicGemm_km_kn_mn_v1< using gridwise_gemm = GridwiseDynamicGemm_km_kn_mn_v1<
......
...@@ -13,11 +13,6 @@ struct DynamicPassThrough ...@@ -13,11 +13,6 @@ struct DynamicPassThrough
const UpperIndex up_lengths_; const UpperIndex up_lengths_;
#if 0
__host__ __device__ constexpr DynamicPassThrough(const DynamicPassThrough&) = default;
__host__ __device__ constexpr DynamicPassThrough(DynamicPassThrough&&) = default;
#else
__host__ __device__ constexpr DynamicPassThrough(const DynamicPassThrough& other) __host__ __device__ constexpr DynamicPassThrough(const DynamicPassThrough& other)
: up_lengths_{other.up_lengths_} : up_lengths_{other.up_lengths_}
{ {
...@@ -27,7 +22,6 @@ struct DynamicPassThrough ...@@ -27,7 +22,6 @@ struct DynamicPassThrough
: up_lengths_{other.up_lengths_} : up_lengths_{other.up_lengths_}
{ {
} }
#endif
__host__ __device__ constexpr DynamicPassThrough(const index_t& low_length) __host__ __device__ constexpr DynamicPassThrough(const index_t& low_length)
: up_lengths_{make_multi_index(low_length)} : up_lengths_{make_multi_index(low_length)}
...@@ -51,31 +45,26 @@ struct DynamicPassThrough ...@@ -51,31 +45,26 @@ struct DynamicPassThrough
idx_low(Number<0>{}) = idx_up[Number<0>{}]; idx_low(Number<0>{}) = idx_up[Number<0>{}];
} }
template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
__host__ __device__ static void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& /* idx_low_old */,
const UpIdx& /* idx_up_old */)
{
static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
idx_diff_low(Number<0>{}) = idx_diff_up[Number<0>{}];
}
template <typename LowIdxDiff, template <typename LowIdxDiff,
typename UpIdxDiff, typename UpIdxDiff,
typename LowIdx, typename LowIdx,
typename UpIdx, typename UpIdx,
index_t Hack> index_t Hack>
__host__ __device__ static void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low, __host__ __device__ static void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up, const UpIdxDiff& idx_diff_up,
const LowIdx& idx_low_old, LowIdx& idx_low,
const UpIdx& idx_up_old, const UpIdx& idx_up_new,
Number<Hack>) Number<Hack>)
{ {
CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low_old, idx_up_old); static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
constexpr auto I0 = Number<0>{};
idx_diff_low(I0) = idx_diff_up[I0];
idx_low += idx_diff_low;
} }
__host__ __device__ static constexpr bool IsLinearTransform() { return true; } __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
...@@ -103,11 +92,6 @@ struct DynamicPad ...@@ -103,11 +92,6 @@ struct DynamicPad
const index_t left_pad_; const index_t left_pad_;
const index_t right_pad_; const index_t right_pad_;
#if 0
__host__ __device__ constexpr DynamicPad(const DynamicPad&) = default;
__host__ __device__ constexpr DynamicPad(DynamicPad&&) = default;
#else
__host__ __device__ constexpr DynamicPad(const DynamicPad& other) __host__ __device__ constexpr DynamicPad(const DynamicPad& other)
: up_lengths_{other.up_lengths_}, left_pad_{other.left_pad_}, right_pad_{other.right_pad_} : up_lengths_{other.up_lengths_}, left_pad_{other.left_pad_}, right_pad_{other.right_pad_}
{ {
...@@ -117,7 +101,6 @@ struct DynamicPad ...@@ -117,7 +101,6 @@ struct DynamicPad
: up_lengths_{other.up_lengths_}, left_pad_{other.left_pad_}, right_pad_{other.right_pad_} : up_lengths_{other.up_lengths_}, left_pad_{other.left_pad_}, right_pad_{other.right_pad_}
{ {
} }
#endif
__host__ __device__ constexpr DynamicPad(const index_t& low_length, __host__ __device__ constexpr DynamicPad(const index_t& low_length,
const index_t& left_pad, const index_t& left_pad,
...@@ -146,32 +129,26 @@ struct DynamicPad ...@@ -146,32 +129,26 @@ struct DynamicPad
idx_low(Number<0>{}) = idx_up[Number<0>{}] - left_pad_; idx_low(Number<0>{}) = idx_up[Number<0>{}] - left_pad_;
} }
template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
__host__ __device__ static constexpr void
CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& /* idx_low_old */,
const UpIdx& /* idx_up_old */)
{
static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
idx_diff_low(Number<0>{}) = idx_diff_up[Number<0>{}];
}
template <typename LowIdxDiff, template <typename LowIdxDiff,
typename UpIdxDiff, typename UpIdxDiff,
typename LowIdx, typename LowIdx,
typename UpIdx, typename UpIdx,
index_t Hack> index_t Hack>
__host__ __device__ static void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low, __host__ __device__ static void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up, const UpIdxDiff& idx_diff_up,
const LowIdx& idx_low_old, LowIdx& idx_low,
const UpIdx& idx_up_old, const UpIdx& idx_up_new,
Number<Hack>) Number<Hack>)
{ {
CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low_old, idx_up_old); static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
constexpr auto I0 = Number<0>{};
idx_diff_low(I0) = idx_diff_up[I0];
idx_low += idx_diff_low;
} }
__host__ __device__ static constexpr bool IsLinearTransform() { return true; } __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
...@@ -199,11 +176,6 @@ struct DynamicLeftPad ...@@ -199,11 +176,6 @@ struct DynamicLeftPad
const UpperIndex up_lengths_; const UpperIndex up_lengths_;
const index_t left_pad_; const index_t left_pad_;
#if 0
__host__ __device__ constexpr DynamicLeftPad(const DynamicLeftPad&) = default;
__host__ __device__ constexpr DynamicLeftPad(DynamicLeftPad&&) = default;
#else
__host__ __device__ constexpr DynamicLeftPad(const DynamicLeftPad& other) __host__ __device__ constexpr DynamicLeftPad(const DynamicLeftPad& other)
: up_lengths_{other.up_lengths_}, left_pad_{other.left_pad_} : up_lengths_{other.up_lengths_}, left_pad_{other.left_pad_}
{ {
...@@ -213,7 +185,6 @@ struct DynamicLeftPad ...@@ -213,7 +185,6 @@ struct DynamicLeftPad
: up_lengths_{other.up_lengths_}, left_pad_{other.left_pad_} : up_lengths_{other.up_lengths_}, left_pad_{other.left_pad_}
{ {
} }
#endif
__host__ __device__ constexpr DynamicLeftPad(const index_t& low_length, const index_t& left_pad) __host__ __device__ constexpr DynamicLeftPad(const index_t& low_length, const index_t& left_pad)
: up_lengths_{make_multi_index(low_length + left_pad)}, left_pad_{left_pad} : up_lengths_{make_multi_index(low_length + left_pad)}, left_pad_{left_pad}
...@@ -238,32 +209,26 @@ struct DynamicLeftPad ...@@ -238,32 +209,26 @@ struct DynamicLeftPad
idx_low(Number<0>{}) = idx_up[Number<0>{}] - left_pad_; idx_low(Number<0>{}) = idx_up[Number<0>{}] - left_pad_;
} }
template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
__host__ __device__ static constexpr void
CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& /* idx_low_old */,
const UpIdx& /* idx_up_old */)
{
static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
idx_diff_low(Number<0>{}) = idx_diff_up[Number<0>{}];
}
template <typename LowIdxDiff, template <typename LowIdxDiff,
typename UpIdxDiff, typename UpIdxDiff,
typename LowIdx, typename LowIdx,
typename UpIdx, typename UpIdx,
index_t Hack> index_t Hack>
__host__ __device__ static void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low, __host__ __device__ static void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up, const UpIdxDiff& idx_diff_up,
const LowIdx& idx_low_old, LowIdx& idx_low,
const UpIdx& idx_up_old, const UpIdx& idx_up_new,
Number<Hack>) Number<Hack>)
{ {
CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low_old, idx_up_old); static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
constexpr auto I0 = Number<0>{};
idx_diff_low(I0) = idx_diff_up[I0];
idx_low += idx_diff_low;
} }
__host__ __device__ static constexpr bool IsLinearTransform() { return true; } __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
...@@ -291,11 +256,6 @@ struct DynamicRightPad ...@@ -291,11 +256,6 @@ struct DynamicRightPad
const index_t low_length_; const index_t low_length_;
const index_t right_pad_; const index_t right_pad_;
#if 0
__host__ __device__ constexpr DynamicRightPad(const DynamicRightPad&) = default;
__host__ __device__ constexpr DynamicRightPad(DynamicRightPad&&) = default;
#else
__host__ __device__ constexpr DynamicRightPad(const DynamicRightPad& other) __host__ __device__ constexpr DynamicRightPad(const DynamicRightPad& other)
: up_lengths_{other.up_lengths_}, : up_lengths_{other.up_lengths_},
low_length_{other.low_length_}, low_length_{other.low_length_},
...@@ -309,7 +269,6 @@ struct DynamicRightPad ...@@ -309,7 +269,6 @@ struct DynamicRightPad
right_pad_{other.right_pad_} right_pad_{other.right_pad_}
{ {
} }
#endif
__host__ __device__ constexpr DynamicRightPad(const index_t& low_length, __host__ __device__ constexpr DynamicRightPad(const index_t& low_length,
const index_t& right_pad) const index_t& right_pad)
...@@ -339,32 +298,26 @@ struct DynamicRightPad ...@@ -339,32 +298,26 @@ struct DynamicRightPad
idx_low(Number<0>{}) = idx_up[Number<0>{}]; idx_low(Number<0>{}) = idx_up[Number<0>{}];
} }
template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
__host__ __device__ static constexpr void
CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& /* idx_low_old */,
const UpIdx& /* idx_up_old */)
{
static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
idx_diff_low(Number<0>{}) = idx_diff_up[Number<0>{}];
}
template <typename LowIdxDiff, template <typename LowIdxDiff,
typename UpIdxDiff, typename UpIdxDiff,
typename LowIdx, typename LowIdx,
typename UpIdx, typename UpIdx,
index_t Hack> index_t Hack>
__host__ __device__ static void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low, __host__ __device__ static void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up, const UpIdxDiff& idx_diff_up,
const LowIdx& idx_low_old, LowIdx& idx_low,
const UpIdx& idx_up_old, const UpIdx& idx_up_new,
Number<Hack>) Number<Hack>)
{ {
CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low_old, idx_up_old); static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
constexpr auto I0 = Number<0>{};
idx_diff_low(I0) = idx_diff_up[I0];
idx_low += idx_diff_low;
} }
__host__ __device__ static constexpr bool IsLinearTransform() { return true; } __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
...@@ -392,11 +345,6 @@ struct DynamicEmbed ...@@ -392,11 +345,6 @@ struct DynamicEmbed
const UpperIndex up_lengths_; const UpperIndex up_lengths_;
const UpperIndex coefficients_; const UpperIndex coefficients_;
#if 0
__host__ __device__ constexpr DynamicEmbed(const DynamicEmbed&) = default;
__host__ __device__ constexpr DynamicEmbed(DynamicEmbed&&) = default;
#else
__host__ __device__ constexpr DynamicEmbed(const DynamicEmbed& other) __host__ __device__ constexpr DynamicEmbed(const DynamicEmbed& other)
: up_lengths_{other.up_lengths_}, coefficients_{other.coefficients_} : up_lengths_{other.up_lengths_}, coefficients_{other.coefficients_}
{ {
...@@ -406,7 +354,7 @@ struct DynamicEmbed ...@@ -406,7 +354,7 @@ struct DynamicEmbed
: up_lengths_{other.up_lengths_}, coefficients_{other.coefficients_} : up_lengths_{other.up_lengths_}, coefficients_{other.coefficients_}
{ {
} }
#endif
__host__ __device__ constexpr DynamicEmbed(const UpperIndex& up_lengths, __host__ __device__ constexpr DynamicEmbed(const UpperIndex& up_lengths,
const UpperIndex& coefficients) const UpperIndex& coefficients)
: up_lengths_{up_lengths}, coefficients_{coefficients} : up_lengths_{up_lengths}, coefficients_{coefficients}
...@@ -447,11 +395,16 @@ struct DynamicEmbed ...@@ -447,11 +395,16 @@ struct DynamicEmbed
}); });
} }
template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx> template <typename LowIdxDiff,
__host__ __device__ constexpr void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low, typename UpIdxDiff,
typename LowIdx,
typename UpIdx,
index_t Hack>
__host__ __device__ void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up, const UpIdxDiff& idx_diff_up,
const LowIdx& /* idx_low_old */, LowIdx& idx_low,
const UpIdx& /* idx_up_old */) const const UpIdx& idx_up_new,
Number<Hack>) const
{ {
static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == NDimUp && static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == NDimUp &&
LowIdx::Size() == 1 && UpIdx::Size() == NDimUp, LowIdx::Size() == 1 && UpIdx::Size() == NDimUp,
...@@ -461,20 +414,8 @@ struct DynamicEmbed ...@@ -461,20 +414,8 @@ struct DynamicEmbed
static_for<0, NDimUp, 1>{}( static_for<0, NDimUp, 1>{}(
[&](auto i) { idx_diff_low(Number<0>{}) += idx_diff_up[i] * coefficients_[i]; }); [&](auto i) { idx_diff_low(Number<0>{}) += idx_diff_up[i] * coefficients_[i]; });
}
template <typename LowIdxDiff, idx_low += idx_diff_low;
typename UpIdxDiff,
typename LowIdx,
typename UpIdx,
index_t Hack>
__host__ __device__ constexpr void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& idx_low_old,
const UpIdx& idx_up_old,
Number<Hack>) const
{
CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low_old, idx_up_old);
} }
__host__ __device__ static constexpr bool IsLinearTransform() { return true; } __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
...@@ -502,11 +443,6 @@ struct DynamicMerge ...@@ -502,11 +443,6 @@ struct DynamicMerge
const LowerIndex low_lengths_scan_; const LowerIndex low_lengths_scan_;
const UpperIndex up_lengths_; const UpperIndex up_lengths_;
#if 0
__host__ __device__ constexpr DynamicMerge(const DynamicMerge&) = default;
__host__ __device__ constexpr DynamicMerge(DynamicMerge&&) = default;
#else
__host__ __device__ constexpr DynamicMerge(const DynamicMerge& other) __host__ __device__ constexpr DynamicMerge(const DynamicMerge& other)
: low_lengths_{other.low_lengths_}, : low_lengths_{other.low_lengths_},
low_lengths_scan_{other.low_lengths_scan_}, low_lengths_scan_{other.low_lengths_scan_},
...@@ -520,7 +456,6 @@ struct DynamicMerge ...@@ -520,7 +456,6 @@ struct DynamicMerge
up_lengths_{other.up_lengths_} up_lengths_{other.up_lengths_}
{ {
} }
#endif
__host__ __device__ constexpr DynamicMerge(const LowerIndex& low_lengths) __host__ __device__ constexpr DynamicMerge(const LowerIndex& low_lengths)
: low_lengths_{low_lengths}, : low_lengths_{low_lengths},
...@@ -562,101 +497,15 @@ struct DynamicMerge ...@@ -562,101 +497,15 @@ struct DynamicMerge
idx_low(Number<NDimLow - 1>{}) = tmp; idx_low(Number<NDimLow - 1>{}) = tmp;
} }
// idx_diff_low depends on idx_low_old, so idx_low need to be up-to-date
// If idx_diff_up is known at compile-time, many calculations can be optimized
// away by compiler
// This function assume idx_low_old is not out-of-bound
template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
__host__ __device__ constexpr void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& idx_low_old,
const UpIdx& /* idx_up_old */) const
{
static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
// CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
// However,
// 1) If idx_diff_up is known at compile-time, then idx_diff_low_const
// can be calculated at compile-time.
// 2) If idx_diff_up is not known at compile-time, but its value
// doesn't change during the whole kernel execution, then
// idx_diff_low_const also
// doesn't change during the whole kernel execution. Compiler generated
// ISA should
// only caclculate idx_diff_low_const once and save it durinng the whole
// kernel execution
// If neither 1) nor 2) is satisfied, then the calculation will also be
// computed at
// run-time each time this function is called, and can be very expensive.
LowerIndex idx_diff_low_const;
#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
CalculateLowerIndex(idx_diff_low_const, idx_diff_up);
#else
index_t tmp = idx_diff_up[Number<0>{}];
static_for<0, NDimLow - 1, 1>{}([&](auto i) {
idx_diff_low_const(i) = tmp / low_lengths_scan_[i];
tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
});
// Hack: this force result into SGPR. Need to make sure the result is thread invariant
idx_diff_low_const(Number<NDimLow - 1>{}) = __builtin_amdgcn_readfirstlane(tmp);
#endif
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t carry = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
// this should be saved in SGPR as well
index_t idx_low_length_minus_idx_diff_low_const =
low_lengths_[i] - idx_diff_low_const[i];
#if 1
index_t idx_low_length_plus_idx_diff_low_const =
low_lengths_[i] + idx_diff_low_const[i];
#endif
index_t idx_low_tmp = idx_low_old[i] + carry;
bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const;
#if 1
bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
#endif
idx_diff_low(i) =
do_carry ? -idx_low_length_minus_idx_diff_low_const : idx_diff_low_const[i];
#if 1
idx_diff_low(i) = do_borrow ? idx_low_length_plus_idx_diff_low_const : idx_diff_low[i];
#endif
idx_diff_low(i) += carry;
carry = do_carry ? 1 : 0;
#if 1
carry = do_borrow ? -1 : carry;
#endif
});
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
}
// idx_diff_low depends on idx_low_old, so idx_low need to be up-to-date
//
// If idx_diff_up is known at compile-time, many calculations can be optimized
// away by compiler
// This function assume idx_low_old is not out-of-bound
// this version save computation but use more register
template <typename LowIdxDiff, template <typename LowIdxDiff,
typename UpIdxDiff, typename UpIdxDiff,
typename LowIdx, typename LowIdx,
typename UpIdx, typename UpIdx,
index_t Hack> index_t Hack>
__host__ __device__ constexpr void CalculateLowerIndexDiff_hack_1(LowIdxDiff& idx_diff_low, __host__ __device__ void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up, const UpIdxDiff& idx_diff_up,
const LowIdx& idx_low_old, LowIdx& idx_low,
const UpIdx& /* idx_up_old */, const UpIdx& idx_up_new,
Number<Hack>) const Number<Hack>) const
{ {
static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 && static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
...@@ -675,11 +524,9 @@ struct DynamicMerge ...@@ -675,11 +524,9 @@ struct DynamicMerge
// only caclculate idx_diff_low_const once and save it durinng the whole // only caclculate idx_diff_low_const once and save it durinng the whole
// kernel execution // kernel execution
// If neither 1) nor 2) is satisfied, then the calculation will also be // If neither 1) nor 2) is satisfied, then the calculation will also be
// computed at // computed at run-time each time this function is called, and can be
// run-time each time this function is called, and can be very expensive. // very expensive.
LowerIndex idx_diff_low_const; LowerIndex idx_diff_low_const;
LowerIndex idx_low_length_minus_idx_diff_low_const;
LowerIndex idx_low_length_plus_idx_diff_low_const;
#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE #if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
index_t tmp = idx_diff_up[Number<0>{}]; index_t tmp = idx_diff_up[Number<0>{}];
...@@ -690,12 +537,6 @@ struct DynamicMerge ...@@ -690,12 +537,6 @@ struct DynamicMerge
}); });
idx_diff_low_const(Number<NDimLow - 1>{}) = tmp; idx_diff_low_const(Number<NDimLow - 1>{}) = tmp;
static_for<0, NDimLow, 1>{}([&](auto i) {
idx_low_length_minus_idx_diff_low_const(i) = low_lengths_[i] - idx_diff_low_const[i];
idx_low_length_plus_idx_diff_low_const(i) = low_lengths_[i] + idx_diff_low_const[i];
});
#else #else
// Hack: this force result into SGPR. Need to make sure the result is thread invariant // Hack: this force result into SGPR. Need to make sure the result is thread invariant
index_t tmp = idx_diff_up[Number<0>{}]; index_t tmp = idx_diff_up[Number<0>{}];
...@@ -706,389 +547,70 @@ struct DynamicMerge ...@@ -706,389 +547,70 @@ struct DynamicMerge
}); });
idx_diff_low_const(Number<NDimLow - 1>{}) = __builtin_amdgcn_readfirstlane(tmp); idx_diff_low_const(Number<NDimLow - 1>{}) = __builtin_amdgcn_readfirstlane(tmp);
static_for<0, NDimLow, 1>{}([&](auto i) {
idx_low_length_minus_idx_diff_low_const(i) =
__builtin_amdgcn_readfirstlane(low_lengths_[i] - idx_diff_low_const[i]);
idx_low_length_plus_idx_diff_low_const(i) =
__builtin_amdgcn_readfirstlane(low_lengths_[i] + idx_diff_low_const[i]);
});
#endif #endif
if constexpr(Hack == 1) if constexpr(Hack == 1)
{ {
// do carry check on each low dimension in reversed order // do carry check on each low dimension in reversed order
// do not need to check the first dimension // do not need to check the first dimension
index_t carry = 0; bool do_carry = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
index_t idx_low_tmp = idx_low_old[i] + carry;
bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
idx_diff_low(i) =
do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
idx_diff_low(i) += carry;
carry = do_carry ? 1 : 0;
});
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
}
else if constexpr(Hack == 2)
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t borrow = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) { static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
index_t idx_low_tmp = idx_low_old[i] - borrow; idx_diff_low(i) = idx_diff_low_const[i] + do_carry;
bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
idx_diff_low(i) =
do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low_const[i];
idx_diff_low(i) -= borrow; index_t idx_low_tmp = idx_low[i] + idx_diff_low[i];
borrow = do_borrow ? 1 : 0; do_carry = idx_low_tmp >= low_lengths_[i];
});
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] - borrow; // TODO: use exec-mask inline asm
} if(do_carry)
else
{ {
// do carry check on each low dimension in reversed order idx_diff_low(i) -= low_lengths_[i];
// do not need to check the first dimension
index_t carry = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
index_t idx_low_tmp = idx_low_old[i] + carry;
bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
idx_diff_low(i) =
do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
idx_diff_low(i) =
do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low[i];
idx_diff_low(i) += carry;
carry = do_carry ? 1 : 0;
carry = do_borrow ? -1 : carry;
});
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
} }
}
// idx_diff_low depends on idx_low_old, so idx_low need to be up-to-date
// If idx_diff_up is known at compile-time, many calculations can be optimized
// away by compiler
// This function assume idx_low_old is not out-of-bound
// this version use less register but more computation
template <typename LowIdxDiff,
typename UpIdxDiff,
typename LowIdx,
typename UpIdx,
index_t Hack>
__host__ __device__ constexpr void CalculateLowerIndexDiff_hack_2(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& idx_low_old,
const UpIdx& /* idx_up_old */,
Number<Hack>) const
{
static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
// CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions. idx_low(i) += idx_diff_low[i];
// However,
// 1) If idx_diff_up is known at compile-time, then idx_diff_low_const
// can be calculated at compile-time.
// 2) If idx_diff_up is not known at compile-time, but its value
// doesn't change during the whole kernel execution, then
// idx_diff_low_const also
// doesn't change during the whole kernel execution. Compiler generated
// ISA should
// only caclculate idx_diff_low_const once and save it durinng the whole
// kernel execution
// If neither 1) nor 2) is satisfied, then the calculation will also be
// computed at
// run-time each time this function is called, and can be very expensive.
LowerIndex idx_diff_low_const;
LowerIndex idx_low_length_minus_idx_diff_low_const;
LowerIndex idx_low_length_plus_idx_diff_low_const;
#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
index_t tmp = idx_diff_up[Number<0>{}];
static_for<0, NDimLow - 1, 1>{}([&](auto i) {
idx_diff_low_const(i) = tmp / low_lengths_scan_[i];
tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
});
idx_diff_low_const(Number<NDimLow - 1>{}) = tmp;
#else
// Hack: this force result into SGPR. Need to make sure the result is thread invariant
index_t tmp = idx_diff_up[Number<0>{}];
static_for<0, NDimLow - 1, 1>{}([&](auto i) {
idx_diff_low_const(i) = __builtin_amdgcn_readfirstlane(tmp / low_lengths_scan_[i]);
tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
}); });
idx_diff_low_const(Number<NDimLow - 1>{}) = __builtin_amdgcn_readfirstlane(tmp); constexpr auto I0 = Number<0>{};
#endif
if constexpr(Hack == 1)
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
bool do_carry = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
idx_diff_low(i) = idx_diff_low_const[i] + do_carry;
index_t idx_low_tmp = idx_low_old[i] + idx_diff_low_const[i] + do_carry;
do_carry = idx_low_tmp >= low_lengths_[i];
idx_diff_low(i) = do_carry ? idx_diff_low(i) - low_lengths_[i] : idx_diff_low[i]; idx_diff_low(I0) = idx_diff_low_const[I0] + do_carry;
});
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + do_carry; idx_low(I0) += idx_diff_low[I0];
} }
else if constexpr(Hack == 2) else if constexpr(Hack == 2)
{ {
// do carry check on each low dimension in reversed order // do borrow check on each low dimension in reversed order
// do not need to check the first dimension // do not need to check the first dimension
bool do_borrow = 0; bool do_borrow = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) { static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
idx_diff_low(i) = idx_diff_low_const[i] - do_borrow; idx_diff_low(i) = idx_diff_low_const[i] - do_borrow;
index_t idx_low_tmp = idx_low_old[i] + idx_diff_low_const[i] - do_borrow; index_t idx_low_tmp = idx_low[i] + idx_diff_low[i];
do_borrow = idx_low_tmp < 0; do_borrow = idx_low_tmp < 0;
idx_diff_low(i) = do_borrow ? idx_diff_low(i) + low_lengths_[i] : idx_diff_low[i]; // TODO: use exec-mask inline asm
}); if(do_borrow)
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] - do_borrow;
}
else
{ {
#if 0 idx_diff_low(i) += low_lengths_[i];
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t carry = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
idx_diff_low(i) = idx_diff_low_const[i] + carry;
bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
idx_diff_low(i) =
do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
idx_diff_low(i) =
do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low[i];
idx_diff_low(i) += carry;
carry = do_carry ? 1 : 0;
carry = do_borrow ? -1 : carry;
});
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
#endif
}
} }
// idx_diff_low depends on idx_low_old, so idx_low need to be up-to-date idx_low(i) += idx_diff_low[i];
// If idx_diff_up is known at compile-time, many calculations can be optimized
// away by compiler
// This function assume idx_low_old is not out-of-bound
// this version use less register but more computation
template <typename LowIdxDiff,
typename UpIdxDiff,
typename LowIdx,
typename UpIdx,
index_t Hack>
__host__ __device__ constexpr void CalculateLowerIndexDiff_hack_3(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& idx_low_old,
const UpIdx& /* idx_up_old */,
Number<Hack>) const
{
static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
// CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
// However,
// 1) If idx_diff_up is known at compile-time, then idx_diff_low_const
// can be calculated at compile-time.
// 2) If idx_diff_up is not known at compile-time, but its value
// doesn't change during the whole kernel execution, then
// idx_diff_low_const also
// doesn't change during the whole kernel execution. Compiler generated
// ISA should
// only caclculate idx_diff_low_const once and save it durinng the whole
// kernel execution
// If neither 1) nor 2) is satisfied, then the calculation will also be
// computed at
// run-time each time this function is called, and can be very expensive.
LowerIndex idx_diff_low_const;
LowerIndex idx_low_length_minus_idx_diff_low_const;
LowerIndex idx_low_length_plus_idx_diff_low_const;
#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
index_t tmp = idx_diff_up[Number<0>{}];
static_for<0, NDimLow - 1, 1>{}([&](auto i) {
idx_diff_low_const(i) = tmp / low_lengths_scan_[i];
tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
});
idx_diff_low_const(Number<NDimLow - 1>{}) = tmp;
#else
// Hack: this force result into SGPR. Need to make sure the result is thread invariant
index_t tmp = idx_diff_up[Number<0>{}];
static_for<0, NDimLow - 1, 1>{}([&](auto i) {
idx_diff_low_const(i) = __builtin_amdgcn_readfirstlane(tmp / low_lengths_scan_[i]);
tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
});
idx_diff_low_const(Number<NDimLow - 1>{}) = __builtin_amdgcn_readfirstlane(tmp);
#endif
if constexpr(Hack == 1)
{
#if 1
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
bool do_carry = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
idx_diff_low(i) = idx_diff_low_const[i] + do_carry;
index_t idx_low_tmp = idx_low_old[i] + idx_diff_low_const[i] + do_carry;
do_carry = idx_low_tmp >= low_lengths_[i];
idx_diff_low(i) = do_carry ? idx_diff_low(i) - low_lengths_[i] : idx_diff_low[i];
});
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + do_carry;
#else
LowerIndex idx_low_new = idx_low_old;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
auto i_m1 = i - Number<1>{};
int64_t exec_mask;
int64_t do_carry;
idx_low_new(i) = idx_diff_low_const[i] + idx_low_old[i];
asm volatile(
"\n \
s_mov_b64 %0, exec \n \
v_cmpx_le_u32_e64 %1, %4, %2 \n \
v_subrev_u32 %2, %4, %2\n \
v_add_u32 %3, %3, 1\n \
s_mov_b64 exec, %0\n \
"
: "=s"(exec_mask), "=s"(do_carry), "=v"(idx_low_new(i)), "=v"(idx_low_new(i_m1))
: "s"(low_lengths_[i]), "2"(idx_low_new[i]), "3"(idx_low_new[i_m1]));
idx_diff_low(i) = idx_low_new[i] - idx_low_old[i];
}); });
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
idx_low_new(I0) += idx_diff_low_const[I0];
idx_diff_low(I0) = idx_low_new[I0] - idx_low_old[I0];
#endif
}
else if constexpr(Hack == 2)
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
bool do_borrow = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) { idx_diff_low(I0) = idx_diff_low_const[I0] - do_borrow;
idx_diff_low(i) = idx_diff_low_const[i] - do_borrow;
index_t idx_low_tmp = idx_low_old[i] + idx_diff_low_const[i] - do_borrow; idx_low(I0) += idx_diff_low[I0];
do_borrow = idx_low_tmp < 0;
idx_diff_low(i) = do_borrow ? idx_diff_low(i) + low_lengths_[i] : idx_diff_low[i];
});
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] - do_borrow;
} }
else else
{ {
#if 0 // not implemented
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t carry = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
idx_diff_low(i) = idx_diff_low_const[i] + carry;
bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
idx_diff_low(i) =
do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
idx_diff_low(i) =
do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low[i];
idx_diff_low(i) += carry;
carry = do_carry ? 1 : 0;
carry = do_borrow ? -1 : carry;
});
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
#endif
} }
} }
template <typename LowIdxDiff,
typename UpIdxDiff,
typename LowIdx,
typename UpIdx,
index_t Hack>
__host__ __device__ constexpr void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& idx_low_old,
const UpIdx& idx_up_old,
Number<Hack>) const
{
#if 0
// this version save computation but use more register
CalculateLowerIndexDiff_hack_1(
idx_diff_low, idx_diff_up, idx_low_old, idx_up_old, Number<Hack>{});
#elif 1
// this version use less register but more computation
CalculateLowerIndexDiff_hack_2(
idx_diff_low, idx_diff_up, idx_low_old, idx_up_old, Number<Hack>{});
#elif 1
// this version use less register but more computation
CalculateLowerIndexDiff_hack_3(
idx_diff_low, idx_diff_up, idx_low_old, idx_up_old, Number<Hack>{});
#endif
}
__host__ __device__ static constexpr bool IsLinearTransform() { return false; } __host__ __device__ static constexpr bool IsLinearTransform() { return false; }
__host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex() __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
...@@ -1156,27 +678,20 @@ struct DynamicUnMerge ...@@ -1156,27 +678,20 @@ struct DynamicUnMerge
} }
} }
template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
__host__ __device__ constexpr void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& /* idx_low_old */,
const UpIdx& /* idx_up_old */) const
{
CalculateLowerIndex(idx_diff_low, idx_diff_up);
}
template <typename LowIdxDiff, template <typename LowIdxDiff,
typename UpIdxDiff, typename UpIdxDiff,
typename LowIdx, typename LowIdx,
typename UpIdx, typename UpIdx,
index_t Hack> index_t Hack>
__host__ __device__ constexpr void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low, __host__ __device__ void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up, const UpIdxDiff& idx_diff_up,
const LowIdx& idx_low_old, LowIdx& idx_low,
const UpIdx& idx_up_old, const UpIdx& idx_up_new,
Number<Hack>) const Number<Hack>) const
{ {
CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low_old, idx_up_old); CalculateLowerIndex(idx_diff_low, idx_diff_up);
idx_low += idx_diff_low;
} }
__host__ __device__ static constexpr bool IsLinearTransform() { return true; } __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
...@@ -1221,139 +736,18 @@ struct DynamicFreeze ...@@ -1221,139 +736,18 @@ struct DynamicFreeze
idx_low(Number<0>{}) = low_idx_; idx_low(Number<0>{}) = low_idx_;
} }
template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
__host__ __device__ static void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& /* idx_low_old */,
const UpIdx& /* idx_up_old */)
{
idx_diff_low(Number<0>{}) = index_t{Number<0>{}};
}
template <typename LowIdxDiff, template <typename LowIdxDiff,
typename UpIdxDiff, typename UpIdxDiff,
typename LowIdx, typename LowIdx,
typename UpIdx, typename UpIdx,
index_t Hack> index_t Hack>
__host__ __device__ static void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low, __host__ __device__ static void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up, const UpIdxDiff& idx_diff_up,
const LowIdx& idx_low_old, LowIdx& idx_low,
const UpIdx& idx_up_old, const UpIdx& idx_up_new,
Number<Hack>) Number<Hack>)
{ {
CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low_old, idx_up_old); idx_diff_low(Number<0>{}) = index_t{Number<0>{}};
}
__host__ __device__ static constexpr bool IsLinearTransform() { return true; }
__host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
{
return true;
}
template <typename UpIdx>
__host__ __device__ static constexpr bool
IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
{
return true;
}
};
#if 0
template <index_t NDimUp, typename StaticPartialUpLengths>
struct HackSemiDynamicUnMerge
{
using LowerIndex = MultiIndex<1>;
using UpperIndex = MultiIndex<NDimUp>;
const UpperIndex up_lengths_;
const UpperIndex up_lengths_scan_;
static constexpr index_t NDimUpStatic = StaticPartialUpLengths::Size();
static constexpr index_t NDimUpDynamic = NDimUp - NDimUpStatic;
const MultiIndex<NDimUpDynamic> dynamic_partial_up_lengths_;
const MultiIndex<NDimUpDynamic> dynamic_partial_up_lengths_scan_;
static constexpr auto static_partial_up_lengths_ = StaticPartialUpLengths{};
static constexpr auto static_partial_up_lengths_scan_ = reverse_exclusive_scan_sequence(
static_partial_up_lengths_, math::multiplies<index_t>(), Number<1>{});
__host__ __device__ constexpr HackSemiDynamicUnMerge(
const MultiIndex<NDimUpDynamic>& dynamic_partial_up_lengths)
: dynamic_partial_up_lengths_{dynamic_partial_up_lengths},
dynamic_partial_up_lengths_scan_{
container_reverse_exclusive_scan(dynamic_partial_up_lengths,
math::multiplies<index_t>(),
static_partial_up_lengths_scan_[Number<0>{}])}
{
static_assert(NDimUpDynamic + NDimUpStatic == NDimUp,
"wrong! inconsisitent # of dimensions");
}
__host__ __device__ constexpr HackSemiDynamicUnMerge()
: up_lengths_{make_zero_multi_index<NDimUpDynamic>()},
up_lengths_scan_{make_zero_multi_index<NDimUpStatic>()}
{
static_assert(NDimUpDynamic + NDimUpStatic == NDimUp,
"wrong! inconsisitent # of dimensions");
}
__host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
__host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return NDimUp; }
__host__ __device__ constexpr const auto GetUpperLengths() const
{
UpperIndex up_lengths;
static_for<0, NDimUpDynamic, 1>{}(
[&](auto i) { up_lengths(i) = dynamic_partial_up_lengths_[i]; });
static_for<0, NDimUpStatic, 1>{}([&](auto i) {
up_lengths(i + Number<NDimUpDynamic>{}) = static_partial_up_lengths_[i];
});
return up_lengths;
}
template <typename LowIdx, typename UpIdx>
__host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
const UpIdx& idx_up) const
{
idx_low(Number<0>{}) = idx_up[Number<NDimUp - 1>{}];
static_for<0, NDimUpDynamic, 1>{}([&](auto i) {
idx_low(Number<0>{}) += idx_up[i] * dynamic_partial_up_lengths_scan_[i];
});
static_for<NDimUpDynamic, NDimUp - 1, 1>{}([&](auto i) {
idx_low(Number<0>{}) +=
idx_up[i] * static_partial_up_lengths_scan_[i + Number<NDimUpDynamic>{}];
});
}
template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
__host__ __device__ constexpr void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& /* idx_low_old */,
const UpIdx& /* idx_up_old */) const
{
CalculateLowerIndex(idx_diff_low, idx_diff_up);
}
template <typename LowIdxDiff,
typename UpIdxDiff,
typename LowIdx,
typename UpIdx,
index_t Hack>
__host__ __device__ constexpr void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& idx_low_old,
const UpIdx& idx_up_old,
Number<Hack>) const
{
CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low_old, idx_up_old);
} }
__host__ __device__ static constexpr bool IsLinearTransform() { return true; } __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
...@@ -1370,7 +764,6 @@ struct HackSemiDynamicUnMerge ...@@ -1370,7 +764,6 @@ struct HackSemiDynamicUnMerge
return true; return true;
} }
}; };
#endif
} // namespace ck } // namespace ck
#endif #endif
...@@ -502,27 +502,18 @@ __host__ __device__ constexpr void move_dynamic_tensor_coordinate(const TensorDe ...@@ -502,27 +502,18 @@ __host__ __device__ constexpr void move_dynamic_tensor_coordinate(const TensorDe
constexpr auto dims_low = TensorDesc::GetLowerDimensionIdss().At(itran); constexpr auto dims_low = TensorDesc::GetLowerDimensionIdss().At(itran);
constexpr auto dims_up = TensorDesc::GetUpperDimensionIdss().At(itran); constexpr auto dims_up = TensorDesc::GetUpperDimensionIdss().At(itran);
const auto idx_up = get_container_subset(idx_hidden, dims_up); const auto idx_up_new = get_container_subset(idx_hidden, dims_up);
auto idx_low = get_container_subset(idx_hidden, dims_low); auto idx_low = get_container_subset(idx_hidden, dims_low);
const auto idx_diff_up = get_container_subset(idx_diff_hidden, dims_up); const auto idx_diff_up = get_container_subset(idx_diff_hidden, dims_up);
MultiIndex<dims_low.Size()> idx_diff_low; MultiIndex<dims_low.Size()> idx_diff_low;
// calculate idx_diff_low // HACK: control UpdateLowerIndex for DynamicMerge using hack
#if 0 // hack
tran.CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low, idx_up);
#else
// HACK: control CalculateLowerIndexDiff for DynamicMerge using ing hack
// TODO remove hack // TODO remove hack
constexpr index_t Hack = constexpr index_t Hack =
decltype(coord_step.hack_calculate_lower_index_diff_)::At(itran); decltype(coord_step.hack_calculate_lower_index_diff_)::At(itran);
tran.CalculateLowerIndexDiff_hack( tran.UpdateLowerIndex(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{});
idx_diff_low, idx_diff_up, idx_low, idx_up, Number<Hack>{});
#endif
// update idx_low
idx_low += idx_diff_low;
set_container_subset(idx_diff_hidden, dims_low, idx_diff_low); set_container_subset(idx_diff_hidden, dims_low, idx_diff_low);
set_container_subset(idx_hidden, dims_low, idx_low); set_container_subset(idx_hidden, dims_low, idx_low);
......
...@@ -384,13 +384,8 @@ struct GridwiseDynamicGemm_km_kn_mn_v1 ...@@ -384,13 +384,8 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
Float, Float,
decltype(c_m0_m1_n0_n1_thread_desc), decltype(c_m0_m1_n0_n1_thread_desc),
decltype(c_m0_m1_n0_n1_global_desc), decltype(c_m0_m1_n0_n1_global_desc),
#if 1 // debug
Sequence<MRepeat, MPerThread, NRepeat, NPerThread>, Sequence<MRepeat, MPerThread, NRepeat, NPerThread>,
CThreadTransferSrcDstAccessOrder, CThreadTransferSrcDstAccessOrder,
#else
Sequence<1, 1, 2, 4>,
Sequence<0, 1, 2, 3>,
#endif
CThreadTransferSrcDstVectorDim, CThreadTransferSrcDstVectorDim,
1, 1,
CThreadTransferDstScalarPerVector, CThreadTransferDstScalarPerVector,
......
...@@ -922,7 +922,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -922,7 +922,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
src_desc, make_multi_index(1, 0), Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0>{}); src_desc, make_multi_index(1, 0), Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0>{});
const auto src_step_m1_0 = make_dynamic_tensor_coordinate_step_hack( const auto src_step_m1_0 = make_dynamic_tensor_coordinate_step_hack(
src_desc, make_multi_index(-1, 0), Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0>{}); src_desc, make_multi_index(-1, 0), Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0>{});
#elif 0 #elif 1
// for non-padded input tensor // for non-padded input tensor
const auto src_step_0_p1 = make_dynamic_tensor_coordinate_step_hack( const auto src_step_0_p1 = make_dynamic_tensor_coordinate_step_hack(
src_desc, make_multi_index(0, 1), Sequence<0, 0, 0, 0, 0, 0, 1>{}); src_desc, make_multi_index(0, 1), Sequence<0, 0, 0, 0, 0, 0, 1>{});
...@@ -1067,7 +1067,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -1067,7 +1067,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
// for padded input tensor // for padded input tensor
const auto adjusted_step = make_dynamic_tensor_coordinate_step_hack( const auto adjusted_step = make_dynamic_tensor_coordinate_step_hack(
src_desc, adjusted_step_idx, Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2>{}); src_desc, adjusted_step_idx, Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2>{});
#elif 0 #elif 1
// for non-paded input tensor // for non-paded input tensor
const auto adjusted_step = make_dynamic_tensor_coordinate_step_hack( const auto adjusted_step = make_dynamic_tensor_coordinate_step_hack(
src_desc, adjusted_step_idx, Sequence<0, 0, 0, 0, 0, 1, 2>{}); src_desc, adjusted_step_idx, Sequence<0, 0, 0, 0, 0, 1, 2>{});
......
...@@ -152,7 +152,7 @@ __device__ float amd_buffer_load<float, 1>(const float* p_src_wave, ...@@ -152,7 +152,7 @@ __device__ float amd_buffer_load<float, 1>(const float* p_src_wave,
return __llvm_amdgcn_buffer_load_f32( return __llvm_amdgcn_buffer_load_f32(
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false); src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#else #else
#if 1 // debug #if 0 // debug
float tmp = __llvm_amdgcn_buffer_load_f32( float tmp = __llvm_amdgcn_buffer_load_f32(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false); src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);
......
...@@ -201,7 +201,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc ...@@ -201,7 +201,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
constexpr auto conv_driver = constexpr auto conv_driver =
#if 1 // debug #if 1
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
#else #else
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment