Commit c1ed17f8 authored by Chao Liu's avatar Chao Liu
Browse files

improving index calculation: change to UpdateIndexDiff()

parent 77c81617
......@@ -157,7 +157,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
const index_t GemmM0 = GemmM / GemmM1;
const index_t GemmN0 = GemmN / GemmN1;
#if 1 // debug
const auto out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc =
transform_dynamic_tensor_descriptor(
out_gemmm_gemmn_global_desc,
......@@ -165,16 +164,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
DynamicUnMerge<2>{make_multi_index(GemmN0, GemmN1)}),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
#else
const auto out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc =
transform_dynamic_tensor_descriptor(
out_gemmm_gemmn_global_desc,
make_tuple(
HackSemiDynamicUnMerge<3, Sequence<GemmM1>>{make_multi_index(1, GemmM0)},
HackSemiDynamicUnMerge<3, Sequence<GemmN1>>{make_multi_index(1, GemmN0)}),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
#endif
// GEMM
using gridwise_gemm = GridwiseDynamicGemm_km_kn_mn_v1<
......
......@@ -13,11 +13,6 @@ struct DynamicPassThrough
const UpperIndex up_lengths_;
#if 0
__host__ __device__ constexpr DynamicPassThrough(const DynamicPassThrough&) = default;
__host__ __device__ constexpr DynamicPassThrough(DynamicPassThrough&&) = default;
#else
__host__ __device__ constexpr DynamicPassThrough(const DynamicPassThrough& other)
: up_lengths_{other.up_lengths_}
{
......@@ -27,7 +22,6 @@ struct DynamicPassThrough
: up_lengths_{other.up_lengths_}
{
}
#endif
__host__ __device__ constexpr DynamicPassThrough(const index_t& low_length)
: up_lengths_{make_multi_index(low_length)}
......@@ -51,31 +45,26 @@ struct DynamicPassThrough
idx_low(Number<0>{}) = idx_up[Number<0>{}];
}
template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
__host__ __device__ static void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& /* idx_low_old */,
const UpIdx& /* idx_up_old */)
{
static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
idx_diff_low(Number<0>{}) = idx_diff_up[Number<0>{}];
}
template <typename LowIdxDiff,
typename UpIdxDiff,
typename LowIdx,
typename UpIdx,
index_t Hack>
__host__ __device__ static void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
__host__ __device__ static void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& idx_low_old,
const UpIdx& idx_up_old,
LowIdx& idx_low,
const UpIdx& idx_up_new,
Number<Hack>)
{
CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low_old, idx_up_old);
static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
constexpr auto I0 = Number<0>{};
idx_diff_low(I0) = idx_diff_up[I0];
idx_low += idx_diff_low;
}
__host__ __device__ static constexpr bool IsLinearTransform() { return true; }
......@@ -103,11 +92,6 @@ struct DynamicPad
const index_t left_pad_;
const index_t right_pad_;
#if 0
__host__ __device__ constexpr DynamicPad(const DynamicPad&) = default;
__host__ __device__ constexpr DynamicPad(DynamicPad&&) = default;
#else
__host__ __device__ constexpr DynamicPad(const DynamicPad& other)
: up_lengths_{other.up_lengths_}, left_pad_{other.left_pad_}, right_pad_{other.right_pad_}
{
......@@ -117,7 +101,6 @@ struct DynamicPad
: up_lengths_{other.up_lengths_}, left_pad_{other.left_pad_}, right_pad_{other.right_pad_}
{
}
#endif
__host__ __device__ constexpr DynamicPad(const index_t& low_length,
const index_t& left_pad,
......@@ -146,32 +129,26 @@ struct DynamicPad
idx_low(Number<0>{}) = idx_up[Number<0>{}] - left_pad_;
}
template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
__host__ __device__ static constexpr void
CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& /* idx_low_old */,
const UpIdx& /* idx_up_old */)
{
static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
idx_diff_low(Number<0>{}) = idx_diff_up[Number<0>{}];
}
template <typename LowIdxDiff,
typename UpIdxDiff,
typename LowIdx,
typename UpIdx,
index_t Hack>
__host__ __device__ static void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
__host__ __device__ static void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& idx_low_old,
const UpIdx& idx_up_old,
LowIdx& idx_low,
const UpIdx& idx_up_new,
Number<Hack>)
{
CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low_old, idx_up_old);
static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
constexpr auto I0 = Number<0>{};
idx_diff_low(I0) = idx_diff_up[I0];
idx_low += idx_diff_low;
}
__host__ __device__ static constexpr bool IsLinearTransform() { return true; }
......@@ -199,11 +176,6 @@ struct DynamicLeftPad
const UpperIndex up_lengths_;
const index_t left_pad_;
#if 0
__host__ __device__ constexpr DynamicLeftPad(const DynamicLeftPad&) = default;
__host__ __device__ constexpr DynamicLeftPad(DynamicLeftPad&&) = default;
#else
__host__ __device__ constexpr DynamicLeftPad(const DynamicLeftPad& other)
: up_lengths_{other.up_lengths_}, left_pad_{other.left_pad_}
{
......@@ -213,7 +185,6 @@ struct DynamicLeftPad
: up_lengths_{other.up_lengths_}, left_pad_{other.left_pad_}
{
}
#endif
__host__ __device__ constexpr DynamicLeftPad(const index_t& low_length, const index_t& left_pad)
: up_lengths_{make_multi_index(low_length + left_pad)}, left_pad_{left_pad}
......@@ -238,32 +209,26 @@ struct DynamicLeftPad
idx_low(Number<0>{}) = idx_up[Number<0>{}] - left_pad_;
}
template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
__host__ __device__ static constexpr void
CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& /* idx_low_old */,
const UpIdx& /* idx_up_old */)
{
static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
idx_diff_low(Number<0>{}) = idx_diff_up[Number<0>{}];
}
template <typename LowIdxDiff,
typename UpIdxDiff,
typename LowIdx,
typename UpIdx,
index_t Hack>
__host__ __device__ static void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
__host__ __device__ static void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& idx_low_old,
const UpIdx& idx_up_old,
LowIdx& idx_low,
const UpIdx& idx_up_new,
Number<Hack>)
{
CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low_old, idx_up_old);
static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
constexpr auto I0 = Number<0>{};
idx_diff_low(I0) = idx_diff_up[I0];
idx_low += idx_diff_low;
}
__host__ __device__ static constexpr bool IsLinearTransform() { return true; }
......@@ -291,11 +256,6 @@ struct DynamicRightPad
const index_t low_length_;
const index_t right_pad_;
#if 0
__host__ __device__ constexpr DynamicRightPad(const DynamicRightPad&) = default;
__host__ __device__ constexpr DynamicRightPad(DynamicRightPad&&) = default;
#else
__host__ __device__ constexpr DynamicRightPad(const DynamicRightPad& other)
: up_lengths_{other.up_lengths_},
low_length_{other.low_length_},
......@@ -309,7 +269,6 @@ struct DynamicRightPad
right_pad_{other.right_pad_}
{
}
#endif
__host__ __device__ constexpr DynamicRightPad(const index_t& low_length,
const index_t& right_pad)
......@@ -339,32 +298,26 @@ struct DynamicRightPad
idx_low(Number<0>{}) = idx_up[Number<0>{}];
}
template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
__host__ __device__ static constexpr void
CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& /* idx_low_old */,
const UpIdx& /* idx_up_old */)
{
static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
idx_diff_low(Number<0>{}) = idx_diff_up[Number<0>{}];
}
template <typename LowIdxDiff,
typename UpIdxDiff,
typename LowIdx,
typename UpIdx,
index_t Hack>
__host__ __device__ static void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
__host__ __device__ static void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& idx_low_old,
const UpIdx& idx_up_old,
LowIdx& idx_low,
const UpIdx& idx_up_new,
Number<Hack>)
{
CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low_old, idx_up_old);
static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
constexpr auto I0 = Number<0>{};
idx_diff_low(I0) = idx_diff_up[I0];
idx_low += idx_diff_low;
}
__host__ __device__ static constexpr bool IsLinearTransform() { return true; }
......@@ -392,11 +345,6 @@ struct DynamicEmbed
const UpperIndex up_lengths_;
const UpperIndex coefficients_;
#if 0
__host__ __device__ constexpr DynamicEmbed(const DynamicEmbed&) = default;
__host__ __device__ constexpr DynamicEmbed(DynamicEmbed&&) = default;
#else
__host__ __device__ constexpr DynamicEmbed(const DynamicEmbed& other)
: up_lengths_{other.up_lengths_}, coefficients_{other.coefficients_}
{
......@@ -406,7 +354,7 @@ struct DynamicEmbed
: up_lengths_{other.up_lengths_}, coefficients_{other.coefficients_}
{
}
#endif
__host__ __device__ constexpr DynamicEmbed(const UpperIndex& up_lengths,
const UpperIndex& coefficients)
: up_lengths_{up_lengths}, coefficients_{coefficients}
......@@ -447,11 +395,16 @@ struct DynamicEmbed
});
}
template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
__host__ __device__ constexpr void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
template <typename LowIdxDiff,
typename UpIdxDiff,
typename LowIdx,
typename UpIdx,
index_t Hack>
__host__ __device__ void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& /* idx_low_old */,
const UpIdx& /* idx_up_old */) const
LowIdx& idx_low,
const UpIdx& idx_up_new,
Number<Hack>) const
{
static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == NDimUp &&
LowIdx::Size() == 1 && UpIdx::Size() == NDimUp,
......@@ -461,20 +414,8 @@ struct DynamicEmbed
static_for<0, NDimUp, 1>{}(
[&](auto i) { idx_diff_low(Number<0>{}) += idx_diff_up[i] * coefficients_[i]; });
}
template <typename LowIdxDiff,
typename UpIdxDiff,
typename LowIdx,
typename UpIdx,
index_t Hack>
__host__ __device__ constexpr void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& idx_low_old,
const UpIdx& idx_up_old,
Number<Hack>) const
{
CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low_old, idx_up_old);
idx_low += idx_diff_low;
}
__host__ __device__ static constexpr bool IsLinearTransform() { return true; }
......@@ -502,11 +443,6 @@ struct DynamicMerge
const LowerIndex low_lengths_scan_;
const UpperIndex up_lengths_;
#if 0
__host__ __device__ constexpr DynamicMerge(const DynamicMerge&) = default;
__host__ __device__ constexpr DynamicMerge(DynamicMerge&&) = default;
#else
__host__ __device__ constexpr DynamicMerge(const DynamicMerge& other)
: low_lengths_{other.low_lengths_},
low_lengths_scan_{other.low_lengths_scan_},
......@@ -520,7 +456,6 @@ struct DynamicMerge
up_lengths_{other.up_lengths_}
{
}
#endif
__host__ __device__ constexpr DynamicMerge(const LowerIndex& low_lengths)
: low_lengths_{low_lengths},
......@@ -562,101 +497,15 @@ struct DynamicMerge
idx_low(Number<NDimLow - 1>{}) = tmp;
}
// idx_diff_low depends on idx_low_old, so idx_low need to be up-to-date
// If idx_diff_up is known at compile-time, many calculations can be optimized
// away by compiler
// This function assume idx_low_old is not out-of-bound
template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
__host__ __device__ constexpr void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& idx_low_old,
const UpIdx& /* idx_up_old */) const
{
static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
// CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
// However,
// 1) If idx_diff_up is known at compile-time, then idx_diff_low_const
// can be calculated at compile-time.
// 2) If idx_diff_up is not known at compile-time, but its value
// doesn't change during the whole kernel execution, then
// idx_diff_low_const also
// doesn't change during the whole kernel execution. Compiler generated
// ISA should
// only caclculate idx_diff_low_const once and save it durinng the whole
// kernel execution
// If neither 1) nor 2) is satisfied, then the calculation will also be
// computed at
// run-time each time this function is called, and can be very expensive.
LowerIndex idx_diff_low_const;
#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
CalculateLowerIndex(idx_diff_low_const, idx_diff_up);
#else
index_t tmp = idx_diff_up[Number<0>{}];
static_for<0, NDimLow - 1, 1>{}([&](auto i) {
idx_diff_low_const(i) = tmp / low_lengths_scan_[i];
tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
});
// Hack: this force result into SGPR. Need to make sure the result is thread invariant
idx_diff_low_const(Number<NDimLow - 1>{}) = __builtin_amdgcn_readfirstlane(tmp);
#endif
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t carry = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
// this should be saved in SGPR as well
index_t idx_low_length_minus_idx_diff_low_const =
low_lengths_[i] - idx_diff_low_const[i];
#if 1
index_t idx_low_length_plus_idx_diff_low_const =
low_lengths_[i] + idx_diff_low_const[i];
#endif
index_t idx_low_tmp = idx_low_old[i] + carry;
bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const;
#if 1
bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
#endif
idx_diff_low(i) =
do_carry ? -idx_low_length_minus_idx_diff_low_const : idx_diff_low_const[i];
#if 1
idx_diff_low(i) = do_borrow ? idx_low_length_plus_idx_diff_low_const : idx_diff_low[i];
#endif
idx_diff_low(i) += carry;
carry = do_carry ? 1 : 0;
#if 1
carry = do_borrow ? -1 : carry;
#endif
});
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
}
// idx_diff_low depends on idx_low_old, so idx_low need to be up-to-date
//
// If idx_diff_up is known at compile-time, many calculations can be optimized
// away by compiler
// This function assume idx_low_old is not out-of-bound
// this version save computation but use more register
template <typename LowIdxDiff,
typename UpIdxDiff,
typename LowIdx,
typename UpIdx,
index_t Hack>
__host__ __device__ constexpr void CalculateLowerIndexDiff_hack_1(LowIdxDiff& idx_diff_low,
__host__ __device__ void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& idx_low_old,
const UpIdx& /* idx_up_old */,
LowIdx& idx_low,
const UpIdx& idx_up_new,
Number<Hack>) const
{
static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
......@@ -675,11 +524,9 @@ struct DynamicMerge
// only caclculate idx_diff_low_const once and save it durinng the whole
// kernel execution
// If neither 1) nor 2) is satisfied, then the calculation will also be
// computed at
// run-time each time this function is called, and can be very expensive.
// computed at run-time each time this function is called, and can be
// very expensive.
LowerIndex idx_diff_low_const;
LowerIndex idx_low_length_minus_idx_diff_low_const;
LowerIndex idx_low_length_plus_idx_diff_low_const;
#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
index_t tmp = idx_diff_up[Number<0>{}];
......@@ -690,12 +537,6 @@ struct DynamicMerge
});
idx_diff_low_const(Number<NDimLow - 1>{}) = tmp;
static_for<0, NDimLow, 1>{}([&](auto i) {
idx_low_length_minus_idx_diff_low_const(i) = low_lengths_[i] - idx_diff_low_const[i];
idx_low_length_plus_idx_diff_low_const(i) = low_lengths_[i] + idx_diff_low_const[i];
});
#else
// Hack: this force result into SGPR. Need to make sure the result is thread invariant
index_t tmp = idx_diff_up[Number<0>{}];
......@@ -706,389 +547,70 @@ struct DynamicMerge
});
idx_diff_low_const(Number<NDimLow - 1>{}) = __builtin_amdgcn_readfirstlane(tmp);
static_for<0, NDimLow, 1>{}([&](auto i) {
idx_low_length_minus_idx_diff_low_const(i) =
__builtin_amdgcn_readfirstlane(low_lengths_[i] - idx_diff_low_const[i]);
idx_low_length_plus_idx_diff_low_const(i) =
__builtin_amdgcn_readfirstlane(low_lengths_[i] + idx_diff_low_const[i]);
});
#endif
if constexpr(Hack == 1)
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t carry = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
index_t idx_low_tmp = idx_low_old[i] + carry;
bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
idx_diff_low(i) =
do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
idx_diff_low(i) += carry;
carry = do_carry ? 1 : 0;
});
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
}
else if constexpr(Hack == 2)
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t borrow = 0;
bool do_carry = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
index_t idx_low_tmp = idx_low_old[i] - borrow;
bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
idx_diff_low(i) =
do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low_const[i];
idx_diff_low(i) = idx_diff_low_const[i] + do_carry;
idx_diff_low(i) -= borrow;
index_t idx_low_tmp = idx_low[i] + idx_diff_low[i];
borrow = do_borrow ? 1 : 0;
});
do_carry = idx_low_tmp >= low_lengths_[i];
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] - borrow;
}
else
// TODO: use exec-mask inline asm
if(do_carry)
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t carry = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
index_t idx_low_tmp = idx_low_old[i] + carry;
bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
idx_diff_low(i) =
do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
idx_diff_low(i) =
do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low[i];
idx_diff_low(i) += carry;
carry = do_carry ? 1 : 0;
carry = do_borrow ? -1 : carry;
});
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
idx_diff_low(i) -= low_lengths_[i];
}
}
// idx_diff_low depends on idx_low_old, so idx_low need to be up-to-date
// If idx_diff_up is known at compile-time, many calculations can be optimized
// away by compiler
// This function assume idx_low_old is not out-of-bound
// this version use less register but more computation
template <typename LowIdxDiff,
typename UpIdxDiff,
typename LowIdx,
typename UpIdx,
index_t Hack>
__host__ __device__ constexpr void CalculateLowerIndexDiff_hack_2(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& idx_low_old,
const UpIdx& /* idx_up_old */,
Number<Hack>) const
{
static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
// CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
// However,
// 1) If idx_diff_up is known at compile-time, then idx_diff_low_const
// can be calculated at compile-time.
// 2) If idx_diff_up is not known at compile-time, but its value
// doesn't change during the whole kernel execution, then
// idx_diff_low_const also
// doesn't change during the whole kernel execution. Compiler generated
// ISA should
// only caclculate idx_diff_low_const once and save it durinng the whole
// kernel execution
// If neither 1) nor 2) is satisfied, then the calculation will also be
// computed at
// run-time each time this function is called, and can be very expensive.
LowerIndex idx_diff_low_const;
LowerIndex idx_low_length_minus_idx_diff_low_const;
LowerIndex idx_low_length_plus_idx_diff_low_const;
#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
index_t tmp = idx_diff_up[Number<0>{}];
static_for<0, NDimLow - 1, 1>{}([&](auto i) {
idx_diff_low_const(i) = tmp / low_lengths_scan_[i];
tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
});
idx_diff_low_const(Number<NDimLow - 1>{}) = tmp;
#else
// Hack: this force result into SGPR. Need to make sure the result is thread invariant
index_t tmp = idx_diff_up[Number<0>{}];
static_for<0, NDimLow - 1, 1>{}([&](auto i) {
idx_diff_low_const(i) = __builtin_amdgcn_readfirstlane(tmp / low_lengths_scan_[i]);
tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
idx_low(i) += idx_diff_low[i];
});
idx_diff_low_const(Number<NDimLow - 1>{}) = __builtin_amdgcn_readfirstlane(tmp);
#endif
if constexpr(Hack == 1)
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
bool do_carry = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
idx_diff_low(i) = idx_diff_low_const[i] + do_carry;
index_t idx_low_tmp = idx_low_old[i] + idx_diff_low_const[i] + do_carry;
do_carry = idx_low_tmp >= low_lengths_[i];
constexpr auto I0 = Number<0>{};
idx_diff_low(i) = do_carry ? idx_diff_low(i) - low_lengths_[i] : idx_diff_low[i];
});
idx_diff_low(I0) = idx_diff_low_const[I0] + do_carry;
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + do_carry;
idx_low(I0) += idx_diff_low[I0];
}
else if constexpr(Hack == 2)
{
// do carry check on each low dimension in reversed order
// do borrow check on each low dimension in reversed order
// do not need to check the first dimension
bool do_borrow = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
idx_diff_low(i) = idx_diff_low_const[i] - do_borrow;
index_t idx_low_tmp = idx_low_old[i] + idx_diff_low_const[i] - do_borrow;
index_t idx_low_tmp = idx_low[i] + idx_diff_low[i];
do_borrow = idx_low_tmp < 0;
idx_diff_low(i) = do_borrow ? idx_diff_low(i) + low_lengths_[i] : idx_diff_low[i];
});
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] - do_borrow;
}
else
// TODO: use exec-mask inline asm
if(do_borrow)
{
#if 0
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t carry = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
idx_diff_low(i) = idx_diff_low_const[i] + carry;
bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
idx_diff_low(i) =
do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
idx_diff_low(i) =
do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low[i];
idx_diff_low(i) += carry;
carry = do_carry ? 1 : 0;
carry = do_borrow ? -1 : carry;
});
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
#endif
}
idx_diff_low(i) += low_lengths_[i];
}
// idx_diff_low depends on idx_low_old, so idx_low need to be up-to-date
// If idx_diff_up is known at compile-time, many calculations can be optimized
// away by compiler
// This function assume idx_low_old is not out-of-bound
// this version use less register but more computation
template <typename LowIdxDiff,
typename UpIdxDiff,
typename LowIdx,
typename UpIdx,
index_t Hack>
__host__ __device__ constexpr void CalculateLowerIndexDiff_hack_3(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& idx_low_old,
const UpIdx& /* idx_up_old */,
Number<Hack>) const
{
static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
// CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
// However,
// 1) If idx_diff_up is known at compile-time, then idx_diff_low_const
// can be calculated at compile-time.
// 2) If idx_diff_up is not known at compile-time, but its value
// doesn't change during the whole kernel execution, then
// idx_diff_low_const also
// doesn't change during the whole kernel execution. Compiler generated
// ISA should
// only caclculate idx_diff_low_const once and save it durinng the whole
// kernel execution
// If neither 1) nor 2) is satisfied, then the calculation will also be
// computed at
// run-time each time this function is called, and can be very expensive.
LowerIndex idx_diff_low_const;
LowerIndex idx_low_length_minus_idx_diff_low_const;
LowerIndex idx_low_length_plus_idx_diff_low_const;
#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
index_t tmp = idx_diff_up[Number<0>{}];
static_for<0, NDimLow - 1, 1>{}([&](auto i) {
idx_diff_low_const(i) = tmp / low_lengths_scan_[i];
tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
});
idx_diff_low_const(Number<NDimLow - 1>{}) = tmp;
#else
// Hack: this force result into SGPR. Need to make sure the result is thread invariant
index_t tmp = idx_diff_up[Number<0>{}];
static_for<0, NDimLow - 1, 1>{}([&](auto i) {
idx_diff_low_const(i) = __builtin_amdgcn_readfirstlane(tmp / low_lengths_scan_[i]);
tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
});
idx_diff_low_const(Number<NDimLow - 1>{}) = __builtin_amdgcn_readfirstlane(tmp);
#endif
if constexpr(Hack == 1)
{
#if 1
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
bool do_carry = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
idx_diff_low(i) = idx_diff_low_const[i] + do_carry;
index_t idx_low_tmp = idx_low_old[i] + idx_diff_low_const[i] + do_carry;
do_carry = idx_low_tmp >= low_lengths_[i];
idx_diff_low(i) = do_carry ? idx_diff_low(i) - low_lengths_[i] : idx_diff_low[i];
});
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + do_carry;
#else
LowerIndex idx_low_new = idx_low_old;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
auto i_m1 = i - Number<1>{};
int64_t exec_mask;
int64_t do_carry;
idx_low_new(i) = idx_diff_low_const[i] + idx_low_old[i];
asm volatile(
"\n \
s_mov_b64 %0, exec \n \
v_cmpx_le_u32_e64 %1, %4, %2 \n \
v_subrev_u32 %2, %4, %2\n \
v_add_u32 %3, %3, 1\n \
s_mov_b64 exec, %0\n \
"
: "=s"(exec_mask), "=s"(do_carry), "=v"(idx_low_new(i)), "=v"(idx_low_new(i_m1))
: "s"(low_lengths_[i]), "2"(idx_low_new[i]), "3"(idx_low_new[i_m1]));
idx_diff_low(i) = idx_low_new[i] - idx_low_old[i];
idx_low(i) += idx_diff_low[i];
});
constexpr auto I0 = Number<0>{};
idx_low_new(I0) += idx_diff_low_const[I0];
idx_diff_low(I0) = idx_low_new[I0] - idx_low_old[I0];
#endif
}
else if constexpr(Hack == 2)
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
bool do_borrow = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
idx_diff_low(i) = idx_diff_low_const[i] - do_borrow;
idx_diff_low(I0) = idx_diff_low_const[I0] - do_borrow;
index_t idx_low_tmp = idx_low_old[i] + idx_diff_low_const[i] - do_borrow;
do_borrow = idx_low_tmp < 0;
idx_diff_low(i) = do_borrow ? idx_diff_low(i) + low_lengths_[i] : idx_diff_low[i];
});
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] - do_borrow;
idx_low(I0) += idx_diff_low[I0];
}
else
{
#if 0
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t carry = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
idx_diff_low(i) = idx_diff_low_const[i] + carry;
bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
idx_diff_low(i) =
do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
idx_diff_low(i) =
do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low[i];
idx_diff_low(i) += carry;
carry = do_carry ? 1 : 0;
carry = do_borrow ? -1 : carry;
});
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
#endif
// not implemented
}
}
template <typename LowIdxDiff,
typename UpIdxDiff,
typename LowIdx,
typename UpIdx,
index_t Hack>
__host__ __device__ constexpr void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& idx_low_old,
const UpIdx& idx_up_old,
Number<Hack>) const
{
#if 0
// this version save computation but use more register
CalculateLowerIndexDiff_hack_1(
idx_diff_low, idx_diff_up, idx_low_old, idx_up_old, Number<Hack>{});
#elif 1
// this version use less register but more computation
CalculateLowerIndexDiff_hack_2(
idx_diff_low, idx_diff_up, idx_low_old, idx_up_old, Number<Hack>{});
#elif 1
// this version use less register but more computation
CalculateLowerIndexDiff_hack_3(
idx_diff_low, idx_diff_up, idx_low_old, idx_up_old, Number<Hack>{});
#endif
}
__host__ __device__ static constexpr bool IsLinearTransform() { return false; }
__host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
......@@ -1156,27 +678,20 @@ struct DynamicUnMerge
}
}
template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
__host__ __device__ constexpr void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& /* idx_low_old */,
const UpIdx& /* idx_up_old */) const
{
CalculateLowerIndex(idx_diff_low, idx_diff_up);
}
template <typename LowIdxDiff,
typename UpIdxDiff,
typename LowIdx,
typename UpIdx,
index_t Hack>
__host__ __device__ constexpr void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
__host__ __device__ void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& idx_low_old,
const UpIdx& idx_up_old,
LowIdx& idx_low,
const UpIdx& idx_up_new,
Number<Hack>) const
{
CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low_old, idx_up_old);
CalculateLowerIndex(idx_diff_low, idx_diff_up);
idx_low += idx_diff_low;
}
__host__ __device__ static constexpr bool IsLinearTransform() { return true; }
......@@ -1221,139 +736,18 @@ struct DynamicFreeze
idx_low(Number<0>{}) = low_idx_;
}
template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
__host__ __device__ static void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& /* idx_low_old */,
const UpIdx& /* idx_up_old */)
{
idx_diff_low(Number<0>{}) = index_t{Number<0>{}};
}
template <typename LowIdxDiff,
typename UpIdxDiff,
typename LowIdx,
typename UpIdx,
index_t Hack>
__host__ __device__ static void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
__host__ __device__ static void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& idx_low_old,
const UpIdx& idx_up_old,
LowIdx& idx_low,
const UpIdx& idx_up_new,
Number<Hack>)
{
CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low_old, idx_up_old);
}
__host__ __device__ static constexpr bool IsLinearTransform() { return true; }
__host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
{
return true;
}
template <typename UpIdx>
__host__ __device__ static constexpr bool
IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
{
return true;
}
};
#if 0
template <index_t NDimUp, typename StaticPartialUpLengths>
struct HackSemiDynamicUnMerge
{
using LowerIndex = MultiIndex<1>;
using UpperIndex = MultiIndex<NDimUp>;
const UpperIndex up_lengths_;
const UpperIndex up_lengths_scan_;
static constexpr index_t NDimUpStatic = StaticPartialUpLengths::Size();
static constexpr index_t NDimUpDynamic = NDimUp - NDimUpStatic;
const MultiIndex<NDimUpDynamic> dynamic_partial_up_lengths_;
const MultiIndex<NDimUpDynamic> dynamic_partial_up_lengths_scan_;
static constexpr auto static_partial_up_lengths_ = StaticPartialUpLengths{};
static constexpr auto static_partial_up_lengths_scan_ = reverse_exclusive_scan_sequence(
static_partial_up_lengths_, math::multiplies<index_t>(), Number<1>{});
__host__ __device__ constexpr HackSemiDynamicUnMerge(
const MultiIndex<NDimUpDynamic>& dynamic_partial_up_lengths)
: dynamic_partial_up_lengths_{dynamic_partial_up_lengths},
dynamic_partial_up_lengths_scan_{
container_reverse_exclusive_scan(dynamic_partial_up_lengths,
math::multiplies<index_t>(),
static_partial_up_lengths_scan_[Number<0>{}])}
{
static_assert(NDimUpDynamic + NDimUpStatic == NDimUp,
"wrong! inconsisitent # of dimensions");
}
__host__ __device__ constexpr HackSemiDynamicUnMerge()
: up_lengths_{make_zero_multi_index<NDimUpDynamic>()},
up_lengths_scan_{make_zero_multi_index<NDimUpStatic>()}
{
static_assert(NDimUpDynamic + NDimUpStatic == NDimUp,
"wrong! inconsisitent # of dimensions");
}
__host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
__host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return NDimUp; }
__host__ __device__ constexpr const auto GetUpperLengths() const
{
UpperIndex up_lengths;
static_for<0, NDimUpDynamic, 1>{}(
[&](auto i) { up_lengths(i) = dynamic_partial_up_lengths_[i]; });
static_for<0, NDimUpStatic, 1>{}([&](auto i) {
up_lengths(i + Number<NDimUpDynamic>{}) = static_partial_up_lengths_[i];
});
return up_lengths;
}
template <typename LowIdx, typename UpIdx>
__host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
const UpIdx& idx_up) const
{
idx_low(Number<0>{}) = idx_up[Number<NDimUp - 1>{}];
static_for<0, NDimUpDynamic, 1>{}([&](auto i) {
idx_low(Number<0>{}) += idx_up[i] * dynamic_partial_up_lengths_scan_[i];
});
static_for<NDimUpDynamic, NDimUp - 1, 1>{}([&](auto i) {
idx_low(Number<0>{}) +=
idx_up[i] * static_partial_up_lengths_scan_[i + Number<NDimUpDynamic>{}];
});
}
template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
__host__ __device__ constexpr void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& /* idx_low_old */,
const UpIdx& /* idx_up_old */) const
{
CalculateLowerIndex(idx_diff_low, idx_diff_up);
}
template <typename LowIdxDiff,
typename UpIdxDiff,
typename LowIdx,
typename UpIdx,
index_t Hack>
__host__ __device__ constexpr void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& idx_low_old,
const UpIdx& idx_up_old,
Number<Hack>) const
{
CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low_old, idx_up_old);
idx_diff_low(Number<0>{}) = index_t{Number<0>{}};
}
__host__ __device__ static constexpr bool IsLinearTransform() { return true; }
......@@ -1370,7 +764,6 @@ struct HackSemiDynamicUnMerge
return true;
}
};
#endif
} // namespace ck
#endif
......@@ -502,27 +502,18 @@ __host__ __device__ constexpr void move_dynamic_tensor_coordinate(const TensorDe
constexpr auto dims_low = TensorDesc::GetLowerDimensionIdss().At(itran);
constexpr auto dims_up = TensorDesc::GetUpperDimensionIdss().At(itran);
const auto idx_up = get_container_subset(idx_hidden, dims_up);
const auto idx_up_new = get_container_subset(idx_hidden, dims_up);
auto idx_low = get_container_subset(idx_hidden, dims_low);
const auto idx_diff_up = get_container_subset(idx_diff_hidden, dims_up);
MultiIndex<dims_low.Size()> idx_diff_low;
// calculate idx_diff_low
#if 0 // hack
tran.CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low, idx_up);
#else
// HACK: control CalculateLowerIndexDiff for DynamicMerge using ing hack
// HACK: control UpdateLowerIndex for DynamicMerge using hack
// TODO remove hack
constexpr index_t Hack =
decltype(coord_step.hack_calculate_lower_index_diff_)::At(itran);
tran.CalculateLowerIndexDiff_hack(
idx_diff_low, idx_diff_up, idx_low, idx_up, Number<Hack>{});
#endif
// update idx_low
idx_low += idx_diff_low;
tran.UpdateLowerIndex(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{});
set_container_subset(idx_diff_hidden, dims_low, idx_diff_low);
set_container_subset(idx_hidden, dims_low, idx_low);
......
......@@ -384,13 +384,8 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
Float,
decltype(c_m0_m1_n0_n1_thread_desc),
decltype(c_m0_m1_n0_n1_global_desc),
#if 1 // debug
Sequence<MRepeat, MPerThread, NRepeat, NPerThread>,
CThreadTransferSrcDstAccessOrder,
#else
Sequence<1, 1, 2, 4>,
Sequence<0, 1, 2, 3>,
#endif
CThreadTransferSrcDstVectorDim,
1,
CThreadTransferDstScalarPerVector,
......
......@@ -922,7 +922,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
src_desc, make_multi_index(1, 0), Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0>{});
const auto src_step_m1_0 = make_dynamic_tensor_coordinate_step_hack(
src_desc, make_multi_index(-1, 0), Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0>{});
#elif 0
#elif 1
// for non-padded input tensor
const auto src_step_0_p1 = make_dynamic_tensor_coordinate_step_hack(
src_desc, make_multi_index(0, 1), Sequence<0, 0, 0, 0, 0, 0, 1>{});
......@@ -1067,7 +1067,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
// for padded input tensor
const auto adjusted_step = make_dynamic_tensor_coordinate_step_hack(
src_desc, adjusted_step_idx, Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2>{});
#elif 0
#elif 1
// for non-paded input tensor
const auto adjusted_step = make_dynamic_tensor_coordinate_step_hack(
src_desc, adjusted_step_idx, Sequence<0, 0, 0, 0, 0, 1, 2>{});
......
......@@ -152,7 +152,7 @@ __device__ float amd_buffer_load<float, 1>(const float* p_src_wave,
return __llvm_amdgcn_buffer_load_f32(
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#else
#if 1 // debug
#if 0 // debug
float tmp = __llvm_amdgcn_buffer_load_f32(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);
......
......@@ -201,7 +201,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
constexpr auto conv_driver =
#if 1 // debug
#if 1
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
#else
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment