"projects/git@developer.sourcefind.cn:wangsen/mineru.git" did not exist on "0b8c614280ce7a4f72c77f48474e62061012aab8"
Commit e444281c authored by Chao Liu's avatar Chao Liu
Browse files

initial implementation for magic number division and DynamicMerge_v2_magic_division that uses it

parent 6d4aefcd
......@@ -467,29 +467,10 @@ struct DynamicEmbed
}
};
#if 1
// Implementation of "Merge" transformation primitive that uses regular to do lowering of
// multi-index and use carry-and-borrow check to do lowering of multi-index delta
template <typename LowLengths>
struct lambda_merge_generate_magic_division_calculate_magic_multiplier
{
template <index_t I>
__host__ __device__ constexpr auto operator()(Number<I> i) const
{
return magic_division::CalculateMagicMultiplier(LowLengths{}[i]);
}
};
template <typename LowLengths>
struct lambda_merge_generate_magic_division_calculate_magic_shift
{
template <index_t I>
__host__ __device__ constexpr auto operator()(Number<I> i) const
{
return magic_division::CalculateMagicShift(LowLengths{}[i]);
}
};
template <typename LowLengths>
struct DynamicMerge
struct DynamicMerge_v1_carry_check
{
static constexpr index_t NDimLow = LowLengths::Size();
......@@ -499,35 +480,19 @@ struct DynamicMerge
using LowLengthsScan = decltype(
container_reverse_exclusive_scan(LowLengths{}, math::multiplies_v2{}, Number<1>{}));
using LowLengthsMagicDivisorMultipiler = decltype(generate_tuple(
lambda_merge_generate_magic_division_calculate_magic_multiplier<LowLengths>{},
Number<NDimLow>{}));
using LowLengthsMagicDivisorShift = decltype(
generate_tuple(lambda_merge_generate_magic_division_calculate_magic_shift<LowLengths>{},
Number<NDimLow>{}));
using UpLengths =
decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies_v2{}, Number<1>{})));
LowLengths low_lengths_;
LowLengthsScan low_lengths_scan_;
LowLengthsMagicDivisorMultipiler low_lengths_magic_divisor_multiplier_;
LowLengthsMagicDivisorShift low_lengths_magic_divisor_shift_;
UpLengths up_lengths_;
__host__ __device__ constexpr DynamicMerge() = default;
__host__ __device__ constexpr DynamicMerge_v1_carry_check() = default;
__host__ __device__ constexpr DynamicMerge(const LowLengths& low_lengths)
__host__ __device__ constexpr DynamicMerge_v1_carry_check(const LowLengths& low_lengths)
: low_lengths_{low_lengths},
low_lengths_scan_{
container_reverse_exclusive_scan(low_lengths, math::multiplies_v2{}, Number<1>{})},
low_lengths_magic_divisor_multiplier_{generate_tuple(
[&](auto i) { return magic_division::CalculateMagicMultiplier(low_lengths[i]); },
Number<NDimLow>{})},
low_lengths_magic_divisor_shift_{generate_tuple(
[&](auto i) { return magic_division::CalculateMagicShift(low_lengths[i]); },
Number<NDimLow>{})},
up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies_v2{}, Number<1>{}))}
{
static_assert(LowerIndex::Size() == NDimLow, "wrong!");
......@@ -548,7 +513,6 @@ struct DynamicMerge
index_t tmp = idx_up[Number<0>{}];
#if 1
// normal division
static_for<0, NDimLow - 1, 1>{}([&](auto i) {
idx_low(i) = tmp / this->low_lengths_scan_[i];
......@@ -556,19 +520,6 @@ struct DynamicMerge
});
idx_low(Number<NDimLow - 1>{}) = tmp;
#else
// magic division
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
index_t tmp2 =
magic_division::DoMagicDivision(tmp,
this->low_lengths_magic_divisor_multiplier_[i],
this->low_lengths_magic_divisor_shift_[i]);
idx_low(i) = tmp - tmp2 * this->low_lengths_[i];
tmp = tmp2;
});
idx_low(Number<0>{}) = tmp;
#endif
}
template <typename LowIdxDiff,
......@@ -1030,7 +981,7 @@ struct DynamicMerge
__host__ __device__ void Print() const
{
printf("{");
printf("DynamicMerge, ");
printf("DynamicMerge_v1_carry_check, ");
printf("low_lengths_ ");
print_multi_index(low_lengths_);
printf("low_lengths_scan_ ");
......@@ -1040,29 +991,41 @@ struct DynamicMerge
printf("}");
}
};
#else
template <typename LowLengths>
struct lambda_merge_generate_magic_division_calculate_magic_multiplier
struct lambda_merge_generate_MagicDivision_calculate_magic_multiplier
{
template <index_t I>
__host__ __device__ constexpr auto operator()(Number<I> i) const
{
return magic_division::CalculateMagicMultiplier(LowLengths{}[i]);
return MagicDivision::CalculateMagicMultiplier(LowLengths{}[i]);
}
};
template <typename LowLengths>
struct lambda_merge_generate_magic_division_calculate_magic_shift
struct lambda_merge_generate_MagicDivision_calculate_magic_shift
{
template <index_t I>
__host__ __device__ constexpr auto operator()(Number<I> i) const
{
return magic_division::CalculateMagicShift(LowLengths{}[i]);
return MagicDivision::CalculateMagicShift(LowLengths{}[i]);
}
};
// Implementation of "Merge" transformation primitive that uses magic-number-division to do lowering
// of both multi-index and delta of multi-index
// Caution:
// 1. The magic number division implementation being used would produce correct result if the
// dividended is uint32_t and its value is with in 31-bit value range of uint32_t.
// 2. The magic number division for int32_t dividened has not been implemented, the int32_t
// dividend would be bit-wise interpreted as uint32_t and magic number division implementation for
// uint32_t is then used.
// 3. For Merge primitive, upper-index is the dividend.
// 4. When upper-index is uint32_t, its value need to be within 31-bit range.
// 5. When upper-index is int32_t type (when index_t is int32_t), its value need to be
// non-negative.
template <typename LowLengths>
struct DynamicMerge
struct DynamicMerge_v2_magic_division
{
static constexpr index_t NDimLow = LowLengths::Size();
......@@ -1072,12 +1035,12 @@ struct DynamicMerge
using UpLengths =
decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies_v2{}, Number<1>{})));
using LowLengthsMagicDivisorMultipiler = decltype(generate_tuple(
lambda_merge_generate_magic_division_calculate_magic_multiplier<LowLengths>{},
Number<NDimLow>{}));
using LowLengthsMagicDivisorMultipiler = decltype(
generate_tuple(lambda_merge_generate_MagicDivision_calculate_magic_multiplier<LowLengths>{},
Number<NDimLow>{}));
using LowLengthsMagicDivisorShift = decltype(
generate_tuple(lambda_merge_generate_magic_division_calculate_magic_shift<LowLengths>{},
generate_tuple(lambda_merge_generate_MagicDivision_calculate_magic_shift<LowLengths>{},
Number<NDimLow>{}));
LowLengths low_lengths_;
......@@ -1085,15 +1048,15 @@ struct DynamicMerge
LowLengthsMagicDivisorShift low_lengths_magic_divisor_shift_;
UpLengths up_lengths_;
__host__ __device__ constexpr DynamicMerge() = default;
__host__ __device__ constexpr DynamicMerge_v2_magic_division() = default;
__host__ __device__ constexpr DynamicMerge(const LowLengths& low_lengths)
__host__ __device__ constexpr DynamicMerge_v2_magic_division(const LowLengths& low_lengths)
: low_lengths_{low_lengths},
low_lengths_magic_divisor_multiplier_{generate_tuple(
[&](auto i) { return magic_division::CalculateMagicMultiplier(low_lengths[i]); },
[&](auto i) { return MagicDivision::CalculateMagicMultiplier(low_lengths[i]); },
Number<NDimLow>{})},
low_lengths_magic_divisor_shift_{generate_tuple(
[&](auto i) { return magic_division::CalculateMagicShift(low_lengths[i]); },
[&](auto i) { return MagicDivision::CalculateMagicShift(low_lengths[i]); },
Number<NDimLow>{})},
up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies_v2{}, Number<1>{}))}
{
......@@ -1115,11 +1078,11 @@ struct DynamicMerge
index_t tmp = idx_up[Number<0>{}];
static_for<NDimLow - 1, 0, -1>{}([&idx_low, &tmp, this](auto i) {
static_for<NDimLow - 1, 0, -1>{}([&, this](auto i) {
index_t tmp2 =
magic_division::DoMagicDivision(tmp,
this->low_lengths_magic_divisor_multiplier_[i],
this->low_lengths_magic_divisor_shift_[i]);
MagicDivision::DoMagicDivision(tmp,
this->low_lengths_magic_divisor_multiplier_[i],
this->low_lengths_magic_divisor_shift_[i]);
idx_low(i) = tmp - tmp2 * this->low_lengths_[i];
tmp = tmp2;
});
......@@ -1142,11 +1105,25 @@ struct DynamicMerge
LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
auto idx_low_old = idx_low;
index_t tmp = idx_up_new[Number<0>{}];
static_for<NDimLow - 1, 0, -1>{}([&, this](auto i) {
index_t tmp2 =
MagicDivision::DoMagicDivision(tmp,
this->low_lengths_magic_divisor_multiplier_[i],
this->low_lengths_magic_divisor_shift_[i]);
index_t idx_low_old = idx_low[i];
idx_low(i) = tmp - tmp2 * this->low_lengths_[i];
tmp = tmp2;
idx_diff_low(i) = idx_low[i] - idx_low_old;
});
CalculateLowerIndex(idx_low, idx_up_new);
idx_diff_low(Number<0>{}) = tmp - idx_low(Number<0>{});
idx_diff_low = idx_low - idx_low_old;
idx_low(Number<0>{}) = tmp;
}
__host__ __device__ static constexpr bool IsLinearTransform() { return false; }
......@@ -1174,7 +1151,7 @@ struct DynamicMerge
__host__ __device__ void Print() const
{
printf("{");
printf("DynamicMerge, ");
printf("DynamicMerge_v2_magic_division, ");
printf("low_lengths_ ");
print_multi_index(low_lengths_);
printf("low_lengths_magic_divisor_multiplier_ ");
......@@ -1186,7 +1163,6 @@ struct DynamicMerge
printf("}");
}
};
#endif
template <typename UpLengths, bool Use24BitIntegerCalculation>
struct DynamicUnMerge
......
......@@ -53,7 +53,11 @@ __host__ __device__ constexpr auto make_embed_transform(const UpLengths& up_leng
template <typename LowLengths>
__host__ __device__ constexpr auto make_merge_transform(const LowLengths& low_lengths)
{
return DynamicMerge<LowLengths>{low_lengths};
#if !CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION
return DynamicMerge_v1_carry_check<LowLengths>{low_lengths};
#else
return DynamicMerge_v2_magic_division<LowLengths>{low_lengths};
#endif
}
template <typename UpLengths, bool Use24BitIntegerCalculation = false>
......
......@@ -115,6 +115,9 @@
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE 1
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 0
// merge transformation use magic number division
#define CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION 1
// hack: have underlying assumption that need to be satsified, otherwise it's a bug
// hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
// thread-invariant, otherwise it's a bug
......
......@@ -10,7 +10,17 @@
namespace ck {
// magic number division
struct magic_division
// Caution:
// 1. For uint32_t as dividend: magic number division implementation being used would produce
// correct result if the dividend is uint32_t and its value is within 31-bit value range.
// 2. For int32_t as dividendd: magic number division for int32_t dividened has not been
// implemented, the int32_t dividend would be bit-wise interpreted as uint32_t and magic number
// division implementation for uint32_t is then used. Therefore, dividend value need to be
// non-negative.
// TODO:
// 1. Implement magic number divison for int32_t
// 2. Implement magic number divison for unit32_t with 32-bit value range
struct MagicDivision
{
// uint32_t
__host__ __device__ static constexpr auto CalculateMagicNumbers(uint32_t divisor)
......@@ -100,13 +110,25 @@ struct magic_division
return CalculateMagicShift(integral_constant<uint32_t, Divisor>{});
}
// magic division
// magic division for uint32_t
__host__ __device__ static constexpr uint32_t
DoMagicDivision(uint32_t dividend, uint32_t multiplier, uint32_t shift)
{
uint32_t tmp = (uint64_t(dividend) * uint64_t(multiplier)) >> 32;
return (tmp + dividend) >> shift;
}
// HACK: magic division for int32_t
// HACK: use dividend_i32 as if it's uint32_t, dividend_i32 need to be
// non-negative for result to be correct
// TODO: figure out how to do magic number divison for int32_t as dividended
__host__ __device__ static constexpr int32_t
DoMagicDivision(int32_t dividend_i32, uint32_t multiplier, uint32_t shift)
{
uint32_t dividend_u32 = as_type<uint32_t>(dividend_i32);
uint32_t tmp = ((uint64_t)dividend_u32 * (uint64_t)multiplier) >> 32;
return (tmp + dividend_i32) >> shift;
}
};
} // namespace ck
......
......@@ -42,5 +42,19 @@ struct is_known_at_compile_time<integral_constant<T, X>>
static constexpr bool value = true;
};
template <typename Y,
typename X,
typename std::enable_if<sizeof(X) == sizeof(Y), bool>::type = false>
__host__ __device__ constexpr Y as_type(X x)
{
union AsType
{
X x;
Y y;
};
return AsType{x}.y;
}
} // namespace ck
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment