Commit 77c81617 authored by Chao Liu's avatar Chao Liu
Browse files

improving index calculation

parent f2f35201
...@@ -34,7 +34,7 @@ template <index_t BlockSize, ...@@ -34,7 +34,7 @@ template <index_t BlockSize,
index_t GemmBBlockTransferSrcScalarPerVector_GemmN, index_t GemmBBlockTransferSrcScalarPerVector_GemmN,
index_t GemmBBlockTransferDstScalarPerVector_GemmN, index_t GemmBBlockTransferDstScalarPerVector_GemmN,
index_t GemmCThreadTransferDstScalarPerVector_GemmN1> index_t GemmCThreadTransferDstScalarPerVector_GemmN1>
struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
{ {
template <typename... Wei, typename... In, typename... Out> template <typename... Wei, typename... In, typename... Out>
__host__ void Run(const DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_global_desc, __host__ void Run(const DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
...@@ -96,18 +96,11 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw ...@@ -96,18 +96,11 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw
// input tensor // input tensor
const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor( const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor(
transform_dynamic_tensor_descriptor(
in_n_c_hi_wi_global_desc, in_n_c_hi_wi_global_desc,
make_tuple(DynamicPassThrough{N}, make_tuple(DynamicPassThrough{N},
DynamicPassThrough{C}, DynamicPassThrough{C},
DynamicLeftPad{Hi, InLeftPadH}, DynamicPad{Hi, InLeftPadH, InRightPadH},
DynamicLeftPad{Wi, InLeftPadW}), DynamicPad{Wi, InLeftPadW, InRightPadW}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})),
make_tuple(DynamicPassThrough{N},
DynamicPassThrough{C},
DynamicRightPad{Hi + InLeftPadH, InRightPadH},
DynamicRightPad{Wi + InLeftPadW, InRightPadW}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
...@@ -164,6 +157,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw ...@@ -164,6 +157,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw
const index_t GemmM0 = GemmM / GemmM1; const index_t GemmM0 = GemmM / GemmM1;
const index_t GemmN0 = GemmN / GemmN1; const index_t GemmN0 = GemmN / GemmN1;
#if 1 // debug
const auto out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc = const auto out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc =
transform_dynamic_tensor_descriptor( transform_dynamic_tensor_descriptor(
out_gemmm_gemmn_global_desc, out_gemmm_gemmn_global_desc,
...@@ -171,6 +165,16 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw ...@@ -171,6 +165,16 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw
DynamicUnMerge<2>{make_multi_index(GemmN0, GemmN1)}), DynamicUnMerge<2>{make_multi_index(GemmN0, GemmN1)}),
make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{})); make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
#else
const auto out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc =
transform_dynamic_tensor_descriptor(
out_gemmm_gemmn_global_desc,
make_tuple(
HackSemiDynamicUnMerge<3, Sequence<GemmM1>>{make_multi_index(1, GemmM0)},
HackSemiDynamicUnMerge<3, Sequence<GemmN1>>{make_multi_index(1, GemmN0)}),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
#endif
// GEMM // GEMM
using gridwise_gemm = GridwiseDynamicGemm_km_kn_mn_v1< using gridwise_gemm = GridwiseDynamicGemm_km_kn_mn_v1<
......
...@@ -93,6 +93,103 @@ struct DynamicPassThrough ...@@ -93,6 +93,103 @@ struct DynamicPassThrough
} }
}; };
template <bool SkipIsValidCheck = false>
struct DynamicPad
{
using LowerIndex = MultiIndex<1>;
using UpperIndex = MultiIndex<1>;
const UpperIndex up_lengths_;
const index_t left_pad_;
const index_t right_pad_;
#if 0
__host__ __device__ constexpr DynamicPad(const DynamicPad&) = default;
__host__ __device__ constexpr DynamicPad(DynamicPad&&) = default;
#else
__host__ __device__ constexpr DynamicPad(const DynamicPad& other)
: up_lengths_{other.up_lengths_}, left_pad_{other.left_pad_}, right_pad_{other.right_pad_}
{
}
__host__ __device__ constexpr DynamicPad(DynamicPad&& other)
: up_lengths_{other.up_lengths_}, left_pad_{other.left_pad_}, right_pad_{other.right_pad_}
{
}
#endif
__host__ __device__ constexpr DynamicPad(const index_t& low_length,
const index_t& left_pad,
const index_t& right_pad)
: up_lengths_{make_multi_index(low_length + left_pad + right_pad)},
left_pad_{left_pad},
right_pad_{right_pad}
{
}
__host__ __device__ constexpr DynamicPad() : up_lengths_{0}, left_pad_{0}, right_pad_{0} {}
__host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
__host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
__host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
template <typename LowIdx, typename UpIdx>
__host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
const UpIdx& idx_up) const
{
static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
idx_low(Number<0>{}) = idx_up[Number<0>{}] - left_pad_;
}
template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
__host__ __device__ static constexpr void
CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& /* idx_low_old */,
const UpIdx& /* idx_up_old */)
{
static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
idx_diff_low(Number<0>{}) = idx_diff_up[Number<0>{}];
}
template <typename LowIdxDiff,
typename UpIdxDiff,
typename LowIdx,
typename UpIdx,
index_t Hack>
__host__ __device__ static void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& idx_low_old,
const UpIdx& idx_up_old,
Number<Hack>)
{
CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low_old, idx_up_old);
}
__host__ __device__ static constexpr bool IsLinearTransform() { return true; }
__host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
{
return SkipIsValidCheck;
}
template <typename UpIdx>
__host__ __device__ constexpr bool
IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& idx_up) const
{
return SkipIsValidCheck || ((idx_up[Number<0>{}] >= left_pad_) &&
(idx_up[Number<0>{}] < up_lengths_[Number<0>{}] - right_pad_));
}
};
template <bool SkipIsValidCheck = false> template <bool SkipIsValidCheck = false>
struct DynamicLeftPad struct DynamicLeftPad
{ {
...@@ -546,15 +643,17 @@ struct DynamicMerge ...@@ -546,15 +643,17 @@ struct DynamicMerge
} }
// idx_diff_low depends on idx_low_old, so idx_low need to be up-to-date // idx_diff_low depends on idx_low_old, so idx_low need to be up-to-date
//
// If idx_diff_up is known at compile-time, many calculations can be optimized // If idx_diff_up is known at compile-time, many calculations can be optimized
// away by compiler // away by compiler
// This function assume idx_low_old is not out-of-bound // This function assume idx_low_old is not out-of-bound
// this version save computation but use more register
template <typename LowIdxDiff, template <typename LowIdxDiff,
typename UpIdxDiff, typename UpIdxDiff,
typename LowIdx, typename LowIdx,
typename UpIdx, typename UpIdx,
index_t Hack> index_t Hack>
__host__ __device__ constexpr void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low, __host__ __device__ constexpr void CalculateLowerIndexDiff_hack_1(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up, const UpIdxDiff& idx_diff_up,
const LowIdx& idx_low_old, const LowIdx& idx_low_old,
const UpIdx& /* idx_up_old */, const UpIdx& /* idx_up_old */,
...@@ -579,7 +678,10 @@ struct DynamicMerge ...@@ -579,7 +678,10 @@ struct DynamicMerge
// computed at // computed at
// run-time each time this function is called, and can be very expensive. // run-time each time this function is called, and can be very expensive.
LowerIndex idx_diff_low_const; LowerIndex idx_diff_low_const;
LowerIndex idx_low_length_minus_idx_diff_low_const;
LowerIndex idx_low_length_plus_idx_diff_low_const;
#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
index_t tmp = idx_diff_up[Number<0>{}]; index_t tmp = idx_diff_up[Number<0>{}];
static_for<0, NDimLow - 1, 1>{}([&](auto i) { static_for<0, NDimLow - 1, 1>{}([&](auto i) {
...@@ -587,10 +689,6 @@ struct DynamicMerge ...@@ -587,10 +689,6 @@ struct DynamicMerge
tmp -= idx_diff_low_const[i] * low_lengths_scan_[i]; tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
}); });
LowerIndex idx_low_length_minus_idx_diff_low_const;
LowerIndex idx_low_length_plus_idx_diff_low_const;
#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
idx_diff_low_const(Number<NDimLow - 1>{}) = tmp; idx_diff_low_const(Number<NDimLow - 1>{}) = tmp;
static_for<0, NDimLow, 1>{}([&](auto i) { static_for<0, NDimLow, 1>{}([&](auto i) {
...@@ -600,6 +698,13 @@ struct DynamicMerge ...@@ -600,6 +698,13 @@ struct DynamicMerge
}); });
#else #else
// Hack: this force result into SGPR. Need to make sure the result is thread invariant // Hack: this force result into SGPR. Need to make sure the result is thread invariant
index_t tmp = idx_diff_up[Number<0>{}];
static_for<0, NDimLow - 1, 1>{}([&](auto i) {
idx_diff_low_const(i) = __builtin_amdgcn_readfirstlane(tmp / low_lengths_scan_[i]);
tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
});
idx_diff_low_const(Number<NDimLow - 1>{}) = __builtin_amdgcn_readfirstlane(tmp); idx_diff_low_const(Number<NDimLow - 1>{}) = __builtin_amdgcn_readfirstlane(tmp);
static_for<0, NDimLow, 1>{}([&](auto i) { static_for<0, NDimLow, 1>{}([&](auto i) {
...@@ -633,6 +738,27 @@ struct DynamicMerge ...@@ -633,6 +738,27 @@ struct DynamicMerge
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry; idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
} }
else if constexpr(Hack == 2) else if constexpr(Hack == 2)
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t borrow = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
index_t idx_low_tmp = idx_low_old[i] - borrow;
bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
idx_diff_low(i) =
do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low_const[i];
idx_diff_low(i) -= borrow;
borrow = do_borrow ? 1 : 0;
});
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] - borrow;
}
else
{ {
// do carry check on each low dimension in reversed order // do carry check on each low dimension in reversed order
// do not need to check the first dimension // do not need to check the first dimension
...@@ -641,26 +767,127 @@ struct DynamicMerge ...@@ -641,26 +767,127 @@ struct DynamicMerge
static_for<NDimLow - 1, 0, -1>{}([&](auto i) { static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
index_t idx_low_tmp = idx_low_old[i] + carry; index_t idx_low_tmp = idx_low_old[i] + carry;
bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
bool do_borrow = idx_low_tmp < -idx_diff_low_const[i]; bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
idx_diff_low(i) =
do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
idx_diff_low(i) = idx_diff_low(i) =
do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low[i]; do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low[i];
idx_diff_low(i) += carry; idx_diff_low(i) += carry;
carry = do_carry ? 1 : 0;
carry = do_borrow ? -1 : carry; carry = do_borrow ? -1 : carry;
}); });
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry; idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
} }
}
// idx_diff_low depends on idx_low_old, so idx_low need to be up-to-date
// If idx_diff_up is known at compile-time, many calculations can be optimized
// away by compiler
// This function assume idx_low_old is not out-of-bound
// this version use less register but more computation
template <typename LowIdxDiff,
typename UpIdxDiff,
typename LowIdx,
typename UpIdx,
index_t Hack>
__host__ __device__ constexpr void CalculateLowerIndexDiff_hack_2(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& idx_low_old,
const UpIdx& /* idx_up_old */,
Number<Hack>) const
{
static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
// CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
// However,
// 1) If idx_diff_up is known at compile-time, then idx_diff_low_const
// can be calculated at compile-time.
// 2) If idx_diff_up is not known at compile-time, but its value
// doesn't change during the whole kernel execution, then
// idx_diff_low_const also
// doesn't change during the whole kernel execution. Compiler generated
// ISA should
// only caclculate idx_diff_low_const once and save it durinng the whole
// kernel execution
// If neither 1) nor 2) is satisfied, then the calculation will also be
// computed at
// run-time each time this function is called, and can be very expensive.
LowerIndex idx_diff_low_const;
LowerIndex idx_low_length_minus_idx_diff_low_const;
LowerIndex idx_low_length_plus_idx_diff_low_const;
#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
index_t tmp = idx_diff_up[Number<0>{}];
static_for<0, NDimLow - 1, 1>{}([&](auto i) {
idx_diff_low_const(i) = tmp / low_lengths_scan_[i];
tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
});
idx_diff_low_const(Number<NDimLow - 1>{}) = tmp;
#else
// Hack: this force result into SGPR. Need to make sure the result is thread invariant
index_t tmp = idx_diff_up[Number<0>{}];
static_for<0, NDimLow - 1, 1>{}([&](auto i) {
idx_diff_low_const(i) = __builtin_amdgcn_readfirstlane(tmp / low_lengths_scan_[i]);
tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
});
idx_diff_low_const(Number<NDimLow - 1>{}) = __builtin_amdgcn_readfirstlane(tmp);
#endif
if constexpr(Hack == 1)
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
bool do_carry = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
idx_diff_low(i) = idx_diff_low_const[i] + do_carry;
index_t idx_low_tmp = idx_low_old[i] + idx_diff_low_const[i] + do_carry;
do_carry = idx_low_tmp >= low_lengths_[i];
idx_diff_low(i) = do_carry ? idx_diff_low(i) - low_lengths_[i] : idx_diff_low[i];
});
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + do_carry;
}
else if constexpr(Hack == 2)
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
bool do_borrow = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
idx_diff_low(i) = idx_diff_low_const[i] - do_borrow;
index_t idx_low_tmp = idx_low_old[i] + idx_diff_low_const[i] - do_borrow;
do_borrow = idx_low_tmp < 0;
idx_diff_low(i) = do_borrow ? idx_diff_low(i) + low_lengths_[i] : idx_diff_low[i];
});
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] - do_borrow;
}
else else
{ {
#if 0
// do carry check on each low dimension in reversed order // do carry check on each low dimension in reversed order
// do not need to check the first dimension // do not need to check the first dimension
index_t carry = 0; index_t carry = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) { static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
index_t idx_low_tmp = idx_low_old[i] + carry; idx_diff_low(i) = idx_diff_low_const[i] + carry;
bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i]; bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
bool do_borrow = idx_low_tmp < -idx_diff_low_const[i]; bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
...@@ -677,9 +904,191 @@ struct DynamicMerge ...@@ -677,9 +904,191 @@ struct DynamicMerge
}); });
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry; idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
#endif
} }
} }
// idx_diff_low depends on idx_low_old, so idx_low need to be up-to-date
// If idx_diff_up is known at compile-time, many calculations can be optimized
// away by compiler
// This function assume idx_low_old is not out-of-bound
// this version use less register but more computation
template <typename LowIdxDiff,
typename UpIdxDiff,
typename LowIdx,
typename UpIdx,
index_t Hack>
__host__ __device__ constexpr void CalculateLowerIndexDiff_hack_3(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& idx_low_old,
const UpIdx& /* idx_up_old */,
Number<Hack>) const
{
static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
// CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
// However,
// 1) If idx_diff_up is known at compile-time, then idx_diff_low_const
// can be calculated at compile-time.
// 2) If idx_diff_up is not known at compile-time, but its value
// doesn't change during the whole kernel execution, then
// idx_diff_low_const also
// doesn't change during the whole kernel execution. Compiler generated
// ISA should
// only caclculate idx_diff_low_const once and save it durinng the whole
// kernel execution
// If neither 1) nor 2) is satisfied, then the calculation will also be
// computed at
// run-time each time this function is called, and can be very expensive.
LowerIndex idx_diff_low_const;
LowerIndex idx_low_length_minus_idx_diff_low_const;
LowerIndex idx_low_length_plus_idx_diff_low_const;
#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
index_t tmp = idx_diff_up[Number<0>{}];
static_for<0, NDimLow - 1, 1>{}([&](auto i) {
idx_diff_low_const(i) = tmp / low_lengths_scan_[i];
tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
});
idx_diff_low_const(Number<NDimLow - 1>{}) = tmp;
#else
// Hack: this force result into SGPR. Need to make sure the result is thread invariant
index_t tmp = idx_diff_up[Number<0>{}];
static_for<0, NDimLow - 1, 1>{}([&](auto i) {
idx_diff_low_const(i) = __builtin_amdgcn_readfirstlane(tmp / low_lengths_scan_[i]);
tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
});
idx_diff_low_const(Number<NDimLow - 1>{}) = __builtin_amdgcn_readfirstlane(tmp);
#endif
if constexpr(Hack == 1)
{
#if 1
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
bool do_carry = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
idx_diff_low(i) = idx_diff_low_const[i] + do_carry;
index_t idx_low_tmp = idx_low_old[i] + idx_diff_low_const[i] + do_carry;
do_carry = idx_low_tmp >= low_lengths_[i];
idx_diff_low(i) = do_carry ? idx_diff_low(i) - low_lengths_[i] : idx_diff_low[i];
});
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + do_carry;
#else
LowerIndex idx_low_new = idx_low_old;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
auto i_m1 = i - Number<1>{};
int64_t exec_mask;
int64_t do_carry;
idx_low_new(i) = idx_diff_low_const[i] + idx_low_old[i];
asm volatile(
"\n \
s_mov_b64 %0, exec \n \
v_cmpx_le_u32_e64 %1, %4, %2 \n \
v_subrev_u32 %2, %4, %2\n \
v_add_u32 %3, %3, 1\n \
s_mov_b64 exec, %0\n \
"
: "=s"(exec_mask), "=s"(do_carry), "=v"(idx_low_new(i)), "=v"(idx_low_new(i_m1))
: "s"(low_lengths_[i]), "2"(idx_low_new[i]), "3"(idx_low_new[i_m1]));
idx_diff_low(i) = idx_low_new[i] - idx_low_old[i];
});
constexpr auto I0 = Number<0>{};
idx_low_new(I0) += idx_diff_low_const[I0];
idx_diff_low(I0) = idx_low_new[I0] - idx_low_old[I0];
#endif
}
else if constexpr(Hack == 2)
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
bool do_borrow = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
idx_diff_low(i) = idx_diff_low_const[i] - do_borrow;
index_t idx_low_tmp = idx_low_old[i] + idx_diff_low_const[i] - do_borrow;
do_borrow = idx_low_tmp < 0;
idx_diff_low(i) = do_borrow ? idx_diff_low(i) + low_lengths_[i] : idx_diff_low[i];
});
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] - do_borrow;
}
else
{
#if 0
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t carry = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
idx_diff_low(i) = idx_diff_low_const[i] + carry;
bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
idx_diff_low(i) =
do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
idx_diff_low(i) =
do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low[i];
idx_diff_low(i) += carry;
carry = do_carry ? 1 : 0;
carry = do_borrow ? -1 : carry;
});
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
#endif
}
}
template <typename LowIdxDiff,
typename UpIdxDiff,
typename LowIdx,
typename UpIdx,
index_t Hack>
__host__ __device__ constexpr void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& idx_low_old,
const UpIdx& idx_up_old,
Number<Hack>) const
{
#if 0
// this version save computation but use more register
CalculateLowerIndexDiff_hack_1(
idx_diff_low, idx_diff_up, idx_low_old, idx_up_old, Number<Hack>{});
#elif 1
// this version use less register but more computation
CalculateLowerIndexDiff_hack_2(
idx_diff_low, idx_diff_up, idx_low_old, idx_up_old, Number<Hack>{});
#elif 1
// this version use less register but more computation
CalculateLowerIndexDiff_hack_3(
idx_diff_low, idx_diff_up, idx_low_old, idx_up_old, Number<Hack>{});
#endif
}
__host__ __device__ static constexpr bool IsLinearTransform() { return false; } __host__ __device__ static constexpr bool IsLinearTransform() { return false; }
__host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex() __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
...@@ -695,7 +1104,7 @@ struct DynamicMerge ...@@ -695,7 +1104,7 @@ struct DynamicMerge
} }
}; };
template <index_t NDimUp> template <index_t NDimUp, bool Use24BitIntegerCalculation = false>
struct DynamicUnMerge struct DynamicUnMerge
{ {
using LowerIndex = MultiIndex<1>; using LowerIndex = MultiIndex<1>;
...@@ -726,12 +1135,26 @@ struct DynamicUnMerge ...@@ -726,12 +1135,26 @@ struct DynamicUnMerge
template <typename LowIdx, typename UpIdx> template <typename LowIdx, typename UpIdx>
__host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low, __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
const UpIdx& idx_up) const const UpIdx& idx_up) const
{
if constexpr(!Use24BitIntegerCalculation)
{ {
idx_low(Number<0>{}) = idx_up[Number<NDimUp - 1>{}]; idx_low(Number<0>{}) = idx_up[Number<NDimUp - 1>{}];
static_for<0, NDimUp - 1, 1>{}( static_for<0, NDimUp - 1, 1>{}(
[&](auto i) { idx_low(Number<0>{}) += idx_up[i] * up_lengths_scan_[i]; }); [&](auto i) { idx_low(Number<0>{}) += idx_up[i] * up_lengths_scan_[i]; });
} }
else
{
idx_low(Number<0>{}) = idx_up[Number<NDimUp - 1>{}];
static_for<0, NDimUp - 1, 1>{}([&](auto i) {
idx_low(Number<0>{}) =
(0x00ffffff & idx_low[Number<0>{}]) +
(0x00ffffff & idx_up[i]) * (0x00ffffff & up_lengths_scan_[i]);
});
}
}
template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx> template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
__host__ __device__ constexpr void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low, __host__ __device__ constexpr void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
...@@ -836,5 +1259,118 @@ struct DynamicFreeze ...@@ -836,5 +1259,118 @@ struct DynamicFreeze
} }
}; };
#if 0
template <index_t NDimUp, typename StaticPartialUpLengths>
struct HackSemiDynamicUnMerge
{
using LowerIndex = MultiIndex<1>;
using UpperIndex = MultiIndex<NDimUp>;
const UpperIndex up_lengths_;
const UpperIndex up_lengths_scan_;
static constexpr index_t NDimUpStatic = StaticPartialUpLengths::Size();
static constexpr index_t NDimUpDynamic = NDimUp - NDimUpStatic;
const MultiIndex<NDimUpDynamic> dynamic_partial_up_lengths_;
const MultiIndex<NDimUpDynamic> dynamic_partial_up_lengths_scan_;
static constexpr auto static_partial_up_lengths_ = StaticPartialUpLengths{};
static constexpr auto static_partial_up_lengths_scan_ = reverse_exclusive_scan_sequence(
static_partial_up_lengths_, math::multiplies<index_t>(), Number<1>{});
__host__ __device__ constexpr HackSemiDynamicUnMerge(
const MultiIndex<NDimUpDynamic>& dynamic_partial_up_lengths)
: dynamic_partial_up_lengths_{dynamic_partial_up_lengths},
dynamic_partial_up_lengths_scan_{
container_reverse_exclusive_scan(dynamic_partial_up_lengths,
math::multiplies<index_t>(),
static_partial_up_lengths_scan_[Number<0>{}])}
{
static_assert(NDimUpDynamic + NDimUpStatic == NDimUp,
"wrong! inconsisitent # of dimensions");
}
__host__ __device__ constexpr HackSemiDynamicUnMerge()
: up_lengths_{make_zero_multi_index<NDimUpDynamic>()},
up_lengths_scan_{make_zero_multi_index<NDimUpStatic>()}
{
static_assert(NDimUpDynamic + NDimUpStatic == NDimUp,
"wrong! inconsisitent # of dimensions");
}
__host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
__host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return NDimUp; }
__host__ __device__ constexpr const auto GetUpperLengths() const
{
UpperIndex up_lengths;
static_for<0, NDimUpDynamic, 1>{}(
[&](auto i) { up_lengths(i) = dynamic_partial_up_lengths_[i]; });
static_for<0, NDimUpStatic, 1>{}([&](auto i) {
up_lengths(i + Number<NDimUpDynamic>{}) = static_partial_up_lengths_[i];
});
return up_lengths;
}
template <typename LowIdx, typename UpIdx>
__host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
const UpIdx& idx_up) const
{
idx_low(Number<0>{}) = idx_up[Number<NDimUp - 1>{}];
static_for<0, NDimUpDynamic, 1>{}([&](auto i) {
idx_low(Number<0>{}) += idx_up[i] * dynamic_partial_up_lengths_scan_[i];
});
static_for<NDimUpDynamic, NDimUp - 1, 1>{}([&](auto i) {
idx_low(Number<0>{}) +=
idx_up[i] * static_partial_up_lengths_scan_[i + Number<NDimUpDynamic>{}];
});
}
template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
__host__ __device__ constexpr void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& /* idx_low_old */,
const UpIdx& /* idx_up_old */) const
{
CalculateLowerIndex(idx_diff_low, idx_diff_up);
}
template <typename LowIdxDiff,
typename UpIdxDiff,
typename LowIdx,
typename UpIdx,
index_t Hack>
__host__ __device__ constexpr void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& idx_low_old,
const UpIdx& idx_up_old,
Number<Hack>) const
{
CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low_old, idx_up_old);
}
__host__ __device__ static constexpr bool IsLinearTransform() { return true; }
__host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
{
return true;
}
template <typename UpIdx>
__host__ __device__ static constexpr bool
IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
{
return true;
}
};
#endif
} // namespace ck } // namespace ck
#endif #endif
...@@ -96,162 +96,39 @@ struct BlockwiseDynamicTensorSliceTransfer_v4 ...@@ -96,162 +96,39 @@ struct BlockwiseDynamicTensorSliceTransfer_v4
} }
} }
__device__ void RunWrite(const DstDesc& dst_desc, DstData* p_dst) __device__ void RunRead_hack(const SrcDesc& src_desc, const SrcData* p_src)
{
if(BlockSize == thread_cluster_desc_.GetElementSize() or
get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
{
threadwise_transfer_.RunWrite(dst_desc, p_dst);
}
}
__device__ void MoveSrcSliceWindow(const SrcDesc& src_desc, const Index& step)
{
if(BlockSize == thread_cluster_desc_.GetElementSize() or
get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
{
threadwise_transfer_.MoveSrcSliceWindow(src_desc, step);
}
}
__device__ void MoveDstSliceWindow(const DstDesc& dst_desc, const Index& step)
{ {
if(BlockSize == thread_cluster_desc_.GetElementSize() or if(BlockSize == thread_cluster_desc_.GetElementSize() or
get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize()) get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
{ {
threadwise_transfer_.MoveDstSliceWindow(dst_desc, step); threadwise_transfer_.RunRead_hack(src_desc, p_src);
} }
} }
static constexpr auto thread_cluster_desc_ = __device__ void RunWrite(const DstDesc& dst_desc, DstData* p_dst)
make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
using ThreadwiseTransfer =
ThreadwiseDynamicTensorSliceTransfer_v3<ThreadSliceLengths,
DstInMemOp,
SrcData,
DstData,
SrcDesc,
DstDesc,
SrcDimAccessOrder,
DstDimAccessOrder,
SrcVectorDim,
DstVectorDim,
SrcScalarPerVector,
DstScalarPerVector,
SrcScalarStrideInVector,
DstScalarStrideInVector,
SrcAddressSpace,
DstAddressSpace,
ThreadTransferSrcResetCoordinateAfterRun,
ThreadTransferDstResetCoordinateAfterRun>;
ThreadwiseTransfer threadwise_transfer_;
};
// this version does following things to avoid scratch memory issue
// 1. Use StaticallyIndexedArray instead of C array for thread buffer
// 2. ThreadwiseDynamicTensorSliceTransfer_v3 does not keep reference to tensor descriptor
// 3. ThreadwiseDynamicTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
template <index_t BlockSize,
InMemoryDataOperation DstInMemOp,
typename BlockSliceLengths,
typename ThreadSliceLengths,
typename ThreadClusterLengths,
typename ThreadClusterArrangeOrder,
typename SrcData,
typename DstData,
typename SrcDesc,
typename DstDesc,
typename SrcDimAccessOrder,
typename DstDimAccessOrder,
index_t SrcVectorDim,
index_t DstVectorDim,
index_t SrcScalarPerVector,
index_t DstScalarPerVector,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace,
index_t SrcScalarStrideInVector,
index_t DstScalarStrideInVector,
index_t ThreadTransferSrcResetCoordinateAfterRun,
index_t ThreadTransferDstResetCoordinateAfterRun>
struct BlockwiseDynamicTensorSliceTransfer_v4_hack
{
static constexpr index_t nDim = remove_reference_t<SrcDesc>::GetNumOfDimension();
using Index = MultiIndex<nDim>;
__device__ constexpr BlockwiseDynamicTensorSliceTransfer_v4_hack(
const SrcDesc& src_desc,
const Index& src_block_slice_origin,
const DstDesc& dst_desc,
const Index& dst_block_slice_origin)
: threadwise_transfer_(
src_desc, make_zero_multi_index<nDim>(), dst_desc, make_zero_multi_index<nDim>())
{
static_assert(nDim == remove_reference_t<remove_cv_t<SrcDesc>>::GetNumOfDimension() &&
nDim == remove_reference_t<remove_cv_t<DstDesc>>::GetNumOfDimension() &&
nDim == BlockSliceLengths::Size() && nDim == ThreadSliceLengths::Size() &&
nDim == ThreadClusterLengths::Size() &&
nDim == ThreadClusterArrangeOrder::Size() &&
nDim == SrcDimAccessOrder::Size() && nDim == DstDimAccessOrder::Size(),
"wrong! nDim not consistent");
static_assert(
is_same<BlockSliceLengths, decltype(ThreadSliceLengths{} * ThreadClusterLengths{})>{},
"wrong! threads should be mapped to cover entire slicing window");
static_assert(BlockSize >= thread_cluster_desc_.GetElementSize(),
"wrong! BlockSize too small");
if(BlockSize == thread_cluster_desc_.GetElementSize() or
get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
{
const auto thread_cluster_id =
thread_cluster_desc_.CalculateClusterIndex(get_thread_local_1d_id());
const auto thread_data_id_begin = thread_cluster_id * ThreadSliceLengths{};
threadwise_transfer_.SetSrcSliceOrigin(src_desc,
src_block_slice_origin + thread_data_id_begin);
threadwise_transfer_.SetDstSliceOrigin(dst_desc,
dst_block_slice_origin + thread_data_id_begin);
}
}
__device__ static constexpr auto CalculateThreadDataBegin()
{
const auto thread_cluster_id =
thread_cluster_desc_.CalculateClusterIndex(get_thread_local_1d_id());
return thread_cluster_id * ThreadSliceLengths{};
}
__device__ void RunRead(const SrcDesc& src_desc, const SrcData* p_src)
{ {
if(BlockSize == thread_cluster_desc_.GetElementSize() or if(BlockSize == thread_cluster_desc_.GetElementSize() or
get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize()) get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
{ {
threadwise_transfer_.RunRead(src_desc, p_src); threadwise_transfer_.RunWrite(dst_desc, p_dst);
} }
} }
__device__ void RunWrite(const DstDesc& dst_desc, DstData* p_dst) __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc, const Index& step)
{ {
if(BlockSize == thread_cluster_desc_.GetElementSize() or if(BlockSize == thread_cluster_desc_.GetElementSize() or
get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize()) get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
{ {
threadwise_transfer_.RunWrite(dst_desc, p_dst); threadwise_transfer_.MoveSrcSliceWindow(src_desc, step);
} }
} }
__device__ void MoveSrcSliceWindow(const SrcDesc& src_desc, const Index& step) __device__ void MoveSrcSliceWindow_hack(const SrcDesc& src_desc, const Index& step)
{ {
if(BlockSize == thread_cluster_desc_.GetElementSize() or if(BlockSize == thread_cluster_desc_.GetElementSize() or
get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize()) get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
{ {
threadwise_transfer_.MoveSrcSliceWindow(src_desc, step); threadwise_transfer_.MoveSrcSliceWindow_hack(src_desc, step);
} }
} }
...@@ -268,7 +145,7 @@ struct BlockwiseDynamicTensorSliceTransfer_v4_hack ...@@ -268,7 +145,7 @@ struct BlockwiseDynamicTensorSliceTransfer_v4_hack
make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{}); make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
using ThreadwiseTransfer = using ThreadwiseTransfer =
ThreadwiseDynamicTensorSliceTransfer_v3_hack<ThreadSliceLengths, ThreadwiseDynamicTensorSliceTransfer_v3<ThreadSliceLengths,
DstInMemOp, DstInMemOp,
SrcData, SrcData,
DstData, DstData,
......
...@@ -166,7 +166,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v1 ...@@ -166,7 +166,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
// B matrix blockwise copy // B matrix blockwise copy
auto b_blockwise_copy = auto b_blockwise_copy =
BlockwiseDynamicTensorSliceTransfer_v4_hack<BlockSize, BlockwiseDynamicTensorSliceTransfer_v4<BlockSize,
InMemoryDataOperation::Set, InMemoryDataOperation::Set,
Sequence<KPerBlock, NPerBlock>, Sequence<KPerBlock, NPerBlock>,
BBlockTransferThreadSliceLengths_K_N, BBlockTransferThreadSliceLengths_K_N,
...@@ -258,16 +258,14 @@ struct GridwiseDynamicGemm_km_kn_mn_v1 ...@@ -258,16 +258,14 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock, 0); constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock, 0);
constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock, 0); constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock, 0);
#if 1
// LDS double buffer: preload data into LDS // LDS double buffer: preload data into LDS
{ {
a_blockwise_copy.RunRead(a_k_m_global_desc, p_a_global); a_blockwise_copy.RunRead(a_k_m_global_desc, p_a_global);
b_blockwise_copy.RunRead(b_k_n_global_desc, p_b_global); b_blockwise_copy.RunRead_hack(b_k_n_global_desc, p_b_global);
a_blockwise_copy.RunWrite(a_k_m_block_desc, p_a_block_double); a_blockwise_copy.RunWrite(a_k_m_block_desc, p_a_block_double);
b_blockwise_copy.RunWrite(b_k_n_block_desc, p_b_block_double); b_blockwise_copy.RunWrite(b_k_n_block_desc, p_b_block_double);
} }
#endif
if constexpr(HasMainKBlockLoop) if constexpr(HasMainKBlockLoop)
{ {
...@@ -285,13 +283,14 @@ struct GridwiseDynamicGemm_km_kn_mn_v1 ...@@ -285,13 +283,14 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
{ {
// even iteration // even iteration
a_blockwise_copy.MoveSrcSliceWindow(a_k_m_global_desc, a_block_slice_copy_step); a_blockwise_copy.MoveSrcSliceWindow(a_k_m_global_desc, a_block_slice_copy_step);
b_blockwise_copy.MoveSrcSliceWindow(b_k_n_global_desc, b_block_slice_copy_step); b_blockwise_copy.MoveSrcSliceWindow_hack(b_k_n_global_desc,
b_block_slice_copy_step);
__syncthreads(); __syncthreads();
// LDS doubel buffer: load next data from device mem // LDS doubel buffer: load next data from device mem
a_blockwise_copy.RunRead(a_k_m_global_desc, p_a_global); a_blockwise_copy.RunRead(a_k_m_global_desc, p_a_global);
b_blockwise_copy.RunRead(b_k_n_global_desc, p_b_global); b_blockwise_copy.RunRead_hack(b_k_n_global_desc, p_b_global);
// LDS double buffer: GEMM on current data // LDS double buffer: GEMM on current data
blockwise_gemm.Run(p_a_block_even, p_b_block_even, p_c_thread); blockwise_gemm.Run(p_a_block_even, p_b_block_even, p_c_thread);
...@@ -302,13 +301,14 @@ struct GridwiseDynamicGemm_km_kn_mn_v1 ...@@ -302,13 +301,14 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
// odd iteration // odd iteration
a_blockwise_copy.MoveSrcSliceWindow(a_k_m_global_desc, a_block_slice_copy_step); a_blockwise_copy.MoveSrcSliceWindow(a_k_m_global_desc, a_block_slice_copy_step);
b_blockwise_copy.MoveSrcSliceWindow(b_k_n_global_desc, b_block_slice_copy_step); b_blockwise_copy.MoveSrcSliceWindow_hack(b_k_n_global_desc,
b_block_slice_copy_step);
__syncthreads(); __syncthreads();
// LDS doubel buffer: load next data from device mem // LDS doubel buffer: load next data from device mem
a_blockwise_copy.RunRead(a_k_m_global_desc, p_a_global); a_blockwise_copy.RunRead(a_k_m_global_desc, p_a_global);
b_blockwise_copy.RunRead(b_k_n_global_desc, p_b_global); b_blockwise_copy.RunRead_hack(b_k_n_global_desc, p_b_global);
// LDS double buffer: GEMM on current data // LDS double buffer: GEMM on current data
blockwise_gemm.Run(p_a_block_odd, p_b_block_odd, p_c_thread); blockwise_gemm.Run(p_a_block_odd, p_b_block_odd, p_c_thread);
...@@ -326,13 +326,13 @@ struct GridwiseDynamicGemm_km_kn_mn_v1 ...@@ -326,13 +326,13 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
if constexpr(HasDoubleTailKBlockLoop) // if has 2 iteration left if constexpr(HasDoubleTailKBlockLoop) // if has 2 iteration left
{ {
a_blockwise_copy.MoveSrcSliceWindow(a_k_m_global_desc, a_block_slice_copy_step); a_blockwise_copy.MoveSrcSliceWindow(a_k_m_global_desc, a_block_slice_copy_step);
b_blockwise_copy.MoveSrcSliceWindow(b_k_n_global_desc, b_block_slice_copy_step); b_blockwise_copy.MoveSrcSliceWindow_hack(b_k_n_global_desc, b_block_slice_copy_step);
__syncthreads(); __syncthreads();
// LDS double buffer: load last data from device mem // LDS double buffer: load last data from device mem
a_blockwise_copy.RunRead(a_k_m_global_desc, p_a_global); a_blockwise_copy.RunRead(a_k_m_global_desc, p_a_global);
b_blockwise_copy.RunRead(b_k_n_global_desc, p_b_global); b_blockwise_copy.RunRead_hack(b_k_n_global_desc, p_b_global);
// LDS double buffer: GEMM on 2nd-last data // LDS double buffer: GEMM on 2nd-last data
blockwise_gemm.Run(p_a_block_double, p_b_block_double, p_c_thread); blockwise_gemm.Run(p_a_block_double, p_b_block_double, p_c_thread);
...@@ -384,8 +384,13 @@ struct GridwiseDynamicGemm_km_kn_mn_v1 ...@@ -384,8 +384,13 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
Float, Float,
decltype(c_m0_m1_n0_n1_thread_desc), decltype(c_m0_m1_n0_n1_thread_desc),
decltype(c_m0_m1_n0_n1_global_desc), decltype(c_m0_m1_n0_n1_global_desc),
#if 1 // debug
Sequence<MRepeat, MPerThread, NRepeat, NPerThread>, Sequence<MRepeat, MPerThread, NRepeat, NPerThread>,
CThreadTransferSrcDstAccessOrder, CThreadTransferSrcDstAccessOrder,
#else
Sequence<1, 1, 2, 4>,
Sequence<0, 1, 2, 3>,
#endif
CThreadTransferSrcDstVectorDim, CThreadTransferSrcDstVectorDim,
1, 1,
CThreadTransferDstScalarPerVector, CThreadTransferDstScalarPerVector,
...@@ -402,7 +407,8 @@ struct GridwiseDynamicGemm_km_kn_mn_v1 ...@@ -402,7 +407,8 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
m_thread_data_on_global % M1, m_thread_data_on_global % M1,
n_thread_data_on_global / N1, n_thread_data_on_global / N1,
n_thread_data_on_global % N1)) n_thread_data_on_global % N1))
.Run(c_m0_m1_n0_n1_thread_desc, p_c_thread, c_m0_m1_n0_n1_global_desc, p_c_global); .Run_hack(
c_m0_m1_n0_n1_thread_desc, p_c_thread, c_m0_m1_n0_n1_global_desc, p_c_global);
} }
} }
......
...@@ -25,7 +25,7 @@ template <typename SrcData, ...@@ -25,7 +25,7 @@ template <typename SrcData,
index_t SrcScalarStrideInVector, index_t SrcScalarStrideInVector,
index_t DstScalarStrideInVector, index_t DstScalarStrideInVector,
bool SrcResetCoordinateAfterRun, // control whether to move back src coordinate after each bool SrcResetCoordinateAfterRun, // control whether to move back src coordinate after each
// RunRead(), will be fused with MoveSrcSliceWindow to // Run(), will be fused with MoveSrcSliceWindow to
// save addr computation // save addr computation
bool DstResetCoordinateAfterRun> // control whether to move back dst coordinate after each bool DstResetCoordinateAfterRun> // control whether to move back dst coordinate after each
// RunWrite(), will be fused with MoveDstSliceWindow to // RunWrite(), will be fused with MoveDstSliceWindow to
...@@ -333,7 +333,307 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r2 ...@@ -333,7 +333,307 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r2
back_step(i) = (SliceLengths{}[i - Number<1>{}] % 2 == 0) ? 0 : (1 - SliceLengths{}[i]); back_step(i) = (SliceLengths{}[i - Number<1>{}] % 2 == 0) ? 0 : (1 - SliceLengths{}[i]);
}); });
return back_step; return back_step;
}
__device__ void
Run_hack(const SrcDesc& src_desc, const SrcData* p_src, const DstDesc& dst_desc, DstData* p_dst)
{
if constexpr(remove_reference_t<SrcDesc>::GetNumOfDimension() == 2)
{
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
const auto src_step_0_p1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 1));
const auto src_step_0_m1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, -1));
const auto src_step_p1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(1, 0));
const auto src_step_m1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(-1, 0));
const auto dst_step_0_p1 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 1));
const auto dst_step_0_m1 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, -1));
const auto dst_step_p1_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0));
const auto dst_step_m1_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0));
constexpr index_t Len0 = SliceLengths{}[0];
constexpr index_t Len1 = SliceLengths{}[1];
#pragma unroll
for(index_t iter0 = 0; iter0 < Len0; ++iter0)
{
#pragma unroll
for(index_t iter1 = 0; iter1 < Len1; ++iter1)
{
// do work
transfer_data<SrcData,
1,
SrcAddressSpace,
DstAddressSpace,
DstInMemOp,
SrcScalarStrideInVector,
DstScalarStrideInVector>(
p_src,
src_slice_origin_.GetOffset(),
coordinate_has_valid_offset_assuming_visible_index_is_valid(
src_desc, src_slice_origin_),
src_desc.GetElementSpaceSize(),
p_dst,
dst_slice_origin_.GetOffset(),
coordinate_has_valid_offset_assuming_visible_index_is_valid(
dst_desc, dst_slice_origin_),
dst_desc.GetElementSpaceSize());
// move dim1 iterator
if(iter1 < Len1 - 1)
{
bool forward_dim1 = (iter0 % 2 == 0);
if(forward_dim1)
{
move_dynamic_tensor_coordinate(
src_desc, src_slice_origin_, src_step_0_p1);
move_dynamic_tensor_coordinate(
dst_desc, dst_slice_origin_, dst_step_0_p1);
}
else
{
move_dynamic_tensor_coordinate(
src_desc, src_slice_origin_, src_step_0_m1);
move_dynamic_tensor_coordinate(
dst_desc, dst_slice_origin_, dst_step_0_m1);
}
}
}
// move dim0 iterator
if(iter0 < Len0 - 1)
{
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_p1_0);
move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, dst_step_p1_0);
}
}
}
else if constexpr(remove_reference_t<SrcDesc>::GetNumOfDimension() == 4)
{
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
#if 0
const auto src_step_0_0_0_p1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, 0, 1));
const auto src_step_0_0_0_m1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, 0, -1));
const auto src_step_0_0_p1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, 1, 0));
const auto src_step_0_0_m1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, -1, 0));
const auto src_step_0_p1_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 1, 0, 0));
const auto src_step_0_m1_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, -1, 0, 0));
const auto src_step_p1_0_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0, 0, 0));
const auto src_step_m1_0_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0, 0, 0));
const auto dst_step_0_0_0_p1 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 0, 0, 1));
const auto dst_step_0_0_0_m1 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 0, 0, -1));
const auto dst_step_0_0_p1_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 0, 1, 0));
const auto dst_step_0_0_m1_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 0, -1, 0));
const auto dst_step_0_p1_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 1, 0, 0));
const auto dst_step_0_m1_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, -1, 0, 0));
const auto dst_step_p1_0_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0, 0, 0));
const auto dst_step_m1_0_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0, 0, 0));
#else
// hack for output tensor
const auto src_step_0_0_0_p1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, 0, 1));
const auto src_step_0_0_0_m1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, 0, -1));
const auto src_step_0_0_p1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, 1, 0));
const auto src_step_0_0_m1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, -1, 0));
const auto src_step_0_p1_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 1, 0, 0));
const auto src_step_0_m1_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, -1, 0, 0));
const auto src_step_p1_0_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0, 0, 0));
const auto src_step_m1_0_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0, 0, 0));
const auto dst_step_0_0_0_p1 = make_dynamic_tensor_coordinate_step_hack(
dst_desc, make_multi_index(0, 0, 0, 1), Sequence<0, 0, 1, 0, 0>{});
const auto dst_step_0_0_0_m1 = make_dynamic_tensor_coordinate_step_hack(
dst_desc, make_multi_index(0, 0, 0, -1), Sequence<0, 0, 2, 0, 0>{});
const auto dst_step_0_0_p1_0 = make_dynamic_tensor_coordinate_step_hack(
dst_desc, make_multi_index(0, 0, 1, 0), Sequence<0, 0, 1, 0, 0>{});
const auto dst_step_0_0_m1_0 = make_dynamic_tensor_coordinate_step_hack(
dst_desc, make_multi_index(0, 0, -1, 0), Sequence<0, 0, 2, 0, 0>{});
const auto dst_step_0_p1_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 1, 0, 0));
const auto dst_step_0_m1_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, -1, 0, 0));
const auto dst_step_p1_0_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0, 0, 0));
const auto dst_step_m1_0_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0, 0, 0));
#endif
constexpr index_t Len0 = SliceLengths{}[0];
constexpr index_t Len1 = SliceLengths{}[1];
constexpr index_t Len2 = SliceLengths{}[2];
constexpr index_t Len3 = SliceLengths{}[3];
#pragma unroll
for(index_t iter0 = 0; iter0 < Len0; ++iter0)
{
#pragma unroll
for(index_t iter1 = 0; iter1 < Len1; ++iter1)
{
#pragma unroll
for(index_t iter2 = 0; iter2 < Len2; ++iter2)
{
#pragma unroll
for(index_t iter3 = 0; iter3 < Len3; ++iter3)
{
// do work
transfer_data<SrcData,
1,
SrcAddressSpace,
DstAddressSpace,
DstInMemOp,
SrcScalarStrideInVector,
DstScalarStrideInVector>(
p_src,
src_slice_origin_.GetOffset(),
coordinate_has_valid_offset_assuming_visible_index_is_valid(
src_desc, src_slice_origin_),
src_desc.GetElementSpaceSize(),
p_dst,
dst_slice_origin_.GetOffset(),
coordinate_has_valid_offset_assuming_visible_index_is_valid(
dst_desc, dst_slice_origin_),
dst_desc.GetElementSpaceSize());
// move dim1 iterator
if(iter3 < Len3 - 1)
{
bool forward_dim3 = (iter2 % 2 == 0);
if(forward_dim3)
{
move_dynamic_tensor_coordinate(
src_desc, src_slice_origin_, src_step_0_0_0_p1);
move_dynamic_tensor_coordinate(
dst_desc, dst_slice_origin_, dst_step_0_0_0_p1);
}
else
{
move_dynamic_tensor_coordinate(
src_desc, src_slice_origin_, src_step_0_0_0_m1);
move_dynamic_tensor_coordinate(
dst_desc, dst_slice_origin_, dst_step_0_0_0_m1);
}
}
}
// move dim1 iterator
if(iter2 < Len2 - 1)
{
bool forward_dim2 = (iter1 % 2 == 0);
if(forward_dim2)
{
move_dynamic_tensor_coordinate(
src_desc, src_slice_origin_, src_step_0_0_p1_0);
move_dynamic_tensor_coordinate(
dst_desc, dst_slice_origin_, dst_step_0_0_p1_0);
}
else
{
move_dynamic_tensor_coordinate(
src_desc, src_slice_origin_, src_step_0_0_m1_0);
move_dynamic_tensor_coordinate(
dst_desc, dst_slice_origin_, dst_step_0_0_m1_0);
}
}
}
// move dim1 iterator
if(iter1 < Len1 - 1)
{
bool forward_dim1 = (iter0 % 2 == 0);
if(forward_dim1)
{
move_dynamic_tensor_coordinate(
src_desc, src_slice_origin_, src_step_0_p1_0_0);
move_dynamic_tensor_coordinate(
dst_desc, dst_slice_origin_, dst_step_0_p1_0_0);
}
else
{
move_dynamic_tensor_coordinate(
src_desc, src_slice_origin_, src_step_0_m1_0_0);
move_dynamic_tensor_coordinate(
dst_desc, dst_slice_origin_, dst_step_0_m1_0_0);
}
}
}
// move dim0 iterator:
if(iter0 < Len0 - 1)
{
// move forward in dim0
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_p1_0_0_0);
move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, dst_step_p1_0_0_0);
}
}
}
// move src and dst coordinate back to their origins
if constexpr(SrcResetCoordinateAfterRun)
{
const auto src_back_step =
make_dynamic_tensor_coordinate_step(src_desc, GetCoordinateBackStep());
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_back_step);
}
if constexpr(DstResetCoordinateAfterRun)
{
const auto dst_back_step =
make_dynamic_tensor_coordinate_step(dst_desc, GetCoordinateBackStep());
move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, dst_back_step);
}
} }
// src_slice_origin_step_idx need to be known at compile-time, for performance reason // src_slice_origin_step_idx need to be known at compile-time, for performance reason
...@@ -591,132 +891,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -591,132 +891,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
} }
} }
__device__ static constexpr auto GetCoordinateBackStep() __device__ void RunRead_hack(const SrcDesc& src_desc, const SrcData* p_src)
{
MultiIndex<nDim> back_step;
back_step(Number<0>{}) = 1 - SliceLengths{}[0];
static_for<1, nDim, 1>{}([&](auto i) {
back_step(i) = (SliceLengths{}[i - Number<1>{}] % 2 == 0) ? 0 : (1 - SliceLengths{}[i]);
});
return back_step;
}
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
const Index& src_slice_origin_step_idx)
{
// if src coord was not reset by RunRead(), then need to adjust the step here
const auto adjusted_step_idx = SrcResetCoordinateAfterRun
? src_slice_origin_step_idx
: src_slice_origin_step_idx + GetCoordinateBackStep();
// is it OK to construct a new step every time?
const auto adjusted_step = make_dynamic_tensor_coordinate_step(src_desc, adjusted_step_idx);
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, adjusted_step);
}
// dst_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
const Index& dst_slice_origin_step_idx)
{
// if dst coord was not reset by RunWrite(), then need to adjust the step here
const auto adjusted_step_idx = DstResetCoordinateAfterRun
? dst_slice_origin_step_idx
: dst_slice_origin_step_idx + GetCoordinateBackStep();
// is it OK to construct a new step every time?
const auto adjusted_step = make_dynamic_tensor_coordinate_step(dst_desc, adjusted_step_idx);
move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, adjusted_step);
}
private:
static constexpr auto buffer_desc_ =
make_dynamic_naive_tensor_descriptor_packed<nDim>(to_multi_index(SliceLengths{}));
static constexpr index_t buffer_size_ = buffer_desc_.GetElementSpaceSize();
StaticallyIndexedArray<SrcData, buffer_size_> buffer_;
SrcCoord src_slice_origin_;
DstCoord dst_slice_origin_;
};
// this version does following things to avoid "alloca" in LLVM-IR, which would cause scratch memory
// and sometimes useless instructions
// 1. It does not keep reference to tensor descriptor
// 2. It does not construct new tensor coordinate for this->Run()
// 3. It does not use pointer for VGPR thread buffer
// 4. It calculate offset for thread buffer directly, instead of moving the coordinate
template <typename SliceLengths,
InMemoryDataOperation DstInMemOp,
typename SrcData,
typename DstData,
typename SrcDesc,
typename DstDesc,
typename SrcDimAccessOrder,
typename DstDimAccessOrder,
index_t SrcVectorDim,
index_t DstVectorDim,
index_t SrcScalarPerVector,
index_t DstScalarPerVector,
index_t SrcScalarStrideInVector,
index_t DstScalarStrideInVector,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace,
bool SrcResetCoordinateAfterRun, // control whether to move back src coordinate after each
// RunRead(), will be fused with MoveSrcSliceWindow to
// save addr computation
bool DstResetCoordinateAfterRun> // control whether to move back dst coordinate after each
// RunWrite(), will be fused with MoveDstSliceWindow to
// save addr computation
struct ThreadwiseDynamicTensorSliceTransfer_v3_hack
{
static constexpr index_t nDim = SliceLengths::Size();
using Index = MultiIndex<nDim>;
using SrcCoord = decltype(make_dynamic_tensor_coordinate(SrcDesc{}, Index{}));
using DstCoord = decltype(make_dynamic_tensor_coordinate(DstDesc{}, Index{}));
using SrcCoordStep = decltype(make_dynamic_tensor_coordinate_step(SrcDesc{}, Index{}));
using DstCoordStep = decltype(make_dynamic_tensor_coordinate_step(DstDesc{}, Index{}));
__device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v3_hack(const SrcDesc& src_desc,
const Index& src_slice_origin,
const DstDesc& dst_desc,
const Index& dst_slice_origin)
: src_slice_origin_(make_dynamic_tensor_coordinate(src_desc, src_slice_origin)),
dst_slice_origin_(make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin))
{
static_assert(SrcAddressSpace == AddressSpace::Global or
SrcAddressSpace == AddressSpace::Lds,
"wrong!");
static_assert(DstAddressSpace == AddressSpace::Global or
DstAddressSpace == AddressSpace::Lds,
"wrong!");
}
__device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v3_hack()
: ThreadwiseDynamicTensorSliceTransfer_v3_hack(
SrcDesc{}, make_zero_multi_index<nDim>(), DstDesc{}, make_zero_multi_index<nDim>())
{
}
__device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
{
src_slice_origin_ = make_dynamic_tensor_coordinate(src_desc, src_slice_origin_idx);
}
__device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
{
dst_slice_origin_ = make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_idx);
}
__device__ void RunRead(const SrcDesc& src_desc, const SrcData* p_src)
{ {
static_assert(remove_reference_t<SrcDesc>::GetNumOfDimension() == 2, static_assert(remove_reference_t<SrcDesc>::GetNumOfDimension() == 2,
"wrong! hardcoded for 2D tensor"); "wrong! hardcoded for 2D tensor");
...@@ -736,44 +911,18 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3_hack ...@@ -736,44 +911,18 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3_hack
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(1, 0)); make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(1, 0));
const auto src_step_m1_0 = const auto src_step_m1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(-1, 0)); make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(-1, 0));
#elif 0
const auto src_step_0_p1 = make_dynamic_tensor_coordinate_step_hack(
src_desc,
make_multi_index(0, 1),
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{});
const auto src_step_0_m1 = make_dynamic_tensor_coordinate_step_hack(
src_desc,
make_multi_index(0, -1),
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{});
const auto src_step_p1_0 = make_dynamic_tensor_coordinate_step_hack(
src_desc,
make_multi_index(1, 0),
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{});
const auto src_step_m1_0 = make_dynamic_tensor_coordinate_step_hack(
src_desc,
make_multi_index(-1, 0),
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{});
#elif 1 #elif 1
// for padded input tensor // for padded input tensor
const auto src_step_0_p1 = make_dynamic_tensor_coordinate_step_hack( const auto src_step_0_p1 = make_dynamic_tensor_coordinate_step_hack(
src_desc, src_desc, make_multi_index(0, 1), Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1>{});
make_multi_index(0, 1),
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1>{});
const auto src_step_0_m1 = make_dynamic_tensor_coordinate_step_hack( const auto src_step_0_m1 = make_dynamic_tensor_coordinate_step_hack(
src_desc, src_desc, make_multi_index(0, -1), Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2>{});
make_multi_index(0, -1),
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2>{});
const auto src_step_p1_0 = make_dynamic_tensor_coordinate_step_hack( const auto src_step_p1_0 = make_dynamic_tensor_coordinate_step_hack(
src_desc, src_desc, make_multi_index(1, 0), Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0>{});
make_multi_index(1, 0),
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0>{});
const auto src_step_m1_0 = make_dynamic_tensor_coordinate_step_hack( const auto src_step_m1_0 = make_dynamic_tensor_coordinate_step_hack(
src_desc, src_desc, make_multi_index(-1, 0), Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0>{});
make_multi_index(-1, 0), #elif 0
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0>{});
#elif 1
// for non-padded input tensor // for non-padded input tensor
const auto src_step_0_p1 = make_dynamic_tensor_coordinate_step_hack( const auto src_step_0_p1 = make_dynamic_tensor_coordinate_step_hack(
src_desc, make_multi_index(0, 1), Sequence<0, 0, 0, 0, 0, 0, 1>{}); src_desc, make_multi_index(0, 1), Sequence<0, 0, 0, 0, 0, 0, 1>{});
...@@ -806,12 +955,23 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3_hack ...@@ -806,12 +955,23 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3_hack
static_assert(SrcAddressSpace == AddressSpace::Global, static_assert(SrcAddressSpace == AddressSpace::Global,
"wrong! hardcoded to use buffer_load, src must be global mem"); "wrong! hardcoded to use buffer_load, src must be global mem");
#if 0 // debug
buffer_(Number<buffer_offset>{}) = amd_buffer_load<SrcData, 1>( buffer_(Number<buffer_offset>{}) = amd_buffer_load<SrcData, 1>(
p_src, p_src,
src_slice_origin_.GetOffset(), src_slice_origin_.GetOffset(),
coordinate_has_valid_offset_assuming_visible_index_is_valid( coordinate_has_valid_offset_assuming_visible_index_is_valid(
src_desc, src_slice_origin_), src_desc, src_slice_origin_),
src_desc.GetElementSpaceSize()); src_desc.GetElementSpaceSize());
#else
SrcData tmp = amd_buffer_load<SrcData, 1>(
p_src, src_slice_origin_.GetOffset(), true, src_desc.GetElementSpaceSize());
const bool is_valid =
coordinate_has_valid_offset_assuming_visible_index_is_valid(
src_desc, src_slice_origin_);
buffer_(Number<buffer_offset>{}) = is_valid ? tmp : SrcData{0};
#endif
// move dim1 iterator // move dim1 iterator
if constexpr(iter1.value < Len1 - 1) if constexpr(iter1.value < Len1 - 1)
...@@ -847,83 +1007,6 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3_hack ...@@ -847,83 +1007,6 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3_hack
} }
} }
__device__ void RunWrite(const DstDesc& dst_desc, DstData* p_dst)
{
static_assert(remove_reference_t<DstDesc>::GetNumOfDimension() == 2,
"wrong! hardcoded for 2D tensor");
// hardcoded for 2D
// TODO implement N-D
if constexpr(remove_reference_t<SrcDesc>::GetNumOfDimension() == 2)
{
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
const auto dst_step_0_p1 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 1));
const auto dst_step_0_m1 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, -1));
const auto dst_step_p1_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0));
const auto dst_step_m1_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0));
constexpr index_t Len0 = SliceLengths{}[0];
constexpr index_t Len1 = SliceLengths{}[1];
static_for<0, Len0, 1>{}([&](auto iter0) {
static_for<0, Len1, 1>{}([&](auto iter1) {
// step direction
constexpr bool forward_dim1 = (iter0.value % 2 == 0);
constexpr index_t i0 = iter0;
constexpr index_t i1 = forward_dim1 ? iter1.value : Len1 - iter1.value - 1;
// do work
constexpr index_t buffer_offset =
buffer_desc_.CalculateOffset(make_multi_index(i0, i1));
// hardcoding for ds_write
// TODO refactor transfer_data() to encapsulate this
static_assert(DstAddressSpace == AddressSpace::Lds &&
DstInMemOp == InMemoryDataOperation::Set,
"wrong! hardcoded for ds_write");
p_dst[dst_slice_origin_.GetOffset()] = buffer_[Number<buffer_offset>{}];
// move dim1 iterator
if constexpr(iter1.value < Len1 - 1)
{
if constexpr(forward_dim1)
{
move_dynamic_tensor_coordinate(
dst_desc, dst_slice_origin_, dst_step_0_p1);
}
else
{
move_dynamic_tensor_coordinate(
dst_desc, dst_slice_origin_, dst_step_0_m1);
}
}
});
// move dim0 iterator
if constexpr(iter0.value < Len0 - 1)
{
move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, dst_step_p1_0);
}
});
}
// move dst coordinate back to its slice origin
if constexpr(DstResetCoordinateAfterRun)
{
const auto dst_back_step =
make_dynamic_tensor_coordinate_step(dst_desc, GetCoordinateBackStep());
move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, dst_back_step);
}
}
__device__ static constexpr auto GetCoordinateBackStep() __device__ static constexpr auto GetCoordinateBackStep()
{ {
MultiIndex<nDim> back_step; MultiIndex<nDim> back_step;
...@@ -947,18 +1030,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3_hack ...@@ -947,18 +1030,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3_hack
: src_slice_origin_step_idx + GetCoordinateBackStep(); : src_slice_origin_step_idx + GetCoordinateBackStep();
// is it OK to construct a new step every time? // is it OK to construct a new step every time?
#if 0 // hack const auto adjusted_step = make_dynamic_tensor_coordinate_step(src_desc, adjusted_step_idx);
const auto adjusted_step = make_dynamic_tensor_coordinate_step(
src_desc, adjusted_step_idx);
#elif 1
// for padded input tensor
const auto adjusted_step = make_dynamic_tensor_coordinate_step_hack(
src_desc, adjusted_step_idx, Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2>{});
#elif 1
// for non-paded input tensor
const auto adjusted_step = make_dynamic_tensor_coordinate_step_hack(
src_desc, adjusted_step_idx, Sequence<0, 0, 0, 0, 0, 1, 2>{});
#endif
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, adjusted_step); move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, adjusted_step);
} }
...@@ -978,6 +1050,32 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3_hack ...@@ -978,6 +1050,32 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3_hack
move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, adjusted_step); move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, adjusted_step);
} }
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__ void MoveSrcSliceWindow_hack(const SrcDesc& src_desc,
const Index& src_slice_origin_step_idx)
{
// if src coord was not reset by RunRead(), then need to adjust the step here
const auto adjusted_step_idx = SrcResetCoordinateAfterRun
? src_slice_origin_step_idx
: src_slice_origin_step_idx + GetCoordinateBackStep();
// is it OK to construct a new step every time?
#if 0 // hack
const auto adjusted_step = make_dynamic_tensor_coordinate_step(
src_desc, adjusted_step_idx);
#elif 1
// for padded input tensor
const auto adjusted_step = make_dynamic_tensor_coordinate_step_hack(
src_desc, adjusted_step_idx, Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2>{});
#elif 0
// for non-paded input tensor
const auto adjusted_step = make_dynamic_tensor_coordinate_step_hack(
src_desc, adjusted_step_idx, Sequence<0, 0, 0, 0, 0, 1, 2>{});
#endif
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, adjusted_step);
}
private: private:
static constexpr auto buffer_desc_ = static constexpr auto buffer_desc_ =
make_dynamic_naive_tensor_descriptor_packed<nDim>(to_multi_index(SliceLengths{})); make_dynamic_naive_tensor_descriptor_packed<nDim>(to_multi_index(SliceLengths{}));
...@@ -989,5 +1087,6 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3_hack ...@@ -989,5 +1087,6 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3_hack
SrcCoord src_slice_origin_; SrcCoord src_slice_origin_;
DstCoord dst_slice_origin_; DstCoord dst_slice_origin_;
}; };
} // namespace ck } // namespace ck
#endif #endif
...@@ -152,10 +152,15 @@ __device__ float amd_buffer_load<float, 1>(const float* p_src_wave, ...@@ -152,10 +152,15 @@ __device__ float amd_buffer_load<float, 1>(const float* p_src_wave,
return __llvm_amdgcn_buffer_load_f32( return __llvm_amdgcn_buffer_load_f32(
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false); src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#else #else
#if 1 // debug
float tmp = __llvm_amdgcn_buffer_load_f32( float tmp = __llvm_amdgcn_buffer_load_f32(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false); src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);
return src_thread_data_valid ? tmp : float(0); return src_thread_data_valid ? tmp : float(0);
#else
return __llvm_amdgcn_buffer_load_f32(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);
#endif
#endif #endif
} }
......
...@@ -87,7 +87,7 @@ ...@@ -87,7 +87,7 @@
// thread-invariant, otherwise it's a bug // thread-invariant, otherwise it's a bug
// TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread" // TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread"
#ifndef CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE #ifndef CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
#define CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0 #define CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 1
#endif #endif
// workaround: put all workaround here // workaround: put all workaround here
......
...@@ -750,6 +750,13 @@ __host__ __device__ constexpr auto reverse_inclusive_scan_sequence(Seq, Reduce, ...@@ -750,6 +750,13 @@ __host__ __device__ constexpr auto reverse_inclusive_scan_sequence(Seq, Reduce,
return typename sequence_reverse_inclusive_scan<Seq, Reduce, Init>::type{}; return typename sequence_reverse_inclusive_scan<Seq, Reduce, Init>::type{};
} }
template <typename Seq, typename Reduce, index_t Init>
__host__ __device__ constexpr auto reverse_exclusive_scan_sequence(Seq, Reduce, Number<Init>)
{
return reverse_inclusive_scan_sequence(Seq::PopFront(), Reduce{}, Number<Init>{})
.PushBack(Number<Init>{});
}
template <typename Seq, typename Reduce, index_t Init> template <typename Seq, typename Reduce, index_t Init>
__host__ __device__ constexpr auto inclusive_scan_sequence(Seq, Reduce, Number<Init>) __host__ __device__ constexpr auto inclusive_scan_sequence(Seq, Reduce, Number<Init>)
{ {
......
...@@ -155,6 +155,7 @@ void device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc, ...@@ -155,6 +155,7 @@ void device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 = 1; constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 = 1;
#elif 1 #elif 1
// cdata = 64, BlockSize = 256, 128x128x8 // cdata = 64, BlockSize = 256, 128x128x8
// b threadwise copy 4x1
constexpr index_t BlockSize = 256; constexpr index_t BlockSize = 256;
constexpr index_t GemmMPerBlock = 128; constexpr index_t GemmMPerBlock = 128;
...@@ -185,6 +186,40 @@ void device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc, ...@@ -185,6 +186,40 @@ void device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
constexpr index_t GemmBBlockCopySrcDataPerRead_GemmN = 1; constexpr index_t GemmBBlockCopySrcDataPerRead_GemmN = 1;
constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN = 1; constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN = 1;
constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 = 1;
#elif 1
// cdata = 64, BlockSize = 256, 128x128x8
// b threadwise copy 2x2
constexpr index_t BlockSize = 256;
constexpr index_t GemmMPerBlock = 128;
constexpr index_t GemmNPerBlock = 128;
constexpr index_t GemmKPerBlock = 8;
constexpr index_t GemmMPerThread = 4;
constexpr index_t GemmNPerThread = 4;
constexpr index_t GemmKPerThread = 1;
constexpr index_t GemmMLevel0Cluster = 2;
constexpr index_t GemmNLevel0Cluster = 2;
constexpr index_t GemmMLevel1Cluster = 8;
constexpr index_t GemmNLevel1Cluster = 8;
constexpr index_t ThreadGemmDataPerReadM = 4;
constexpr index_t ThreadGemmDataPerReadN = 4;
using GemmABlockCopyThreadSliceLengths_GemmK_GemmM = Sequence<4, 1>;
using GemmABlockCopyThreadClusterLengths_GemmK_GemmM = Sequence<2, 128>;
constexpr index_t GemmABlockCopySrcDataPerRead_GemmK = 4;
constexpr index_t GemmABlockCopyDstDataPerWrite_GemmM = 1;
using GemmBBlockCopyThreadSliceLengths_GemmK_GemmN = Sequence<2, 2>;
using GemmBBlockCopyThreadClusterLengths_GemmK_GemmN = Sequence<4, 64>;
constexpr index_t GemmBBlockCopySrcDataPerRead_GemmN = 1;
constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN = 1;
constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 = 1; constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 = 1;
#elif 1 #elif 1
// cdata = 64, BlockSize = 256, 128x128x8 // cdata = 64, BlockSize = 256, 128x128x8
......
...@@ -41,12 +41,13 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc ...@@ -41,12 +41,13 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data()); wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
const auto in_n_c_hi_wi_desc = make_dynamic_naive_tensor_descriptor<4>( // assume packed tensor
to_multi_index(InDesc::GetLengths()), to_multi_index(InDesc::GetStrides())); const auto in_n_c_hi_wi_desc =
const auto wei_k_c_y_x_desc = make_dynamic_naive_tensor_descriptor<4>( make_dynamic_naive_tensor_descriptor_packed<4>(to_multi_index(InDesc::GetLengths()));
to_multi_index(WeiDesc::GetLengths()), to_multi_index(WeiDesc::GetStrides())); const auto wei_k_c_y_x_desc =
const auto out_n_k_ho_wo_desc = make_dynamic_naive_tensor_descriptor<4>( make_dynamic_naive_tensor_descriptor_packed<4>(to_multi_index(WeiDesc::GetLengths()));
to_multi_index(OutDesc::GetLengths()), to_multi_index(OutDesc::GetStrides())); const auto out_n_k_ho_wo_desc =
make_dynamic_naive_tensor_descriptor_packed<4>(to_multi_index(OutDesc::GetLengths()));
const auto conv_strides = to_multi_index(ConvStrides{}); const auto conv_strides = to_multi_index(ConvStrides{});
const auto conv_dilations = to_multi_index(ConvDilations{}); const auto conv_dilations = to_multi_index(ConvDilations{});
...@@ -115,6 +116,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc ...@@ -115,6 +116,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1; constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1;
#elif 1 #elif 1
// cdata = 64, BlockSize = 256, 128x128x8 // cdata = 64, BlockSize = 256, 128x128x8
// b thread copy 4x1
constexpr index_t BlockSize = 256; constexpr index_t BlockSize = 256;
constexpr index_t GemmMPerBlock = 128; constexpr index_t GemmMPerBlock = 128;
...@@ -142,6 +144,37 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc ...@@ -142,6 +144,37 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 1; constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 1;
constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmN = 1; constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmN = 1;
constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1;
#elif 1
// cdata = 64, BlockSize = 256, 128x128x8
// b thread copy 2x2
constexpr index_t BlockSize = 256;
constexpr index_t GemmMPerBlock = 128;
constexpr index_t GemmNPerBlock = 128;
constexpr index_t GemmKPerBlock = 8;
constexpr index_t GemmMPerThread = 4;
constexpr index_t GemmNPerThread = 4;
constexpr index_t GemmKPerThread = 1;
constexpr index_t GemmMLevel0Cluster = 2;
constexpr index_t GemmNLevel0Cluster = 2;
constexpr index_t GemmMLevel1Cluster = 8;
constexpr index_t GemmNLevel1Cluster = 8;
using GemmABlockTransferThreadSliceLengths_GemmK_GemmM = Sequence<4, 1>;
using GemmABlockTransferThreadClusterLengths_GemmK_GemmM = Sequence<2, 128>;
constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 1;
constexpr index_t GemmABlockTransferDstScalarPerVector_GemmM = 1;
using GemmBBlockTransferThreadSliceLengths_GemmK_GemmN = Sequence<2, 2>;
using GemmBBlockTransferThreadClusterLengths_GemmK_GemmN = Sequence<4, 64>;
constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 1;
constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmN = 1;
constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1; constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1;
#endif #endif
...@@ -169,7 +202,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc ...@@ -169,7 +202,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
constexpr auto conv_driver = constexpr auto conv_driver =
#if 1 // debug #if 1 // debug
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
#else #else
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
#endif #endif
......
...@@ -217,7 +217,7 @@ int main(int argc, char* argv[]) ...@@ -217,7 +217,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<0, 0>; using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>;
#elif 1 #elif 0
// 3x3, 35x35, stride 2 // 3x3, 35x35, stride 2
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 288; constexpr index_t C = 288;
...@@ -352,7 +352,7 @@ int main(int argc, char* argv[]) ...@@ -352,7 +352,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<0, 0>; using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>;
#elif 0 #elif 1
// 3x3, 28x28 // 3x3, 28x28
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 128; constexpr index_t C = 128;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment