"test/srt/vscode:/vscode.git/clone" did not exist on "094c116f7dc4a5a4d845ce812406e9514a275266"
Commit 44ddcdcb authored by Chao Liu's avatar Chao Liu
Browse files

adding vector load

parent c1ed17f8
...@@ -502,10 +502,153 @@ struct DynamicMerge ...@@ -502,10 +502,153 @@ struct DynamicMerge
typename LowIdx, typename LowIdx,
typename UpIdx, typename UpIdx,
index_t Hack> index_t Hack>
__host__ __device__ void UpdateLowerIndex(LowIdxDiff& idx_diff_low, __host__ __device__ void UpdateLowerIndex_1(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up, const UpIdxDiff& idx_diff_up,
LowIdx& idx_low, LowIdx& idx_low,
const UpIdx& idx_up_new, const UpIdx& /* idx_up_new */,
Number<Hack>) const
{
static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
// CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
// However,
// 1) If idx_diff_up is known at compile-time, then idx_diff_low_const
// can be calculated at compile-time.
// 2) If idx_diff_up is not known at compile-time, but its value
// doesn't change during the whole kernel execution, then
// idx_diff_low_const also
// doesn't change during the whole kernel execution. Compiler generated
// ISA should
// only caclculate idx_diff_low_const once and save it durinng the whole
// kernel execution
// If neither 1) nor 2) is satisfied, then the calculation will also be
// computed at
// run-time each time this function is called, and can be very expensive.
LowerIndex idx_diff_low_const;
LowerIndex idx_low_length_minus_idx_diff_low_const;
LowerIndex idx_low_length_plus_idx_diff_low_const;
#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
index_t tmp = idx_diff_up[Number<0>{}];
static_for<0, NDimLow - 1, 1>{}([&](auto i) {
idx_diff_low_const(i) = tmp / low_lengths_scan_[i];
tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
});
idx_diff_low_const(Number<NDimLow - 1>{}) = tmp;
static_for<0, NDimLow, 1>{}([&](auto i) {
idx_low_length_minus_idx_diff_low_const(i) = low_lengths_[i] - idx_diff_low_const[i];
idx_low_length_plus_idx_diff_low_const(i) = low_lengths_[i] + idx_diff_low_const[i];
});
#else
// Hack: this force result into SGPR. Need to make sure the result is thread invariant
index_t tmp = idx_diff_up[Number<0>{}];
static_for<0, NDimLow - 1, 1>{}([&](auto i) {
idx_diff_low_const(i) = __builtin_amdgcn_readfirstlane(tmp / low_lengths_scan_[i]);
tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
});
idx_diff_low_const(Number<NDimLow - 1>{}) = __builtin_amdgcn_readfirstlane(tmp);
static_for<0, NDimLow, 1>{}([&](auto i) {
idx_low_length_minus_idx_diff_low_const(i) =
__builtin_amdgcn_readfirstlane(low_lengths_[i] - idx_diff_low_const[i]);
idx_low_length_plus_idx_diff_low_const(i) =
__builtin_amdgcn_readfirstlane(low_lengths_[i] + idx_diff_low_const[i]);
});
#endif
if constexpr(Hack == 1)
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t carry = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
index_t idx_low_tmp = idx_low[i] + carry;
bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
idx_diff_low(i) =
do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
idx_diff_low(i) += carry;
carry = do_carry ? 1 : 0;
});
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
idx_low += idx_diff_low;
}
else if constexpr(Hack == 2)
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t borrow = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
index_t idx_low_tmp = idx_low[i] - borrow;
bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
idx_diff_low(i) =
do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low_const[i];
idx_diff_low(i) -= borrow;
borrow = do_borrow ? 1 : 0;
});
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] - borrow;
idx_low += idx_diff_low;
}
else
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t carry = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
index_t idx_low_tmp = idx_low[i] + carry;
bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
idx_diff_low(i) =
do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
idx_diff_low(i) =
do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low[i];
idx_diff_low(i) += carry;
carry = do_carry ? 1 : 0;
carry = do_borrow ? -1 : carry;
});
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
idx_low += idx_diff_low;
}
}
template <typename LowIdxDiff,
typename UpIdxDiff,
typename LowIdx,
typename UpIdx,
index_t Hack>
__host__ __device__ void UpdateLowerIndex_2(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
LowIdx& idx_low,
const UpIdx& /* idx_up_new */,
Number<Hack>) const Number<Hack>) const
{ {
static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 && static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
...@@ -611,6 +754,24 @@ struct DynamicMerge ...@@ -611,6 +754,24 @@ struct DynamicMerge
} }
} }
template <typename LowIdxDiff,
typename UpIdxDiff,
typename LowIdx,
typename UpIdx,
index_t Hack>
__host__ __device__ void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
LowIdx& idx_low,
const UpIdx& idx_up_new,
Number<Hack>) const
{
#if 1
UpdateLowerIndex_1(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{});
#else
UpdateLowerIndex_2(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{});
#endif
}
__host__ __device__ static constexpr bool IsLinearTransform() { return false; } __host__ __device__ static constexpr bool IsLinearTransform() { return false; }
__host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex() __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
...@@ -624,7 +785,7 @@ struct DynamicMerge ...@@ -624,7 +785,7 @@ struct DynamicMerge
{ {
return true; return true;
} }
}; }; // namespace ck
template <index_t NDimUp, bool Use24BitIntegerCalculation = false> template <index_t NDimUp, bool Use24BitIntegerCalculation = false>
struct DynamicUnMerge struct DynamicUnMerge
......
...@@ -152,6 +152,17 @@ __host__ __device__ constexpr auto operator*(const Tuple<Xs...>& x, const Y& y) ...@@ -152,6 +152,17 @@ __host__ __device__ constexpr auto operator*(const Tuple<Xs...>& x, const Y& y)
return r; return r;
} }
// MultiIndex = index_t * MultiIndex
template <typename... Xs>
__host__ __device__ constexpr auto operator*(index_t a, const Tuple<Xs...>& x)
{
constexpr index_t NSize = sizeof...(Xs);
Tuple<Xs...> r;
static_for<0, NSize, 1>{}([&](auto i) { r(i) = a * x[i]; });
return r;
}
#endif #endif
} // namespace ck } // namespace ck
#endif #endif
...@@ -735,13 +735,54 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -735,13 +735,54 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
__device__ void RunRead(const SrcDesc& src_desc, const SrcData* p_src) __device__ void RunRead(const SrcDesc& src_desc, const SrcData* p_src)
{ {
// hardcoded for 2D
// TODO implemente N-D
static_assert(remove_reference_t<SrcDesc>::GetNumOfDimension() == 2, static_assert(remove_reference_t<SrcDesc>::GetNumOfDimension() == 2,
"wrong! hardcoded for 2D tensor"); "wrong! hardcoded for 2D tensor");
// hardcoded for 2D constexpr auto src_scalar_per_access = [&]() {
// TODO implemente N-D Index src_scalar_per_access;
if constexpr(remove_reference_t<SrcDesc>::GetNumOfDimension() == 2)
static_for<0, nDim, 1>{}([&](auto i) {
if constexpr(i == SrcVectorDim)
{
src_scalar_per_access(i) = SrcScalarPerVector * SrcScalarStrideInVector;
}
else
{
src_scalar_per_access(i) = 1;
}
});
return src_scalar_per_access;
}();
constexpr auto src_scalar_step_in_vector = [&]() {
Index src_scalar_step_in_vector;
static_for<0, nDim, 1>{}([&](auto i) {
if constexpr(i == SrcVectorDim)
{
src_scalar_step_in_vector(i) = 1;
}
else
{ {
src_scalar_step_in_vector(i) = 0;
}
});
return src_scalar_step_in_vector;
}();
constexpr auto access_lengths = [&]() {
Index access_lengths;
static_for<0, nDim, 1>{}(
[&](auto i) { access_lengths(i) = SliceLengths{}[i] / src_scalar_per_access[i]; });
return access_lengths;
}();
// TODO use constexpr for coordinate-step to make sure compiler behave correctly // TODO use constexpr for coordinate-step to make sure compiler behave correctly
const auto src_step_0_p1 = const auto src_step_0_p1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 1)); make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 1));
...@@ -753,56 +794,89 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -753,56 +794,89 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
const auto src_step_m1_0 = const auto src_step_m1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(-1, 0)); make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(-1, 0));
constexpr index_t Len0 = SliceLengths{}[0]; constexpr auto I0 = Number<0>{};
constexpr index_t Len1 = SliceLengths{}[1]; constexpr auto I1 = Number<1>{};
static_for<0, Len0, 1>{}([&](auto iter0) { static_for<0, access_lengths[I0], 1>{}([&](auto iter0) {
static_for<0, Len1, 1>{}([&](auto iter1) { static_for<0, access_lengths[I1], 1>{}([&](auto iter1) {
// step direction // step direction
constexpr bool forward_dim1 = (iter0.value % 2 == 0); constexpr bool forward_dim1 = (iter0.value % 2 == 0);
constexpr index_t i0 = iter0.value; constexpr index_t i0 = iter0.value;
constexpr index_t i1 = forward_dim1 ? iter1.value : Len1 - iter1.value - 1; constexpr index_t i1 =
forward_dim1 ? iter1.value : access_lengths[I1] - iter1.value - 1;
// do work // do work
constexpr index_t buffer_offset =
buffer_desc_.CalculateOffset(make_multi_index(i0, i1));
// hardcoding for buffer_load // hardcoding for buffer_load
// TODO refactor transfer_data() to encapsulate this // TODO refactor transfer_data() to encapsulate this
static_assert(SrcAddressSpace == AddressSpace::Global, static_assert(SrcAddressSpace == AddressSpace::Global,
"wrong! hardcoded to use buffer_load, src must be global mem"); "wrong! hardcoded to use buffer_load, src must be global mem");
buffer_(Number<buffer_offset>{}) = amd_buffer_load<SrcData, 1>( #if 1 // only works for SrcScalarPerVector == 1
p_src, auto src_data = amd_buffer_load<SrcData, 1>(
src_slice_origin_.GetOffset(), p_src, src_slice_origin_.GetOffset(), true, src_desc.GetElementSpaceSize());
coordinate_has_valid_offset_assuming_visible_index_is_valid(
src_desc, src_slice_origin_), const bool is_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
src_desc.GetElementSpaceSize()); src_desc, src_slice_origin_);
constexpr index_t buffer_offset =
buffer_desc_.CalculateOffset(make_multi_index(i0, i1));
buffer_(Number<buffer_offset>{}) = is_valid ? src_data : SrcData{0};
#elif 1 // only works for SrcScalarPerVector == 1
auto src_data = amd_buffer_load<SrcData, 1>(
p_src, src_slice_origin_.GetOffset(), true, src_desc.GetElementSpaceSize());
const bool is_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
src_desc, src_slice_origin_);
constexpr index_t buffer_offset =
buffer_desc_.CalculateOffset(make_multi_index(i0, i1) * src_scalar_per_access);
buffer_(Number<buffer_offset>{}) = is_valid ? src_data : SrcData{0};
#else
vector_type<SrcData, SrcScalarPerVector> src_vector;
using SrcVectorType = typename vector_type<SrcData, SrcScalarPerVector>::MemoryType;
src_vector.Vector() = amd_buffer_load<SrcData, SrcScalarPerVector>(
p_src, src_slice_origin_.GetOffset(), true, src_desc.GetElementSpaceSize());
const bool is_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
src_desc, src_slice_origin_);
src_vector.Vector() = is_valid ? src_vector.Vector() : SrcVectorType{0};
static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
constexpr index_t buffer_offset = buffer_desc_.CalculateOffset(
make_multi_index(i0, i1) * src_scalar_per_access +
i * src_scalar_step_in_vector);
// TODO: can buffe_ use vector access?
buffer_(Number<buffer_offset>{}) = src_vector[i];
});
#endif
// move dim1 iterator // move dim1 iterator
if constexpr(iter1.value < Len1 - 1) if constexpr(iter1.value < access_lengths[I1] - 1)
{ {
if constexpr(forward_dim1) if constexpr(forward_dim1)
{ {
move_dynamic_tensor_coordinate( move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_0_p1);
src_desc, src_slice_origin_, src_step_0_p1);
} }
else else
{ {
move_dynamic_tensor_coordinate( move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_0_m1);
src_desc, src_slice_origin_, src_step_0_m1);
} }
} }
}); });
// move dim0 iterator // move dim0 iterator
if constexpr(iter0.value < Len0 - 1) if constexpr(iter0.value < access_lengths[I0] - 1)
{ {
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_p1_0); move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_p1_0);
} }
}); });
}
// move src coordinate back to its slice origin // move src coordinate back to its slice origin
if constexpr(SrcResetCoordinateAfterRun) if constexpr(SrcResetCoordinateAfterRun)
...@@ -893,13 +967,54 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -893,13 +967,54 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
__device__ void RunRead_hack(const SrcDesc& src_desc, const SrcData* p_src) __device__ void RunRead_hack(const SrcDesc& src_desc, const SrcData* p_src)
{ {
// hardcoding for buffer_load
// TODO refactor transfer_data() to encapsulate this
static_assert(remove_reference_t<SrcDesc>::GetNumOfDimension() == 2, static_assert(remove_reference_t<SrcDesc>::GetNumOfDimension() == 2,
"wrong! hardcoded for 2D tensor"); "wrong! hardcoded for 2D tensor");
// hardcoded for 2D constexpr auto src_scalar_per_access = [&]() {
// TODO implemente N-D Index src_scalar_per_access;
if constexpr(remove_reference_t<SrcDesc>::GetNumOfDimension() == 2)
static_for<0, nDim, 1>{}([&](auto i) {
if constexpr(i == SrcVectorDim)
{
src_scalar_per_access(i) = SrcScalarPerVector * SrcScalarStrideInVector;
}
else
{
src_scalar_per_access(i) = 1;
}
});
return src_scalar_per_access;
}();
constexpr auto src_scalar_step_in_vector = [&]() {
Index src_scalar_step_in_vector;
static_for<0, nDim, 1>{}([&](auto i) {
if constexpr(i == SrcVectorDim)
{
src_scalar_step_in_vector(i) = 1;
}
else
{ {
src_scalar_step_in_vector(i) = 0;
}
});
return src_scalar_step_in_vector;
}();
constexpr auto access_lengths = [&]() {
Index access_lengths;
static_for<0, nDim, 1>{}(
[&](auto i) { access_lengths(i) = SliceLengths{}[i] / src_scalar_per_access[i]; });
return access_lengths;
}();
#if 0 // hack #if 0 // hack
// TODO use constexpr for coordinate-step to make sure compiler behave correctly // TODO use constexpr for coordinate-step to make sure compiler behave correctly
const auto src_step_0_p1 = const auto src_step_0_p1 =
...@@ -911,7 +1026,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -911,7 +1026,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(1, 0)); make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(1, 0));
const auto src_step_m1_0 = const auto src_step_m1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(-1, 0)); make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(-1, 0));
#elif 1 #elif 0
// for padded input tensor // for padded input tensor
const auto src_step_0_p1 = make_dynamic_tensor_coordinate_step_hack( const auto src_step_0_p1 = make_dynamic_tensor_coordinate_step_hack(
src_desc, make_multi_index(0, 1), Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1>{}); src_desc, make_multi_index(0, 1), Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1>{});
...@@ -935,67 +1050,78 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -935,67 +1050,78 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
src_desc, make_multi_index(-1, 0), Sequence<0, 0, 0, 0, 0, 2, 0>{}); src_desc, make_multi_index(-1, 0), Sequence<0, 0, 0, 0, 0, 2, 0>{});
#endif #endif
constexpr index_t Len0 = SliceLengths{}[0]; constexpr auto I0 = Number<0>{};
constexpr index_t Len1 = SliceLengths{}[1]; constexpr auto I1 = Number<1>{};
static_for<0, Len0, 1>{}([&](auto iter0) { static_for<0, access_lengths[I0], 1>{}([&](auto iter0) {
static_for<0, Len1, 1>{}([&](auto iter1) { static_for<0, access_lengths[I1], 1>{}([&](auto iter1) {
// step direction // step direction
constexpr bool forward_dim1 = (iter0.value % 2 == 0); constexpr bool forward_dim1 = (iter0.value % 2 == 0);
constexpr index_t i0 = iter0.value; constexpr index_t i0 = iter0.value;
constexpr index_t i1 = forward_dim1 ? iter1.value : Len1 - iter1.value - 1; constexpr index_t i1 =
forward_dim1 ? iter1.value : access_lengths[I1] - iter1.value - 1;
// do work // do work
constexpr index_t buffer_offset =
buffer_desc_.CalculateOffset(make_multi_index(i0, i1));
// hardcoding for buffer_load // hardcoding for buffer_load
// TODO refactor transfer_data() to encapsulate this // TODO refactor transfer_data() to encapsulate this
static_assert(SrcAddressSpace == AddressSpace::Global, static_assert(SrcAddressSpace == AddressSpace::Global,
"wrong! hardcoded to use buffer_load, src must be global mem"); "wrong! hardcoded to use buffer_load, src must be global mem");
#if 0 // debug #if 1 // only works for SrcScalarPerVector == 1
buffer_(Number<buffer_offset>{}) = amd_buffer_load<SrcData, 1>( auto src_data = amd_buffer_load<SrcData, 1>(
p_src,
src_slice_origin_.GetOffset(),
coordinate_has_valid_offset_assuming_visible_index_is_valid(
src_desc, src_slice_origin_),
src_desc.GetElementSpaceSize());
#else
SrcData tmp = amd_buffer_load<SrcData, 1>(
p_src, src_slice_origin_.GetOffset(), true, src_desc.GetElementSpaceSize()); p_src, src_slice_origin_.GetOffset(), true, src_desc.GetElementSpaceSize());
const bool is_valid = const bool is_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
coordinate_has_valid_offset_assuming_visible_index_is_valid( src_desc, src_slice_origin_);
constexpr index_t buffer_offset =
buffer_desc_.CalculateOffset(make_multi_index(i0, i1) * src_scalar_per_access);
buffer_(Number<buffer_offset>{}) = is_valid ? src_data : SrcData{0};
#elif 1
vector_type<SrcData, SrcScalarPerVector> src_vector;
using SrcVectorType = typename vector_type<SrcData, SrcScalarPerVector>::MemoryType;
src_vector.Vector() = amd_buffer_load<SrcData, SrcScalarPerVector>(
p_src, src_slice_origin_.GetOffset(), true, src_desc.GetElementSpaceSize());
const bool is_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
src_desc, src_slice_origin_); src_desc, src_slice_origin_);
buffer_(Number<buffer_offset>{}) = is_valid ? tmp : SrcData{0}; src_vector.Vector() = is_valid ? src_vector.Vector() : SrcVectorType{0};
static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
constexpr index_t buffer_offset = buffer_desc_.CalculateOffset(
make_multi_index(i0, i1) * src_scalar_per_access +
i * src_scalar_step_in_vector);
// TODO: can buffe_ use vector access?
buffer_(Number<buffer_offset>{}) = src_vector[i];
});
#endif #endif
// move dim1 iterator // move dim1 iterator
if constexpr(iter1.value < Len1 - 1) if constexpr(iter1.value < access_lengths[I1] - 1)
{ {
if constexpr(forward_dim1) if constexpr(forward_dim1)
{ {
move_dynamic_tensor_coordinate( move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_0_p1);
src_desc, src_slice_origin_, src_step_0_p1);
} }
else else
{ {
move_dynamic_tensor_coordinate( move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_0_m1);
src_desc, src_slice_origin_, src_step_0_m1);
} }
} }
}); });
// move dim0 iterator // move dim0 iterator
if constexpr(iter0.value < Len0 - 1) if constexpr(iter0.value < access_lengths[I0] - 1)
{ {
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_p1_0); move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_p1_0);
} }
}); });
}
// move src coordinate back to its slice origin // move src coordinate back to its slice origin
if constexpr(SrcResetCoordinateAfterRun) if constexpr(SrcResetCoordinateAfterRun)
...@@ -1063,7 +1189,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -1063,7 +1189,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
#if 0 // hack #if 0 // hack
const auto adjusted_step = make_dynamic_tensor_coordinate_step( const auto adjusted_step = make_dynamic_tensor_coordinate_step(
src_desc, adjusted_step_idx); src_desc, adjusted_step_idx);
#elif 1 #elif 0
// for padded input tensor // for padded input tensor
const auto adjusted_step = make_dynamic_tensor_coordinate_step_hack( const auto adjusted_step = make_dynamic_tensor_coordinate_step_hack(
src_desc, adjusted_step_idx, Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2>{}); src_desc, adjusted_step_idx, Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2>{});
......
...@@ -87,7 +87,7 @@ ...@@ -87,7 +87,7 @@
// thread-invariant, otherwise it's a bug // thread-invariant, otherwise it's a bug
// TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread" // TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread"
#ifndef CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE #ifndef CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
#define CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 1 #define CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0
#endif #endif
// workaround: put all workaround here // workaround: put all workaround here
......
...@@ -182,11 +182,28 @@ struct vector_type<float, 1> ...@@ -182,11 +182,28 @@ struct vector_type<float, 1>
{ {
using MemoryType = float; using MemoryType = float;
float data_;
__host__ __device__ static constexpr index_t Size() { return 1; }
__host__ __device__ constexpr const auto& Vector() const { return data_; }
__host__ __device__ constexpr auto& Vector() { return data_; }
template <index_t I> template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>) __host__ __device__ constexpr const auto& operator[](Number<I>) const
{ {
static_assert(I < 1, "wrong"); static_assert(I == 0, "wrong!");
*(reinterpret_cast<float*>(&v) + I) = s;
return data_;
}
template <index_t I>
__host__ __device__ constexpr auto& operator()(Number<I>)
{
static_assert(I == 0, "wrong!");
return data_;
} }
}; };
...@@ -222,13 +239,62 @@ struct vector_type<float, 4> ...@@ -222,13 +239,62 @@ struct vector_type<float, 4>
{ {
using MemoryType = float4_t; using MemoryType = float4_t;
__host__ __device__ static constexpr index_t GetSize() { return 4; } union
{
float4_t v;
float s0, s1, s2, s3;
} data_;
__host__ __device__ static constexpr index_t Size() { return 4; }
__host__ __device__ constexpr const auto& Vector() const { return data_.v; }
__host__ __device__ constexpr auto& Vector() { return data_.v; }
template <index_t I> template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>) __host__ __device__ constexpr const auto& operator[](Number<I>) const
{ {
static_assert(I < 4, "wrong"); static_assert(I >= 0 && I < 4, "wrong!");
*(reinterpret_cast<float*>(&v) + I) = s;
if constexpr(I == 0)
{
return data_.s0;
}
else if constexpr(I == 1)
{
return data_.s1;
}
else if constexpr(I == 2)
{
return data_.s2;
}
else
{
return data_.s3;
}
}
template <index_t I>
__host__ __device__ constexpr auto& operator()(Number<I>)
{
static_assert(I >= 0 && I < 4, "wrong!");
if constexpr(I == 0)
{
return data_.s0;
}
else if constexpr(I == 1)
{
return data_.s1;
}
else if constexpr(I == 2)
{
return data_.s2;
}
else
{
return data_.s3;
}
} }
}; };
......
...@@ -929,7 +929,7 @@ void device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc, ...@@ -929,7 +929,7 @@ void device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN = 2; constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN = 2;
constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 = 1; constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 = 1;
#elif 1 #elif 0
// cdata = 64, BlockSize = 64, 64x64x3 // cdata = 64, BlockSize = 64, 64x64x3
constexpr index_t BlockSize = 64; constexpr index_t BlockSize = 64;
......
...@@ -201,7 +201,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc ...@@ -201,7 +201,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
constexpr auto conv_driver = constexpr auto conv_driver =
#if 1 #if 0
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
#else #else
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
......
...@@ -22,7 +22,7 @@ int main(int argc, char* argv[]) ...@@ -22,7 +22,7 @@ int main(int argc, char* argv[])
{ {
using namespace ck; using namespace ck;
#if 0 #if 1
// 3x3, 35x35, stride 2 // 3x3, 35x35, stride 2
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 192; constexpr index_t C = 192;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment