Commit b8d64385 authored by Chao Liu's avatar Chao Liu
Browse files

N-D tensor copy for threadwise copy v1r3

parent 39821a90
...@@ -388,6 +388,11 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad ...@@ -388,6 +388,11 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
const index_t InRightPadH = in_right_pads[I0]; const index_t InRightPadH = in_right_pads[I0];
const index_t InRightPadW = in_right_pads[I1]; const index_t InRightPadW = in_right_pads[I1];
if(!(InLeftPadH == 0 && InLeftPadW == 0 && InRightPadH == 0 && InRightPadW == 0))
{
throw std::runtime_error("wrong! 1x1, stride 1, no padding");
}
// weight tensor // weight tensor
#if 0 #if 0
// TODO implement graph optimization of tensor descriptor transformation // TODO implement graph optimization of tensor descriptor transformation
......
...@@ -269,7 +269,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v1 ...@@ -269,7 +269,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{})); make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}));
// hack to control index calculation when iterating over b_k_n_global tensor // hack to control index calculation when iterating over b_k_n_global tensor
#if 0 #if 1
// for padded input // for padded input
constexpr auto b_k_n_global_iterator_hacks = constexpr auto b_k_n_global_iterator_hacks =
make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0>{}, make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0>{},
......
...@@ -7,9 +7,31 @@ ...@@ -7,9 +7,31 @@
namespace ck { namespace ck {
// TODO: How to fix this? It uses an struct instead of lambda because lambda
// doesn't have constructor
template <index_t DstVectorDim, index_t DstScalarPerVector>
struct lambda_ThreadwiseDynamicTensorSliceTransfer_v1r3_dst_scalar_per_access
{
__host__ __device__ constexpr auto operator()(index_t i) const
{
return (i == DstVectorDim) ? DstScalarPerVector : 1;
}
};
template <index_t DstVectorDim>
struct lambda_ThreadwiseDynamicTensorSliceTransfer_v1r3_dst_scalar_step_in_vector
{
__host__ __device__ constexpr auto operator()(index_t i) const
{
return (i == DstVectorDim) ? 1 : 0;
}
};
// this version is less likely to have scratch memory issue, due to: // this version is less likely to have scratch memory issue, due to:
// 1. It does not keep reference to tensor descriptor // 1. It does not keep reference to tensor descriptor
// 2. It does not construct new tensor coordinate for this->Run() // 2. It does not construct new tensor coordinate for this->Run()
// Assume src_slice_origin_idx is 0
// TODO: support non-zero src_slice_oring_idx
template <typename SrcData, template <typename SrcData,
typename DstData, typename DstData,
typename SrcDesc, typename SrcDesc,
...@@ -57,153 +79,111 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3 ...@@ -57,153 +79,111 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
{ {
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
// hardcoded for 4D constexpr auto dst_scalar_per_access = generate_sequence(
// TODO implemente N-D lambda_ThreadwiseDynamicTensorSliceTransfer_v1r3_dst_scalar_per_access<
static_assert(remove_reference_t<SrcDesc>::GetNumOfDimension() == 4, DstVectorDim,
"wrong! hardcoded for 4D tensor"); DstScalarPerVector>{},
Number<nDim>{});
constexpr auto dst_scalar_per_access = [&]() { constexpr auto dst_scalar_step_in_vector = generate_sequence(
Index dst_scalar_per_access; lambda_ThreadwiseDynamicTensorSliceTransfer_v1r3_dst_scalar_step_in_vector<
DstVectorDim>{},
Number<nDim>{});
static_for<0, nDim, 1>{}([&](auto i) { constexpr auto access_lengths = SliceLengths{} / dst_scalar_per_access;
dst_scalar_per_access(i) = (i == DstVectorDim) ? DstScalarPerVector : 1;
constexpr auto dim_access_order = DimAccessOrder{};
constexpr auto ordered_access_lengths =
container_reorder_given_new2old(access_lengths, dim_access_order);
// make forward iterators
const auto dst_forward_iterators = generate_tuple(
[&](auto i) {
Index forward_step;
static_for<0, nDim, 1>{}([&](auto j) {
forward_step(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
}); });
return dst_scalar_per_access; return make_dynamic_tensor_coordinate_iterator(
}(); dst_desc, forward_step, dst_iterator_hacks[I0][i]);
},
Number<nDim>{});
constexpr auto dst_scalar_step_in_vector = [&]() { // make backward iterators
Index dst_scalar_step_in_vector; const auto dst_backward_iterators = generate_tuple(
[&](auto i) {
Index backward_step;
static_for<0, nDim, 1>{}( static_for<0, nDim, 1>{}([&](auto j) {
[&](auto i) { dst_scalar_step_in_vector(i) = (i == DstVectorDim) ? 1 : 0; }); backward_step(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
});
return dst_scalar_step_in_vector; return make_dynamic_tensor_coordinate_iterator(
}(); dst_desc, backward_step, dst_iterator_hacks[I1][i]);
},
Number<nDim>{});
constexpr auto access_lengths = [&]() { // loop over tensor and copy
Index access_lengths; static_ford<decltype(ordered_access_lengths)>{}([&](auto ordered_access_idx) {
static_for<0, nDim, 1>{}( // judge move forward or move backward
[&](auto i) { access_lengths(i) = SliceLengths{}[i] / dst_scalar_per_access[i]; }); constexpr auto forward_sweep = [&]() {
StaticallyIndexedArray<bool, nDim> forward_sweep;
return access_lengths; forward_sweep(I0) = true;
static_for<1, nDim, 1>{}([&](auto i) {
index_t tmp = ordered_access_idx[I0];
static_for<0, i, 1>{}([&](auto j) {
tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j];
});
forward_sweep(i) = tmp % 2 == 0;
});
return forward_sweep;
}(); }();
const auto dst_forward_iterators = // calculate dst data index
make_tuple(make_dynamic_tensor_coordinate_iterator(dst_desc, constexpr auto dst_data_idx = [&]() {
make_multi_index(1, 0, 0, 0) * Index ordered_idx;
dst_scalar_per_access,
dst_iterator_hacks[I0][I0]), static_for<0, nDim, 1>{}([&](auto i) {
make_dynamic_tensor_coordinate_iterator(dst_desc, ordered_idx(i) = forward_sweep[i]
make_multi_index(0, 1, 0, 0) * ? ordered_access_idx[i]
dst_scalar_per_access, : ordered_access_lengths[i] - 1 - ordered_access_idx[i];
dst_iterator_hacks[I0][I1]), });
make_dynamic_tensor_coordinate_iterator(dst_desc,
make_multi_index(0, 0, 1, 0) *
dst_scalar_per_access,
dst_iterator_hacks[I0][I2]),
make_dynamic_tensor_coordinate_iterator(dst_desc,
make_multi_index(0, 0, 0, 1) *
dst_scalar_per_access,
dst_iterator_hacks[I0][I3]));
const auto dst_backward_iterators =
make_tuple(make_dynamic_tensor_coordinate_iterator(dst_desc,
make_multi_index(-1, 0, 0, 0) *
dst_scalar_per_access,
dst_iterator_hacks[I1][I0]),
make_dynamic_tensor_coordinate_iterator(dst_desc,
make_multi_index(0, -1, 0, 0) *
dst_scalar_per_access,
dst_iterator_hacks[I1][I1]),
make_dynamic_tensor_coordinate_iterator(dst_desc,
make_multi_index(0, 0, -1, 0) *
dst_scalar_per_access,
dst_iterator_hacks[I1][I2]),
make_dynamic_tensor_coordinate_iterator(dst_desc,
make_multi_index(0, 0, 0, -1) *
dst_scalar_per_access,
dst_iterator_hacks[I1][I3]));
// loop over dim0
static_for<0,
SliceLengths{}[DimAccessOrder{}[I0]],
dst_scalar_per_access[DimAccessOrder{}[I0]]>{}([&](auto iter0) {
constexpr index_t i0 = iter0;
constexpr bool forward_dim1 = auto dst_data_idx = container_reorder_given_old2new(ordered_idx, dim_access_order) *
(iter0 / dst_scalar_per_access[DimAccessOrder{}[I0]]) % 2 == 0; dst_scalar_per_access;
// loop over dim1 return dst_data_idx;
static_for<0, }();
SliceLengths{}[DimAccessOrder{}[I1]],
dst_scalar_per_access[DimAccessOrder{}[I1]]>{}([&](auto iter1) {
constexpr index_t i1 =
forward_dim1 ? iter1
: SliceLengths{}[DimAccessOrder{}[I1]] -
dst_scalar_per_access[DimAccessOrder{}[I1]] - iter1;
constexpr bool forward_dim2 =
((iter0 / dst_scalar_per_access[DimAccessOrder{}[I0]]) *
access_lengths[DimAccessOrder{}[I1]] +
(iter1 / dst_scalar_per_access[DimAccessOrder{}[I1]])) %
2 ==
0;
// loop over dim2
static_for<0,
SliceLengths{}[DimAccessOrder{}[I2]],
dst_scalar_per_access[DimAccessOrder{}[I2]]>{}([&](auto iter2) {
constexpr index_t i2 =
forward_dim2 ? iter2
: SliceLengths{}[DimAccessOrder{}[I2]] -
dst_scalar_per_access[DimAccessOrder{}[I2]] - iter2;
constexpr bool forward_dim3 =
(((iter0 / dst_scalar_per_access[DimAccessOrder{}[I0]]) *
access_lengths[DimAccessOrder{}[I1]] +
(iter1 / dst_scalar_per_access[DimAccessOrder{}[I1]])) *
access_lengths[DimAccessOrder{}[I2]] +
(iter2 / dst_scalar_per_access[DimAccessOrder{}[I2]])) %
2 ==
0;
// loop over dim3
static_for<0,
SliceLengths{}[DimAccessOrder{}[I3]],
dst_scalar_per_access[DimAccessOrder{}[I3]]>{}([&](auto iter3) {
constexpr index_t i3 =
forward_dim3 ? iter3
: SliceLengths{}[DimAccessOrder{}[I3]] -
dst_scalar_per_access[DimAccessOrder{}[I3]] - iter3;
// do work // copy data
// hardcoding for buffer_store // hardcoding for buffer_store
// TODO refactor transfer_data() to encapsulate this // TODO refactor transfer_data() to encapsulate this
static_assert(SrcAddressSpace == AddressSpace::Vgpr && static_assert(SrcAddressSpace == AddressSpace::Vgpr &&
DstAddressSpace == AddressSpace::Global, DstAddressSpace == AddressSpace::Global,
"wrong! hardcoded to use buffer_store"); "wrong! hardcoded to use buffer_store");
using DstVectorType =
typename vector_type<DstData, DstScalarPerVector>::MemoryType;
vector_type<DstData, DstScalarPerVector> dst_vector; vector_type<DstData, DstScalarPerVector> dst_vector;
// this is hardcoded for src that has compile-time tensor descriptor // this is hardcoded for src that has compile-time tensor descriptor
static_for<0, DstScalarPerVector, 1>{}([&](auto i) { static_for<0, DstScalarPerVector, 1>{}([&](auto i) {
// hack: assume src_slice_origin_idx is 0 // assume src_slice_origin_idx is 0
constexpr index_t src_offset = SrcDesc::CalculateOffset( // TODO: support non-zero src_slice_oring_idx
container_reorder_given_old2new(make_multi_index(i0, i1, i2, i3), constexpr index_t src_offset =
DimAccessOrder{}) + SrcDesc::CalculateOffset(dst_data_idx + i * dst_scalar_step_in_vector);
i * dst_scalar_step_in_vector);
dst_vector(i) = p_src[Number<src_offset>{}]; dst_vector(i) = p_src[Number<src_offset>{}];
}); });
#if 1
amd_buffer_store_v2<DstData, DstScalarPerVector>( amd_buffer_store_v2<DstData, DstScalarPerVector>(
dst_vector.Vector(), dst_vector.Vector(),
p_dst, p_dst,
...@@ -211,76 +191,52 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3 ...@@ -211,76 +191,52 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
coordinate_has_valid_offset_assuming_visible_index_is_valid( coordinate_has_valid_offset_assuming_visible_index_is_valid(
dst_desc, dst_slice_origin_coord_), dst_desc, dst_slice_origin_coord_),
dst_desc.GetElementSpaceSize()); dst_desc.GetElementSpaceSize());
#else
static_for<0, DstScalarPerVector, 1>{}([&](auto i) {
amd_buffer_store_v2<DstData, 1>(
dst_vector[i],
p_dst,
dst_slice_origin_coord_.GetOffset() + i.value,
coordinate_has_valid_offset_assuming_visible_index_is_valid(
dst_desc, dst_slice_origin_coord_),
dst_desc.GetElementSpaceSize());
});
#endif
// move along dim3 constexpr auto move_on_dim = [&]() constexpr
if constexpr(iter3 < SliceLengths{}[DimAccessOrder{}[I3]] -
dst_scalar_per_access[DimAccessOrder{}[I3]])
{
if constexpr(forward_dim3)
{
move_dynamic_tensor_coordinate(
dst_desc,
dst_slice_origin_coord_,
dst_forward_iterators[DimAccessOrder{}[I3]]);
}
else
{ {
move_dynamic_tensor_coordinate( StaticallyIndexedArray<bool, nDim> move_on_dim;
dst_desc,
dst_slice_origin_coord_, static_for<0, nDim, 1>{}([&](auto i) {
dst_backward_iterators[DimAccessOrder{}[I3]]); move_on_dim(i) = ordered_access_idx[i] < ordered_access_lengths[i] - 1;
}
} static_for<i + 1, nDim, 1>{}([&](auto j) {
move_on_dim(i) &= ordered_access_idx[j] == ordered_access_lengths[j] - 1;
});
}); });
// move along dim2 return move_on_dim;
if constexpr(iter2 < SliceLengths{}[DimAccessOrder{}[I2]] -
dst_scalar_per_access[DimAccessOrder{}[I2]])
{
if constexpr(forward_dim2)
{
move_dynamic_tensor_coordinate(
dst_desc,
dst_slice_origin_coord_,
dst_forward_iterators[DimAccessOrder{}[I2]]);
}
else
{
move_dynamic_tensor_coordinate(
dst_desc,
dst_slice_origin_coord_,
dst_backward_iterators[DimAccessOrder{}[I2]]);
}
} }
}); ();
// move along dim1 // move
if constexpr(iter1 < SliceLengths{}[DimAccessOrder{}[I1]] - static_for<0, nDim, 1>{}([&](auto i) {
dst_scalar_per_access[DimAccessOrder{}[I1]]) if constexpr(move_on_dim[i])
{ {
if constexpr(forward_dim1) if constexpr(forward_sweep[i])
{ {
move_dynamic_tensor_coordinate(dst_desc, move_dynamic_tensor_coordinate(dst_desc,
dst_slice_origin_coord_, dst_slice_origin_coord_,
dst_forward_iterators[DimAccessOrder{}[I1]]); dst_forward_iterators[dim_access_order[i]]);
} }
else else
{ {
move_dynamic_tensor_coordinate( move_dynamic_tensor_coordinate(dst_desc,
dst_desc,
dst_slice_origin_coord_, dst_slice_origin_coord_,
dst_backward_iterators[DimAccessOrder{}[I1]]); dst_backward_iterators[dim_access_order[i]]);
} }
} }
}); });
// move along dim0
if constexpr(iter0 < SliceLengths{}[DimAccessOrder{}[I0]] -
dst_scalar_per_access[DimAccessOrder{}[I0]])
{
move_dynamic_tensor_coordinate(
dst_desc, dst_slice_origin_coord_, dst_forward_iterators[DimAccessOrder{}[I0]]);
}
}); });
// move dst coordinate back to slice origin (or not) // move dst coordinate back to slice origin (or not)
...@@ -340,7 +296,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3 ...@@ -340,7 +296,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
private: private:
DstCoord dst_slice_origin_coord_; DstCoord dst_slice_origin_coord_;
}; }; // namespace ck
// this version does following things to avoid "alloca" in LLVM-IR, which would cause scratch memory // this version does following things to avoid "alloca" in LLVM-IR, which would cause scratch memory
// and sometimes useless instructions // and sometimes useless instructions
......
...@@ -73,6 +73,30 @@ __host__ __device__ constexpr auto container_reorder_given_old2new(const Tuple<T ...@@ -73,6 +73,30 @@ __host__ __device__ constexpr auto container_reorder_given_old2new(const Tuple<T
old_tuple, typename sequence_map_inverse<decltype(old2new)>::type{}); old_tuple, typename sequence_map_inverse<decltype(old2new)>::type{});
} }
template <index_t... Is, index_t... IRs>
__host__ __device__ constexpr auto container_reorder_given_new2old(Sequence<Is...> /* old_seq */,
Sequence<IRs...> /*new2old*/)
{
static_assert(sizeof...(Is) == sizeof...(IRs), "wrong! size not consistent");
static_assert(is_valid_sequence_map<Sequence<IRs...>>{}, "wrong! invalid reorder map");
return Sequence<Sequence<Is...>::At(Number<IRs>{})...>{};
}
template <index_t... Is, index_t... IRs>
__host__ __device__ constexpr auto container_reorder_given_old2new(Sequence<Is...> old_seq,
Sequence<IRs...> /* old2new */)
{
static_assert(sizeof...(Is) == sizeof...(IRs), "wrong! size not consistent");
static_assert(is_valid_sequence_map<Sequence<IRs...>>{}, "wrong! invalid reorder map");
constexpr auto new2old = typename sequence_map_inverse<Sequence<IRs...>>::type{};
return container_reorder_give_new2old(old_seq, new2old);
}
template <typename TData, typename Container, typename Reduce> template <typename TData, typename Container, typename Reduce>
__host__ __device__ constexpr TData container_reduce(const Container& a, Reduce f, TData init) __host__ __device__ constexpr TData container_reduce(const Container& a, Reduce f, TData init)
{ {
......
...@@ -144,7 +144,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc ...@@ -144,7 +144,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 1; constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 1;
constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmN = 1; constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmN = 1;
constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1; constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 4;
#elif 0 #elif 0
// cdata = 64, BlockSize = 256, 128x128x8 // cdata = 64, BlockSize = 256, 128x128x8
// b thread copy 2x2 // b thread copy 2x2
...@@ -233,7 +233,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc ...@@ -233,7 +233,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
constexpr auto conv_driver = constexpr auto conv_driver =
#if 0 #if 1
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
#elif 0 #elif 0
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
......
...@@ -52,7 +52,7 @@ int main(int argc, char* argv[]) ...@@ -52,7 +52,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<0, 0>; using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>;
#elif 0 #elif 1
// 3x3, 71x71 // 3x3, 71x71
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 192; constexpr index_t C = 192;
...@@ -592,7 +592,7 @@ int main(int argc, char* argv[]) ...@@ -592,7 +592,7 @@ int main(int argc, char* argv[])
LeftPads{}, LeftPads{},
RightPads{}, RightPads{},
nrepeat); nrepeat);
#elif 1 #elif 0
device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc, device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc,
in_nchw, in_nchw,
wei_kcyx_desc, wei_kcyx_desc,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment