Commit 97ba755f authored by Chao Liu's avatar Chao Liu
Browse files

refactor

parent 8d460740
...@@ -38,7 +38,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, ...@@ -38,7 +38,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
constexpr index_t X = wei_kcyx_desc.GetLength(I3); constexpr index_t X = wei_kcyx_desc.GetLength(I3);
// reorder weight // reorder weight
auto wei_cyxk_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<C, Y, X, K>{}); auto wei_cyxk_desc = make_ConstantTensorDescriptor_packed(Sequence<C, Y, X, K>{});
ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: "); ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc)); Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
...@@ -51,7 +51,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, ...@@ -51,7 +51,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
std::thread::hardware_concurrency()); std::thread::hardware_concurrency());
// reorder input // reorder input
auto in_chwn_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<C, Hi, Wi, N>{}); auto in_chwn_desc = make_ConstantTensorDescriptor_packed(Sequence<C, Hi, Wi, N>{});
ostream_ConstantTensorDescriptor(in_chwn_desc, std::cout << "in_chwn_desc: "); ostream_ConstantTensorDescriptor(in_chwn_desc, std::cout << "in_chwn_desc: ");
Tensor<T> in_chwn(make_TensorDescriptor(in_chwn_desc)); Tensor<T> in_chwn(make_TensorDescriptor(in_chwn_desc));
...@@ -64,8 +64,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, ...@@ -64,8 +64,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
std::thread::hardware_concurrency()); std::thread::hardware_concurrency());
// output // output
auto out_khwn_desc = auto out_khwn_desc = make_ConstantTensorDescriptor_packed(Sequence<K, Ho, Wo, N>{});
make_ConstantTensorDescriptor_default_rank_packed(Sequence<K, Ho, Wo, N>{});
ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: "); ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: ");
Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc)); Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc));
......
...@@ -37,7 +37,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc, ...@@ -37,7 +37,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
constexpr index_t X = wei_kcyx_desc.GetLength(I3); constexpr index_t X = wei_kcyx_desc.GetLength(I3);
// reorder weight // reorder weight
auto wei_cyxk_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<C, Y, X, K>{}); auto wei_cyxk_desc = make_ConstantTensorDescriptor_packed(Sequence<C, Y, X, K>{});
ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: "); ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc)); Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
...@@ -50,8 +50,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc, ...@@ -50,8 +50,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
std::thread::hardware_concurrency()); std::thread::hardware_concurrency());
// output // output
auto out_khwn_desc = auto out_khwn_desc = make_ConstantTensorDescriptor_packed(Sequence<K, Ho, Wo, N>{});
make_ConstantTensorDescriptor_default_rank_packed(Sequence<K, Ho, Wo, N>{});
ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: "); ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: ");
Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc)); Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc));
......
...@@ -36,7 +36,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc, ...@@ -36,7 +36,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
constexpr index_t X = wei_kcyx_desc.GetLength(I3); constexpr index_t X = wei_kcyx_desc.GetLength(I3);
// reorder weight // reorder weight
auto wei_cyxk_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<C, Y, X, K>{}); auto wei_cyxk_desc = make_ConstantTensorDescriptor_packed(Sequence<C, Y, X, K>{});
ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: "); ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc)); Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
......
...@@ -36,7 +36,7 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc, ...@@ -36,7 +36,7 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
constexpr index_t X = wei_kcyx_desc.GetLength(I3); constexpr index_t X = wei_kcyx_desc.GetLength(I3);
// reorder weight // reorder weight
auto wei_cyxk_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<C, Y, X, K>{}); auto wei_cyxk_desc = make_ConstantTensorDescriptor_packed(Sequence<C, Y, X, K>{});
ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: "); ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc)); Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
......
...@@ -443,7 +443,7 @@ int main(int argc, char* argv[]) ...@@ -443,7 +443,7 @@ int main(int argc, char* argv[])
constexpr index_t HPad = 0; constexpr index_t HPad = 0;
constexpr index_t WPad = 0; constexpr index_t WPad = 0;
#elif 0 #elif 1
// 3x3 filter, 28x28 image // 3x3 filter, 28x28 image
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 256; constexpr index_t C = 256;
...@@ -455,7 +455,7 @@ int main(int argc, char* argv[]) ...@@ -455,7 +455,7 @@ int main(int argc, char* argv[])
constexpr index_t HPad = 0; constexpr index_t HPad = 0;
constexpr index_t WPad = 0; constexpr index_t WPad = 0;
#elif 1 #elif 0
// 1x1 filter, 28x28 image // 1x1 filter, 28x28 image
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 512; constexpr index_t C = 512;
...@@ -568,8 +568,8 @@ int main(int argc, char* argv[]) ...@@ -568,8 +568,8 @@ int main(int argc, char* argv[])
auto lower_pads = Sequence<HPad, WPad>{}; auto lower_pads = Sequence<HPad, WPad>{};
auto upper_pads = Sequence<HPad, WPad>{}; auto upper_pads = Sequence<HPad, WPad>{};
auto in_nchw_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<N, C, HI, WI>{}); auto in_nchw_desc = make_ConstantTensorDescriptor_packed(Sequence<N, C, HI, WI>{});
auto wei_kcyx_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<K, C, Y, X>{}); auto wei_kcyx_desc = make_ConstantTensorDescriptor_packed(Sequence<K, C, Y, X>{});
auto out_nkhw_desc = get_convolution_with_padding_output_default_4d_tensor_descriptor( auto out_nkhw_desc = get_convolution_with_padding_output_default_4d_tensor_descriptor(
in_nchw_desc, wei_kcyx_desc, lower_pads, upper_pads); in_nchw_desc, wei_kcyx_desc, lower_pads, upper_pads);
......
...@@ -114,7 +114,7 @@ struct ConstantMergedTensorDescriptor ...@@ -114,7 +114,7 @@ struct ConstantMergedTensorDescriptor
__host__ __device__ static Array<index_t, nDim> GetMultiIndexFrom1dIndex(index_t id) __host__ __device__ static Array<index_t, nDim> GetMultiIndexFrom1dIndex(index_t id)
{ {
constexpr auto dummy_desc = make_ConstantTensorDescriptor_default_rank_packed(GetLengths()); constexpr auto dummy_desc = make_ConstantTensorDescriptor_packed(GetLengths());
return dummy_desc.GetMultiIndexFrom1dIndex(id); return dummy_desc.GetMultiIndexFrom1dIndex(id);
} }
...@@ -128,7 +128,7 @@ __host__ __device__ constexpr auto make_ConstantMergedTensorDescriptor(OriginalT ...@@ -128,7 +128,7 @@ __host__ __device__ constexpr auto make_ConstantMergedTensorDescriptor(OriginalT
} }
template <class TDesc> template <class TDesc>
__host__ __device__ void print_ConstantMergedTensorDescriptor(TDesc, const char* s) __host__ __device__ void print_ConstantMergedTensorDescriptor(const char* s, TDesc)
{ {
print_ConstantTensorDescriptor(TDesc::GetOriginalTensorDescriptor(), s); print_ConstantTensorDescriptor(s, TDesc::GetOriginalTensorDescriptor());
} }
...@@ -2,25 +2,23 @@ ...@@ -2,25 +2,23 @@
#include "common.hip.hpp" #include "common.hip.hpp"
template <class Lengths> template <class Lengths>
__host__ __device__ constexpr auto calculate_tensor_strides_default_rank_packed(Lengths) __host__ __device__ constexpr auto calculate_tensor_strides_packed(Lengths)
{ {
return reverse_inclusive_scan_sequence(Lengths{}.PopFront(), mod_conv::multiplies<index_t>{}) return reverse_inclusive_scan_sequence(Lengths{}.PopFront(), mod_conv::multiplies<index_t>{})
.PushBack(Number<1>{}); .PushBack(Number<1>{});
} }
template <class Lengths, index_t Align> template <class Lengths, index_t Align>
__host__ __device__ constexpr auto calculate_tensor_strides_default_rank_aligned(Lengths, __host__ __device__ constexpr auto calculate_tensor_strides_aligned(Lengths, Number<Align>)
Number<Align>)
{ {
constexpr index_t L_back_align = constexpr index_t L_back_align =
Align * mod_conv::integer_divide_ceiler<index_t>{}(Lengths{}.Back(), Align); Align * mod_conv::integer_divide_ceiler<index_t>{}(Lengths{}.Back(), Align);
return calculate_tensor_strides_default_rank_packed( return calculate_tensor_strides_packed(
Lengths{}.Modify(Number<Lengths{}.GetSize() - 1>{}, Number<L_back_align>{})); Lengths{}.Modify(Number<Lengths{}.GetSize() - 1>{}, Number<L_back_align>{}));
} }
// MemoryRanks of dimensions is for conversion from offset to multi-index template <class Lengths, class Strides>
template <class Lengths, class Strides, class MemoryRanks>
struct ConstantTensorDescriptor struct ConstantTensorDescriptor
{ {
using Type = ConstantTensorDescriptor; using Type = ConstantTensorDescriptor;
...@@ -29,15 +27,7 @@ struct ConstantTensorDescriptor ...@@ -29,15 +27,7 @@ struct ConstantTensorDescriptor
__host__ __device__ constexpr ConstantTensorDescriptor() __host__ __device__ constexpr ConstantTensorDescriptor()
{ {
static_assert(Lengths::GetSize() == Strides::GetSize() && static_assert(Lengths::GetSize() == Strides::GetSize(), "nDim not consistent");
Lengths::GetSize() == MemoryRanks::GetSize(),
"nDim not consistent");
#if 0 // require sequence_sort, but it's not implemented yet
static_assert(is_same<typename sequence_sort<MemoryRanks>::SortedSeqType,
typename arithmetic_sequence_gen<0, nDim, 1>::SeqType>::value,
"wrong! invalid MemoryRanks");
#endif
} }
__host__ __device__ static constexpr auto GetOriginalTensorDescriptor() { return Type{}; } __host__ __device__ static constexpr auto GetOriginalTensorDescriptor() { return Type{}; }
...@@ -54,8 +44,6 @@ struct ConstantTensorDescriptor ...@@ -54,8 +44,6 @@ struct ConstantTensorDescriptor
__host__ __device__ static constexpr auto GetStrides() { return Strides{}; } __host__ __device__ static constexpr auto GetStrides() { return Strides{}; }
__host__ __device__ static constexpr auto GetMemoryRanks() { return MemoryRanks{}; }
template <index_t I> template <index_t I>
__host__ __device__ static constexpr index_t GetLength(Number<I>) __host__ __device__ static constexpr index_t GetLength(Number<I>)
{ {
...@@ -68,12 +56,6 @@ struct ConstantTensorDescriptor ...@@ -68,12 +56,6 @@ struct ConstantTensorDescriptor
return Strides{}.Get(Number<I>{}); return Strides{}.Get(Number<I>{});
} }
template <index_t I>
__host__ __device__ static constexpr index_t GetMemoryRank(Number<I>)
{
return MemoryRanks{}.Get(Number<I>{});
}
__host__ __device__ static constexpr bool AreStridesNonAscending() __host__ __device__ static constexpr bool AreStridesNonAscending()
{ {
bool flag = true; bool flag = true;
...@@ -98,20 +80,13 @@ struct ConstantTensorDescriptor ...@@ -98,20 +80,13 @@ struct ConstantTensorDescriptor
return accumulate_on_sequence(Lengths{}, mod_conv::multiplies<index_t>{}, Number<1>{}); return accumulate_on_sequence(Lengths{}, mod_conv::multiplies<index_t>{}, Number<1>{});
} }
// WRONG! ReorderGivenOld2New is broken
template <class Align = Number<1>> template <class Align = Number<1>>
__host__ __device__ static constexpr index_t GetElementSpace(Align align = Align{}) __host__ __device__ static constexpr index_t GetElementSpace(Align align = Align{})
{ {
#if 0 // This is WRONG! align shouldbe applied to the last memory rank, not the last tensor
constexpr auto lengths_in_rank = GetLengths().ReorderGivenOld2New(MemoryRank{}); // dimension
constexpr auto strides_in_rank = GetStrides().ReorderGivenOld2new(MemoryRank{});
constexpr index_t element_space_unaligned = accumulate_on_sequence(
(lengths_in_rank - Number<1>{}) * strides_in_rank, mod_conv::plus<index_t>{}, Number<1>{});
#else // WRONG! align shouldbe applied to the last memory rank, not the last tensor dimension
constexpr index_t element_space_unaligned = accumulate_on_sequence( constexpr index_t element_space_unaligned = accumulate_on_sequence(
(GetLengths() - Number<1>{}) * GetStrides(), mod_conv::plus<index_t>{}, Number<1>{}); (GetLengths() - Number<1>{}) * GetStrides(), mod_conv::plus<index_t>{}, Number<1>{});
#endif
return align.Get() * ((element_space_unaligned + align.Get() - 1) / align.Get()); return align.Get() * ((element_space_unaligned + align.Get() - 1) / align.Get());
} }
...@@ -148,35 +123,13 @@ struct ConstantTensorDescriptor ...@@ -148,35 +123,13 @@ struct ConstantTensorDescriptor
multi_id * GetStrides(), mod_conv::plus<index_t>{}, Number<0>{}); multi_id * GetStrides(), mod_conv::plus<index_t>{}, Number<0>{});
} }
#if 0 // ReorderGivenOld2new is broken
__host__ __device__ static Array<index_t, nDim> GetMultiIndexFromOffset(index_t offset)
{
Array<index_t, nDim> ranked_multi_id;
constexpr auto ranked_strides =
GetStrides().ReorderGivenOld2new(MemoryRanks{}); // check this
// calculate index in each of the dimensions in the order of their rank (not dimension)
static_for<0, nDim - 1, 1>{}([&](auto IDim) {
constexpr index_t idim = IDim.Get();
constexpr index_t stride = ranked_strides.Get(Number<idim>{});
ranked_multi_id[idim] = offset / stride;
offset -= ranked_multi_id[idim] * stride;
});
ranked_multi_id[nDim - 1] = offset / ranked_strides.Get(Number<nDim - 1>{});
return reorder_array_given_new2old(ranked_multi_id, MemoryRanks{}); // check this
}
#endif
__host__ __device__ static Array<index_t, nDim> GetMultiIndexFrom1dIndex(index_t id) __host__ __device__ static Array<index_t, nDim> GetMultiIndexFrom1dIndex(index_t id)
{ {
Array<index_t, nDim> multi_id; Array<index_t, nDim> multi_id;
constexpr auto dummy_strides = calculate_tensor_strides_default_rank_packed(GetLengths()); constexpr auto dummy_strides = calculate_tensor_strides_packed(GetLengths());
// calculate index in each of the dimensions in the order of their dimension (not rank) // calculate index in each of the dimensions in the order of their dimension
static_for<0, nDim - 1, 1>{}([&](auto IDim) { static_for<0, nDim - 1, 1>{}([&](auto IDim) {
constexpr index_t idim = IDim.Get(); constexpr index_t idim = IDim.Get();
constexpr index_t stride = dummy_strides.Get(Number<idim>{}); constexpr index_t stride = dummy_strides.Get(Number<idim>{});
...@@ -267,24 +220,16 @@ struct ConstantTensorDescriptor ...@@ -267,24 +220,16 @@ struct ConstantTensorDescriptor
return new_multi_id; return new_multi_id;
} }
// WRONG! Ranks is broken
template <index_t... IDims> template <index_t... IDims>
__host__ __device__ static constexpr auto Extract(Number<IDims>... extract_dims) __host__ __device__ static constexpr auto Extract(Number<IDims>... extract_dims)
{ {
static_assert(sizeof...(IDims) <= GetNumOfDimension(), static_assert(sizeof...(IDims) <= GetNumOfDimension(),
"wrong! too many number of dimensions to be extracted"); "wrong! too many number of dimensions to be extracted");
using extract_lengths = decltype(Lengths{}.Extract(extract_dims...)); using extract_lengths = decltype(Lengths::Extract(extract_dims...));
using extract_strides = decltype(Strides{}.Extract(extract_dims...)); using extract_strides = decltype(Strides::Extract(extract_dims...));
using extract_ranks = decltype(MemoryRanks{}.Extract(extract_dims...));
#if 0
using new_ranks = typename sequence_sort<extract_ranks>::Original2SortedType;
#else // WRONG! TODO:: implement sequence_sort
using new_ranks = typename arithmetic_sequence_gen<0, sizeof...(IDims), 1>::SeqType;
#endif
return ConstantTensorDescriptor<extract_lengths, extract_strides, new_ranks>{}; return ConstantTensorDescriptor<extract_lengths, extract_strides>{};
} }
template <index_t... IDims> template <index_t... IDims>
...@@ -298,12 +243,8 @@ struct ConstantTensorDescriptor ...@@ -298,12 +243,8 @@ struct ConstantTensorDescriptor
{ {
using leaf_tensor = ConstantTensorDescriptor<Ts...>; using leaf_tensor = ConstantTensorDescriptor<Ts...>;
// memory rank is broken
// TODO: remove memory rank info from tensor descritpor
return ConstantTensorDescriptor<decltype(GetLengths().Append(leaf_tensor::GetLengths())), return ConstantTensorDescriptor<decltype(GetLengths().Append(leaf_tensor::GetLengths())),
decltype(GetStrides().Append(leaf_tensor::GetStrides())), decltype(GetStrides().Append(leaf_tensor::GetStrides()))>{};
decltype(GetMemoryRanks().Append(
leaf_tensor::GetMemoryRanks()))>{};
} }
template <index_t IDim, index_t SliceLen> template <index_t IDim, index_t SliceLen>
...@@ -311,18 +252,9 @@ struct ConstantTensorDescriptor ...@@ -311,18 +252,9 @@ struct ConstantTensorDescriptor
{ {
using slice_lengths = decltype(Lengths{}.Modify(Number<IDim>{}, Number<SliceLen>{})); using slice_lengths = decltype(Lengths{}.Modify(Number<IDim>{}, Number<SliceLen>{}));
return ConstantTensorDescriptor<slice_lengths, Strides, MemoryRanks>{}; return ConstantTensorDescriptor<slice_lengths, Strides>{};
} }
template <index_t Threashold, index_t Delta>
struct f_fold_impl
{
__host__ __device__ constexpr index_t operator()(index_t x) const
{
return x > Threashold ? x + Delta : x;
}
};
template <index_t IDim, index_t... FoldIntervals> template <index_t IDim, index_t... FoldIntervals>
__host__ __device__ static constexpr auto Fold(Number<IDim>, Number<FoldIntervals>...) __host__ __device__ static constexpr auto Fold(Number<IDim>, Number<FoldIntervals>...)
{ {
...@@ -333,7 +265,6 @@ struct ConstantTensorDescriptor ...@@ -333,7 +265,6 @@ struct ConstantTensorDescriptor
constexpr auto unfold_length = GetLength(Number<IDim>{}); constexpr auto unfold_length = GetLength(Number<IDim>{});
constexpr auto unfold_stride = GetStride(Number<IDim>{}); constexpr auto unfold_stride = GetStride(Number<IDim>{});
constexpr auto unfold_rank = GetMemoryRank(Number<IDim>{});
// length of the dimension to be folded needs to be dividable by fold_interval_product, // length of the dimension to be folded needs to be dividable by fold_interval_product,
// otherwise, folding is invalid // otherwise, folding is invalid
...@@ -350,16 +281,6 @@ struct ConstantTensorDescriptor ...@@ -350,16 +281,6 @@ struct ConstantTensorDescriptor
reverse_inclusive_scan_sequence(fold_intervals.PushBack(Number<1>{}), reverse_inclusive_scan_sequence(fold_intervals.PushBack(Number<1>{}),
mod_conv::multiplies<index_t>{}); mod_conv::multiplies<index_t>{});
// folded_ranks
constexpr auto fold_ranks =
typename arithmetic_sequence_gen<unfold_rank,
unfold_rank + fold_intervals.GetSize() + 1,
1>::SeqType{};
// increase the ranks that are larger than unfold_rank
constexpr auto tmp_ranks = transform_sequences(
f_fold_impl<unfold_rank, fold_intervals.GetSize()>{}, GetMemoryRanks());
// left and right // left and right
constexpr auto left = typename arithmetic_sequence_gen<0, IDim, 1>::SeqType{}; constexpr auto left = typename arithmetic_sequence_gen<0, IDim, 1>::SeqType{};
constexpr auto right = constexpr auto right =
...@@ -369,15 +290,8 @@ struct ConstantTensorDescriptor ...@@ -369,15 +290,8 @@ struct ConstantTensorDescriptor
GetLengths().Extract(left).Append(fold_lengths).Append(GetLengths().Extract(right)); GetLengths().Extract(left).Append(fold_lengths).Append(GetLengths().Extract(right));
constexpr auto new_strides = constexpr auto new_strides =
GetStrides().Extract(left).Append(fold_strides).Append(GetStrides().Extract(right)); GetStrides().Extract(left).Append(fold_strides).Append(GetStrides().Extract(right));
constexpr auto new_ranks =
tmp_ranks.Extract(left).Append(fold_ranks).Append(tmp_ranks.Extract(right));
static_assert(new_ranks.GetSize() == new_lengths.GetSize(), "wrong!");
static_assert(fold_ranks.GetSize() == fold_lengths.GetSize(), "wrong!");
return ConstantTensorDescriptor<decltype(new_lengths), return ConstantTensorDescriptor<decltype(new_lengths), decltype(new_strides)>{};
decltype(new_strides),
decltype(new_ranks)>{};
} }
template <index_t Threashold, index_t Delta> template <index_t Threashold, index_t Delta>
...@@ -411,11 +325,6 @@ struct ConstantTensorDescriptor ...@@ -411,11 +325,6 @@ struct ConstantTensorDescriptor
// check if packed // check if packed
static_assert(GetStride(IDim_p1) * GetLength(IDim_p1) == GetStride(IDim), static_assert(GetStride(IDim_p1) * GetLength(IDim_p1) == GetStride(IDim),
"wrong! dimensions to be unfolded need to be packed"); "wrong! dimensions to be unfolded need to be packed");
// check ranks
static_assert(GetMemoryRank(IDim_p1) == GetMemoryRank(IDim) + 1,
"wrong! ranks of dimensions to be unfolded need to be in increasing and "
"continuous ranks");
}); });
#endif #endif
...@@ -426,21 +335,13 @@ struct ConstantTensorDescriptor ...@@ -426,21 +335,13 @@ struct ConstantTensorDescriptor
constexpr auto right = constexpr auto right =
typename arithmetic_sequence_gen<LastUnfoldDim + 1, GetNumOfDimension(), 1>::SeqType{}; typename arithmetic_sequence_gen<LastUnfoldDim + 1, GetNumOfDimension(), 1>::SeqType{};
// unfolded length, stride and rank // unfolded length, stride
constexpr index_t unfold_length = accumulate_on_sequence( constexpr index_t unfold_length = accumulate_on_sequence(
GetLengths().Extract(middle), mod_conv::multiplies<index_t>{}, Number<1>{}); GetLengths().Extract(middle), mod_conv::multiplies<index_t>{}, Number<1>{});
constexpr index_t unfold_stride = GetStride(Number<LastUnfoldDim>{}); constexpr index_t unfold_stride = GetStride(Number<LastUnfoldDim>{});
constexpr index_t unfold_rank = GetMemoryRank(Number<FirstUnfoldDim>{}); // new lengths, strides
// decrease the ranks that are larger than the rank of LastUnfoldDim
constexpr auto tmp_ranks =
transform_sequences(f_unfold_impl<GetMemoryRank(Number<LastUnfoldDim>{}),
LastUnfoldDim - FirstUnfoldDim + 1>{},
GetMemoryRanks());
// new lengths, strides and ranks
constexpr auto new_lengths = GetLengths() constexpr auto new_lengths = GetLengths()
.Extract(left) .Extract(left)
.PushBack(Number<unfold_length>{}) .PushBack(Number<unfold_length>{})
...@@ -451,22 +352,14 @@ struct ConstantTensorDescriptor ...@@ -451,22 +352,14 @@ struct ConstantTensorDescriptor
.PushBack(Number<unfold_stride>{}) .PushBack(Number<unfold_stride>{})
.Append(GetStrides().Extract(right)); .Append(GetStrides().Extract(right));
constexpr auto new_ranks = tmp_ranks.Extract(left) return ConstantTensorDescriptor<decltype(new_lengths), decltype(new_strides)>{};
.PushBack(Number<unfold_rank>{})
.Append(tmp_ranks.Extract(right));
return ConstantTensorDescriptor<decltype(new_lengths),
decltype(new_strides),
decltype(new_ranks)>{};
} }
template <class MapNew2Old> template <class MapNew2Old>
__host__ __device__ static constexpr auto ReorderGivenNew2Old(MapNew2Old) __host__ __device__ static constexpr auto ReorderGivenNew2Old(MapNew2Old)
{ {
return ConstantTensorDescriptor<decltype(Lengths{}.ReorderGivenNew2Old(MapNew2Old{})), return ConstantTensorDescriptor<decltype(Lengths{}.ReorderGivenNew2Old(MapNew2Old{})),
decltype(Strides{}.ReorderGivenNew2Old(MapNew2Old{})), decltype(Strides{}.ReorderGivenNew2Old(MapNew2Old{}))>{};
decltype(
MemoryRanks{}.ReorderGivenNew2Old(MapNew2Old{}))>{};
} }
#if 0 // require sequence_sort, which is not implemented yet #if 0 // require sequence_sort, which is not implemented yet
...@@ -474,358 +367,108 @@ struct ConstantTensorDescriptor ...@@ -474,358 +367,108 @@ struct ConstantTensorDescriptor
__host__ __device__ static constexpr auto ReorderGivenOld2New(MapOld2New) __host__ __device__ static constexpr auto ReorderGivenOld2New(MapOld2New)
{ {
return ConstantTensorDescriptor<decltype(Lengths{}.ReorderGivenOld2New(MapOld2New{})), return ConstantTensorDescriptor<decltype(Lengths{}.ReorderGivenOld2New(MapOld2New{})),
decltype(Strides{}.ReorderGivenOld2New(MapOld2New{})), decltype(Strides{}.ReorderGivenOld2New(MapOld2New{}))>{}
decltype(
MemoryRanks{}.ReorderGivenOld2New(MapOld2New{}))>{};
} }
#endif #endif
}; };
template <class Lengths> template <class Lengths>
__host__ __device__ constexpr auto make_ConstantTensorDescriptor_default_rank_packed(Lengths) __host__ __device__ constexpr auto make_ConstantTensorDescriptor_packed(Lengths)
{ {
using Strides = decltype(calculate_tensor_strides_default_rank_packed(Lengths{})); using Strides = decltype(calculate_tensor_strides_packed(Lengths{}));
using MemoryRanks = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::SeqType; return ConstantTensorDescriptor<Lengths, Strides>{};
return ConstantTensorDescriptor<Lengths, Strides, MemoryRanks>{};
} }
template <class Lengths, class Strides> template <class Lengths, class Strides>
__host__ __device__ constexpr auto make_ConstantTensorDescriptor_default_rank(Lengths, Strides) __host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Strides)
{ {
using MemoryRanks = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::SeqType; return ConstantTensorDescriptor<Lengths, Strides>{};
return ConstantTensorDescriptor<Lengths, Strides, MemoryRanks>{};
} }
template <class Lengths, index_t Align> template <class Lengths, index_t Align>
__host__ __device__ constexpr auto make_ConstantTensorDescriptor_default_rank_aligned(Lengths, __host__ __device__ constexpr auto make_ConstantTensorDescriptor_aligned(Lengths, Number<Align>)
Number<Align>)
{ {
using Strides = using Strides = decltype(calculate_tensor_strides_aligned(Lengths{}, Number<Align>{}));
decltype(calculate_tensor_strides_default_rank_aligned(Lengths{}, Number<Align>{})); return ConstantTensorDescriptor<Lengths, Strides>{};
using MemoryRanks = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::SeqType;
return ConstantTensorDescriptor<Lengths, Strides, MemoryRanks>{};
} }
template <class TDesc> template <index_t... Lengths, index_t... Strides>
__host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s) __host__ __device__ void
print_ConstantTensorDescriptor(const char* s,
ConstantTensorDescriptor<Sequence<Lengths...>, Sequence<Strides...>>)
{ {
constexpr index_t ndim = TDesc::GetNumOfDimension(); constexpr index_t ndim = sizeof...(Lengths);
static_assert(ndim >= 1 && ndim <= 10, "wrong!");
static_if<ndim == 1>{}([&](auto fwd) {
constexpr auto I0 = Number<0>{};
constexpr auto desc = fwd(TDesc{}); static_assert(ndim > 0 && ndim <= 10, "wrong!");
printf("%s dim %u, lengths {%u}, strides {%u}, ranks {%u}\n", static_if<ndim == 1>{}([&](auto) {
s, printf("%s dim %u, lengths {%u}, strides {%u}\n", s, ndim, Lengths..., Strides...);
desc.GetNumOfDimension(),
desc.GetLength(I0),
desc.GetStride(I0),
desc.GetMemoryRank(I0));
}); });
static_if<ndim == 2>{}([&](auto fwd) { static_if<ndim == 2>{}([&](auto) {
constexpr auto I0 = Number<0>{}; printf("%s dim %u, lengths {%u %u}, strides {%u %u}\n", s, ndim, Lengths..., Strides...);
constexpr auto I1 = Number<1>{};
constexpr auto desc = fwd(TDesc{});
printf("%s dim %u, lengths {%u %u}, strides {%u %u}, ranks {%u %u}\n",
s,
desc.GetNumOfDimension(),
desc.GetLength(I0),
desc.GetLength(I1),
desc.GetStride(I0),
desc.GetStride(I1),
desc.GetMemoryRank(I0),
desc.GetMemoryRank(I1));
}); });
static_if<ndim == 3>{}([&](auto fwd) { static_if<ndim == 3>{}([&](auto) {
constexpr auto I0 = Number<0>{}; printf(
constexpr auto I1 = Number<1>{}; "%s dim %u, lengths {%u %u %u}, strides {%u %u %u}\n", s, ndim, Lengths..., Strides...);
constexpr auto I2 = Number<2>{};
constexpr auto desc = fwd(TDesc{});
printf("%s dim %u, lengths {%u %u %u}, strides {%u %u %u}, ranks {%u %u %u}\n",
s,
desc.GetNumOfDimension(),
desc.GetLength(I0),
desc.GetLength(I1),
desc.GetLength(I2),
desc.GetStride(I0),
desc.GetStride(I1),
desc.GetStride(I2),
desc.GetMemoryRank(I0),
desc.GetMemoryRank(I1),
desc.GetMemoryRank(I2));
}); });
static_if<ndim == 4>{}([&](auto fwd) { static_if<ndim == 4>{}([&](auto) {
constexpr auto I0 = Number<0>{}; printf("%s dim %u, lengths {%u %u %u %u}, strides {%u %u %u %u}\n",
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto desc = fwd(TDesc{});
printf("%s dim %u, lengths {%u %u %u %u}, strides {%u %u %u %u}, ranks {%u %u %u %u}\n",
s, s,
desc.GetNumOfDimension(), ndim,
desc.GetLength(I0), Lengths...,
desc.GetLength(I1), Strides...);
desc.GetLength(I2),
desc.GetLength(I3),
desc.GetStride(I0),
desc.GetStride(I1),
desc.GetStride(I2),
desc.GetStride(I3),
desc.GetMemoryRank(I0),
desc.GetMemoryRank(I1),
desc.GetMemoryRank(I2),
desc.GetMemoryRank(I3));
}); });
static_if<ndim == 5>{}([&](auto fwd) { static_if<ndim == 5>{}([&](auto) {
constexpr auto I0 = Number<0>{}; printf("%s dim %u, lengths {%u %u %u %u %u}, strides {%u %u %u %u %u}\n",
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto I4 = Number<4>{};
constexpr auto desc = fwd(TDesc{});
printf("%s dim %u, lengths {%u %u %u %u %u}, strides {%u %u %u %u %u}, ranks {%u %u %u %u "
"%u}\n",
s, s,
desc.GetNumOfDimension(), ndim,
desc.GetLength(I0), Lengths...,
desc.GetLength(I1), Strides...);
desc.GetLength(I2),
desc.GetLength(I3),
desc.GetLength(I4),
desc.GetStride(I0),
desc.GetStride(I1),
desc.GetStride(I2),
desc.GetStride(I3),
desc.GetStride(I4),
desc.GetMemoryRank(I0),
desc.GetMemoryRank(I1),
desc.GetMemoryRank(I2),
desc.GetMemoryRank(I3),
desc.GetMemoryRank(I4));
}); });
static_if<ndim == 6>{}([&](auto fwd) { static_if<ndim == 6>{}([&](auto) {
constexpr auto I0 = Number<0>{}; printf("%s dim %u, lengths {%u %u %u %u %u %u}, strides {%u %u %u %u %u %u}\n",
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto I4 = Number<4>{};
constexpr auto I5 = Number<5>{};
constexpr auto desc = fwd(TDesc{});
printf("%s dim %u, lengths {%u %u %u %u %u %u}, strides {%u %u %u %u %u %u}, ranks {%u %u "
"%u %u %u %u}\n",
s, s,
desc.GetNumOfDimension(), ndim,
desc.GetLength(I0), Lengths...,
desc.GetLength(I1), Strides...);
desc.GetLength(I2),
desc.GetLength(I3),
desc.GetLength(I4),
desc.GetLength(I5),
desc.GetStride(I0),
desc.GetStride(I1),
desc.GetStride(I2),
desc.GetStride(I3),
desc.GetStride(I4),
desc.GetStride(I5),
desc.GetMemoryRank(I0),
desc.GetMemoryRank(I1),
desc.GetMemoryRank(I2),
desc.GetMemoryRank(I3),
desc.GetMemoryRank(I4),
desc.GetMemoryRank(I5));
}); });
static_if<ndim == 7>{}([&](auto fwd) { static_if<ndim == 7>{}([&](auto) {
constexpr auto I0 = Number<0>{}; printf("%s dim %u, lengths {%u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u}\n",
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto I4 = Number<4>{};
constexpr auto I5 = Number<5>{};
constexpr auto I6 = Number<6>{};
constexpr auto desc = fwd(TDesc{});
printf("%s dim %u, lengths {%u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u}, ranks "
"{%u %u %u %u %u %u %u}\n",
s, s,
desc.GetNumOfDimension(), ndim,
desc.GetLength(I0), Lengths...,
desc.GetLength(I1), Strides...);
desc.GetLength(I2),
desc.GetLength(I3),
desc.GetLength(I4),
desc.GetLength(I5),
desc.GetLength(I6),
desc.GetStride(I0),
desc.GetStride(I1),
desc.GetStride(I2),
desc.GetStride(I3),
desc.GetStride(I4),
desc.GetStride(I5),
desc.GetStride(I6),
desc.GetMemoryRank(I0),
desc.GetMemoryRank(I1),
desc.GetMemoryRank(I2),
desc.GetMemoryRank(I3),
desc.GetMemoryRank(I4),
desc.GetMemoryRank(I5),
desc.GetMemoryRank(I6));
}); });
static_if<ndim == 8>{}([&](auto fwd) { static_if<ndim == 8>{}([&](auto) {
constexpr auto I0 = Number<0>{}; printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u %u}\n",
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto I4 = Number<4>{};
constexpr auto I5 = Number<5>{};
constexpr auto I6 = Number<6>{};
constexpr auto I7 = Number<7>{};
constexpr auto desc = fwd(TDesc{});
printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u %u}, "
"ranks {%u %u %u %u %u %u %u %u}\n",
s, s,
desc.GetNumOfDimension(), ndim,
desc.GetLength(I0), Lengths...,
desc.GetLength(I1), Strides...);
desc.GetLength(I2),
desc.GetLength(I3),
desc.GetLength(I4),
desc.GetLength(I5),
desc.GetLength(I6),
desc.GetLength(I7),
desc.GetStride(I0),
desc.GetStride(I1),
desc.GetStride(I2),
desc.GetStride(I3),
desc.GetStride(I4),
desc.GetStride(I5),
desc.GetStride(I6),
desc.GetStride(I7),
desc.GetMemoryRank(I0),
desc.GetMemoryRank(I1),
desc.GetMemoryRank(I2),
desc.GetMemoryRank(I3),
desc.GetMemoryRank(I4),
desc.GetMemoryRank(I5),
desc.GetMemoryRank(I6),
desc.GetMemoryRank(I7));
}); });
static_if<ndim == 9>{}([&](auto fwd) { static_if<ndim == 9>{}([&](auto) {
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto I4 = Number<4>{};
constexpr auto I5 = Number<5>{};
constexpr auto I6 = Number<6>{};
constexpr auto I7 = Number<7>{};
constexpr auto I8 = Number<8>{};
constexpr auto desc = fwd(TDesc{});
printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u %u " printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u %u "
"%u}, ranks {%u %u %u %u %u %u %u %u %u}\n", "%u}\n",
s, s,
desc.GetNumOfDimension(), ndim,
desc.GetLength(I0), Lengths...,
desc.GetLength(I1), Strides...);
desc.GetLength(I2),
desc.GetLength(I3),
desc.GetLength(I4),
desc.GetLength(I5),
desc.GetLength(I6),
desc.GetLength(I7),
desc.GetLength(I8),
desc.GetStride(I0),
desc.GetStride(I1),
desc.GetStride(I2),
desc.GetStride(I3),
desc.GetStride(I4),
desc.GetStride(I5),
desc.GetStride(I6),
desc.GetStride(I7),
desc.GetStride(I8),
desc.GetMemoryRank(I0),
desc.GetMemoryRank(I1),
desc.GetMemoryRank(I2),
desc.GetMemoryRank(I3),
desc.GetMemoryRank(I4),
desc.GetMemoryRank(I5),
desc.GetMemoryRank(I6),
desc.GetMemoryRank(I7),
desc.GetMemoryRank(I8));
}); });
static_if<ndim == 10>{}([&](auto fwd) { static_if<ndim == 10>{}([&](auto) {
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto I4 = Number<4>{};
constexpr auto I5 = Number<5>{};
constexpr auto I6 = Number<6>{};
constexpr auto I7 = Number<7>{};
constexpr auto I8 = Number<8>{};
constexpr auto I9 = Number<9>{};
constexpr auto desc = fwd(TDesc{});
printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u " printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u "
"%u %u %u}, ranks {%u %u %u %u %u %u %u %u %u %u}\n", "%u %u %u}\n",
s, s,
desc.GetNumOfDimension(), ndim,
desc.GetLength(I0), Lengths...,
desc.GetLength(I1), Strides...);
desc.GetLength(I2),
desc.GetLength(I3),
desc.GetLength(I4),
desc.GetLength(I5),
desc.GetLength(I6),
desc.GetLength(I7),
desc.GetLength(I8),
desc.GetLength(I9),
desc.GetStride(I0),
desc.GetStride(I1),
desc.GetStride(I2),
desc.GetStride(I3),
desc.GetStride(I4),
desc.GetStride(I5),
desc.GetStride(I6),
desc.GetStride(I7),
desc.GetStride(I8),
desc.GetStride(I9),
desc.GetMemoryRank(I0),
desc.GetMemoryRank(I1),
desc.GetMemoryRank(I2),
desc.GetMemoryRank(I3),
desc.GetMemoryRank(I4),
desc.GetMemoryRank(I5),
desc.GetMemoryRank(I6),
desc.GetMemoryRank(I7),
desc.GetMemoryRank(I8),
desc.GetMemoryRank(I9));
}); });
} }
...@@ -13,7 +13,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst ...@@ -13,7 +13,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
constexpr auto dst_desc = DstDesc{}; constexpr auto dst_desc = DstDesc{};
constexpr auto desc = make_ConstantTensorDescriptor_default_rank_packed(dst_desc.GetLengths()); constexpr auto desc = make_ConstantTensorDescriptor_packed(dst_desc.GetLengths());
#if 0 #if 0
if(get_thread_local_1d_id() == 0) if(get_thread_local_1d_id() == 0)
...@@ -108,7 +108,7 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds ...@@ -108,7 +108,7 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds
constexpr auto src_desc = SrcDesc{}; constexpr auto src_desc = SrcDesc{};
constexpr auto dst_desc = DstDesc{}; constexpr auto dst_desc = DstDesc{};
constexpr auto ref_desc = make_ConstantTensorDescriptor_default_rank_packed(SrcOpLengths{}); constexpr auto ref_desc = make_ConstantTensorDescriptor_packed(SrcOpLengths{});
constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize; constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize;
...@@ -259,7 +259,7 @@ struct Blockwise4dTensorCopy1 ...@@ -259,7 +259,7 @@ struct Blockwise4dTensorCopy1
constexpr index_t read_per_d3 = mod_conv::integer_divide_ceil(L3, DataPerRead); constexpr index_t read_per_d3 = mod_conv::integer_divide_ceil(L3, DataPerRead);
constexpr auto ref_desc = constexpr auto ref_desc =
make_ConstantTensorDescriptor_default_rank_packed(Sequence<L0, L1, L2, read_per_d3>{}); make_ConstantTensorDescriptor_packed(Sequence<L0, L1, L2, read_per_d3>{});
constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize; constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize;
...@@ -336,7 +336,7 @@ struct BlockwiseChwnTensorCopyPadded ...@@ -336,7 +336,7 @@ struct BlockwiseChwnTensorCopyPadded
constexpr auto src_desc = SrcDesc{}; constexpr auto src_desc = SrcDesc{};
constexpr auto dst_desc = DstDesc{}; constexpr auto dst_desc = DstDesc{};
constexpr auto ref_desc = make_ConstantTensorDescriptor_default_rank_packed(DstOpLengths{}); constexpr auto ref_desc = make_ConstantTensorDescriptor_packed(DstOpLengths{});
constexpr auto h_global_pad_low = GlobalLowerPads{}.Get(I0); constexpr auto h_global_pad_low = GlobalLowerPads{}.Get(I0);
constexpr auto w_global_pad_low = GlobalLowerPads{}.Get(I1); constexpr auto w_global_pad_low = GlobalLowerPads{}.Get(I1);
...@@ -510,8 +510,7 @@ struct Blockwise4dTensorCopy3 ...@@ -510,8 +510,7 @@ struct Blockwise4dTensorCopy3
} }
} }
constexpr auto thread_cluster_desc = constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed(ThreadPerDims{});
make_ConstantTensorDescriptor_default_rank_packed(ThreadPerDims{});
const auto thread_multi_id = const auto thread_multi_id =
thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id()); thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
...@@ -653,7 +652,7 @@ struct Blockwise4dTensorCopy3 ...@@ -653,7 +652,7 @@ struct Blockwise4dTensorCopy3
constexpr index_t nloop_d2 = L2 / thread_per_d2; constexpr index_t nloop_d2 = L2 / thread_per_d2;
constexpr index_t nloop_d3 = mod_conv::integer_divide_ceil(L3, thread_per_d3 * DataPerRead); constexpr index_t nloop_d3 = mod_conv::integer_divide_ceil(L3, thread_per_d3 * DataPerRead);
constexpr auto clipboard_desc = make_ConstantTensorDescriptor_default_rank_packed( constexpr auto clipboard_desc = make_ConstantTensorDescriptor_packed(
Sequence<nloop_d0, nloop_d1, nloop_d2, nloop_d3 * DataPerRead>{}); Sequence<nloop_d0, nloop_d1, nloop_d2, nloop_d3 * DataPerRead>{});
#pragma unroll #pragma unroll
...@@ -720,7 +719,7 @@ struct Blockwise4dTensorCopy3 ...@@ -720,7 +719,7 @@ struct Blockwise4dTensorCopy3
constexpr index_t nloop_d2 = L2 / thread_per_d2; constexpr index_t nloop_d2 = L2 / thread_per_d2;
constexpr index_t nloop_d3 = mod_conv::integer_divide_ceil(L3, thread_per_d3 * DataPerRead); constexpr index_t nloop_d3 = mod_conv::integer_divide_ceil(L3, thread_per_d3 * DataPerRead);
constexpr auto clipboard_desc = make_ConstantTensorDescriptor_default_rank_packed( constexpr auto clipboard_desc = make_ConstantTensorDescriptor_packed(
Sequence<nloop_d0, nloop_d1, nloop_d2, nloop_d3 * DataPerRead>{}); Sequence<nloop_d0, nloop_d1, nloop_d2, nloop_d3 * DataPerRead>{});
#pragma unroll #pragma unroll
......
...@@ -63,7 +63,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -63,7 +63,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
"wrong!"); "wrong!");
// thread cluster // thread cluster
constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_default_rank_packed( constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed(
DataClusterLengths{}.ReorderGivenNew2Old(ThreadClusterArrangeOrder{})); DataClusterLengths{}.ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));
// BlockSize // BlockSize
...@@ -185,7 +185,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -185,7 +185,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{}); constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{});
constexpr auto thread_tensor_desc = constexpr auto thread_tensor_desc =
make_ConstantTensorDescriptor_default_rank_packed(SubLengths{} * repeat_lengths); make_ConstantTensorDescriptor_packed(SubLengths{} * repeat_lengths);
return thread_tensor_desc.GetElementSpace(); return thread_tensor_desc.GetElementSpace();
} }
...@@ -199,8 +199,8 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -199,8 +199,8 @@ struct BlockwiseGenericTensorSliceCopy_v1
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{}); constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{});
constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor_default_rank_packed( constexpr auto thread_tensor_desc =
thread_sub_tensor_lengths * repeat_lengths); make_ConstantTensorDescriptor_packed(thread_sub_tensor_lengths * repeat_lengths);
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) { static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){}); constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});
...@@ -237,8 +237,8 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -237,8 +237,8 @@ struct BlockwiseGenericTensorSliceCopy_v1
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{}); constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{});
constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor_default_rank_packed( constexpr auto thread_tensor_desc =
thread_sub_tensor_lengths * repeat_lengths); make_ConstantTensorDescriptor_packed(thread_sub_tensor_lengths * repeat_lengths);
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) { static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){}); constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});
......
...@@ -40,7 +40,7 @@ struct BlockwiseTensorSliceReorderCopy_v3 ...@@ -40,7 +40,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
src_cluster_lengths.ReorderGivenNew2Old(map_thread_cluster_2_src_cluster); src_cluster_lengths.ReorderGivenNew2Old(map_thread_cluster_2_src_cluster);
constexpr auto thread_cluster_desc = constexpr auto thread_cluster_desc =
make_ConstantTensorDescriptor_default_rank_packed(thread_cluster_lengths); make_ConstantTensorDescriptor_packed(thread_cluster_lengths);
// sanity check: data type // sanity check: data type
static_assert(is_same<Float, float>::value, "wrong! only support float for now!\n"); static_assert(is_same<Float, float>::value, "wrong! only support float for now!\n");
...@@ -175,7 +175,7 @@ struct BlockwiseTensorSliceReorderCopy_v3 ...@@ -175,7 +175,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths; constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;
constexpr auto thread_tensor_desc = constexpr auto thread_tensor_desc =
make_ConstantTensorDescriptor_default_rank_packed(thread_tensor_lengths); make_ConstantTensorDescriptor_packed(thread_tensor_lengths);
return thread_tensor_desc.GetElementSpace(); return thread_tensor_desc.GetElementSpace();
} }
...@@ -196,7 +196,7 @@ struct BlockwiseTensorSliceReorderCopy_v3 ...@@ -196,7 +196,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths; constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;
constexpr auto thread_tensor_desc = constexpr auto thread_tensor_desc =
make_ConstantTensorDescriptor_default_rank_packed(thread_tensor_lengths); make_ConstantTensorDescriptor_packed(thread_tensor_lengths);
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) { static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){}; constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
...@@ -234,7 +234,7 @@ struct BlockwiseTensorSliceReorderCopy_v3 ...@@ -234,7 +234,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths; constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;
constexpr auto thread_tensor_desc = constexpr auto thread_tensor_desc =
make_ConstantTensorDescriptor_default_rank_packed(thread_tensor_lengths); make_ConstantTensorDescriptor_packed(thread_tensor_lengths);
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) { static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){}; constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
......
...@@ -30,7 +30,7 @@ __host__ __device__ constexpr auto get_convolution_output_default_4d_tensor_desc ...@@ -30,7 +30,7 @@ __host__ __device__ constexpr auto get_convolution_output_default_4d_tensor_desc
constexpr auto HO = HI + 1 - Y; constexpr auto HO = HI + 1 - Y;
constexpr auto WO = WI + 1 - X; constexpr auto WO = WI + 1 - X;
return make_ConstantTensorDescriptor_default_rank_packed(Sequence<N, K, HO, WO>{}); return make_ConstantTensorDescriptor_packed(Sequence<N, K, HO, WO>{});
} }
template <class InDesc, class WeiDesc, class LowerPads, class UpperPads> template <class InDesc, class WeiDesc, class LowerPads, class UpperPads>
...@@ -67,7 +67,7 @@ __host__ __device__ constexpr auto get_convolution_with_padding_output_default_4 ...@@ -67,7 +67,7 @@ __host__ __device__ constexpr auto get_convolution_with_padding_output_default_4
constexpr auto HO = HI + HPadLow + HPadUp + 1 - Y; constexpr auto HO = HI + HPadLow + HPadUp + 1 - Y;
constexpr auto WO = WI + WPadLow + WPadUp + 1 - X; constexpr auto WO = WI + WPadLow + WPadUp + 1 - X;
return make_ConstantTensorDescriptor_default_rank_packed(Sequence<N, K, HO, WO>{}); return make_ConstantTensorDescriptor_packed(Sequence<N, K, HO, WO>{});
} }
template <class InDesc, class WeiDesc, class OutDesc> template <class InDesc, class WeiDesc, class OutDesc>
......
...@@ -45,23 +45,23 @@ struct GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw ...@@ -45,23 +45,23 @@ struct GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw
constexpr index_t Y = wei_kcyx_global_desc.GetLength(I2); constexpr index_t Y = wei_kcyx_global_desc.GetLength(I2);
constexpr index_t X = wei_kcyx_global_desc.GetLength(I3); constexpr index_t X = wei_kcyx_global_desc.GetLength(I3);
constexpr auto wei_ke_global_desc = make_ConstantTensorDescriptor_default_rank_packed( constexpr auto wei_ke_global_desc = make_ConstantTensorDescriptor_packed(
Sequence<K, C * Y * X>{}); // 2d view of wei for blockwise copy Sequence<K, C * Y * X>{}); // 2d view of wei for blockwise copy
constexpr index_t HiPerBlock = HoPerBlock + Y - 1; constexpr index_t HiPerBlock = HoPerBlock + Y - 1;
constexpr index_t WiPerBlock = WoPerBlock + X - 1; constexpr index_t WiPerBlock = WoPerBlock + X - 1;
constexpr auto in_nchw_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( constexpr auto in_nchw_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{}, Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{},
Number<InBlockCopyDataPerRead>{}); Number<InBlockCopyDataPerRead>{});
constexpr auto wei_ke_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( constexpr auto wei_ke_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<KPerBlock, CPerBlock * Y * X>{}, Sequence<KPerBlock, CPerBlock * Y * X>{},
Number<WeiBlockCopyDataPerRead>{}); // 2d view of wei for blockwise copy Number<WeiBlockCopyDataPerRead>{}); // 2d view of wei for blockwise copy
constexpr auto wei_kcyx_block_desc = make_ConstantTensorDescriptor_default_rank( constexpr auto wei_kcyx_block_desc =
Sequence<KPerBlock, CPerBlock, Y, X>{}, make_ConstantTensorDescriptor(Sequence<KPerBlock, CPerBlock, Y, X>{},
Sequence<wei_ke_block_desc.GetStride(I0), Y * X, X, 1>{}); Sequence<wei_ke_block_desc.GetStride(I0), Y * X, X, 1>{});
// shared mem // shared mem
constexpr index_t in_block_element_size = constexpr index_t in_block_element_size =
...@@ -82,11 +82,11 @@ struct GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw ...@@ -82,11 +82,11 @@ struct GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw
constexpr index_t HiPerThread = HoPerThread + Y - 1; constexpr index_t HiPerThread = HoPerThread + Y - 1;
constexpr index_t WiPerThread = WoPerThread + X - 1; constexpr index_t WiPerThread = WoPerThread + X - 1;
constexpr auto in_nchw_thread_block_desc = make_ConstantTensorDescriptor_default_rank( constexpr auto in_nchw_thread_block_desc = make_ConstantTensorDescriptor(
Sequence<NPerThread, CPerThread, HiPerThread, WiPerThread>{}, Sequence<NPerThread, CPerThread, HiPerThread, WiPerThread>{},
in_nchw_block_desc.GetStrides()); in_nchw_block_desc.GetStrides());
constexpr auto wei_kcyx_thread_block_desc = make_ConstantTensorDescriptor_default_rank( constexpr auto wei_kcyx_thread_block_desc = make_ConstantTensorDescriptor(
Sequence<KPerThread, CPerThread, Y, X>{}, wei_kcyx_block_desc.GetStrides()); Sequence<KPerThread, CPerThread, Y, X>{}, wei_kcyx_block_desc.GetStrides());
constexpr auto out_nkhw_thread_desc = get_convolution_output_default_4d_tensor_descriptor( constexpr auto out_nkhw_thread_desc = get_convolution_output_default_4d_tensor_descriptor(
......
...@@ -85,7 +85,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn ...@@ -85,7 +85,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock); constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock);
constexpr index_t NBlockWork = mod_conv::integer_divide_ceil(N, NPerBlock); constexpr index_t NBlockWork = mod_conv::integer_divide_ceil(N, NPerBlock);
constexpr auto block_work_desc = make_ConstantTensorDescriptor_default_rank_packed( constexpr auto block_work_desc = make_ConstantTensorDescriptor_packed(
Sequence<KBlockWork, HBlockWork, WBlockWork, NBlockWork>{}); Sequence<KBlockWork, HBlockWork, WBlockWork, NBlockWork>{});
const auto block_work_multi_id = const auto block_work_multi_id =
...@@ -109,7 +109,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn ...@@ -109,7 +109,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
GemmDataPerReadA, GemmDataPerReadA,
GemmDataPerReadB); GemmDataPerReadB);
constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{}, Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{},
Number<InBlockCopyDataPerRead_N>{}); Number<InBlockCopyDataPerRead_N>{});
...@@ -118,12 +118,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn ...@@ -118,12 +118,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0, static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
"GemmDataPerReadB alignment requirement is not meet"); "GemmDataPerReadB alignment requirement is not meet");
constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<CPerBlock, KPerBlock>{}, Sequence<CPerBlock, KPerBlock>{},
Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{}); Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
// tensor view of threadwise output in register // tensor view of threadwise output in register
constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_default_rank_packed( constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed(
Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{}); Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{});
// blockwise copy // blockwise copy
......
...@@ -86,7 +86,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn ...@@ -86,7 +86,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
constexpr index_t HBlockWork = mod_conv::integer_divide_ceil(Ho, HoPerBlock); constexpr index_t HBlockWork = mod_conv::integer_divide_ceil(Ho, HoPerBlock);
constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock); constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock);
constexpr auto block_work_desc = make_ConstantTensorDescriptor_default_rank_packed( constexpr auto block_work_desc = make_ConstantTensorDescriptor_packed(
Sequence<NBlockWork, KBlockWork, HBlockWork, WBlockWork>{}); Sequence<NBlockWork, KBlockWork, HBlockWork, WBlockWork>{});
const auto block_work_multi_id = const auto block_work_multi_id =
...@@ -102,7 +102,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn ...@@ -102,7 +102,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
// global tensor view // global tensor view
constexpr auto wei_c_k_global_desc = constexpr auto wei_c_k_global_desc =
make_ConstantTensorDescriptor_default_rank(Sequence<C, K>{}, Sequence<Y * X * K, 1>{}); make_ConstantTensorDescriptor(Sequence<C, K>{}, Sequence<Y * X * K, 1>{});
// LDS tensor view // LDS tensor view
// be careful of alignment // be careful of alignment
...@@ -111,7 +111,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn ...@@ -111,7 +111,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
GemmDataPerReadA, GemmDataPerReadA,
GemmDataPerReadB); GemmDataPerReadB);
constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{}, Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{},
Number<InBlockReorderDataPerWrite_N>{}); Number<InBlockReorderDataPerWrite_N>{});
...@@ -120,12 +120,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn ...@@ -120,12 +120,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0, static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
"GemmDataPerReadB alignment requirement is not meet"); "GemmDataPerReadB alignment requirement is not meet");
constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<CPerBlock, KPerBlock>{}, Sequence<CPerBlock, KPerBlock>{},
Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{}); Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
// tensor view of threadwise output in register // tensor view of threadwise output in register
constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_default_rank_packed( constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed(
Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{}); Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{});
// blockwise copy // blockwise copy
...@@ -448,10 +448,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn ...@@ -448,10 +448,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
constexpr index_t K1 = KPerBlock / KPerThread; constexpr index_t K1 = KPerBlock / KPerThread;
#if 0 #if 0
constexpr auto out_10d_global_desc = make_ConstantTensorDescriptor_default_rank_packed( constexpr auto out_10d_global_desc = make_ConstantTensorDescriptor_packed(
Sequence<K / (K1 * K2), K1, K2, Ho, Wo / (W1 * W2 * W3), W1, W2, W3, N / N1, N1>{}); Sequence<K / (K1 * K2), K1, K2, Ho, Wo / (W1 * W2 * W3), W1, W2, W3, N / N1, N1>{});
constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor_default_rank_packed( constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor_packed(
Sequence<KPerThread / K2, 1, K2, HoPerThread, 1, W1, 1, W3, 1, N1>{}); Sequence<KPerThread / K2, 1, K2, HoPerThread, 1, W1, 1, W3, 1, N1>{});
#else #else
constexpr auto out_10d_global_desc = constexpr auto out_10d_global_desc =
......
...@@ -86,7 +86,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw ...@@ -86,7 +86,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
constexpr index_t HBlockWork = mod_conv::integer_divide_ceil(Ho, HoPerBlock); constexpr index_t HBlockWork = mod_conv::integer_divide_ceil(Ho, HoPerBlock);
constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock); constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock);
constexpr auto block_work_desc = make_ConstantTensorDescriptor_default_rank_packed( constexpr auto block_work_desc = make_ConstantTensorDescriptor_packed(
Sequence<NBlockWork, KBlockWork, HBlockWork, WBlockWork>{}); Sequence<NBlockWork, KBlockWork, HBlockWork, WBlockWork>{});
const auto block_work_multi_id = const auto block_work_multi_id =
...@@ -110,7 +110,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw ...@@ -110,7 +110,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
GemmDataPerReadA, GemmDataPerReadA,
GemmDataPerReadB); GemmDataPerReadB);
constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{}, Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{},
Number<InBlockReorderDataPerWrite_N>{}); Number<InBlockReorderDataPerWrite_N>{});
...@@ -119,12 +119,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw ...@@ -119,12 +119,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0, static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
"GemmDataPerReadB alignment requirement is not meet"); "GemmDataPerReadB alignment requirement is not meet");
constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<CPerBlock, KPerBlock>{}, Sequence<CPerBlock, KPerBlock>{},
Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{}); Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
// tensor view of threadwise output in register // tensor view of threadwise output in register
constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_default_rank_packed( constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed(
Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{}); Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{});
// blockwise copy // blockwise copy
......
...@@ -83,7 +83,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw ...@@ -83,7 +83,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
constexpr index_t HBlockWork = mod_conv::integer_divide_ceil(Ho, HoPerBlock); constexpr index_t HBlockWork = mod_conv::integer_divide_ceil(Ho, HoPerBlock);
constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock); constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock);
constexpr auto block_work_desc = make_ConstantTensorDescriptor_default_rank_packed( constexpr auto block_work_desc = make_ConstantTensorDescriptor_packed(
Sequence<NBlockWork, KBlockWork, HBlockWork, WBlockWork>{}); Sequence<NBlockWork, KBlockWork, HBlockWork, WBlockWork>{});
const auto block_work_multi_id = const auto block_work_multi_id =
...@@ -99,7 +99,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw ...@@ -99,7 +99,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
// global tensor view // global tensor view
constexpr auto wei_c_k_global_desc = constexpr auto wei_c_k_global_desc =
make_ConstantTensorDescriptor_default_rank(Sequence<C, K>{}, Sequence<Y * X * K, 1>{}); make_ConstantTensorDescriptor(Sequence<C, K>{}, Sequence<Y * X * K, 1>{});
// LDS tensor view // LDS tensor view
// be careful of alignment // be careful of alignment
...@@ -108,7 +108,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw ...@@ -108,7 +108,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
GemmDataPerReadA, GemmDataPerReadA,
GemmDataPerReadB); GemmDataPerReadB);
constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{}, Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{},
Number<InBlockReorderDataPerWrite_N>{}); Number<InBlockReorderDataPerWrite_N>{});
...@@ -117,12 +117,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw ...@@ -117,12 +117,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0, static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
"GemmDataPerReadB alignment requirement is not meet"); "GemmDataPerReadB alignment requirement is not meet");
constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<CPerBlock, KPerBlock>{}, Sequence<CPerBlock, KPerBlock>{},
Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{}); Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
// tensor view of threadwise output in register // tensor view of threadwise output in register
constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_default_rank_packed( constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed(
Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{}); Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{});
// blockwise copy // blockwise copy
......
...@@ -88,7 +88,7 @@ struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw ...@@ -88,7 +88,7 @@ struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw
constexpr index_t BBlockWork = B / BPerBlock; constexpr index_t BBlockWork = B / BPerBlock;
constexpr auto block_work_desc = constexpr auto block_work_desc =
make_ConstantTensorDescriptor_default_rank_packed(Sequence<KBlockWork, BBlockWork>{}); make_ConstantTensorDescriptor_packed(Sequence<KBlockWork, BBlockWork>{});
const auto block_work_multi_id = const auto block_work_multi_id =
block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id()); block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id());
...@@ -111,9 +111,8 @@ struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw ...@@ -111,9 +111,8 @@ struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw
// memory layout descriptor in LDS [C, N1, B, N2], dst of blockwise copy // memory layout descriptor in LDS [C, N1, B, N2], dst of blockwise copy
// be careful of LDS alignment // be careful of LDS alignment
constexpr auto in_c_n1_b_n2_block_mem_desc = constexpr auto in_c_n1_b_n2_block_mem_desc = make_ConstantTensorDescriptor_aligned(
make_ConstantTensorDescriptor_default_rank_aligned( Sequence<CPerBlock, N1, BPerBlock, N2>{}, Number<InBlockCopyDstDataPerWrite_N2>{});
Sequence<CPerBlock, N1, BPerBlock, N2>{}, Number<InBlockCopyDstDataPerWrite_N2>{});
// this check is ad-hoc // this check is ad-hoc
// TODO: need to properly implement tensor descriptor with alignment // TODO: need to properly implement tensor descriptor with alignment
...@@ -143,7 +142,7 @@ struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw ...@@ -143,7 +142,7 @@ struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw
// tensor descriptor in LDS, dst of blockwise copy // tensor descriptor in LDS, dst of blockwise copy
// be careful of LDS alignment // be careful of LDS alignment
constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<CPerBlock, KPerBlock>{}, Sequence<CPerBlock, KPerBlock>{},
Number<mod_conv::max(WeiBlockCopyDataPerAccess_K, GemmDataPerReadA)>{}); Number<mod_conv::max(WeiBlockCopyDataPerAccess_K, GemmDataPerReadA)>{});
...@@ -367,7 +366,7 @@ struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw ...@@ -367,7 +366,7 @@ struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw
// define tensor descriptor for threadwise copy // define tensor descriptor for threadwise copy
// output memory layout descriptor in register // output memory layout descriptor in register
constexpr auto out_k0_k1_k2_n1_n0_h_w_n2_thread_mem_desc = constexpr auto out_k0_k1_k2_n1_n0_h_w_n2_thread_mem_desc =
make_ConstantTensorDescriptor_default_rank_packed( make_ConstantTensorDescriptor_packed(
Sequence<KPerBlock / (K1 * K2), 1, K2, N1, 1, 1, 1, N2>{}); Sequence<KPerBlock / (K1 * K2), 1, K2, N1, 1, 1, 1, N2>{});
// output tensor descriptor in register, src of threadwise copy // output tensor descriptor in register, src of threadwise copy
......
...@@ -91,7 +91,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw ...@@ -91,7 +91,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
constexpr index_t BBlockWork = B / BPerBlock; constexpr index_t BBlockWork = B / BPerBlock;
constexpr auto block_work_desc = constexpr auto block_work_desc =
make_ConstantTensorDescriptor_default_rank_packed(Sequence<KBlockWork, BBlockWork>{}); make_ConstantTensorDescriptor_packed(Sequence<KBlockWork, BBlockWork>{});
const auto block_work_multi_id = const auto block_work_multi_id =
block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id()); block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id());
...@@ -114,9 +114,8 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw ...@@ -114,9 +114,8 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
// memory layout descriptor in LDS [C, N1, B, N2], dst of blockwise copy // memory layout descriptor in LDS [C, N1, B, N2], dst of blockwise copy
// be careful of LDS alignment // be careful of LDS alignment
constexpr auto in_c_n1_b_n2_block_mem_desc = constexpr auto in_c_n1_b_n2_block_mem_desc = make_ConstantTensorDescriptor_aligned(
make_ConstantTensorDescriptor_default_rank_aligned( Sequence<CPerBlock, N1, BPerBlock, N2>{}, Number<InBlockCopyDstDataPerWrite_N2>{});
Sequence<CPerBlock, N1, BPerBlock, N2>{}, Number<InBlockCopyDstDataPerWrite_N2>{});
// this check is ad-hoc // this check is ad-hoc
// TODO: need to properly implement tensor descriptor with alignment // TODO: need to properly implement tensor descriptor with alignment
...@@ -146,7 +145,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw ...@@ -146,7 +145,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
// tensor descriptor in LDS, dst of blockwise copy // tensor descriptor in LDS, dst of blockwise copy
// be careful of LDS alignment // be careful of LDS alignment
constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<CPerBlock, KPerBlock>{}, Sequence<CPerBlock, KPerBlock>{},
Number<mod_conv::max(WeiBlockCopyDataPerAccess_K, GemmDataPerReadA)>{}); Number<mod_conv::max(WeiBlockCopyDataPerAccess_K, GemmDataPerReadA)>{});
...@@ -320,7 +319,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw ...@@ -320,7 +319,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
// define tensor descriptor for threadwise copy // define tensor descriptor for threadwise copy
// output memory layout descriptor in register // output memory layout descriptor in register
constexpr auto out_k0_k1_k2_n1_n0_h_w_n2_thread_mem_desc = constexpr auto out_k0_k1_k2_n1_n0_h_w_n2_thread_mem_desc =
make_ConstantTensorDescriptor_default_rank_packed( make_ConstantTensorDescriptor_packed(
Sequence<KPerBlock / (K1 * K2), 1, K2, N1, 1, 1, 1, N2>{}); Sequence<KPerBlock / (K1 * K2), 1, K2, N1, 1, 1, 1, N2>{});
// output tensor descriptor in register, src of threadwise copy // output tensor descriptor in register, src of threadwise copy
......
...@@ -99,7 +99,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw ...@@ -99,7 +99,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
constexpr index_t BBlockWork = B / BPerBlock; constexpr index_t BBlockWork = B / BPerBlock;
constexpr auto block_work_desc = constexpr auto block_work_desc =
make_ConstantTensorDescriptor_default_rank_packed(Sequence<KBlockWork, BBlockWork>{}); make_ConstantTensorDescriptor_packed(Sequence<KBlockWork, BBlockWork>{});
const auto block_work_multi_id = const auto block_work_multi_id =
block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id()); block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id());
...@@ -127,20 +127,9 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw ...@@ -127,20 +127,9 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
Sequence<3, 6, 7>{}, Sequence<3, 6, 7>{},
Sequence<5>{}); Sequence<5>{});
#if 0
if(get_block_1d_id() == 0 && get_thread_local_1d_id() == 0)
{
print_ConstantTensorDescriptor(in_n0_n1_n2_h_w_global_desc,
"in_n0_n1_n2_h_w_global_desc: ");
print_ConstantTensorDescriptor(in_c_y_x_global_desc, "in_c_y_x_global_desc: ");
print_ConstantMergedTensorDescriptor(in_e_n1_b_n2_global_merged_desc,
"in_e_n1_b_n2_global_merged_desc: ");
}
#endif
// memory layout descriptor in LDS [E, N1, B, N2], dst of blockwise copy // memory layout descriptor in LDS [E, N1, B, N2], dst of blockwise copy
// be careful of LDS alignment // be careful of LDS alignment
constexpr auto in_e_n1_b_n2_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( constexpr auto in_e_n1_b_n2_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<EPerBlock, N1, BPerBlock, N2>{}, Number<InBlockCopyDstDataPerWrite_N2>{}); Sequence<EPerBlock, N1, BPerBlock, N2>{}, Number<InBlockCopyDstDataPerWrite_N2>{});
// this check is ad-hoc // this check is ad-hoc
...@@ -174,7 +163,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw ...@@ -174,7 +163,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
// tensor descriptor in LDS, dst of blockwise copy // tensor descriptor in LDS, dst of blockwise copy
// be careful of LDS alignment // be careful of LDS alignment
constexpr auto wei_e_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( constexpr auto wei_e_k_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<EPerBlock, KPerBlock>{}, Sequence<EPerBlock, KPerBlock>{},
Number<mod_conv::max(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{}); Number<mod_conv::max(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{});
...@@ -406,7 +395,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw ...@@ -406,7 +395,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
// define tensor descriptor for threadwise copy // define tensor descriptor for threadwise copy
// output memory layout descriptor in register // output memory layout descriptor in register
constexpr auto out_k0_k1_k2_n1_n0_h_w_n2_thread_mem_desc = constexpr auto out_k0_k1_k2_n1_n0_h_w_n2_thread_mem_desc =
make_ConstantTensorDescriptor_default_rank_packed( make_ConstantTensorDescriptor_packed(
Sequence<KPerBlock / (K1 * K2), 1, K2, N1, 1, 1, 1, N2>{}); Sequence<KPerBlock / (K1 * K2), 1, K2, N1, 1, 1, 1, N2>{});
// output tensor descriptor in register, src of threadwise copy // output tensor descriptor in register, src of threadwise copy
......
...@@ -93,7 +93,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw ...@@ -93,7 +93,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
constexpr index_t BBlockWork = B / BPerBlock; constexpr index_t BBlockWork = B / BPerBlock;
constexpr auto block_work_desc = constexpr auto block_work_desc =
make_ConstantTensorDescriptor_default_rank_packed(Sequence<KBlockWork, BBlockWork>{}); make_ConstantTensorDescriptor_packed(Sequence<KBlockWork, BBlockWork>{});
const auto block_work_multi_id = const auto block_work_multi_id =
block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id()); block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id());
...@@ -134,7 +134,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw ...@@ -134,7 +134,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
// memory layout descriptor in LDS [E, N1, B, N2], dst of blockwise copy // memory layout descriptor in LDS [E, N1, B, N2], dst of blockwise copy
// be careful of LDS alignment // be careful of LDS alignment
constexpr auto in_e_n1_b_n2_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( constexpr auto in_e_n1_b_n2_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<EPerBlock, N1, BPerBlock, N2>{}, Number<InBlockCopyDstDataPerWrite_N2>{}); Sequence<EPerBlock, N1, BPerBlock, N2>{}, Number<InBlockCopyDstDataPerWrite_N2>{});
// this check is ad-hoc // this check is ad-hoc
...@@ -167,7 +167,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw ...@@ -167,7 +167,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
// tensor descriptor in LDS, dst of blockwise copy // tensor descriptor in LDS, dst of blockwise copy
// be careful of LDS alignment // be careful of LDS alignment
constexpr auto wei_e_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( constexpr auto wei_e_k_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<EPerBlock, KPerBlock>{}, Sequence<EPerBlock, KPerBlock>{},
Number<mod_conv::max(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{}); Number<mod_conv::max(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{});
...@@ -288,7 +288,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw ...@@ -288,7 +288,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
// define tensor descriptor for threadwise copy // define tensor descriptor for threadwise copy
// output memory layout descriptor in register // output memory layout descriptor in register
constexpr auto out_k0_k1_k2_n1_n0_h_w_n2_thread_mem_desc = constexpr auto out_k0_k1_k2_n1_n0_h_w_n2_thread_mem_desc =
make_ConstantTensorDescriptor_default_rank_packed( make_ConstantTensorDescriptor_packed(
Sequence<KPerBlock / (K1 * K2), 1, K2, N1, 1, 1, 1, N2>{}); Sequence<KPerBlock / (K1 * K2), 1, K2, N1, 1, 1, 1, N2>{});
// output tensor descriptor in register, src of threadwise copy // output tensor descriptor in register, src of threadwise copy
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment