Commit acd7082f authored by Chao Liu's avatar Chao Liu
Browse files

adding ConstantMergedTensorDescriptor, refactering ConstantTensorDescriptor, Sequence

parent cd29b09a
...@@ -38,7 +38,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, ...@@ -38,7 +38,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
constexpr index_t X = wei_kcyx_desc.GetLength(I3); constexpr index_t X = wei_kcyx_desc.GetLength(I3);
// reorder weight // reorder weight
auto wei_cyxk_desc = make_ConstantTensorDescriptor(Sequence<C, Y, X, K>{}); auto wei_cyxk_desc = make_packed_ConstantTensorDescriptor(Sequence<C, Y, X, K>{});
ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: "); ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc)); Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
...@@ -51,7 +51,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, ...@@ -51,7 +51,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
std::thread::hardware_concurrency()); std::thread::hardware_concurrency());
// reorder input // reorder input
auto in_chwn_desc = make_ConstantTensorDescriptor(Sequence<C, Hi, Wi, N>{}); auto in_chwn_desc = make_packed_ConstantTensorDescriptor(Sequence<C, Hi, Wi, N>{});
ostream_ConstantTensorDescriptor(in_chwn_desc, std::cout << "in_chwn_desc: "); ostream_ConstantTensorDescriptor(in_chwn_desc, std::cout << "in_chwn_desc: ");
Tensor<T> in_chwn(make_TensorDescriptor(in_chwn_desc)); Tensor<T> in_chwn(make_TensorDescriptor(in_chwn_desc));
...@@ -64,7 +64,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, ...@@ -64,7 +64,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
std::thread::hardware_concurrency()); std::thread::hardware_concurrency());
// output // output
auto out_khwn_desc = make_ConstantTensorDescriptor(Sequence<K, Ho, Wo, N>{}); auto out_khwn_desc = make_packed_ConstantTensorDescriptor(Sequence<K, Ho, Wo, N>{});
ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: "); ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: ");
Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc)); Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc));
......
...@@ -37,7 +37,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc, ...@@ -37,7 +37,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
constexpr index_t X = wei_kcyx_desc.GetLength(I3); constexpr index_t X = wei_kcyx_desc.GetLength(I3);
// reorder weight // reorder weight
auto wei_cyxk_desc = make_ConstantTensorDescriptor(Sequence<C, Y, X, K>{}); auto wei_cyxk_desc = make_packed_ConstantTensorDescriptor(Sequence<C, Y, X, K>{});
ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: "); ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc)); Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
...@@ -50,7 +50,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc, ...@@ -50,7 +50,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
std::thread::hardware_concurrency()); std::thread::hardware_concurrency());
// output // output
auto out_khwn_desc = make_ConstantTensorDescriptor(Sequence<K, Ho, Wo, N>{}); auto out_khwn_desc = make_packed_ConstantTensorDescriptor(Sequence<K, Ho, Wo, N>{});
ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: "); ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: ");
Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc)); Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc));
......
...@@ -36,7 +36,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc, ...@@ -36,7 +36,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
constexpr index_t X = wei_kcyx_desc.GetLength(I3); constexpr index_t X = wei_kcyx_desc.GetLength(I3);
// reorder weight // reorder weight
auto wei_cyxk_desc = make_ConstantTensorDescriptor(Sequence<C, Y, X, K>{}); auto wei_cyxk_desc = make_packed_ConstantTensorDescriptor(Sequence<C, Y, X, K>{});
ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: "); ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc)); Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
......
...@@ -548,8 +548,8 @@ int main(int argc, char* argv[]) ...@@ -548,8 +548,8 @@ int main(int argc, char* argv[])
auto lower_pads = Sequence<HPad, WPad>{}; auto lower_pads = Sequence<HPad, WPad>{};
auto upper_pads = Sequence<HPad, WPad>{}; auto upper_pads = Sequence<HPad, WPad>{};
auto in_nchw_desc = make_ConstantTensorDescriptor(Sequence<N, C, HI, WI>{}); auto in_nchw_desc = make_packed_ConstantTensorDescriptor(Sequence<N, C, HI, WI>{});
auto wei_kcyx_desc = make_ConstantTensorDescriptor(Sequence<K, C, Y, X>{}); auto wei_kcyx_desc = make_packed_ConstantTensorDescriptor(Sequence<K, C, Y, X>{});
auto out_nkhw_desc = get_convolution_with_padding_output_default_4d_tensor_descriptor( auto out_nkhw_desc = get_convolution_with_padding_output_default_4d_tensor_descriptor(
in_nchw_desc, wei_kcyx_desc, lower_pads, upper_pads); in_nchw_desc, wei_kcyx_desc, lower_pads, upper_pads);
......
...@@ -16,6 +16,8 @@ struct Array ...@@ -16,6 +16,8 @@ struct Array
{ {
} }
__host__ __device__ constexpr index_t GetSize() const { return NSize; }
__host__ __device__ const TData& operator[](index_t i) const { return mData[i]; } __host__ __device__ const TData& operator[](index_t i) const { return mData[i]; }
__host__ __device__ TData& operator[](index_t i) { return mData[i]; } __host__ __device__ TData& operator[](index_t i) { return mData[i]; }
...@@ -67,6 +69,23 @@ __host__ __device__ auto reorder_array_given_old2new(const Array<TData, NSize>& ...@@ -67,6 +69,23 @@ __host__ __device__ auto reorder_array_given_old2new(const Array<TData, NSize>&
return new_array; return new_array;
} }
template <class TData, index_t NSize, class ExtractSeq>
__host__ __device__ auto extract_array(const Array<TData, NSize>& old_array, ExtractSeq)
{
Array<TData, ExtractSeq::GetSize()> new_array;
constexpr index_t new_size = ExtractSeq::GetSize();
static_assert(new_size <= NSize, "wrong! too many extract");
static_for<0, new_size, 1>{}([&](auto I) {
constexpr index_t i = I.Get();
new_array[i] = old_array[ExtractSeq{}.Get(I)];
});
return new_array;
}
template <class TData, index_t NSize> template <class TData, index_t NSize>
__host__ __device__ constexpr auto operator+(const Array<TData, NSize>& a, __host__ __device__ constexpr auto operator+(const Array<TData, NSize>& a,
const Array<TData, NSize>& b) const Array<TData, NSize>& b)
......
...@@ -21,7 +21,7 @@ struct ConstantMatrixDescriptor ...@@ -21,7 +21,7 @@ struct ConstantMatrixDescriptor
__host__ __device__ constexpr index_t GetElementSpace() const { return NRow_ * RowStride_; } __host__ __device__ constexpr index_t GetElementSpace() const { return NRow_ * RowStride_; }
__host__ __device__ index_t Get1dIndex(index_t irow, index_t icol) const __host__ __device__ index_t GetOffsetFromMultiIndex(index_t irow, index_t icol) const
{ {
return irow * RowStride_ + icol; return irow * RowStride_ + icol;
} }
......
...@@ -2,94 +2,118 @@ ...@@ -2,94 +2,118 @@
#include "common.hip.hpp" #include "common.hip.hpp"
#include "ConstantTensorDescriptor.hip.hpp" #include "ConstantTensorDescriptor.hip.hpp"
// TensorDesc: ConstantTensorDescriptor<...> // OriginalTensorDesc : ConstantTensorDescriptor<...>
// MergedDimRanges: Sequence<FirstMergedDim, LastMergedDim> // it's the tensor whose dimensions are to be merged
template <class TensorDesc, class... MergedDimRanges> // OriginalDimMergeSeqs : Sequence<...>...
// each is a sequence of original dimensions (of OriginalTensorDesc) to be merged
template <class OriginalTensorDesc, class... OriginalDimMergeSeqs>
struct ConstantMergedTensorDescriptor struct ConstantMergedTensorDescriptor
{ {
static constexpr index_t nOriginalDim = GetNumOfOriginalDimension(); static constexpr auto mOriginalDimMergeSeqs = std::tuple<OriginalDimMergeSeqs...>{};
static constexpr index_t nDim = GetNumOfDimension();
static constexpr index_t nDim = std::tuple_size<mOriginalDimMergeSeqs>::value;
static constexpr index_t nOriginalDim = OriginalDesc::GetNumOfDimension();
template <class... Is>
__host__ __device__ constexpr ConstantMergedTensorDescriptor() __host__ __device__ constexpr ConstantMergedTensorDescriptor()
{ {
constexpr auto merged_dim_ranges = std::make_tuple(MergedDimRanges{}...); static_assert(nDim <= nOriginalDim, "wrong!");
static_for<0, sizeof...(MergedDimRanges), 1>{}([&](auto I) { // TODO: check each of OriginalDimMergeSeqs contains at least 1, and at most
constexpr index_t i = I.Get(); // OriginalTensorDesc::nDim number of dimensions
constexpr auto merged_dim_range = std::get<i>(merged_dim_ranges);
// TODO: check there is no duplication in OriginalDimMergeSeqs
static_assert(merged_dim_range.GetSize() == 2,
"wrong! should specify first and last dimension to be merged"); // TODO: check OriginalDimMergeSeqs contains all original dimensions
static_assert(merged_dim_range.Get(Number<0>{}) < GetNumOfUnmergedDimension(),
"wrong!");
static_assert(merged_dim_range.Get(Number<1>{}) < GetNumOfUnmergedDimension(),
"wrong!");
static_assert(merged_dim_range.Get(Number<0>{}) <= merged_dim_range.Get(Number<1>{}),
"wrong!");
});
} }
__host__ __device__ static constexpr index_t GetNumOfDimension() __host__ __device__ static constexpr index_t GetNumOfDimension() { return nDim; }
{
constexpr auto merged_dim_ranges = std::make_tuple(MergedDimRanges...); __host__ __device__ static constexpr index_t GetNumOfOriginalDimension() { return nOriginalDim }
struct f_calculate_num_of_lost_dim template <index_t IDim>
__host__ __device__ static constexpr bool ContainMultipleOriginalDimensions(Number<IDim>)
{ {
__host__ __device__ constexpr index_t operator()(auto I) const return (std::Get<IDIM>(mOriginalDimMergeSeqs).GetSize() > 1);
}
template <index_t IDim>
__host__ __device__ static constexpr index_t GetLength(Number<IDim>)
{ {
constexpr index_t i = I.Get(); constexpr auto original_dims_partial = std::Get<IDim>(mOriginalDimMergeSeqs);
constexpr auto merged_dim_range = std::get<i>(merged_dim_ranges);
return merged_dim_range.Get(Number<1>{}) - merged_dim_range.Get(Number<0>{}); return OriginalTensorDesc::Extract(original_dims_partial).GetElementSize();
} }
};
constexpr index_t num_lost_dim = static_const_reduce_n<sizeof...(MergedDimRanges)>{}( template <index_t IDim>
f_calculate_num_of_lost_dim, std::plus<index_t>{}); __host__ __device__ static constexpr index_t GetStride(Number<IDim>)
{
static_assert(!ContainMultipleOriginalDimensions(Number<IDim>{}),
"wrong! stride of a merged dimension is undefined");
constexpr auto idim_original = std::Get<IDim>(mOriginalDimMergeSeqs).Front();
return TensorDesc::GetNumOfDimension() - num_lost_dim; return OriginalTensorDesc::GetStride(Number<idim_original>{});
} }
__host__ __device__ static constexpr index_t GetNumOfOriginalDimension() __host__ __device__ static constexpr auto GetLengths()
{ {
return TensorDesc::GetNumOfDimension(); return Sequence<OriginalTensorDesc::Extract(OriginalDimMergeSeqs).GetElementSize()...>{};
} }
template <index_t IDim> __host__ __device__ static constexpr index_t GetElementSize()
__host__ __device__ static constexpr bool IsMergedDimension(Number<IDim>)
{ {
// not implemented return OriginalTensorDesc::GetElementSize();
} }
template <index_t IDim> __host__ __device__ static auto
__host__ __device__ static constexpr bool GetLength(Number<IDim>) GetOriginalMultiIndexFromMultiIndex(Array<index_t, nDim> multi_id)
{ {
// not implemented Array<index_t, nOriginalDim> original_multi_id;
static_for<0, nDim, 1>{}([&](auto IDim) {
constexpr index_t idim = IDim.Get();
constexpr auto original_dims_partial = std::get<idim>(mOriginalDimMergeSeqs);
// get partial original-multi-id corresponding to this merged dimension
constexpr auto original_multi_id_partial =
OriginalTensorDesc::Extract(original_dims_partial)
.GetMultiIndexFrom1dIndex(multi_id[idim]);
// make sure compiler unroll this loop and propagate all the constants
for(index_t i = 0; i < original_dims_partial.GetSize(); ++i)
{
index_t idim_original = original_dims_partial[i];
original_multi_id[idim_original] = original_multi_id_partial[i]
} }
});
template <index_t IDim> return original_multi_id;
__host__ __device__ static constexpr bool GetStride(Number<IDim>) }
__host__ __device__ static index_t GetOffsetFromMultiIndex(Array<index_t, nDim> multi_id)
{ {
static_assert(!IsMergedDimension(Number<IDim>{}, "wrong! stride of a merged dimension is undefined") const auto original_multi_id = GetOriginalMultiIndexFromMultiIndex(multi_id);
// not implemented
return OriginalTensorDesc::GetOffsetFromMultiIndex(orginal_multi_id);
} }
template <class... Is> template <index_t... Is>
__host__ __device__ auto MultiIndex2OriginalMultiIndex(Is... is) const __host__ __device__ static index_t GetOffsetFromMultiIndex(Is... is)
{ {
// not implemented return GetOffsetFromMultiIndex(Array<index_t, nDim>{is...});
} }
template <class... Is> __host__ __device__ static Array<index_t, nDim> GetMultiIndexFrom1dIndex(index_t id)
__host__ __device__ auto OriginalMultiIndex2MultiIndex(Is... is) const
{ {
// not implemented constexpr auto dummy_desc = make_packed_ConstantTensorDescriptor(GetLengths());
return dummy_desc.GetMultiIndexFrom1dIndex(id);
} }
}; };
template <class TensorDesc, class... MergedDimRanges> template <class OriginalTensorDesc, class... OriginalDimMergeSeqs>
constexpr auto make_ConstantMergedTensorDescriptor(TensorDesc, MergedDimRanges...) constexpr auto make_ConstantMergedTensorDescriptor(OriginalTensorDesc, OriginalDimMergeSeqs...)
{ {
return ConstantMergedTensorDescriptor<TensorDesc, MergedDimRanges...>{}; return ConstantMergedTensorDescriptor<OriginalTensorDesc, OriginalDimMergeSeqs...>{};
} }
...@@ -2,40 +2,25 @@ ...@@ -2,40 +2,25 @@
#include "common.hip.hpp" #include "common.hip.hpp"
template <class Lengths> template <class Lengths>
__host__ __device__ constexpr auto calculate_default_strides(Lengths) __host__ __device__ constexpr auto calculate_packed_tensor_strides(Lengths)
{ {
return reverse_inclusive_scan_sequence(Lengths{}.PopFront().PushBack(Number<1>{}), return reverse_inclusive_scan_sequence(Lengths{}.PopFront(), std::multiplies<index_t>{})
std::multiplies<index_t>{}); .PushBack(Number<1>{});
}
// this is ugly, only for 2d
template <index_t L0, index_t L1, index_t Align>
__host__ __device__ constexpr auto calculate_default_strides_aligned(Sequence<L0, L1>,
Number<Align>)
{
constexpr index_t L1_align = Align * ((L1 + Align - 1) / Align);
return Sequence<L1_align, 1>{};
} }
// this is ugly, only for 3d template <class Lengths, index_t Align>
template <index_t L0, index_t L1, index_t L2, index_t Align> __host__ __device__ constexpr auto
__host__ __device__ constexpr auto calculate_default_strides_aligned(Sequence<L0, L1, L2>, calculate_rank_tensor_default_strides_with_alignment(Lengths, Number<Align>)
Number<Align>)
{ {
constexpr index_t L2_align = Align * ((L2 + Align - 1) / Align); constexpr index_t L_back_align =
return Sequence<L1 * L2_align, L2_align, 1>{}; Align * mod_conv::integer_divide_ceiler<index_t>{}(Lengths{}.Back(), Align);
}
// this is ugly, only for 4d return calculate_packed_tensor_strides(
template <index_t L0, index_t L1, index_t L2, index_t L3, index_t Align> Lengths{}.Modify(Number<Lengths{}.GetSize() - 1>{}, Number<L_back_align>{}));
__host__ __device__ constexpr auto calculate_default_strides_aligned(Sequence<L0, L1, L2, L3>,
Number<Align>)
{
constexpr index_t L3_align = Align * ((L3 + Align - 1) / Align);
return Sequence<L1 * L2 * L3_align, L2 * L3_align, L3_align, 1>{};
} }
template <class Lengths, class Strides> // MemoryRanks of dimensions is for conversion from offset to multi-index
template <class Lengths, class Strides, class MemoryRanks>
struct ConstantTensorDescriptor struct ConstantTensorDescriptor
{ {
using Type = ConstantTensorDescriptor; using Type = ConstantTensorDescriptor;
...@@ -44,14 +29,24 @@ struct ConstantTensorDescriptor ...@@ -44,14 +29,24 @@ struct ConstantTensorDescriptor
__host__ __device__ constexpr ConstantTensorDescriptor() __host__ __device__ constexpr ConstantTensorDescriptor()
{ {
static_assert(Lengths::GetSize() == Strides::GetSize(), "nDim not consistent"); static_assert(Lengths::GetSize() == Strides::GetSize() &&
Lengths::GetSize() == MemoryRanks::GetSize(),
"nDim not consistent");
#if 0 // require sequence_sort, but it's not implemented yet
static_assert(is_same<typename sequence_sort<MemoryRanks>::SortedSeqType,
typename arithmetic_sequence_gen<0, nDim, 1>::SeqType>::value,
"wrong! invalid MemoryRanks");
#endif
} }
__host__ __device__ static constexpr index_t GetNumOfDimension() { return nDim; } __host__ __device__ static constexpr index_t GetNumOfDimension() { return nDim; }
__host__ __device__ static constexpr Lengths GetLengths() { return Lengths{}; } __host__ __device__ static constexpr auto GetLengths() { return Lengths{}; }
__host__ __device__ static constexpr auto GetStrides() { return Strides{}; }
__host__ __device__ static constexpr Strides GetStrides() { return Strides{}; } __host__ __device__ static constexpr auto GetMemoryRanks() { return MemoryRanks{}; }
template <index_t I> template <index_t I>
__host__ __device__ static constexpr index_t GetLength(Number<I>) __host__ __device__ static constexpr index_t GetLength(Number<I>)
...@@ -65,47 +60,58 @@ struct ConstantTensorDescriptor ...@@ -65,47 +60,58 @@ struct ConstantTensorDescriptor
return Strides{}.Get(Number<I>{}); return Strides{}.Get(Number<I>{});
} }
template <index_t I>
__host__ __device__ static constexpr index_t GetMemoryRank(Number<I>)
{
return MemoryRanks{}.Get(Number<I>{});
}
__host__ __device__ static constexpr index_t GetElementSize() __host__ __device__ static constexpr index_t GetElementSize()
{ {
return accumulate_on_sequence(Lengths{}, std::multiplies<index_t>{}, Number<1>{}); return accumulate_on_sequence(Lengths{}, std::multiplies<index_t>{}, Number<1>{});
} }
// WRONG! ReorderGivenOld2New is broken
template <class Align = Number<1>> template <class Align = Number<1>>
__host__ __device__ static constexpr index_t GetElementSpace(Align align = Align{}) __host__ __device__ static constexpr index_t GetElementSpace(Align align = Align{})
{ {
#if 0
constexpr auto lengths_in_rank = GetLengths().ReorderGivenOld2New(MemoryRank{});
constexpr auto strides_in_rank = GetStrides().ReorderGivenOld2new(MemoryRank{});
constexpr index_t element_space_unaligned = accumulate_on_sequence(
(lengths_in_rank - Number<1>{}) * strides_in_rank, std::plus<index_t>{}, Number<1>{});
#else // WRONG! align shouldbe applied to the last memory rank, not the last tensor dimension
constexpr index_t element_space_unaligned = accumulate_on_sequence( constexpr index_t element_space_unaligned = accumulate_on_sequence(
(GetLengths() - Number<1>{}) * GetStrides(), std::plus<index_t>{}, Number<1>{}); (GetLengths() - Number<1>{}) * GetStrides(), std::plus<index_t>{}, Number<1>{});
#endif
return align.Get() * ((element_space_unaligned + align.Get() - 1) / align.Get()); return align.Get() * ((element_space_unaligned + align.Get() - 1) / align.Get());
} }
template <index_t NSize> template <index_t NSize>
__host__ __device__ static index_t Get1dIndex(Array<index_t, NSize> multi_id) __host__ __device__ static index_t GetOffsetFromMultiIndex(Array<index_t, NSize> multi_id)
{ {
static_assert(NSize == nDim, "wrong! Dimension not consistent"); static_assert(NSize == nDim, "wrong! Dimension not consistent");
index_t id = 0; index_t offset = 0;
static_for<0, nDim, 1>{}([&](auto IDim) { static_for<0, nDim, 1>{}([&](auto IDim) {
constexpr index_t idim = IDim.Get(); constexpr index_t idim = IDim.Get();
id += multi_id[idim] * GetStride(IDim); offset += multi_id[idim] * GetStride(IDim);
}); });
return id; return offset;
} }
template <class... Is> template <class... Is>
__host__ __device__ static index_t Get1dIndex(Is... is) __host__ __device__ static index_t GetOffsetFromMultiIndex(Is... is)
{ {
static_assert(sizeof...(Is) == nDim, "number of multi-index is wrong"); return GetOffsetFromMultiIndex(Array<index_t, sizeof...(Is)>{is...});
const auto multi_id = Array<index_t, nDim>(is...);
return Get1dIndex(multi_id);
} }
template <index_t... Is> template <index_t... Is>
__host__ __device__ static constexpr index_t Get1dIndex(Sequence<Is...> /*multi_id*/) __host__ __device__ static constexpr index_t GetOffsetFromMultiIndex(Sequence<Is...>)
{ {
static_assert(sizeof...(Is) == nDim, "wrong! Dimension not consistent"); static_assert(sizeof...(Is) == nDim, "wrong! Dimension not consistent");
...@@ -114,44 +120,84 @@ struct ConstantTensorDescriptor ...@@ -114,44 +120,84 @@ struct ConstantTensorDescriptor
return accumulate_on_sequence(multi_id * GetStrides(), std::plus<index_t>{}, Number<0>{}); return accumulate_on_sequence(multi_id * GetStrides(), std::plus<index_t>{}, Number<0>{});
} }
__host__ __device__ static Array<index_t, nDim> GetMultiIndex(index_t id) #if 0 // ReorderGivenOld2new is broken
__host__ __device__ static Array<index_t, nDim> GetMultiIndexFromOffset(index_t offset)
{ {
Array<index_t, nDim> multi_id; Array<index_t, nDim> ranked_multi_id;
constexpr auto ranked_strides =
GetStrides().ReorderGivenOld2new(MemoryRanks{}); // check this
// calculate index in each of the dimensions in the order of their rank (not dimension)
static_for<0, nDim - 1, 1>{}([&](auto IDim) { static_for<0, nDim - 1, 1>{}([&](auto IDim) {
constexpr index_t idim = IDim.Get(); constexpr index_t idim = IDim.Get();
multi_id[idim] = id / GetStride(IDim); constexpr index_t stride = ranked_strides.Get(Number<idim>{});
id -= multi_id[idim] * GetStride(IDim); ranked_multi_id[idim] = offset / stride;
offset -= ranked_multi_id[idim] * stride;
}); });
multi_id[nDim - 1] = id / GetStride(Number<nDim - 1>{}); ranked_multi_id[nDim - 1] = offset / ranked_strides.Get(Number<nDim - 1>{});
return multi_id; return reorder_array_given_new2old(ranked_multi_id, MemoryRanks{}); // check this
} }
#endif
__host__ __device__ static constexpr auto Pack() __host__ __device__ static Array<index_t, nDim> GetMultiIndexFrom1dIndex(index_t id)
{ {
constexpr auto default_strides = calculate_default_strides(Lengths{}); Array<index_t, nDim> multi_id;
return ConstantTensorDescriptor<Lengths, decltype(default_strides)>{};
constexpr auto dummy_strides = calculate_packed_tensor_strides(GetLengths());
// calculate index in each of the dimensions in the order of their dimension (not rank)
static_for<0, nDim - 1, 1>{}([&](auto IDim) {
constexpr index_t idim = IDim.Get();
constexpr index_t stride = dummy_strides.Get(Number<idim>{});
multi_id[idim] = id / stride;
id -= multi_id[idim] * stride;
});
multi_id[nDim - 1] = id / dummy_strides.Get(Number<nDim - 1>{});
return multi_id;
} }
// WRONG! Ranks is broken
template <index_t... IDims> template <index_t... IDims>
__host__ __device__ static constexpr auto Extract(Number<IDims>... extract_dims) __host__ __device__ static constexpr auto Extract(Number<IDims>... extract_dims)
{ {
static_assert(sizeof...(IDims) <= GetNumOfDimension(), static_assert(sizeof...(IDims) <= GetNumOfDimension(),
"wrong! too many number of dimensions to be extracted"); "wrong! too many number of dimensions to be extracted");
return make_ConstantTensorDescriptor(Lengths{}.Extract(extract_dims...), using extract_lengths = decltype(Lengths{}.Extract(extract_dims...));
Strides{}.Extract(extract_dims...)); using extract_strides = decltype(Strides{}.Extract(extract_dims...));
using extract_ranks = decltype(MemoryRanks{}.Extract(extract_dims...));
#if 0
using new_ranks = typename sequence_sort<extract_ranks>::Original2SortedType;
#else // WRONG! TODO:: implement sequence_sort
using new_ranks = typename arithmetic_sequence_gen<0, sizeof...(IDims), 1>::SeqType;
#endif
return ConstantTensorDescriptor<extract_lengths, extract_strides, new_ranks>{};
} }
template <index_t IDim, index_t SliceLen> template <index_t IDim, index_t SliceLen>
__host__ __device__ static constexpr auto Slice(Number<IDim>, Number<SliceLen>) __host__ __device__ static constexpr auto Slice(Number<IDim>, Number<SliceLen>)
{ {
return make_ConstantTensorDescriptor(Lengths{}.Modify(Number<IDim>{}, Number<SliceLen>{}), using slice_lengths = decltype(Lengths{}.Modify(Number<IDim>{}, Number<SliceLen>{}));
Strides{});
return ConstantTensorDescriptor<slice_lengths, Strides, MemoryRanks>{};
} }
template <index_t Threashold, index_t Delta>
struct f_fold_impl
{
__host__ __device__ constexpr index_t operator()(index_t x) const
{
return x > Threashold ? x + Delta : x;
}
};
template <index_t IDim, index_t... FoldIntervals> template <index_t IDim, index_t... FoldIntervals>
__host__ __device__ static constexpr auto Fold(Number<IDim>, Number<FoldIntervals>...) __host__ __device__ static constexpr auto Fold(Number<IDim>, Number<FoldIntervals>...)
{ {
...@@ -162,6 +208,7 @@ struct ConstantTensorDescriptor ...@@ -162,6 +208,7 @@ struct ConstantTensorDescriptor
constexpr auto unfold_length = GetLength(Number<IDim>{}); constexpr auto unfold_length = GetLength(Number<IDim>{});
constexpr auto unfold_stride = GetStride(Number<IDim>{}); constexpr auto unfold_stride = GetStride(Number<IDim>{});
constexpr auto unfold_rank = GetMemoryRank(Number<IDim>{});
// length of the dimension to be folded needs to be dividable by fold_interval_product, // length of the dimension to be folded needs to be dividable by fold_interval_product,
// otherwise, folding is invalid // otherwise, folding is invalid
...@@ -178,15 +225,44 @@ struct ConstantTensorDescriptor ...@@ -178,15 +225,44 @@ struct ConstantTensorDescriptor
reverse_inclusive_scan_sequence(fold_intervals.PushBack(Number<1>{}), reverse_inclusive_scan_sequence(fold_intervals.PushBack(Number<1>{}),
std::multiplies<index_t>{}); std::multiplies<index_t>{});
// folded_ranks
constexpr auto fold_ranks =
typename arithmetic_sequence_gen<unfold_rank,
unfold_rank + fold_intervals.GetSize() + 1,
1>::SeqType{};
// increase the ranks that are larger than unfold_rank
constexpr auto tmp_ranks = transform_sequences(
f_fold_impl<unfold_rank, fold_intervals.GetSize()>{}, GetMemoryRanks());
// left and right // left and right
constexpr auto left = make_increasing_sequence(Number<0>{}, Number<IDim>{}, Number<1>{}); constexpr auto left = typename arithmetic_sequence_gen<0, IDim, 1>::SeqType{};
constexpr auto right = make_increasing_sequence( constexpr auto right =
Number<IDim + 1>{}, Number<GetNumOfDimension()>{}, Number<1>{}); typename arithmetic_sequence_gen<IDim + 1, GetNumOfDimension(), 1>::SeqType{};
constexpr auto new_lengths =
GetLengths().Extract(left).Append(fold_lengths).Append(GetLengths().Extract(right));
constexpr auto new_strides =
GetStrides().Extract(left).Append(fold_strides).Append(GetStrides().Extract(right));
constexpr auto new_ranks =
tmp_ranks.Extract(left).Append(fold_ranks).Append(tmp_ranks.Extract(right));
static_assert(new_ranks.GetSize() == new_lengths.GetSize(), "wrong!");
static_assert(fold_ranks.GetSize() == fold_lengths.GetSize(), "wrong!");
return ConstantTensorDescriptor<decltype(new_lengths),
decltype(new_strides),
decltype(new_ranks)>{};
}
return make_ConstantTensorDescriptor( template <index_t Threashold, index_t Delta>
GetLengths().Extract(left).Append(fold_lengths).Append(GetLengths().Extract(right)), struct f_unfold_impl
GetStrides().Extract(left).Append(fold_strides).Append(GetStrides().Extract(right))); {
__host__ __device__ constexpr index_t operator()(index_t x) const
{
return x > Threashold ? x - Delta : x;
} }
};
template <index_t FirstUnfoldDim, index_t LastUnfoldDim> template <index_t FirstUnfoldDim, index_t LastUnfoldDim>
__host__ __device__ static constexpr auto Unfold(Number<FirstUnfoldDim>, Number<LastUnfoldDim>) __host__ __device__ static constexpr auto Unfold(Number<FirstUnfoldDim>, Number<LastUnfoldDim>)
...@@ -198,66 +274,109 @@ struct ConstantTensorDescriptor ...@@ -198,66 +274,109 @@ struct ConstantTensorDescriptor
// dimensions to be unfold need to be in descending order (w.r.t. strides), and need to be // dimensions to be unfold need to be in descending order (w.r.t. strides), and need to be
// packed in memory, otherwise, unfolding is invalid // packed in memory, otherwise, unfolding is invalid
static_for<FirstUnfoldDim, LastUnfoldDim, 1>{}([&](auto IDim) { static_for<FirstUnfoldDim, LastUnfoldDim, 1>{}([&](auto IDim) {
constexpr auto IDim_p1 = IDim + Number<1>{};
// check stride
static_assert( static_assert(
GetStride(IDim) >= GetStride(Number<IDim.Get() + 1>{}), GetStride(IDim) >= GetStride(IDim_p1),
"wrong! dimensions to be unfolded need to be in descending order w.r.t strides"); "wrong! dimensions to be unfolded need to be in descending order w.r.t strides");
static_assert(GetStride(IDim + 1) * GetLength(IDim + 1) == GetStride(IDim), // check if packed
static_assert(GetStride(IDim_p1) * GetLength(IDim_p1) == GetStride(IDim),
"wrong! dimensions to be unfolded need to be packed"); "wrong! dimensions to be unfolded need to be packed");
// checkt ranks
static_assert(GetMemoryRank(IDim_p1) = GetMemoryRank(IDim) + 1,
"wrong! ranks of dimensions to be "
"unfolded need to be in increasing "
"and continuous ranks");
}); });
// left and right // left and right
constexpr auto left = constexpr auto left = typename arithmetic_sequence_gen<0, FirstUnfoldDim, 1>::SeqType{};
make_increasing_sequence(Number<0>{}, Number<FirstUnfoldDim>{}, Number<1>{}); constexpr auto middle =
constexpr auto middle = make_increasing_sequence( typename arithmetic_sequence_gen<FirstUnfoldDim, LastUnfoldDim + 1, 1>::SeqType{};
Number<FirstUnfoldDim>{}, Number<LastUnfoldDim + 1>{}, Number<1>{}); constexpr auto right =
constexpr auto right = make_increasing_sequence( typename arithmetic_sequence_gen<LastUnfoldDim + 1, GetNumOfDimension(), 1>::SeqType{};
Number<LastUnfoldDim + 1>{}, Number<GetNumOfDimension()>{}, Number<1>{});
// unfolded length, stride and rank
// length and stride
constexpr index_t unfold_length = accumulate_on_sequence( constexpr index_t unfold_length = accumulate_on_sequence(
GetLengths().Extract(middle), std::multiplies<index_t>{}, Number<1>{}); GetLengths().Extract(middle), std::multiplies<index_t>{}, Number<1>{});
constexpr index_t unfold_stride = GetStride(Number<LastUnfoldDim>{}); constexpr index_t unfold_stride = GetStride(Number<LastUnfoldDim>{});
return make_ConstantTensorDescriptor(GetLengths() constexpr index_t unfold_rank = GetMemoryRank(Number<FirstUnfoldDim>{});
// decrease the ranks that are larger than the rank of LastUnfoldDim
constexpr auto tmp_ranks =
transform_sequences(GetMemoryRanks(),
f_unfold_impl<GetMemoryRank(Number<LastUnfoldDim>{}),
LastUnfoldDim - FirstUnfoldDim + 1>{});
// new lengths, strides and ranks
constexpr auto new_lengths = GetLengths()
.Extract(left) .Extract(left)
.PushBack(Number<unfold_length>{}) .PushBack(Number<unfold_length>{})
.Append(GetLengths().Extract(right)), .Append(GetLengths().Extract(right));
GetStrides()
constexpr auto new_strides = GetStrides()
.Extract(left) .Extract(left)
.PushBack(Number<unfold_stride>{}) .PushBack(Number<unfold_stride>{})
.Append(GetStrides().Extract(right))); .Append(GetStrides().Extract(right));
constexpr auto new_ranks = tmp_ranks.Extract(left)
.PushBack(Number<unfold_rank>{})
.Append(tmp_ranks.Extract(right));
return ConstantTensorDescriptor<decltype(new_lengths),
decltype(new_strides),
decltype(new_ranks)>{};
}
template <class MapNew2Old>
__host__ __device__ static constexpr auto ReorderGivenNew2Old(MapNew2Old)
{
return ConstantTensorDescriptor<decltype(Lengths{}.ReorderGivenNew2Old(MapNew2Old{})),
decltype(Strides{}.ReorderGivenNew2Old(MapNew2Old{})),
decltype(
MemoryRanks{}.ReorderGivenNew2Old(MapNew2Old{}))>{};
} }
template <index_t... IRs> #if 0 // require sequence_sort, which is not implemented yet
__host__ __device__ static constexpr auto ReorderGivenNew2Old(Sequence<IRs...> /*new2old*/) template <class MapOld2New>
__host__ __device__ static constexpr auto ReorderGivenOld2New(MapOld2New)
{ {
static_assert(sizeof...(IRs) == GetNumOfDimension(), "wrong! dimension is wrong"); return ConstantTensorDescriptor<decltype(Lengths{}.ReorderGivenOld2New(MapOld2New{})),
constexpr auto map_new2old = Sequence<IRs...>{}; decltype(Strides{}.ReorderGivenOld2New(MapOld2New{})),
return make_ConstantTensorDescriptor(Lengths{}.ReorderGivenNew2Old(map_new2old), decltype(
Strides{}.ReorderGivenNew2Old(map_new2old)); MemoryRanks{}.ReorderGivenOld2New(MapOld2New{}))>{};
} }
#endif
}; };
template <class Lengths> template <class Lengths>
__host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths) __host__ __device__ constexpr auto make_packed_ConstantTensorDescriptor(Lengths)
{ {
using Strides = decltype(calculate_default_strides(Lengths{})); using Strides = decltype(calculate_packed_tensor_strides(Lengths{}));
return ConstantTensorDescriptor<Lengths, Strides>{}; using MemoryRanks = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::SeqType;
return ConstantTensorDescriptor<Lengths, Strides, MemoryRanks>{};
} }
template <class Lengths, class Strides> template <class Lengths, class Strides>
__host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Strides) __host__ __device__ constexpr auto make_ranked_ConstantTensorDescriptor(Lengths, Strides)
{ {
return ConstantTensorDescriptor<Lengths, Strides>{}; using MemoryRanks = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::SeqType;
return ConstantTensorDescriptor<Lengths, Strides, MemoryRanks>{};
} }
template <class Lengths, index_t Align> template <class Lengths, index_t Align>
__host__ __device__ constexpr auto make_ConstantTensorDescriptor_aligned(Lengths, Number<Align>) __host__ __device__ constexpr auto
make_ranked_ConstantTensorDescriptor_with_alignment(Lengths, Number<Align>)
{ {
using Strides = decltype(calculate_default_strides_aligned(Lengths{}, Number<Align>{})); using Strides =
return ConstantTensorDescriptor<Lengths, Strides>{}; decltype(calculate_rank_tensor_default_strides_with_alignment(Lengths{}, Number<Align>{}));
using MemoryRanks = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::SeqType;
return ConstantTensorDescriptor<Lengths, Strides, MemoryRanks>{};
} }
template <class TDesc> template <class TDesc>
......
...@@ -9,76 +9,100 @@ struct Sequence ...@@ -9,76 +9,100 @@ struct Sequence
static constexpr index_t mSize = sizeof...(Is); static constexpr index_t mSize = sizeof...(Is);
const index_t mData[mSize + 1] = {
Is..., 0}; // the last element is dummy, to prevent compiler complain on empty Sequence
__host__ __device__ static constexpr index_t GetSize() { return mSize; } __host__ __device__ static constexpr index_t GetSize() { return mSize; }
template <index_t I> template <index_t I>
__host__ __device__ constexpr index_t Get(Number<I>) const __host__ __device__ static constexpr index_t Get(Number<I>)
{ {
static_assert(I < mSize, "wrong! I too large");
// the last dummy element is to prevent compiler complain about empty Sequence
const index_t mData[mSize + 1] = {Is..., 0};
return mData[I]; return mData[I];
} }
__host__ __device__ index_t operator[](index_t i) const { return mData[i]; } __host__ __device__ index_t operator[](index_t i) const
{
const index_t mData[mSize + 1] = {Is..., 0};
return mData[i];
}
template <index_t... IRs> template <index_t... IRs>
__host__ __device__ constexpr auto ReorderGivenNew2Old(Sequence<IRs...> /*new2old*/) const __host__ __device__ static constexpr auto ReorderGivenNew2Old(Sequence<IRs...> /*new2old*/)
{ {
static_assert(mSize == sizeof...(IRs), "mSize not consistent"); #if 0 // require sequence_sort, which is not implemented yet
static_assert(is_same<sequence_sort<Sequence<IRs...>>::SortedSeqType,
constexpr auto old = Type{}; arithmetic_sequence_gen<0, mSize, 1>::SeqType>::value,
"wrong! invalid new2old map");
#endif
return Sequence<old.Get(Number<IRs>{})...>{}; return Sequence<Type{}.Get(Number<IRs>{})...>{};
} }
template <index_t... IRs> #if 0 // require sequence_sort, which is not implemented yet
__host__ __device__ constexpr auto ReorderGivenOld2New(Sequence<IRs...> /*old2new*/) const template <class MapOld2New>
__host__ __device__ static constexpr auto ReorderGivenOld2New(MapOld2New /*old2new*/)
{ {
// TODO: don't know how to implement this static_assert(is_same<sequence_sort<MapOld2New>::SortedSeqType,
printf("Sequence::ReorderGivenOld2New not implemented"); arithmetic_sequence_gen<0, mSize, 1>::SeqType>::value,
assert(false); "wrong! invalid old2new map");
constexpr auto map_new2old = typename sequence_map_inverse<MapOld2New>::SeqMapType{};
return ReorderGivenNew2Old(map_new2old);
} }
#endif
__host__ __device__ constexpr auto Reverse() const; __host__ __device__ static constexpr auto Reverse();
__host__ __device__ constexpr index_t Front() const { return mData[0]; } __host__ __device__ static constexpr index_t Front()
{
const index_t mData[mSize + 1] = {Is..., 0};
return mData[0];
}
__host__ __device__ constexpr index_t Back() const { return mData[mSize - 1]; } __host__ __device__ static constexpr index_t Back()
{
const index_t mData[mSize + 1] = {Is..., 0};
return mData[mSize - 1];
}
template <index_t I> template <index_t I>
__host__ __device__ constexpr auto PushFront(Number<I>) const __host__ __device__ static constexpr auto PushFront(Number<I>)
{ {
return Sequence<I, Is...>{}; return Sequence<I, Is...>{};
} }
template <index_t I> template <index_t I>
__host__ __device__ constexpr auto PushBack(Number<I>) const __host__ __device__ static constexpr auto PushBack(Number<I>)
{ {
return Sequence<Is..., I>{}; return Sequence<Is..., I>{};
} }
__host__ __device__ constexpr auto PopFront() const; __host__ __device__ static constexpr auto PopFront();
__host__ __device__ constexpr auto PopBack() const; __host__ __device__ static constexpr auto PopBack();
template <index_t... Xs> template <index_t... Xs>
__host__ __device__ constexpr auto Append(Sequence<Xs...>) const __host__ __device__ static constexpr auto Append(Sequence<Xs...>)
{ {
return Sequence<Is..., Xs...>{}; return Sequence<Is..., Xs...>{};
} }
template <index_t... Ns> template <index_t... Ns>
__host__ __device__ constexpr auto Extract(Number<Ns>...) const __host__ __device__ static constexpr auto Extract(Number<Ns>...)
{ {
return Sequence<Type{}.Get(Number<Ns>{})...>{}; return Sequence<Type{}.Get(Number<Ns>{})...>{};
} }
template <index_t... Ns> template <index_t... Ns>
__host__ __device__ constexpr auto Extract(Sequence<Ns...>) const __host__ __device__ static constexpr auto Extract(Sequence<Ns...>)
{ {
return Sequence<Type{}.Get(Number<Ns>{})...>{}; return Sequence<Type{}.Get(Number<Ns>{})...>{};
} }
template <index_t I, index_t X>
__host__ __device__ static constexpr auto Modify(Number<I>, Number<X>);
}; };
template <class, class> template <class, class>
...@@ -91,43 +115,36 @@ struct sequence_merge<Sequence<Xs...>, Sequence<Ys...>> ...@@ -91,43 +115,36 @@ struct sequence_merge<Sequence<Xs...>, Sequence<Ys...>>
}; };
template <index_t IBegin, index_t NSize, index_t Increment> template <index_t IBegin, index_t NSize, index_t Increment>
struct increasing_sequence_gen_impl struct arithmetic_sequence_gen_impl
{ {
static constexpr index_t NSizeLeft = NSize / 2; static constexpr index_t NSizeLeft = NSize / 2;
using SeqType = typename sequence_merge< using SeqType = typename sequence_merge<
typename increasing_sequence_gen_impl<IBegin, NSizeLeft, Increment>::SeqType, typename arithmetic_sequence_gen_impl<IBegin, NSizeLeft, Increment>::SeqType,
typename increasing_sequence_gen_impl<IBegin + NSizeLeft * Increment, typename arithmetic_sequence_gen_impl<IBegin + NSizeLeft * Increment,
NSize - NSizeLeft, NSize - NSizeLeft,
Increment>::SeqType>::SeqType; Increment>::SeqType>::SeqType;
}; };
template <index_t IBegin, index_t Increment> template <index_t IBegin, index_t Increment>
struct increasing_sequence_gen_impl<IBegin, 1, Increment> struct arithmetic_sequence_gen_impl<IBegin, 1, Increment>
{ {
using SeqType = Sequence<IBegin>; using SeqType = Sequence<IBegin>;
}; };
template <index_t IBegin, index_t Increment> template <index_t IBegin, index_t Increment>
struct increasing_sequence_gen_impl<IBegin, 0, Increment> struct arithmetic_sequence_gen_impl<IBegin, 0, Increment>
{ {
using SeqType = Sequence<>; using SeqType = Sequence<>;
}; };
template <index_t IBegin, index_t IEnd, index_t Increment> template <index_t IBegin, index_t IEnd, index_t Increment>
struct increasing_sequence_gen struct arithmetic_sequence_gen
{ {
using SeqType = using SeqType =
typename increasing_sequence_gen_impl<IBegin, IEnd - IBegin, Increment>::SeqType; typename arithmetic_sequence_gen_impl<IBegin, IEnd - IBegin, Increment>::SeqType;
}; };
template <index_t IBegin, index_t IEnd, index_t Increment>
__host__ __device__ constexpr auto
make_increasing_sequence(Number<IBegin>, Number<IEnd>, Number<Increment>)
{
return typename increasing_sequence_gen<IBegin, IEnd, Increment>::SeqType{};
}
template <class, class> template <class, class>
struct sequence_reverse_inclusive_scan; struct sequence_reverse_inclusive_scan;
...@@ -161,8 +178,8 @@ struct sequence_split ...@@ -161,8 +178,8 @@ struct sequence_split
{ {
static constexpr index_t NSize = Seq{}.GetSize(); static constexpr index_t NSize = Seq{}.GetSize();
using range0 = typename increasing_sequence_gen<0, I, 1>::SeqType; using range0 = typename arithmetic_sequence_gen<0, I, 1>::SeqType;
using range1 = typename increasing_sequence_gen<I, NSize, 1>::SeqType; using range1 = typename arithmetic_sequence_gen<I, NSize, 1>::SeqType;
using SeqType0 = typename sequence_extract<Seq, range0>::SeqType; using SeqType0 = typename sequence_extract<Seq, range0>::SeqType;
using SeqType1 = typename sequence_extract<Seq, range1>::SeqType; using SeqType1 = typename sequence_extract<Seq, range1>::SeqType;
...@@ -191,6 +208,63 @@ struct sequence_reverse<Sequence<I0, I1>> ...@@ -191,6 +208,63 @@ struct sequence_reverse<Sequence<I0, I1>>
using SeqType = Sequence<I1, I0>; using SeqType = Sequence<I1, I0>;
}; };
#if 0 // not fully implemented
template <class KeySeq0, class ValSeq0, class KeySeq1, class ValSeq1>
struct sequence_sort_merge_impl;
template <index_t Key0,
index_t... Keys0,
index_t Val0,
index_t... Vals0,
index_t Key1,
index_t... Keys1,
index_t Val0,
index_t... Vals1>
struct sequence_sort_merge_impl<Sequence<Key0, Keys0...>,
Sequence<Val0, Vals0...>,
Sequence<Key1, Keys1...>,
Sequence<Val1, Vals1...>>
{
};
template <class>
struct sequence_sort;
template <index_t... Is>
struct sequence_sort<Sequence<Is...>>
{
using OriginalSeqType = Sequence<Is...>;
using SortedSeqType = xxxxx;
using MapSorted2OriginalType = xxx;
};
template <class Seq, class IsValidSeqMap>
struct sequence_map_inverse_impl;
// impl for valid map, no impl for invalid map
template <index_t... Is>
struct sequence_map_inverse_impl<Sequence<Is...>, true>
{
using SeqMapType = sequence_sort<Sequence<Is...>>::MapSorted2OriginalType;
};
template <class>
struct sequence_map_inverse;
template <class Is...>
struct sequence_map_inverse<Sequence<Is...>>
{
// TODO: make sure the map to be inversed is valid: [0, sizeof...(Is))
static constexpr bool is_valid_sequence_map =
is_same<typename sequence_sort<Sequence<Is...>>::SortedSeqType,
typename arithmetic_sequence_gen<0, sizeof...(Is), 1>::SeqType>::value;
// make compiler fails, if is_valid_map != true
using SeqMapType =
typename sequence_map_inverse_impl<Sequence<Is...>, is_valid_map>::SeqMapType;
};
#endif
template <index_t... Xs, index_t... Ys> template <index_t... Xs, index_t... Ys>
__host__ __device__ constexpr auto operator+(Sequence<Xs...>, Sequence<Ys...>) __host__ __device__ constexpr auto operator+(Sequence<Xs...>, Sequence<Ys...>)
{ {
...@@ -243,7 +317,7 @@ __host__ __device__ constexpr auto operator+(Sequence<Xs...>, Number<Y>) ...@@ -243,7 +317,7 @@ __host__ __device__ constexpr auto operator+(Sequence<Xs...>, Number<Y>)
template <index_t... Xs, index_t Y> template <index_t... Xs, index_t Y>
__host__ __device__ constexpr auto operator-(Sequence<Xs...>, Number<Y>) __host__ __device__ constexpr auto operator-(Sequence<Xs...>, Number<Y>)
{ {
#if 0 // doesn't compile #if 0 // TODO: turn it on. Doesn't compile
constexpr auto seq_x = Sequence<Xs...>{}; constexpr auto seq_x = Sequence<Xs...>{};
static_for<0, sizeof...(Xs), 1>{}([&](auto Iter) { static_for<0, sizeof...(Xs), 1>{}([&](auto Iter) {
...@@ -313,14 +387,13 @@ __host__ __device__ constexpr auto operator%(Number<Y>, Sequence<Xs...>) ...@@ -313,14 +387,13 @@ __host__ __device__ constexpr auto operator%(Number<Y>, Sequence<Xs...>)
template <index_t I, index_t... Is> template <index_t I, index_t... Is>
__host__ __device__ constexpr auto sequence_pop_front(Sequence<I, Is...>) __host__ __device__ constexpr auto sequence_pop_front(Sequence<I, Is...>)
{ {
static_assert(sizeof...(Is) > 0, "empty Sequence!");
return Sequence<Is...>{}; return Sequence<Is...>{};
} }
template <class Seq> template <class Seq>
__host__ __device__ constexpr auto sequence_pop_back(Seq) __host__ __device__ constexpr auto sequence_pop_back(Seq)
{ {
static_assert(Seq{}.GetSize() > 0, "empty Sequence!"); static_assert(Seq{}.GetSize() > 0, "wrong! cannot pop an empty Sequence!");
return sequence_pop_front(Seq{}.Reverse()).Reverse(); return sequence_pop_front(Seq{}.Reverse()).Reverse();
} }
...@@ -349,16 +422,16 @@ transform_sequences(F f, Sequence<Xs...>, Sequence<Ys...>, Sequence<Zs...>) ...@@ -349,16 +422,16 @@ transform_sequences(F f, Sequence<Xs...>, Sequence<Ys...>, Sequence<Zs...>)
return Sequence<f(Xs, Ys, Zs)...>{}; return Sequence<f(Xs, Ys, Zs)...>{};
} }
template <index_t... Is> template <class Seq, class Reduce>
__host__ __device__ constexpr auto Sequence<Is...>::PopFront() const __host__ __device__ constexpr auto reverse_inclusive_scan_sequence(Seq, Reduce)
{ {
return sequence_pop_front(Type{}); return typename sequence_reverse_inclusive_scan<Seq, Reduce>::SeqType{};
} }
template <index_t... Is> template <class Seq, class Reduce>
__host__ __device__ constexpr auto Sequence<Is...>::PopBack() const __host__ __device__ constexpr auto inclusive_scan_sequence(Seq, Reduce)
{ {
return sequence_pop_back(Type{}); return reverse_inclusive_scan_sequence(Seq{}.Reverse(), Reduce{}).Reverse();
} }
template <class Seq> template <class Seq>
...@@ -381,19 +454,32 @@ __host__ __device__ constexpr index_t ...@@ -381,19 +454,32 @@ __host__ __device__ constexpr index_t
} }
template <index_t... Is> template <index_t... Is>
__host__ __device__ constexpr auto Sequence<Is...>::Reverse() const __host__ __device__ constexpr auto Sequence<Is...>::PopFront()
{ {
return typename sequence_reverse<Sequence<Is...>>::SeqType{}; return sequence_pop_front(Type{});
} }
template <class Seq, class Reduce> template <index_t... Is>
__host__ __device__ constexpr auto reverse_inclusive_scan_sequence(Seq, Reduce) __host__ __device__ constexpr auto Sequence<Is...>::PopBack()
{ {
return typename sequence_reverse_inclusive_scan<Seq, Reduce>::SeqType{}; return sequence_pop_back(Type{});
} }
template <class Seq, class Reduce> template <index_t... Is>
__host__ __device__ constexpr auto inclusive_scan_sequence(Seq, Reduce) __host__ __device__ constexpr auto Sequence<Is...>::Reverse()
{ {
return reverse_inclusive_scan_sequence(Seq{}.Reverse(), Reduce{}).Reverse(); return typename sequence_reverse<Sequence<Is...>>::SeqType{};
}
template <index_t... Is>
template <index_t I, index_t X>
__host__ __device__ constexpr auto Sequence<Is...>::Modify(Number<I>, Number<X>)
{
static_assert(I < GetSize(), "wrong!");
using seq_split = sequence_split<Type, I>;
constexpr auto seq_left = typename seq_split::SeqType0{};
constexpr auto seq_right = typename seq_split::SeqType1{}.PopFront();
return seq_left.PushBack(Number<X>{}).Append(seq_right);
} }
...@@ -33,7 +33,7 @@ blockwise_2d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst ...@@ -33,7 +33,7 @@ blockwise_2d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
const index_t did1 = is / desc.GetStride(I1); const index_t did1 = is / desc.GetStride(I1);
const index_t dindex = dst_desc.Get1dIndex(did0, did1); const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1);
f(p_dst[dindex]); f(p_dst[dindex]);
} }
...@@ -52,7 +52,7 @@ blockwise_2d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst ...@@ -52,7 +52,7 @@ blockwise_2d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
const index_t did1 = is / desc.GetStride(I1); const index_t did1 = is / desc.GetStride(I1);
const index_t dindex = dst_desc.Get1dIndex(did0, did1); const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1);
f(p_dst[dindex]); f(p_dst[dindex]);
} }
...@@ -102,9 +102,9 @@ __device__ void blockwise_2d_tensor_pointwise_operation_binary_reorder_by_get_ds ...@@ -102,9 +102,9 @@ __device__ void blockwise_2d_tensor_pointwise_operation_binary_reorder_by_get_ds
did[1] = is / ref_desc.GetStride(I1); did[1] = is / ref_desc.GetStride(I1);
const index_t aindex = src_desc.Get1dIndex(did[0], did[1]); const index_t aindex = src_desc.GetOffsetFromMultiIndex(did[0], did[1]);
const index_t bindex = dst_desc.Get1dIndex(did[IR0], did[IR1]); const index_t bindex = dst_desc.GetOffsetFromMultiIndex(did[IR0], did[IR1]);
f(p_src[aindex], p_dst[bindex]); f(p_src[aindex], p_dst[bindex]);
} }
...@@ -125,9 +125,9 @@ __device__ void blockwise_2d_tensor_pointwise_operation_binary_reorder_by_get_ds ...@@ -125,9 +125,9 @@ __device__ void blockwise_2d_tensor_pointwise_operation_binary_reorder_by_get_ds
did[1] = is / ref_desc.GetStride(I1); did[1] = is / ref_desc.GetStride(I1);
const index_t aindex = src_desc.Get1dIndex(did[0], did[1]); const index_t aindex = src_desc.GetOffsetFromMultiIndex(did[0], did[1]);
const index_t bindex = dst_desc.Get1dIndex(did[IR0], did[IR1]); const index_t bindex = dst_desc.GetOffsetFromMultiIndex(did[IR0], did[IR1]);
f(p_src[aindex], p_dst[bindex]); f(p_src[aindex], p_dst[bindex]);
} }
...@@ -224,8 +224,10 @@ struct Blockwise2dTensorCopy1 ...@@ -224,8 +224,10 @@ struct Blockwise2dTensorCopy1
did[1] = is / ref_desc.GetStride(I1); did[1] = is / ref_desc.GetStride(I1);
const index_t src_index = src_desc.Get1dIndex(did[0], did[1] * DataPerRead); const index_t src_index =
const index_t dst_index = dst_desc.Get1dIndex(did[0], did[1] * DataPerRead); src_desc.GetOffsetFromMultiIndex(did[0], did[1] * DataPerRead);
const index_t dst_index =
dst_desc.GetOffsetFromMultiIndex(did[0], did[1] * DataPerRead);
*(reinterpret_cast<vector_t*>(p_dst + dst_index)) = *(reinterpret_cast<vector_t*>(p_dst + dst_index)) =
*(reinterpret_cast<const vector_t*>(p_src + src_index)); *(reinterpret_cast<const vector_t*>(p_src + src_index));
...@@ -328,8 +330,8 @@ struct Blockwise2dTensorCopy2 ...@@ -328,8 +330,8 @@ struct Blockwise2dTensorCopy2
{ {
index_t did1 = d1v4loop * 4 * ThreadPerDim1 + 4 * mThreadId1; index_t did1 = d1v4loop * 4 * ThreadPerDim1 + 4 * mThreadId1;
const index_t sindex = src_desc.Get1dIndex(did0, did1); const index_t sindex = src_desc.GetOffsetFromMultiIndex(did0, did1);
const index_t dindex = dst_desc.Get1dIndex(did0, did1); const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1);
*(reinterpret_cast<Float4*>(p_dst + dindex)) = *(reinterpret_cast<Float4*>(p_dst + dindex)) =
*(reinterpret_cast<const Float4*>(p_src + sindex)); *(reinterpret_cast<const Float4*>(p_src + sindex));
...@@ -341,8 +343,8 @@ struct Blockwise2dTensorCopy2 ...@@ -341,8 +343,8 @@ struct Blockwise2dTensorCopy2
index_t did1 = index_t did1 =
Dim1V4Loop * 4 * ThreadPerDim1 + d1v2loop * 2 * ThreadPerDim1 + 2 * mThreadId1; Dim1V4Loop * 4 * ThreadPerDim1 + d1v2loop * 2 * ThreadPerDim1 + 2 * mThreadId1;
const index_t sindex = src_desc.Get1dIndex(did0, did1); const index_t sindex = src_desc.GetOffsetFromMultiIndex(did0, did1);
const index_t dindex = dst_desc.Get1dIndex(did0, did1); const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1);
*(reinterpret_cast<Float2*>(p_dst + dindex)) = *(reinterpret_cast<Float2*>(p_dst + dindex)) =
*(reinterpret_cast<const Float2*>(p_src + sindex)); *(reinterpret_cast<const Float2*>(p_src + sindex));
...@@ -354,8 +356,8 @@ struct Blockwise2dTensorCopy2 ...@@ -354,8 +356,8 @@ struct Blockwise2dTensorCopy2
index_t did1 = Dim1V4Loop * 4 * ThreadPerDim1 + Dim1V2Loop * 2 * ThreadPerDim1 + index_t did1 = Dim1V4Loop * 4 * ThreadPerDim1 + Dim1V2Loop * 2 * ThreadPerDim1 +
d1v1loop * ThreadPerDim1 + mThreadId1; d1v1loop * ThreadPerDim1 + mThreadId1;
const index_t sindex = src_desc.Get1dIndex(did0, did1); const index_t sindex = src_desc.GetOffsetFromMultiIndex(did0, did1);
const index_t dindex = dst_desc.Get1dIndex(did0, did1); const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1);
p_dst[dindex] = p_src[sindex]; p_dst[dindex] = p_src[sindex];
} }
...@@ -368,8 +370,8 @@ struct Blockwise2dTensorCopy2 ...@@ -368,8 +370,8 @@ struct Blockwise2dTensorCopy2
if(did1 < L1) if(did1 < L1)
{ {
const index_t sindex = src_desc.Get1dIndex(did0, did1); const index_t sindex = src_desc.GetOffsetFromMultiIndex(did0, did1);
const index_t dindex = dst_desc.Get1dIndex(did0, did1); const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1);
p_dst[dindex] = p_src[sindex]; p_dst[dindex] = p_src[sindex];
} }
...@@ -389,8 +391,8 @@ struct Blockwise2dTensorCopy2 ...@@ -389,8 +391,8 @@ struct Blockwise2dTensorCopy2
{ {
index_t did1 = d1v4loop * 4 * ThreadPerDim1 + 4 * mThreadId1; index_t did1 = d1v4loop * 4 * ThreadPerDim1 + 4 * mThreadId1;
const index_t sindex = src_desc.Get1dIndex(did0, did1); const index_t sindex = src_desc.GetOffsetFromMultiIndex(did0, did1);
const index_t dindex = dst_desc.Get1dIndex(did0, did1); const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1);
*(reinterpret_cast<Float4*>(p_dst + dindex)) = *(reinterpret_cast<Float4*>(p_dst + dindex)) =
*(reinterpret_cast<const Float4*>(p_src + sindex)); *(reinterpret_cast<const Float4*>(p_src + sindex));
...@@ -402,8 +404,8 @@ struct Blockwise2dTensorCopy2 ...@@ -402,8 +404,8 @@ struct Blockwise2dTensorCopy2
index_t did1 = Dim1V4Loop * 4 * ThreadPerDim1 + d1v2loop * 2 * ThreadPerDim1 + index_t did1 = Dim1V4Loop * 4 * ThreadPerDim1 + d1v2loop * 2 * ThreadPerDim1 +
2 * mThreadId1; 2 * mThreadId1;
const index_t sindex = src_desc.Get1dIndex(did0, did1); const index_t sindex = src_desc.GetOffsetFromMultiIndex(did0, did1);
const index_t dindex = dst_desc.Get1dIndex(did0, did1); const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1);
*(reinterpret_cast<Float2*>(p_dst + dindex)) = *(reinterpret_cast<Float2*>(p_dst + dindex)) =
*(reinterpret_cast<const Float2*>(p_src + sindex)); *(reinterpret_cast<const Float2*>(p_src + sindex));
...@@ -415,8 +417,8 @@ struct Blockwise2dTensorCopy2 ...@@ -415,8 +417,8 @@ struct Blockwise2dTensorCopy2
index_t did1 = Dim1V4Loop * 4 * ThreadPerDim1 + Dim1V2Loop * 2 * ThreadPerDim1 + index_t did1 = Dim1V4Loop * 4 * ThreadPerDim1 + Dim1V2Loop * 2 * ThreadPerDim1 +
d1v1loop * ThreadPerDim1 + mThreadId1; d1v1loop * ThreadPerDim1 + mThreadId1;
const index_t sindex = src_desc.Get1dIndex(did0, did1); const index_t sindex = src_desc.GetOffsetFromMultiIndex(did0, did1);
const index_t dindex = dst_desc.Get1dIndex(did0, did1); const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1);
p_dst[dindex] = p_src[sindex]; p_dst[dindex] = p_src[sindex];
} }
...@@ -429,8 +431,8 @@ struct Blockwise2dTensorCopy2 ...@@ -429,8 +431,8 @@ struct Blockwise2dTensorCopy2
if(did1 < L1) if(did1 < L1)
{ {
const index_t sindex = src_desc.Get1dIndex(did0, did1); const index_t sindex = src_desc.GetOffsetFromMultiIndex(did0, did1);
const index_t dindex = dst_desc.Get1dIndex(did0, did1); const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1);
p_dst[dindex] = p_src[sindex]; p_dst[dindex] = p_src[sindex];
} }
...@@ -497,8 +499,10 @@ struct Blockwise2dTensorCopy3 ...@@ -497,8 +499,10 @@ struct Blockwise2dTensorCopy3
const index_t thread_id_d0 = get_thread_local_1d_id() / thread_per_d1; const index_t thread_id_d0 = get_thread_local_1d_id() / thread_per_d1;
const index_t thread_id_d1 = get_thread_local_1d_id() - thread_id_d0 * thread_per_d1; const index_t thread_id_d1 = get_thread_local_1d_id() - thread_id_d0 * thread_per_d1;
mSrcMyThreadOffset = SrcDesc{}.Get1dIndex(thread_id_d0, thread_id_d1 * DataPerRead); mSrcMyThreadOffset =
mDstMyThreadOffset = DstDesc{}.Get1dIndex(thread_id_d0, thread_id_d1 * DataPerRead); SrcDesc{}.GetOffsetFromMultiIndex(thread_id_d0, thread_id_d1 * DataPerRead);
mDstMyThreadOffset =
DstDesc{}.GetOffsetFromMultiIndex(thread_id_d0, thread_id_d1 * DataPerRead);
} }
__device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
......
...@@ -71,8 +71,10 @@ struct Blockwise3dTensorCopy1 ...@@ -71,8 +71,10 @@ struct Blockwise3dTensorCopy1
did[2] = is / ref_desc.GetStride(I2); did[2] = is / ref_desc.GetStride(I2);
const index_t src_index = src_desc.Get1dIndex(did[0], did[1], did[2] * DataPerRead); const index_t src_index =
const index_t dst_index = dst_desc.Get1dIndex(did[0], did[1], did[2] * DataPerRead); src_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2] * DataPerRead);
const index_t dst_index =
dst_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2] * DataPerRead);
*(reinterpret_cast<vector_t*>(p_dst + dst_index)) = *(reinterpret_cast<vector_t*>(p_dst + dst_index)) =
*(reinterpret_cast<const vector_t*>(p_src + src_index)); *(reinterpret_cast<const vector_t*>(p_src + src_index));
...@@ -167,12 +169,13 @@ struct Blockwise3dTensorCopy3 ...@@ -167,12 +169,13 @@ struct Blockwise3dTensorCopy3
} }
constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor(ThreadPerDims{}); constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor(ThreadPerDims{});
const auto thread_multi_id = thread_cluster_desc.GetMultiIndex(get_thread_local_1d_id()); const auto thread_multi_id =
thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
mSrcMyThreadOffset = SrcDesc{}.Get1dIndex( mSrcMyThreadOffset = SrcDesc{}.GetOffsetFromMultiIndex(
thread_multi_id[0], thread_multi_id[1], thread_multi_id[2] * DataPerRead); thread_multi_id[0], thread_multi_id[1], thread_multi_id[2] * DataPerRead);
mDstMyThreadOffset = DstDesc{}.Get1dIndex( mDstMyThreadOffset = DstDesc{}.GetOffsetFromMultiIndex(
thread_multi_id[0], thread_multi_id[1], thread_multi_id[2] * DataPerRead); thread_multi_id[0], thread_multi_id[1], thread_multi_id[2] * DataPerRead);
} }
...@@ -214,12 +217,12 @@ struct Blockwise3dTensorCopy3 ...@@ -214,12 +217,12 @@ struct Blockwise3dTensorCopy3
for(index_t iloop_d2 = 0; iloop_d2 < nloop_d2; ++iloop_d2) for(index_t iloop_d2 = 0; iloop_d2 < nloop_d2; ++iloop_d2)
{ {
const index_t src_offset = const index_t src_offset =
SrcDesc{}.Get1dIndex(iloop_d0 * thread_per_d0, SrcDesc{}.GetOffsetFromMultiIndex(iloop_d0 * thread_per_d0,
iloop_d1 * thread_per_d1, iloop_d1 * thread_per_d1,
iloop_d2 * thread_per_d2 * DataPerRead); iloop_d2 * thread_per_d2 * DataPerRead);
const index_t dst_offset = const index_t dst_offset =
DstDesc{}.Get1dIndex(iloop_d0 * thread_per_d0, DstDesc{}.GetOffsetFromMultiIndex(iloop_d0 * thread_per_d0,
iloop_d1 * thread_per_d1, iloop_d1 * thread_per_d1,
iloop_d2 * thread_per_d2 * DataPerRead); iloop_d2 * thread_per_d2 * DataPerRead);
...@@ -295,12 +298,12 @@ struct Blockwise3dTensorCopy3 ...@@ -295,12 +298,12 @@ struct Blockwise3dTensorCopy3
for(index_t iloop_d2 = 0; iloop_d2 < nloop_d2; ++iloop_d2) for(index_t iloop_d2 = 0; iloop_d2 < nloop_d2; ++iloop_d2)
{ {
const index_t src_offset = const index_t src_offset =
SrcDesc{}.Get1dIndex(iloop_d0 * thread_per_d0, SrcDesc{}.GetOffsetFromMultiIndex(iloop_d0 * thread_per_d0,
iloop_d1 * thread_per_d1, iloop_d1 * thread_per_d1,
iloop_d2 * thread_per_d2 * DataPerRead); iloop_d2 * thread_per_d2 * DataPerRead);
const index_t clipboard_offset = const index_t clipboard_offset = clipboard_desc.GetOffsetFromMultiIndex(
clipboard_desc.Get1dIndex(iloop_d0, iloop_d1, iloop_d2 * DataPerRead); iloop_d0, iloop_d1, iloop_d2 * DataPerRead);
*(reinterpret_cast<vector_t*>(&p_clipboard[clipboard_offset])) = *( *(reinterpret_cast<vector_t*>(&p_clipboard[clipboard_offset])) = *(
reinterpret_cast<const vector_t*>(&p_src[src_offset + mSrcMyThreadOffset])); reinterpret_cast<const vector_t*>(&p_src[src_offset + mSrcMyThreadOffset]));
...@@ -350,11 +353,11 @@ struct Blockwise3dTensorCopy3 ...@@ -350,11 +353,11 @@ struct Blockwise3dTensorCopy3
#pragma unroll #pragma unroll
for(index_t iloop_d2 = 0; iloop_d2 < nloop_d2; ++iloop_d2) for(index_t iloop_d2 = 0; iloop_d2 < nloop_d2; ++iloop_d2)
{ {
const index_t clipboard_offset = const index_t clipboard_offset = clipboard_desc.GetOffsetFromMultiIndex(
clipboard_desc.Get1dIndex(iloop_d0, iloop_d1, iloop_d2 * DataPerRead); iloop_d0, iloop_d1, iloop_d2 * DataPerRead);
const index_t dst_offset = const index_t dst_offset =
DstDesc{}.Get1dIndex(iloop_d0 * thread_per_d0, DstDesc{}.GetOffsetFromMultiIndex(iloop_d0 * thread_per_d0,
iloop_d1 * thread_per_d1, iloop_d1 * thread_per_d1,
iloop_d2 * thread_per_d2 * DataPerRead); iloop_d2 * thread_per_d2 * DataPerRead);
......
...@@ -13,7 +13,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst ...@@ -13,7 +13,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
constexpr auto dst_desc = DstDesc{}; constexpr auto dst_desc = DstDesc{};
constexpr auto desc = make_ConstantTensorDescriptor(dst_desc.GetLengths()); constexpr auto desc = make_packed_ConstantTensorDescriptor(dst_desc.GetLengths());
#if 0 #if 0
if(get_thread_local_1d_id() == 0) if(get_thread_local_1d_id() == 0)
...@@ -43,7 +43,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst ...@@ -43,7 +43,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
const index_t did3 = is / desc.GetStride(I3); const index_t did3 = is / desc.GetStride(I3);
const index_t dindex = dst_desc.Get1dIndex(did0, did1, did2, did3); const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1, did2, did3);
f(p_dst[dindex]); f(p_dst[dindex]);
} }
...@@ -70,7 +70,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst ...@@ -70,7 +70,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
const index_t did3 = is / desc.GetStride(I3); const index_t did3 = is / desc.GetStride(I3);
const index_t dindex = dst_desc.Get1dIndex(did0, did1, did2, did3); const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1, did2, did3);
f(p_dst[dindex]); f(p_dst[dindex]);
} }
...@@ -108,7 +108,7 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds ...@@ -108,7 +108,7 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds
constexpr auto src_desc = SrcDesc{}; constexpr auto src_desc = SrcDesc{};
constexpr auto dst_desc = DstDesc{}; constexpr auto dst_desc = DstDesc{};
constexpr auto ref_desc = make_ConstantTensorDescriptor(SrcOpLengths{}); constexpr auto ref_desc = make_packed_ConstantTensorDescriptor(SrcOpLengths{});
constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize; constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize;
...@@ -132,9 +132,10 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds ...@@ -132,9 +132,10 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds
did[3] = is / ref_desc.GetStride(I3); did[3] = is / ref_desc.GetStride(I3);
const index_t src_index = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]); const index_t src_index = src_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2], did[3]);
const index_t dst_index = dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]); const index_t dst_index =
dst_desc.GetOffsetFromMultiIndex(did[IR0], did[IR1], did[IR2], did[IR3]);
f(p_src[src_index], p_dst[dst_index]); f(p_src[src_index], p_dst[dst_index]);
} }
...@@ -163,9 +164,11 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds ...@@ -163,9 +164,11 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds
did[3] = is / ref_desc.GetStride(I3); did[3] = is / ref_desc.GetStride(I3);
const index_t src_index = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]); const index_t src_index =
src_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2], did[3]);
const index_t dst_index = dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]); const index_t dst_index =
dst_desc.GetOffsetFromMultiIndex(did[IR0], did[IR1], did[IR2], did[IR3]);
f(p_src[src_index], p_dst[dst_index]); f(p_src[src_index], p_dst[dst_index]);
} }
...@@ -256,7 +259,7 @@ struct Blockwise4dTensorCopy1 ...@@ -256,7 +259,7 @@ struct Blockwise4dTensorCopy1
constexpr index_t read_per_d3 = mod_conv::integer_divide_ceil(L3, DataPerRead); constexpr index_t read_per_d3 = mod_conv::integer_divide_ceil(L3, DataPerRead);
constexpr auto ref_desc = constexpr auto ref_desc =
make_ConstantTensorDescriptor(Sequence<L0, L1, L2, read_per_d3>{}); make_packed_ConstantTensorDescriptor(Sequence<L0, L1, L2, read_per_d3>{});
constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize; constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize;
...@@ -278,9 +281,9 @@ struct Blockwise4dTensorCopy1 ...@@ -278,9 +281,9 @@ struct Blockwise4dTensorCopy1
did[3] = is / ref_desc.GetStride(I3); did[3] = is / ref_desc.GetStride(I3);
const index_t src_index = const index_t src_index =
src_desc.Get1dIndex(did[0], did[1], did[2], did[3] * DataPerRead); src_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2], did[3] * DataPerRead);
const index_t dst_index = const index_t dst_index =
dst_desc.Get1dIndex(did[0], did[1], did[2], did[3] * DataPerRead); dst_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2], did[3] * DataPerRead);
*(reinterpret_cast<vector_t*>(p_dst + dst_index)) = *(reinterpret_cast<vector_t*>(p_dst + dst_index)) =
*(reinterpret_cast<const vector_t*>(p_src + src_index)); *(reinterpret_cast<const vector_t*>(p_src + src_index));
...@@ -333,16 +336,16 @@ struct BlockwiseChwnTensorCopyPadded ...@@ -333,16 +336,16 @@ struct BlockwiseChwnTensorCopyPadded
constexpr auto src_desc = SrcDesc{}; constexpr auto src_desc = SrcDesc{};
constexpr auto dst_desc = DstDesc{}; constexpr auto dst_desc = DstDesc{};
constexpr auto ref_desc = make_ConstantTensorDescriptor(DstOpLengths{}); constexpr auto ref_desc = make_packed_ConstantTensorDescriptor(DstOpLengths{});
constexpr auto h_global_pad_low = GlobalLowerPads{}.Get(I0); constexpr auto h_global_pad_low = GlobalLowerPads{}.Get(I0);
constexpr auto w_global_pad_low = GlobalLowerPads{}.Get(I1); constexpr auto w_global_pad_low = GlobalLowerPads{}.Get(I1);
constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize; constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize;
const Float* p_src_tmp = const Float* p_src_tmp = p_src +
p_src + src_desc.GetOffsetFromMultiIndex(
src_desc.Get1dIndex(c_block_data_begin, c_block_data_begin,
(ho_block_data_begin + h_block_pad_low) - h_global_pad_low, (ho_block_data_begin + h_block_pad_low) - h_global_pad_low,
(wo_block_data_begin + w_block_pad_low) - w_global_pad_low, (wo_block_data_begin + w_block_pad_low) - w_global_pad_low,
n_block_data_begin); n_block_data_begin);
...@@ -389,13 +392,13 @@ struct BlockwiseChwnTensorCopyPadded ...@@ -389,13 +392,13 @@ struct BlockwiseChwnTensorCopyPadded
did[3] = is / ref_desc.GetStride(I3); did[3] = is / ref_desc.GetStride(I3);
const index_t bindex = dst_desc.Get1dIndex(did[0], did[1], did[2], did[3]); const index_t bindex = dst_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2], did[3]);
p_dst[bindex] = p_dst[bindex] =
(did[1] < h_block_pad_low || did[1] + h_block_pad_up >= ref_desc.GetLength(I1) || (did[1] < h_block_pad_low || did[1] + h_block_pad_up >= ref_desc.GetLength(I1) ||
did[2] < w_block_pad_low || did[2] + w_block_pad_up >= ref_desc.GetLength(I2)) did[2] < w_block_pad_low || did[2] + w_block_pad_up >= ref_desc.GetLength(I2))
? Float(0) ? Float(0)
: p_src_tmp[src_desc.Get1dIndex(did[0], did[1], did[2], did[3])]; : p_src_tmp[src_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2], did[3])];
} }
constexpr bool has_tail = (ref_desc.GetElementSize() > NLoop * BlockSize); constexpr bool has_tail = (ref_desc.GetElementSize() > NLoop * BlockSize);
...@@ -422,14 +425,16 @@ struct BlockwiseChwnTensorCopyPadded ...@@ -422,14 +425,16 @@ struct BlockwiseChwnTensorCopyPadded
did[3] = is / ref_desc.GetStride(I3); did[3] = is / ref_desc.GetStride(I3);
const index_t bindex = dst_desc.Get1dIndex(did[0], did[1], did[2], did[3]); const index_t bindex =
dst_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2], did[3]);
p_dst[bindex] = p_dst[bindex] =
(did[1] < h_block_pad_low || (did[1] < h_block_pad_low ||
did[1] + h_block_pad_up >= ref_desc.GetLength(I1) || did[1] + h_block_pad_up >= ref_desc.GetLength(I1) ||
did[2] < w_block_pad_low || did[2] + w_block_pad_up >= ref_desc.GetLength(I2)) did[2] < w_block_pad_low || did[2] + w_block_pad_up >= ref_desc.GetLength(I2))
? Float(0) ? Float(0)
: p_src_tmp[src_desc.Get1dIndex(did[0], did[1], did[2], did[3])]; : p_src_tmp[src_desc.GetOffsetFromMultiIndex(
did[0], did[1], did[2], did[3])];
} }
} }
} }
...@@ -505,15 +510,16 @@ struct Blockwise4dTensorCopy3 ...@@ -505,15 +510,16 @@ struct Blockwise4dTensorCopy3
} }
} }
constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor(ThreadPerDims{}); constexpr auto thread_cluster_desc = make_packed_ConstantTensorDescriptor(ThreadPerDims{});
const auto thread_multi_id = thread_cluster_desc.GetMultiIndex(get_thread_local_1d_id()); const auto thread_multi_id =
thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
mSrcMyThreadOffset = SrcDesc{}.Get1dIndex(thread_multi_id[0], mSrcMyThreadOffset = SrcDesc{}.GetOffsetFromMultiIndex(thread_multi_id[0],
thread_multi_id[1], thread_multi_id[1],
thread_multi_id[2], thread_multi_id[2],
thread_multi_id[3] * DataPerRead); thread_multi_id[3] * DataPerRead);
mDstMyThreadOffset = DstDesc{}.Get1dIndex(thread_multi_id[0], mDstMyThreadOffset = DstDesc{}.GetOffsetFromMultiIndex(thread_multi_id[0],
thread_multi_id[1], thread_multi_id[1],
thread_multi_id[2], thread_multi_id[2],
thread_multi_id[3] * DataPerRead); thread_multi_id[3] * DataPerRead);
...@@ -564,14 +570,14 @@ struct Blockwise4dTensorCopy3 ...@@ -564,14 +570,14 @@ struct Blockwise4dTensorCopy3
#pragma unroll #pragma unroll
for(index_t iloop_d3 = 0; iloop_d3 < nloop_d3; ++iloop_d3) for(index_t iloop_d3 = 0; iloop_d3 < nloop_d3; ++iloop_d3)
{ {
const index_t src_offset = const index_t src_offset = SrcDesc{}.GetOffsetFromMultiIndex(
SrcDesc{}.Get1dIndex(iloop_d0 * thread_per_d0, iloop_d0 * thread_per_d0,
iloop_d1 * thread_per_d1, iloop_d1 * thread_per_d1,
iloop_d2 * thread_per_d2, iloop_d2 * thread_per_d2,
iloop_d3 * thread_per_d3 * DataPerRead); iloop_d3 * thread_per_d3 * DataPerRead);
const index_t dst_offset = const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(
DstDesc{}.Get1dIndex(iloop_d0 * thread_per_d0, iloop_d0 * thread_per_d0,
iloop_d1 * thread_per_d1, iloop_d1 * thread_per_d1,
iloop_d2 * thread_per_d2, iloop_d2 * thread_per_d2,
iloop_d3 * thread_per_d3 * DataPerRead); iloop_d3 * thread_per_d3 * DataPerRead);
...@@ -646,7 +652,7 @@ struct Blockwise4dTensorCopy3 ...@@ -646,7 +652,7 @@ struct Blockwise4dTensorCopy3
constexpr index_t nloop_d2 = L2 / thread_per_d2; constexpr index_t nloop_d2 = L2 / thread_per_d2;
constexpr index_t nloop_d3 = mod_conv::integer_divide_ceil(L3, thread_per_d3 * DataPerRead); constexpr index_t nloop_d3 = mod_conv::integer_divide_ceil(L3, thread_per_d3 * DataPerRead);
constexpr auto clipboard_desc = make_ConstantTensorDescriptor( constexpr auto clipboard_desc = make_packed_ConstantTensorDescriptor(
Sequence<nloop_d0, nloop_d1, nloop_d2, nloop_d3 * DataPerRead>{}); Sequence<nloop_d0, nloop_d1, nloop_d2, nloop_d3 * DataPerRead>{});
#pragma unroll #pragma unroll
...@@ -661,13 +667,13 @@ struct Blockwise4dTensorCopy3 ...@@ -661,13 +667,13 @@ struct Blockwise4dTensorCopy3
#pragma unroll #pragma unroll
for(index_t iloop_d3 = 0; iloop_d3 < nloop_d3; ++iloop_d3) for(index_t iloop_d3 = 0; iloop_d3 < nloop_d3; ++iloop_d3)
{ {
const index_t src_offset = const index_t src_offset = SrcDesc{}.GetOffsetFromMultiIndex(
SrcDesc{}.Get1dIndex(iloop_d0 * thread_per_d0, iloop_d0 * thread_per_d0,
iloop_d1 * thread_per_d1, iloop_d1 * thread_per_d1,
iloop_d2 * thread_per_d2, iloop_d2 * thread_per_d2,
iloop_d3 * thread_per_d3 * DataPerRead); iloop_d3 * thread_per_d3 * DataPerRead);
const index_t clipboard_offset = clipboard_desc.Get1dIndex( const index_t clipboard_offset = clipboard_desc.GetOffsetFromMultiIndex(
iloop_d0, iloop_d1, iloop_d2, iloop_d3 * DataPerRead); iloop_d0, iloop_d1, iloop_d2, iloop_d3 * DataPerRead);
*(reinterpret_cast<vector_t*>(&p_clipboard[clipboard_offset])) = *(reinterpret_cast<vector_t*>(&p_clipboard[clipboard_offset])) =
...@@ -713,7 +719,7 @@ struct Blockwise4dTensorCopy3 ...@@ -713,7 +719,7 @@ struct Blockwise4dTensorCopy3
constexpr index_t nloop_d2 = L2 / thread_per_d2; constexpr index_t nloop_d2 = L2 / thread_per_d2;
constexpr index_t nloop_d3 = mod_conv::integer_divide_ceil(L3, thread_per_d3 * DataPerRead); constexpr index_t nloop_d3 = mod_conv::integer_divide_ceil(L3, thread_per_d3 * DataPerRead);
constexpr auto clipboard_desc = make_ConstantTensorDescriptor( constexpr auto clipboard_desc = make_packed_ConstantTensorDescriptor(
Sequence<nloop_d0, nloop_d1, nloop_d2, nloop_d3 * DataPerRead>{}); Sequence<nloop_d0, nloop_d1, nloop_d2, nloop_d3 * DataPerRead>{});
#pragma unroll #pragma unroll
...@@ -728,11 +734,11 @@ struct Blockwise4dTensorCopy3 ...@@ -728,11 +734,11 @@ struct Blockwise4dTensorCopy3
#pragma unroll #pragma unroll
for(index_t iloop_d3 = 0; iloop_d3 < nloop_d3; ++iloop_d3) for(index_t iloop_d3 = 0; iloop_d3 < nloop_d3; ++iloop_d3)
{ {
const index_t clipboard_offset = clipboard_desc.Get1dIndex( const index_t clipboard_offset = clipboard_desc.GetOffsetFromMultiIndex(
iloop_d0, iloop_d1, iloop_d2, iloop_d3 * DataPerRead); iloop_d0, iloop_d1, iloop_d2, iloop_d3 * DataPerRead);
const index_t dst_offset = const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(
DstDesc{}.Get1dIndex(iloop_d0 * thread_per_d0, iloop_d0 * thread_per_d0,
iloop_d1 * thread_per_d1, iloop_d1 * thread_per_d1,
iloop_d2 * thread_per_d2, iloop_d2 * thread_per_d2,
iloop_d3 * thread_per_d3 * DataPerRead); iloop_d3 * thread_per_d3 * DataPerRead);
......
...@@ -87,10 +87,10 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -87,10 +87,10 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
const auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id()); const auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id());
mMyThreadOffsetA = c_thread_mtx_index.batch * BlockMatrixStrideA + mMyThreadOffsetA = c_thread_mtx_index.batch * BlockMatrixStrideA +
a_block_mtx.Get1dIndex(0, c_thread_mtx_index.row); a_block_mtx.GetOffsetFromMultiIndex(0, c_thread_mtx_index.row);
mMyThreadOffsetB = c_thread_mtx_index.batch * BlockMatrixStrideB + mMyThreadOffsetB = c_thread_mtx_index.batch * BlockMatrixStrideB +
b_block_mtx.Get1dIndex(0, c_thread_mtx_index.col); b_block_mtx.GetOffsetFromMultiIndex(0, c_thread_mtx_index.col);
#if 0 #if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
...@@ -221,10 +221,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -221,10 +221,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
threadwise_matrix_copy( threadwise_matrix_copy(
a_block_mtx, a_block_mtx,
p_a_block + p_a_block +
a_block_mtx.Get1dIndex(k_begin, m_repeat * MPerLevel1Cluster) + a_block_mtx.GetOffsetFromMultiIndex(k_begin,
m_repeat * MPerLevel1Cluster) +
ib * BlockMatrixStrideA + mMyThreadOffsetA, ib * BlockMatrixStrideA + mMyThreadOffsetA,
a_thread_mtx, a_thread_mtx,
p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC), p_a_thread +
a_thread_mtx.GetOffsetFromMultiIndex(0, m_repeat * MPerThreadSubC),
a_thread_sub_mtx.GetLengths(), a_thread_sub_mtx.GetLengths(),
Number<DataPerReadA>{}); Number<DataPerReadA>{});
} }
...@@ -238,10 +240,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -238,10 +240,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
threadwise_matrix_copy( threadwise_matrix_copy(
b_block_mtx, b_block_mtx,
p_b_block + p_b_block +
b_block_mtx.Get1dIndex(k_begin, n_repeat * NPerLevel1Cluster) + b_block_mtx.GetOffsetFromMultiIndex(k_begin,
n_repeat * NPerLevel1Cluster) +
ib * BlockMatrixStrideB + mMyThreadOffsetB, ib * BlockMatrixStrideB + mMyThreadOffsetB,
b_thread_mtx, b_thread_mtx,
p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC), p_b_thread +
b_thread_mtx.GetOffsetFromMultiIndex(0, n_repeat * NPerThreadSubC),
b_thread_sub_mtx.GetLengths(), b_thread_sub_mtx.GetLengths(),
Number<DataPerReadB>{}); Number<DataPerReadB>{});
} }
...@@ -343,9 +347,11 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -343,9 +347,11 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
reg_a[0] = *reinterpret_cast<const Float4*>(&p_a_block[mMyThreadOffsetA]); reg_a[0] = *reinterpret_cast<const Float4*>(&p_a_block[mMyThreadOffsetA]);
reg_b[0] = *reinterpret_cast<const Float4*>(&p_b_block[mMyThreadOffsetB]); reg_b[0] = *reinterpret_cast<const Float4*>(&p_b_block[mMyThreadOffsetB]);
reg_b[1] = *reinterpret_cast<const Float4*>( reg_b[1] = *reinterpret_cast<const Float4*>(
&p_b_block[b_block_mtx.Get1dIndex(0, NPerLevel1Cluster) + mMyThreadOffsetB]); &p_b_block[b_block_mtx.GetOffsetFromMultiIndex(0, NPerLevel1Cluster) +
mMyThreadOffsetB]);
reg_a[1] = *reinterpret_cast<const Float4*>( reg_a[1] = *reinterpret_cast<const Float4*>(
&p_a_block[a_block_mtx.Get1dIndex(0, MPerLevel1Cluster) + mMyThreadOffsetA]); &p_a_block[a_block_mtx.GetOffsetFromMultiIndex(0, MPerLevel1Cluster) +
mMyThreadOffsetA]);
outerProduct4x4(reg_a[0], reg_b[0], reg_c[0], reg_c[2], reg_c[4], reg_c[6]); outerProduct4x4(reg_a[0], reg_b[0], reg_c[0], reg_c[2], reg_c[4], reg_c[6]);
outerProduct4x4(reg_a[0], reg_b[1], reg_c[1], reg_c[3], reg_c[5], reg_c[7]); outerProduct4x4(reg_a[0], reg_b[1], reg_c[1], reg_c[3], reg_c[5], reg_c[7]);
...@@ -353,15 +359,17 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -353,15 +359,17 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
for(index_t k = 1; k < K; ++k) for(index_t k = 1; k < K; ++k)
{ {
reg_a[0] = *reinterpret_cast<const Float4*>( reg_a[0] = *reinterpret_cast<const Float4*>(
&p_a_block[a_block_mtx.Get1dIndex(k, 0) + mMyThreadOffsetA]); &p_a_block[a_block_mtx.GetOffsetFromMultiIndex(k, 0) + mMyThreadOffsetA]);
outerProduct4x4(reg_a[1], reg_b[0], reg_c[8], reg_c[10], reg_c[12], reg_c[14]); outerProduct4x4(reg_a[1], reg_b[0], reg_c[8], reg_c[10], reg_c[12], reg_c[14]);
reg_b[0] = *reinterpret_cast<const Float4*>( reg_b[0] = *reinterpret_cast<const Float4*>(
&p_b_block[b_block_mtx.Get1dIndex(k, 0) + mMyThreadOffsetB]); &p_b_block[b_block_mtx.GetOffsetFromMultiIndex(k, 0) + mMyThreadOffsetB]);
outerProduct4x4(reg_a[1], reg_b[1], reg_c[9], reg_c[11], reg_c[13], reg_c[15]); outerProduct4x4(reg_a[1], reg_b[1], reg_c[9], reg_c[11], reg_c[13], reg_c[15]);
reg_b[1] = *reinterpret_cast<const Float4*>( reg_b[1] = *reinterpret_cast<const Float4*>(
&p_b_block[b_block_mtx.Get1dIndex(k, NPerLevel1Cluster) + mMyThreadOffsetB]); &p_b_block[b_block_mtx.GetOffsetFromMultiIndex(k, NPerLevel1Cluster) +
mMyThreadOffsetB]);
reg_a[1] = *reinterpret_cast<const Float4*>( reg_a[1] = *reinterpret_cast<const Float4*>(
&p_a_block[a_block_mtx.Get1dIndex(k, MPerLevel1Cluster) + mMyThreadOffsetA]); &p_a_block[a_block_mtx.GetOffsetFromMultiIndex(k, MPerLevel1Cluster) +
mMyThreadOffsetA]);
outerProduct4x4(reg_a[0], reg_b[0], reg_c[0], reg_c[2], reg_c[4], reg_c[6]); outerProduct4x4(reg_a[0], reg_b[0], reg_c[0], reg_c[2], reg_c[4], reg_c[6]);
outerProduct4x4(reg_a[0], reg_b[1], reg_c[1], reg_c[3], reg_c[5], reg_c[7]); outerProduct4x4(reg_a[0], reg_b[1], reg_c[1], reg_c[3], reg_c[5], reg_c[7]);
} }
...@@ -489,7 +497,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -489,7 +497,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
const index_t c_thread_offset = const index_t c_thread_offset =
c_thread_mtx_begin.batch * BlockMatrixStrideC + c_thread_mtx_begin.batch * BlockMatrixStrideC +
c_block_mtx.Get1dIndex(c_thread_mtx_begin.row, c_thread_mtx_begin.col); c_block_mtx.GetOffsetFromMultiIndex(c_thread_mtx_begin.row, c_thread_mtx_begin.col);
for(index_t m_repeat = 0; m_repeat < MRepeat; ++m_repeat) for(index_t m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
{ {
...@@ -498,11 +506,11 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -498,11 +506,11 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
threadwise_matrix_copy( threadwise_matrix_copy(
c_thread_sub_mtx, c_thread_sub_mtx,
p_c_thread + p_c_thread +
c_thread_sub_mtx.Get1dIndex(m_repeat * MPerLevel1Cluster, c_thread_sub_mtx.GetOffsetFromMultiIndex(m_repeat * MPerLevel1Cluster,
n_repeat * NPerLevel1Cluster), n_repeat * NPerLevel1Cluster),
c_block_mtx, c_block_mtx,
p_c_block + p_c_block +
c_block_mtx.Get1dIndex(m_repeat * MPerLevel1Cluster, c_block_mtx.GetOffsetFromMultiIndex(m_repeat * MPerLevel1Cluster,
n_repeat * NPerLevel1Cluster) + n_repeat * NPerLevel1Cluster) +
c_thread_offset, c_thread_offset,
c_thread_sub_mtx.GetLengths()); c_thread_sub_mtx.GetLengths());
......
...@@ -51,8 +51,8 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 ...@@ -51,8 +51,8 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id()); auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id());
mMyThreadOffsetA = BlockMatrixA::Get1dIndex(0, c_thread_mtx_index.row); mMyThreadOffsetA = BlockMatrixA::GetOffsetFromMultiIndex(0, c_thread_mtx_index.row);
mMyThreadOffsetB = BlockMatrixB::Get1dIndex(0, c_thread_mtx_index.col); mMyThreadOffsetB = BlockMatrixB::GetOffsetFromMultiIndex(0, c_thread_mtx_index.col);
} }
__device__ static auto GetThreadMatrixCLengths() __device__ static auto GetThreadMatrixCLengths()
...@@ -248,10 +248,11 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 ...@@ -248,10 +248,11 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
{ {
threadwise_matrix_copy( threadwise_matrix_copy(
a_block_mtx, a_block_mtx,
p_a_block + a_block_mtx.Get1dIndex(k_begin, m_repeat * MPerLevel1Cluster) + p_a_block +
a_block_mtx.GetOffsetFromMultiIndex(k_begin, m_repeat * MPerLevel1Cluster) +
mMyThreadOffsetA, mMyThreadOffsetA,
a_thread_mtx, a_thread_mtx,
p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC), p_a_thread + a_thread_mtx.GetOffsetFromMultiIndex(0, m_repeat * MPerThreadSubC),
a_thread_sub_mtx.GetLengths(), a_thread_sub_mtx.GetLengths(),
Number<DataPerReadA>{}); Number<DataPerReadA>{});
} }
...@@ -262,10 +263,11 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 ...@@ -262,10 +263,11 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
{ {
threadwise_matrix_copy( threadwise_matrix_copy(
b_block_mtx, b_block_mtx,
p_b_block + b_block_mtx.Get1dIndex(k_begin, n_repeat * NPerLevel1Cluster) + p_b_block +
b_block_mtx.GetOffsetFromMultiIndex(k_begin, n_repeat * NPerLevel1Cluster) +
mMyThreadOffsetB, mMyThreadOffsetB,
b_thread_mtx, b_thread_mtx,
p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC), p_b_thread + b_thread_mtx.GetOffsetFromMultiIndex(0, n_repeat * NPerThreadSubC),
b_thread_sub_mtx.GetLengths(), b_thread_sub_mtx.GetLengths(),
Number<DataPerReadB>{}); Number<DataPerReadB>{});
} }
......
...@@ -11,7 +11,7 @@ template <index_t BlockSize, ...@@ -11,7 +11,7 @@ template <index_t BlockSize,
class SliceLengths, class SliceLengths,
class SubLengths, class SubLengths,
class ClusterLengths, class ClusterLengths,
class ThreadArrangeOrder, class ThreadClusterArrangeOrder,
class SrcAccessOrder, class SrcAccessOrder,
class DstAccessOrder> class DstAccessOrder>
struct BlockwiseTensorSliceCopy_generic_v1 struct BlockwiseTensorSliceCopy_generic_v1
...@@ -21,28 +21,135 @@ struct BlockwiseTensorSliceCopy_generic_v1 ...@@ -21,28 +21,135 @@ struct BlockwiseTensorSliceCopy_generic_v1
index_t mSrcMyThreadOffset; index_t mSrcMyThreadOffset;
index_t mDstMyThreadOffset; index_t mDstMyThreadOffset;
__device__ BlockwiseTensorSliceCopy_generic_v1(Array<index_t, nDim> src_block_multi_id_offset, __device__ BlockwiseTensorSliceCopy_generic_v1(Array<index_t, nDim> src_block_multi_offset,
Array<index_t, nDim> dst_block_multi_id_offset) Array<index_t, nDim> dst_block_multi_offset)
{ {
// only support SrcSubLengths.GetLength() == 1 on merged dimension, for now
// check SrcDataPerRead should be 1, if last dimension is a merged dimension
// check NDim consistent // check NDim consistent
static_assert(SrcDesc::GetNumOfDimension() == DstDesc::GetNumOfDimension(), "wrong");
constexpr auto thread_cluster_desc = make_packed_ConstantTensorDescriptor(
ClusterLengths{}.ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));
// BlockSize
static_assert(BlockSize == thread_cluster_desc.GetElementSize(), "wrong! BlockSize");
// divide work
static_for<0, nDim, 1>{}([&](auto IDim) {
static_assert(SliceLengths{}.Get(IDim) % SubLenghs{}.Get(IDim) == 0,
"wrong! cannot evenly divide sliced tensor into sub-tensor");
});
constexpr auto thread_work_desc =
make_packed_ConstantTensorDescriptor(SliceLengths{} / SliceSubLengths{});
static_for<0, nDim, 1>{}([&](auto IDim) {
static_assert(thread_work_desc.GetLength(IDim) % thread_cluster_desc.Get(IDim) == 0,
"wrong! cannot evenly divide work to cluster");
});
// only support SubLengths.Get() == 1 on merged dimension, for now
static_for<0, nDim, 1>{}([&](auto IDim) {
static_if<(SrcDesc::ContainMultipleOriginalDimensions(IDim) ||
DstDesc::ContainMultipleOriginalDimensions(IDim))>{}([&](auto fwd) {
static_assert(fwd(SubLengths{}).Get(IDim) == 1,
"wrong! Sub-Lengths on merged dimension should be 1");
});
});
// calculate mSrcMyThreadOffset, mDstMyThreadOffset
const auto thread_cluster_multi_id =
thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
// calculate mSrcMyThreadOffset const auto data_cluster_multi_id =
// calculate mDstMyThreadOffset reorder_array_given_old2new(thread_cluster_multi_id, ThreadClusterArrangeOrder{});
const auto thread_data_multi_offset = data_cluster_multi_id * SubLengths{};
mSrcMythreadOffset =
SrcDesc::GetOffsetFromMultiIndex(src_block_multi_offset + thread_data_multi_offset);
mSrcMythreadOffset =
DstDesc::GetOffsetFromMultiIndex(dst_block_multi_offset + thread_data_multi_offset);
} }
__device__ static constexpr index_t GetRegisterClipboardSize() {} __device__ static constexpr index_t GetRegisterClipboardSize()
{
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ClusterLengths{});
constexpr auto thread_tensor_desc =
make_packed_ConstantTensorDescriptor(SubLengths{} * repeat_lengths);
return thread_tensor_desc.GetElementSpaceSize();
}
__device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src, __device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src,
Float* __restrict__ p_clipboard) const Float* __restrict__ p_clipboard) const
{ {
constexpr auto thread_sub_tensor_lengths = SubLengths{};
constexpr auto data_per_cluster_per_dims = thread_sub_tensor_lengths * ClusterLengths{};
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ClusterLengths{});
constexpr auto thread_tensor_desc =
make_packed_ConstantTensorDescriptor(thread_sub_tensor_lengths * repeat_lengths);
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
constexpr auto src_data_multi_offset = repeat_multi_id * data_per_cluster_per_dims;
constexpr auto clipboard_data_multi_offset =
repeat_multi_id * thread_sub_tensor_lengths;
constexpr index_t src_offset = SrcDesc{}.GetOffsetFromMultiIndex(src_data_multi_id);
constexpr index_t clipboard_offset =
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id);
threadwise_tensor_slice_copy_generic(SrcDesc{},
p_src + src_offset + mSrcMyThreadOffset,
thread_tensor_desc,
zero_array<index_t, nDim>{},
thread_tensor_desc,
p_clipboard + clipboard_offset,
zero_array<index_t, nDim>{},
thread_sub_tensor_lengths,
SrcAccessOrder{});
});
} }
__device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard, __device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard,
Float* __restrict__ p_dst) const Float* __restrict__ p_dst) const
{ {
constexpr auto thread_sub_tensor_lengths = SubLengths{};
constexpr auto data_per_cluster_per_dims = thread_sub_tensor_lengths * ClusterLengths{};
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ClusterLengths{});
constexpr auto thread_tensor_desc =
make_packed_ConstantTensorDescriptor(thread_sub_tensor_lengths * repeat_lengths);
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
constexpr auto clipboard_data_multi_offset =
repeat_multi_id * thread_sub_tensor_lengths;
constexpr auto dst_data_multi_offset = repeat_multi_id * data_per_cluster_per_dims;
constexpr index_t clipboard_offset =
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_offset);
constexpr index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_offset);
threadwise_tensor_slice_copy_generic(thread_tensor_desc,
p_clipboard + clipboard_offset,
zero_array<index_t, nDim>{},
DstDesc{},
p_dst + dst_offset + mDstMyThreadOffset,
zero_array<index_t, nDim>{},
thread_sub_tensor_lengths,
DstAccessOrder{});
} }
__device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
...@@ -52,4 +159,4 @@ struct BlockwiseTensorSliceCopy_generic_v1 ...@@ -52,4 +159,4 @@ struct BlockwiseTensorSliceCopy_generic_v1
RunLoadRegisterClipboard(p_src, p_clipboard); RunLoadRegisterClipboard(p_src, p_clipboard);
RunStoreRegisterClipboard(p_clipboard, p_dst); RunStoreRegisterClipboard(p_clipboard, p_dst);
} }
}; };
...@@ -39,7 +39,7 @@ struct BlockwiseTensorSliceReorderCopy_v3 ...@@ -39,7 +39,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
constexpr auto thread_cluster_lengths = constexpr auto thread_cluster_lengths =
src_cluster_lengths.ReorderGivenNew2Old(map_thread_cluster_2_src_cluster); src_cluster_lengths.ReorderGivenNew2Old(map_thread_cluster_2_src_cluster);
constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor(thread_cluster_lengths); constexpr auto thread_cluster_desc = make_packed_ConstantTensorDescriptor(thread_cluster_lengths);
// sanity check: data type // sanity check: data type
static_assert(is_same<Float, float>::value, "wrong! only support float for now!\n"); static_assert(is_same<Float, float>::value, "wrong! only support float for now!\n");
...@@ -105,7 +105,8 @@ struct BlockwiseTensorSliceReorderCopy_v3 ...@@ -105,7 +105,8 @@ struct BlockwiseTensorSliceReorderCopy_v3
} }
} }
const auto thread_multi_id = thread_cluster_desc.GetMultiIndex(get_thread_local_1d_id()); const auto thread_multi_id =
thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
// compiler: thread_multi_id, src_data_multi_id, dst_data_multi_id, will use separate // compiler: thread_multi_id, src_data_multi_id, dst_data_multi_id, will use separate
// regsiters, or only one copy??? // regsiters, or only one copy???
...@@ -115,17 +116,21 @@ struct BlockwiseTensorSliceReorderCopy_v3 ...@@ -115,17 +116,21 @@ struct BlockwiseTensorSliceReorderCopy_v3
static_for<0, nDim, 1>{}([&](auto IDim) { static_for<0, nDim, 1>{}([&](auto IDim) {
constexpr auto I = decltype(IDim){}; constexpr auto I = decltype(IDim){};
constexpr index_t i = I.Get(); constexpr index_t i = I.Get();
// compiler: will it really compute index here, or be merged with Get1dIndex and // compiler: will it really compute index here, or be merged with
// GetOffsetFromMultiIndex and
// optimized away??? // optimized away???
src_data_multi_id[i] *= src_sub_lengths.Get(I); src_data_multi_id[i] *= src_sub_lengths.Get(I);
}); });
// compiler: will it really compute index here, or be merged with Get1dIndex and // compiler: will it really compute index here, or be merged with GetOffsetFromMultiIndex
// and
// optimized away??? // optimized away???
const auto dst_data_multi_id = reorder_array_given_new2old(src_data_multi_id, map_dst2src); const auto dst_data_multi_id = reorder_array_given_new2old(src_data_multi_id, map_dst2src);
mSrcMyThreadOffset = src_desc.Get1dIndex(src_data_multi_id + src_block_data_multi_id_begin); mSrcMyThreadOffset =
mDstMyThreadOffset = dst_desc.Get1dIndex(dst_data_multi_id + dst_block_data_multi_id_begin); src_desc.GetOffsetFromMultiIndex(src_data_multi_id + src_block_data_multi_id_begin);
mDstMyThreadOffset =
dst_desc.GetOffsetFromMultiIndex(dst_data_multi_id + dst_block_data_multi_id_begin);
} }
__device__ static constexpr index_t GetRegisterClipboardSize() __device__ static constexpr index_t GetRegisterClipboardSize()
...@@ -142,7 +147,7 @@ struct BlockwiseTensorSliceReorderCopy_v3 ...@@ -142,7 +147,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths; constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;
constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor(thread_tensor_lengths); constexpr auto thread_tensor_desc = make_packed_ConstantTensorDescriptor(thread_tensor_lengths);
return thread_tensor_desc.GetElementSpace(); return thread_tensor_desc.GetElementSpace();
} }
...@@ -162,7 +167,7 @@ struct BlockwiseTensorSliceReorderCopy_v3 ...@@ -162,7 +167,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths; constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;
constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor(thread_tensor_lengths); constexpr auto thread_tensor_desc = make_packed_ConstantTensorDescriptor(thread_tensor_lengths);
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) { static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){}; constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
...@@ -171,9 +176,9 @@ struct BlockwiseTensorSliceReorderCopy_v3 ...@@ -171,9 +176,9 @@ struct BlockwiseTensorSliceReorderCopy_v3
constexpr auto clipboard_data_multi_id = repeat_multi_id * thread_sub_tensor_lengths; constexpr auto clipboard_data_multi_id = repeat_multi_id * thread_sub_tensor_lengths;
constexpr index_t src_offset = SrcDesc{}.Get1dIndex(src_data_multi_id); constexpr index_t src_offset = SrcDesc{}.GetOffsetFromMultiIndex(src_data_multi_id);
constexpr index_t clipboard_offset = constexpr index_t clipboard_offset =
thread_tensor_desc.Get1dIndex(clipboard_data_multi_id); thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id);
threadwise_tensor_slice_copy(SrcDesc{}, threadwise_tensor_slice_copy(SrcDesc{},
p_src + src_offset + mSrcMyThreadOffset, p_src + src_offset + mSrcMyThreadOffset,
...@@ -199,7 +204,7 @@ struct BlockwiseTensorSliceReorderCopy_v3 ...@@ -199,7 +204,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths; constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;
constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor(thread_tensor_lengths); constexpr auto thread_tensor_desc = make_packed_ConstantTensorDescriptor(thread_tensor_lengths);
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) { static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){}; constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
...@@ -212,9 +217,9 @@ struct BlockwiseTensorSliceReorderCopy_v3 ...@@ -212,9 +217,9 @@ struct BlockwiseTensorSliceReorderCopy_v3
constexpr auto dst_data_multi_id = src_data_multi_id.ReorderGivenNew2Old(MapDst2Src{}); constexpr auto dst_data_multi_id = src_data_multi_id.ReorderGivenNew2Old(MapDst2Src{});
constexpr index_t clipboard_offset = constexpr index_t clipboard_offset =
thread_tensor_desc.Get1dIndex(clipboard_data_multi_id); thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id);
constexpr index_t dst_offset = DstDesc{}.Get1dIndex(dst_data_multi_id); constexpr index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_id);
// write in the order of dst // write in the order of dst
#if 1 #if 1
......
...@@ -30,7 +30,7 @@ __host__ __device__ constexpr auto get_convolution_output_default_4d_tensor_desc ...@@ -30,7 +30,7 @@ __host__ __device__ constexpr auto get_convolution_output_default_4d_tensor_desc
constexpr auto HO = HI + 1 - Y; constexpr auto HO = HI + 1 - Y;
constexpr auto WO = WI + 1 - X; constexpr auto WO = WI + 1 - X;
return make_ConstantTensorDescriptor(Sequence<N, K, HO, WO>{}); return make_packed_ConstantTensorDescriptor(Sequence<N, K, HO, WO>{});
} }
template <class InDesc, class WeiDesc, class LowerPads, class UpperPads> template <class InDesc, class WeiDesc, class LowerPads, class UpperPads>
...@@ -67,7 +67,7 @@ __host__ __device__ constexpr auto get_convolution_with_padding_output_default_4 ...@@ -67,7 +67,7 @@ __host__ __device__ constexpr auto get_convolution_with_padding_output_default_4
constexpr auto HO = HI + HPadLow + HPadUp + 1 - Y; constexpr auto HO = HI + HPadLow + HPadUp + 1 - Y;
constexpr auto WO = WI + WPadLow + WPadUp + 1 - X; constexpr auto WO = WI + WPadLow + WPadUp + 1 - X;
return make_ConstantTensorDescriptor(Sequence<N, K, HO, WO>{}); return make_packed_ConstantTensorDescriptor(Sequence<N, K, HO, WO>{});
} }
template <class InDesc, class WeiDesc, class OutDesc> template <class InDesc, class WeiDesc, class OutDesc>
......
...@@ -180,17 +180,18 @@ struct GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw ...@@ -180,17 +180,18 @@ struct GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw
c_block_data_begin += CPerBlock, __syncthreads()) c_block_data_begin += CPerBlock, __syncthreads())
{ {
// copy input tensor to LDS // copy input tensor to LDS
blockwise_in_copy.Run(p_in_global + blockwise_in_copy.Run(
in_nchw_global_desc.Get1dIndex(n_block_data_begin, p_in_global +
in_nchw_global_desc.GetOffsetFromMultiIndex(n_block_data_begin,
c_block_data_begin, c_block_data_begin,
hi_block_data_begin, hi_block_data_begin,
wi_block_data_begin), wi_block_data_begin),
p_in_block); p_in_block);
// copy weight tensor to LDS // copy weight tensor to LDS
blockwise_wei_copy.Run( blockwise_wei_copy.Run(p_wei_global +
p_wei_global + wei_kcyx_global_desc.GetOffsetFromMultiIndex(
wei_kcyx_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0), k_block_data_begin, c_block_data_begin, 0, 0),
p_wei_block); p_wei_block);
__syncthreads(); __syncthreads();
...@@ -202,26 +203,28 @@ struct GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw ...@@ -202,26 +203,28 @@ struct GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw
threadwise_direct_convolution_2( threadwise_direct_convolution_2(
in_nchw_thread_block_desc, in_nchw_thread_block_desc,
p_in_block + p_in_block +
in_nchw_block_desc.Get1dIndex(n_thread_data_begin, in_nchw_block_desc.GetOffsetFromMultiIndex(n_thread_data_begin,
c_thread_data, c_thread_data,
hi_thread_data_begin, hi_thread_data_begin,
wi_thread_data_begin), wi_thread_data_begin),
wei_kcyx_thread_block_desc, wei_kcyx_thread_block_desc,
p_wei_block + p_wei_block +
wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), wei_kcyx_block_desc.GetOffsetFromMultiIndex(
k_thread_data_begin, c_thread_data, 0, 0),
out_nkhw_thread_desc, out_nkhw_thread_desc,
p_out_thread); p_out_thread);
#elif 0 #elif 0
threadwise_direct_convolution_3( threadwise_direct_convolution_3(
in_nchw_thread_block_desc, in_nchw_thread_block_desc,
p_in_block + p_in_block +
in_nchw_block_desc.Get1dIndex(n_thread_data_begin, in_nchw_block_desc.GetOffsetFromMultiIndex(n_thread_data_begin,
c_thread_data, c_thread_data,
hi_thread_data_begin, hi_thread_data_begin,
wi_thread_data_begin), wi_thread_data_begin),
wei_kcyx_thread_block_desc, wei_kcyx_thread_block_desc,
p_wei_block + p_wei_block +
wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), wei_kcyx_block_desc.GetOffsetFromMultiIndex(
k_thread_data_begin, c_thread_data, 0, 0),
out_nkhw_thread_desc, out_nkhw_thread_desc,
p_out_thread); p_out_thread);
#endif #endif
...@@ -229,12 +232,12 @@ struct GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw ...@@ -229,12 +232,12 @@ struct GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw
} }
// copy output tensor from register to global mem // copy output tensor from register to global mem
threadwise_tensor_slice_copy( threadwise_tensor_slice_copy(out_nkhw_thread_desc,
out_nkhw_thread_desc,
p_out_thread, p_out_thread,
out_nkhw_global_desc, out_nkhw_global_desc,
p_out_global + p_out_global +
out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, out_nkhw_global_desc.GetOffsetFromMultiIndex(
n_block_data_begin + n_thread_data_begin,
k_block_data_begin + k_thread_data_begin, k_block_data_begin + k_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin, ho_block_data_begin + ho_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin), wo_block_data_begin + wo_thread_data_begin),
......
...@@ -221,11 +221,12 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn ...@@ -221,11 +221,12 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
const Float* p_in_global_block_offset = const Float* p_in_global_block_offset =
p_in_global + p_in_global +
in_c_h_w_n_global_desc.Get1dIndex( in_c_h_w_n_global_desc.GetOffsetFromMultiIndex(
0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin); 0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin);
const Float* p_wei_global_block_offset = const Float* p_wei_global_block_offset =
p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin); p_wei_global +
wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, 0, 0, k_block_data_begin);
for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock, for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock,
p_in_global_block_offset += CPerBlock * in_c_h_w_n_global_desc.GetStride(I0), p_in_global_block_offset += CPerBlock * in_c_h_w_n_global_desc.GetStride(I0),
...@@ -261,8 +262,8 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn ...@@ -261,8 +262,8 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
#else #else
blockwise_batch_gemm.Run_asm blockwise_batch_gemm.Run_asm
#endif #endif
(p_wei_block + wei_c_y_x_k_block_desc.Get1dIndex(0, y, x, 0), (p_wei_block + wei_c_y_x_k_block_desc.GetOffsetFromMultiIndex(0, y, x, 0),
p_in_block + in_c_h_w_n_block_desc.Get1dIndex(0, y, x, 0), p_in_block + in_c_h_w_n_block_desc.GetOffsetFromMultiIndex(0, y, x, 0),
p_out_thread); p_out_thread);
} }
} }
...@@ -325,12 +326,12 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn ...@@ -325,12 +326,12 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
} }
#endif #endif
threadwise_tensor_slice_copy( threadwise_tensor_slice_copy(out_10d_thread_desc,
out_10d_thread_desc,
p_out_thread, p_out_thread,
out_10d_global_desc, out_10d_global_desc,
p_out_global + p_out_global +
out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, out_k_h_w_n_global_desc.GetOffsetFromMultiIndex(
k_block_data_begin + k_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin, ho_block_data_begin + ho_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin, wo_block_data_begin + wo_thread_data_begin,
n_block_data_begin + n_thread_data_begin), n_block_data_begin + n_thread_data_begin),
...@@ -375,12 +376,12 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn ...@@ -375,12 +376,12 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
} }
#endif #endif
threadwise_tensor_slice_copy( threadwise_tensor_slice_copy(out_10d_thread_desc,
out_10d_thread_desc,
p_out_thread, p_out_thread,
out_10d_global_desc, out_10d_global_desc,
p_out_global + p_out_global +
out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, out_k_h_w_n_global_desc.GetOffsetFromMultiIndex(
k_block_data_begin + k_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin, ho_block_data_begin + ho_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin, wo_block_data_begin + wo_thread_data_begin,
n_block_data_begin + n_thread_data_begin), n_block_data_begin + n_thread_data_begin),
......
...@@ -230,11 +230,12 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn ...@@ -230,11 +230,12 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
#if 1 #if 1
const Float* p_in_global_block_offset = const Float* p_in_global_block_offset =
p_in_global + p_in_global +
in_c_h_w_n_global_desc.Get1dIndex( in_c_h_w_n_global_desc.GetOffsetFromMultiIndex(
0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin); 0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin);
const Float* p_wei_global_block_offset = const Float* p_wei_global_block_offset =
p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin); p_wei_global +
wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, 0, 0, k_block_data_begin);
for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock, for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock,
p_in_global_block_offset += CPerBlock * in_c_h_w_n_global_desc.GetStride(I0), p_in_global_block_offset += CPerBlock * in_c_h_w_n_global_desc.GetStride(I0),
...@@ -242,21 +243,23 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn ...@@ -242,21 +243,23 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
{ {
for(index_t y = 0; y < Y; ++y) for(index_t y = 0; y < Y; ++y)
{ {
blockwise_in_copy.Run(p_in_global_block_offset + blockwise_in_copy.Run(
in_c_h_w_n_global_desc.Get1dIndex(0, y, 0, 0), p_in_global_block_offset +
in_c_h_w_n_global_desc.GetOffsetFromMultiIndex(0, y, 0, 0),
p_in_block); p_in_block);
blockwise_wei_copy.Run(p_wei_global_block_offset + blockwise_wei_copy.Run(
wei_c_y_x_k_global_desc.Get1dIndex(0, y, 0, 0), p_wei_global_block_offset +
wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, 0, 0),
p_wei_block); p_wei_block);
__syncthreads(); __syncthreads();
for(index_t x = 0; x < X; ++x) for(index_t x = 0; x < X; ++x)
{ {
blockwise_batch_gemm.Run(p_wei_block + wei_c_x_k_block_desc.Get1dIndex(0, x, 0), blockwise_batch_gemm.Run(
p_in_block + p_wei_block + wei_c_x_k_block_desc.GetOffsetFromMultiIndex(0, x, 0),
in_c_h_w_n_block_desc.Get1dIndex(0, 0, x, 0), p_in_block + in_c_h_w_n_block_desc.GetOffsetFromMultiIndex(0, 0, x, 0),
p_out_thread); p_out_thread);
} }
...@@ -269,11 +272,12 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn ...@@ -269,11 +272,12 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
{ {
const Float* p_in_global_block_offset = const Float* p_in_global_block_offset =
p_in_global + p_in_global +
in_c_h_w_n_global_desc.Get1dIndex( in_c_h_w_n_global_desc.GetOffsetFromMultiIndex(
0, hi_block_data_begin + y, wi_block_data_begin, n_block_data_begin); 0, hi_block_data_begin + y, wi_block_data_begin, n_block_data_begin);
const Float* p_wei_global_block_offset = const Float* p_wei_global_block_offset =
p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, y, 0, k_block_data_begin); p_wei_global +
wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, 0, k_block_data_begin);
for(index_t for(index_t
c_block_data_begin = 0; c_block_data_begin = 0;
...@@ -290,9 +294,9 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn ...@@ -290,9 +294,9 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
for(index_t x = 0; x < X; ++x) for(index_t x = 0; x < X; ++x)
{ {
blockwise_batch_gemm.Run(p_wei_block + wei_c_x_k_block_desc.Get1dIndex(0, x, 0), blockwise_batch_gemm.Run(
p_in_block + p_wei_block + wei_c_x_k_block_desc.GetOffsetFromMultiIndex(0, x, 0),
in_c_h_w_n_block_desc.Get1dIndex(0, 0, x, 0), p_in_block + in_c_h_w_n_block_desc.GetOffsetFromMultiIndex(0, 0, x, 0),
p_out_thread); p_out_thread);
} }
...@@ -358,12 +362,12 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn ...@@ -358,12 +362,12 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
} }
#endif #endif
threadwise_tensor_slice_copy( threadwise_tensor_slice_copy(out_10d_thread_desc,
out_10d_thread_desc,
p_out_thread, p_out_thread,
out_10d_global_desc, out_10d_global_desc,
p_out_global + p_out_global +
out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, out_k_h_w_n_global_desc.GetOffsetFromMultiIndex(
k_block_data_begin + k_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin, ho_block_data_begin + ho_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin, wo_block_data_begin + wo_thread_data_begin,
n_block_data_begin + n_thread_data_begin), n_block_data_begin + n_thread_data_begin),
...@@ -408,12 +412,12 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn ...@@ -408,12 +412,12 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
} }
#endif #endif
threadwise_tensor_slice_copy( threadwise_tensor_slice_copy(out_10d_thread_desc,
out_10d_thread_desc,
p_out_thread, p_out_thread,
out_10d_global_desc, out_10d_global_desc,
p_out_global + p_out_global +
out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, out_k_h_w_n_global_desc.GetOffsetFromMultiIndex(
k_block_data_begin + k_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin, ho_block_data_begin + ho_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin, wo_block_data_begin + wo_thread_data_begin,
n_block_data_begin + n_thread_data_begin), n_block_data_begin + n_thread_data_begin),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment