Commit 8a4b5978 authored by Chao Liu's avatar Chao Liu
Browse files

adding implicit gemm v3

parent 2a48812e
......@@ -38,7 +38,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
constexpr index_t X = wei_kcyx_desc.GetLength(I3);
// reorder weight
auto wei_cyxk_desc = make_packed_ConstantTensorDescriptor(Sequence<C, Y, X, K>{});
auto wei_cyxk_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<C, Y, X, K>{});
ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
......@@ -51,7 +51,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
std::thread::hardware_concurrency());
// reorder input
auto in_chwn_desc = make_packed_ConstantTensorDescriptor(Sequence<C, Hi, Wi, N>{});
auto in_chwn_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<C, Hi, Wi, N>{});
ostream_ConstantTensorDescriptor(in_chwn_desc, std::cout << "in_chwn_desc: ");
Tensor<T> in_chwn(make_TensorDescriptor(in_chwn_desc));
......@@ -64,7 +64,8 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
std::thread::hardware_concurrency());
// output
auto out_khwn_desc = make_packed_ConstantTensorDescriptor(Sequence<K, Ho, Wo, N>{});
auto out_khwn_desc =
make_ConstantTensorDescriptor_default_rank_packed(Sequence<K, Ho, Wo, N>{});
ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: ");
Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc));
......
......@@ -37,7 +37,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
constexpr index_t X = wei_kcyx_desc.GetLength(I3);
// reorder weight
auto wei_cyxk_desc = make_packed_ConstantTensorDescriptor(Sequence<C, Y, X, K>{});
auto wei_cyxk_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<C, Y, X, K>{});
ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
......@@ -50,7 +50,8 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
std::thread::hardware_concurrency());
// output
auto out_khwn_desc = make_packed_ConstantTensorDescriptor(Sequence<K, Ho, Wo, N>{});
auto out_khwn_desc =
make_ConstantTensorDescriptor_default_rank_packed(Sequence<K, Ho, Wo, N>{});
ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: ");
Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc));
......
......@@ -36,7 +36,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
constexpr index_t X = wei_kcyx_desc.GetLength(I3);
// reorder weight
auto wei_cyxk_desc = make_packed_ConstantTensorDescriptor(Sequence<C, Y, X, K>{});
auto wei_cyxk_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<C, Y, X, K>{});
ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
......@@ -57,7 +57,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data());
out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
#if 0
#if 1
// for 3x3, 34x34, v1r3, Pascal
constexpr index_t BlockSize = 128;
......@@ -92,7 +92,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
constexpr index_t WeiBlockCopyDataPerRead_K = 4;
constexpr index_t OutThreadCopyDataPerWrite_W = 2;
#elif 0
#elif 1
// for 3x3, 34x34, v1r3, Vega 20, WoPerBlock = 32
constexpr index_t BlockSize = 256;
......@@ -162,7 +162,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
constexpr index_t WeiBlockCopyDataPerRead_K = 4;
constexpr index_t OutThreadCopyDataPerWrite_W = 2;
#elif 1
#elif 0
// for 3x3, 34x34, v1r3, Vega 20, WoPerBlock = 8
constexpr index_t BlockSize = 256;
......
......@@ -35,7 +35,7 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
constexpr index_t X = wei_kcyx_desc.GetLength(I3);
// reorder weight
auto wei_cyxk_desc = make_ConstantTensorDescriptor(Sequence<C, Y, X, K>{});
auto wei_cyxk_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<C, Y, X, K>{});
ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
......@@ -56,37 +56,40 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data());
out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
constexpr index_t N1 = 2;
constexpr index_t N2 = 4;
constexpr index_t B = (N * Ho * Wo) / (N1 * N2);
#if 1
// for 3x3, 28x28, v3, Pascal
constexpr index_t BlockSize = 128;
// for 3x3, 28x28, v3
constexpr index_t BlockSize = 256;
constexpr index_t BPerBlock = 16;
constexpr index_t KPerBlock = 128;
constexpr index_t CPerBlock = 8;
constexpr index_t BPerThread = 1;
constexpr index_t KPerThread = 8;
constexpr index_t GemmMPerThreadSubC = 4;
constexpr index_t GemmNPerThreadSubC = 4;
constexpr index_t GemmMLevel0Cluster = 4;
constexpr index_t GemmNLevel0Cluster = 2;
constexpr index_t GemmNLevel0Cluster = 4;
constexpr index_t GemmMLevel1Cluster = 4;
constexpr index_t GemmNLevel1Cluster = 2;
constexpr index_t GemmNLevel1Cluster = 4;
constexpr index_t GemmKPerThreadLoop = 1;
constexpr index_t GemmDataPerReadA = 4;
constexpr index_t GemmDataPerReadB = 4;
using InBlockReorderSrcSubLengths_NCHW = Sequence<4, 1, 1, 1>;
using InBlockReorderSrcClusterLengths_NCHW = Sequence<4, 8, 2, 2>;
using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
using InBlockCopySubLengths_N1_N2_C_B = Sequence<1, 4, 1, 1>;
using InBlockCopyClusterLengths_N1_N2_C_B = Sequence<2, 1, 8, 16>;
constexpr index_t InBlockCopySrcDataPerRead_B = 1;
constexpr index_t InBlockCopyDstDataPerWrite_N2 = 4;
constexpr index_t WeiBlockCopyDataPerRead_K = 4;
constexpr index_t WeiBlockCopyDataPerAccess_K = 4;
#endif
constexpr index_t GridSize =
((N + NPerBlock - 1) / NPerBlock) * ((K + KPerBlock - 1) / KPerBlock) *
((Ho + HoPerBlock - 1) / HoPerBlock) * ((Wo + WoPerBlock - 1) / WoPerBlock);
((B + BPerBlock - 1) / BPerBlock) * ((K + KPerBlock - 1) / KPerBlock);
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
......@@ -102,15 +105,11 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
decltype(in_nchw_desc),
decltype(wei_cyxk_desc),
decltype(out_nkhw_desc),
NPerBlock,
BPerBlock,
KPerBlock,
CPerBlock,
HoPerBlock,
WoPerBlock,
NPerThread,
KPerThread,
HoPerThread,
WoPerThread,
N1,
N2,
GemmMPerThreadSubC,
GemmNPerThreadSubC,
GemmMLevel0Cluster,
......@@ -120,14 +119,11 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
GemmKPerThreadLoop,
GemmDataPerReadA,
GemmDataPerReadB,
InBlockReorderSrcSubLengths_NCHW,
InBlockReorderSrcClusterLengths_NCHW,
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW,
InBlockReorderDataPerRead_W,
InBlockReorderDataPerWrite_N,
WeiBlockCopyClusterLengths,
WeiBlockCopyDataPerRead_K,
OutThreadCopyDataPerWrite_W>{};
InBlockCopySubLengths_N1_N2_C_B,
InBlockCopyClusterLengths_N1_N2_C_B,
InBlockCopySrcDataPerRead_B,
InBlockCopyDstDataPerWrite_N2,
WeiBlockCopyDataPerAccess_K>{};
float time = launch_kernel(run_gridwise_convolution<decltype(gridwise_conv), T>,
dim3(GridSize),
......
......@@ -13,7 +13,7 @@
#include "device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp"
#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
//#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
struct GeneratorTensor_1
{
......@@ -548,8 +548,8 @@ int main(int argc, char* argv[])
auto lower_pads = Sequence<HPad, WPad>{};
auto upper_pads = Sequence<HPad, WPad>{};
auto in_nchw_desc = make_packed_ConstantTensorDescriptor(Sequence<N, C, HI, WI>{});
auto wei_kcyx_desc = make_packed_ConstantTensorDescriptor(Sequence<K, C, Y, X>{});
auto in_nchw_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<N, C, HI, WI>{});
auto wei_kcyx_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<K, C, Y, X>{});
auto out_nkhw_desc = get_convolution_with_padding_output_default_4d_tensor_descriptor(
in_nchw_desc, wei_kcyx_desc, lower_pads, upper_pads);
......@@ -612,11 +612,11 @@ int main(int argc, char* argv[])
device_convolution_implicit_gemm_v1_chwn_cyxk_khwn
#elif 0
device_convolution_implicit_gemm_v1_nchw_cyxk_khwn
#elif 1
#elif 0
device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw
#elif 0
device_convolution_implicit_gemm_v2_chwn_cyxk_khwn
#elif 0
#elif 1
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw
#endif
(in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
......
......@@ -12,7 +12,7 @@ struct Array
index_t mData[nSize];
template <class... Xs>
__host__ __device__ Array(Xs... xs) : mData{static_cast<TData>(xs)...}
__host__ __device__ constexpr Array(Xs... xs) : mData{static_cast<TData>(xs)...}
{
}
......@@ -37,6 +37,25 @@ struct Array
}
};
template <index_t... Is>
__host__ __device__ constexpr auto sequence2array(Sequence<Is...>)
{
return Array<index_t, sizeof...(Is)>{Is...};
}
template <class TData, index_t NSize>
__host__ __device__ constexpr auto make_zero_array()
{
Array<TData, NSize> a;
static_for<0, NSize, 1>{}([&](auto I) {
constexpr index_t i = I.Get();
a[i] = static_cast<TData>(0);
});
return a;
}
template <class TData, index_t NSize, index_t... IRs>
__host__ __device__ auto reorder_array_given_new2old(const Array<TData, NSize>& old_array,
Sequence<IRs...> new2old)
......@@ -80,15 +99,14 @@ __host__ __device__ auto extract_array(const Array<TData, NSize>& old_array, Ext
static_for<0, new_size, 1>{}([&](auto I) {
constexpr index_t i = I.Get();
new_array[i] = old_array[ExtractSeq{}.Get(I)];
new_array[i] = old_array[ExtractSeq::Get(I)];
});
return new_array;
}
template <class TData, index_t NSize>
__host__ __device__ constexpr auto operator+(const Array<TData, NSize>& a,
const Array<TData, NSize>& b)
__host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Array<TData, NSize> b)
{
Array<TData, NSize> result;
......@@ -99,3 +117,20 @@ __host__ __device__ constexpr auto operator+(const Array<TData, NSize>& a,
return result;
}
// Array = Array * Sequence
template <class TData, index_t NSize, index_t... Is>
__host__ __device__ constexpr auto operator*(Array<TData, NSize> a, Sequence<Is...> b)
{
static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
Array<TData, NSize> result;
static_for<0, NSize, 1>{}([&](auto I) {
constexpr index_t i = I.Get();
result[i] = a[i] + b.Get(I);
});
return result;
}
......@@ -9,26 +9,26 @@ struct ConstantMatrixDescriptor
static_assert(NCol_ <= RowStride_, "wrong! NCol > RowStride!");
}
__host__ __device__ constexpr index_t NRow() const { return NRow_; }
__host__ __device__ static constexpr index_t NRow() { return NRow_; }
__host__ __device__ constexpr index_t NCol() const { return NCol_; }
__host__ __device__ static constexpr index_t NCol() { return NCol_; }
__host__ __device__ constexpr index_t RowStride() const { return RowStride_; }
__host__ __device__ static constexpr index_t RowStride() { return RowStride_; }
__host__ __device__ constexpr auto GetLengths() const { return Sequence<NRow_, NCol_>{}; }
__host__ __device__ static constexpr auto GetLengths() { return Sequence<NRow_, NCol_>{}; }
__host__ __device__ constexpr index_t GetElementSize() const { return NRow_ * NCol_; }
__host__ __device__ static constexpr index_t GetElementSize() { return NRow_ * NCol_; }
__host__ __device__ constexpr index_t GetElementSpace() const { return NRow_ * RowStride_; }
__host__ __device__ static constexpr index_t GetElementSpace() { return NRow_ * RowStride_; }
__host__ __device__ index_t GetOffsetFromMultiIndex(index_t irow, index_t icol) const
__host__ __device__ static index_t GetOffsetFromMultiIndex(index_t irow, index_t icol)
{
return irow * RowStride_ + icol;
}
template <index_t SubNRow, index_t SubNCol>
__host__ __device__ constexpr auto MakeSubMatrixDescriptor(Number<SubNRow>,
Number<SubNCol>) const
__host__ __device__ static constexpr auto MakeSubMatrixDescriptor(Number<SubNRow>,
Number<SubNCol>)
{
return ConstantMatrixDescriptor<SubNRow, SubNCol, RowStride_>{};
}
......
......@@ -11,8 +11,8 @@ struct ConstantMergedTensorDescriptor
{
static constexpr auto mOriginalDimMergeSeqs = std::tuple<OriginalDimMergeSeqs...>{};
static constexpr index_t nDim = std::tuple_size<mOriginalDimMergeSeqs>::value;
static constexpr index_t nOriginalDim = OriginalDesc::GetNumOfDimension();
static constexpr index_t nDim = sizeof...(OriginalDimMergeSeqs);
static constexpr index_t nOriginalDim = OriginalTensorDesc::GetNumOfDimension();
__host__ __device__ constexpr ConstantMergedTensorDescriptor()
{
......@@ -21,25 +21,28 @@ struct ConstantMergedTensorDescriptor
// TODO: check each of OriginalDimMergeSeqs contains at least 1, and at most
// OriginalTensorDesc::nDim number of dimensions
// TODO: check there is no duplication in OriginalDimMergeSeqs
// TODO: check OriginalDimMergeSeqs contains all original dimensions
// TODO: check there is no duplication in OriginalDimMergeSeqs
}
__host__ __device__ static constexpr index_t GetNumOfDimension() { return nDim; }
__host__ __device__ static constexpr index_t GetNumOfOriginalDimension() { return nOriginalDim }
__host__ __device__ static constexpr index_t GetNumOfOriginalDimension()
{
return nOriginalDim;
}
template <index_t IDim>
__host__ __device__ static constexpr bool ContainMultipleOriginalDimensions(Number<IDim>)
{
return (std::Get<IDIM>(mOriginalDimMergeSeqs).GetSize() > 1);
return (std::get<IDim>(mOriginalDimMergeSeqs).GetSize() > 1);
}
template <index_t IDim>
__host__ __device__ static constexpr index_t GetLength(Number<IDim>)
{
constexpr auto original_dims_partial = std::Get<IDim>(mOriginalDimMergeSeqs);
constexpr auto original_dims_partial = std::get<IDim>(mOriginalDimMergeSeqs);
return OriginalTensorDesc::Extract(original_dims_partial).GetElementSize();
}
......@@ -50,14 +53,14 @@ struct ConstantMergedTensorDescriptor
static_assert(!ContainMultipleOriginalDimensions(Number<IDim>{}),
"wrong! stride of a merged dimension is undefined");
constexpr auto idim_original = std::Get<IDim>(mOriginalDimMergeSeqs).Front();
constexpr auto idim_original = std::get<IDim>(mOriginalDimMergeSeqs).Front();
return OriginalTensorDesc::GetStride(Number<idim_original>{});
}
__host__ __device__ static constexpr auto GetLengths()
{
return Sequence<OriginalTensorDesc::Extract(OriginalDimMergeSeqs).GetElementSize()...>{};
return Sequence<OriginalTensorDesc::Extract(OriginalDimMergeSeqs{}).GetElementSize()...>{};
}
__host__ __device__ static constexpr index_t GetElementSize()
......@@ -75,17 +78,16 @@ struct ConstantMergedTensorDescriptor
constexpr auto original_dims_partial = std::get<idim>(mOriginalDimMergeSeqs);
// get partial original-multi-id corresponding to this merged dimension
constexpr auto original_multi_id_partial =
const auto original_multi_id_partial =
OriginalTensorDesc::Extract(original_dims_partial)
.GetMultiIndexFrom1dIndex(multi_id[idim]);
// make sure compiler unroll this loop and propagate all the constants
for(index_t i = 0; i < original_dims_partial.GetSize(); ++i)
{
index_t idim_original = original_dims_partial[i];
static_for<0, original_dims_partial.GetSize(), 1>{}([&](auto I_) {
constexpr auto I = decltype(I_){};
constexpr index_t idim_original = original_dims_partial.Get(I);
original_multi_id[idim_original] = original_multi_id_partial[i]
}
original_multi_id[idim_original] = original_multi_id_partial[I.Get()];
});
});
return original_multi_id;
......@@ -95,10 +97,10 @@ struct ConstantMergedTensorDescriptor
{
const auto original_multi_id = GetOriginalMultiIndexFromMultiIndex(multi_id);
return OriginalTensorDesc::GetOffsetFromMultiIndex(orginal_multi_id);
return OriginalTensorDesc::GetOffsetFromMultiIndex(original_multi_id);
}
template <index_t... Is>
template <class... Is>
__host__ __device__ static index_t GetOffsetFromMultiIndex(Is... is)
{
return GetOffsetFromMultiIndex(Array<index_t, nDim>{is...});
......@@ -106,14 +108,15 @@ struct ConstantMergedTensorDescriptor
__host__ __device__ static Array<index_t, nDim> GetMultiIndexFrom1dIndex(index_t id)
{
constexpr auto dummy_desc = make_packed_ConstantTensorDescriptor(GetLengths());
constexpr auto dummy_desc = make_ConstantTensorDescriptor_default_rank_packed(GetLengths());
return dummy_desc.GetMultiIndexFrom1dIndex(id);
}
};
template <class OriginalTensorDesc, class... OriginalDimMergeSeqs>
constexpr auto make_ConstantMergedTensorDescriptor(OriginalTensorDesc, OriginalDimMergeSeqs...)
__host__ __device__ constexpr auto make_ConstantMergedTensorDescriptor(OriginalTensorDesc,
OriginalDimMergeSeqs...)
{
return ConstantMergedTensorDescriptor<OriginalTensorDesc, OriginalDimMergeSeqs...>{};
}
......@@ -2,20 +2,20 @@
#include "common.hip.hpp"
template <class Lengths>
__host__ __device__ constexpr auto calculate_packed_tensor_strides(Lengths)
__host__ __device__ constexpr auto calculate_tensor_strides_default_rank_packed(Lengths)
{
return reverse_inclusive_scan_sequence(Lengths{}.PopFront(), std::multiplies<index_t>{})
.PushBack(Number<1>{});
}
template <class Lengths, index_t Align>
__host__ __device__ constexpr auto
calculate_rank_tensor_default_strides_with_alignment(Lengths, Number<Align>)
__host__ __device__ constexpr auto calculate_tensor_strides_default_rank_aligned(Lengths,
Number<Align>)
{
constexpr index_t L_back_align =
Align * mod_conv::integer_divide_ceiler<index_t>{}(Lengths{}.Back(), Align);
return calculate_packed_tensor_strides(
return calculate_tensor_strides_default_rank_packed(
Lengths{}.Modify(Number<Lengths{}.GetSize() - 1>{}, Number<L_back_align>{}));
}
......@@ -66,6 +66,12 @@ struct ConstantTensorDescriptor
return MemoryRanks{}.Get(Number<I>{});
}
template <class T>
__host__ __device__ static constexpr bool ContainMultipleOriginalDimensions(T)
{
return false;
}
__host__ __device__ static constexpr index_t GetElementSize()
{
return accumulate_on_sequence(Lengths{}, std::multiplies<index_t>{}, Number<1>{});
......@@ -146,7 +152,7 @@ struct ConstantTensorDescriptor
{
Array<index_t, nDim> multi_id;
constexpr auto dummy_strides = calculate_packed_tensor_strides(GetLengths());
constexpr auto dummy_strides = calculate_tensor_strides_default_rank_packed(GetLengths());
// calculate index in each of the dimensions in the order of their dimension (not rank)
static_for<0, nDim - 1, 1>{}([&](auto IDim) {
......@@ -181,6 +187,12 @@ struct ConstantTensorDescriptor
return ConstantTensorDescriptor<extract_lengths, extract_strides, new_ranks>{};
}
template <index_t... IDims>
__host__ __device__ static constexpr auto Extract(Sequence<IDims...>)
{
return Extract(Number<IDims>{}...);
}
template <index_t IDim, index_t SliceLen>
__host__ __device__ static constexpr auto Slice(Number<IDim>, Number<SliceLen>)
{
......@@ -271,9 +283,11 @@ struct ConstantTensorDescriptor
FirstUnfoldDim <= LastUnfoldDim,
"wrong! should have FirstUnfoldDim <= LastUnfoldDim!");
#if 0 // cannot compile: compiler complain about constexpr
// dimensions to be unfold need to be in descending order (w.r.t. strides), and need to be
// packed in memory, otherwise, unfolding is invalid
static_for<FirstUnfoldDim, LastUnfoldDim, 1>{}([&](auto IDim) {
static_for<FirstUnfoldDim, LastUnfoldDim, 1>{}([&](auto IDim_) {
constexpr auto IDim = decltype(IDim_){};
constexpr auto IDim_p1 = IDim + Number<1>{};
// check stride
......@@ -285,11 +299,12 @@ struct ConstantTensorDescriptor
static_assert(GetStride(IDim_p1) * GetLength(IDim_p1) == GetStride(IDim),
"wrong! dimensions to be unfolded need to be packed");
// checkt ranks
// check ranks
static_assert(GetMemoryRank(IDim_p1) == GetMemoryRank(IDim) + 1,
"wrong! ranks of dimensions to be unfolded need to be in increasing and "
"continuous ranks");
});
#endif
// left and right
constexpr auto left = typename arithmetic_sequence_gen<0, FirstUnfoldDim, 1>::SeqType{};
......@@ -308,9 +323,9 @@ struct ConstantTensorDescriptor
// decrease the ranks that are larger than the rank of LastUnfoldDim
constexpr auto tmp_ranks =
transform_sequences(GetMemoryRanks(),
f_unfold_impl<GetMemoryRank(Number<LastUnfoldDim>{}),
LastUnfoldDim - FirstUnfoldDim + 1>{});
transform_sequences(f_unfold_impl<GetMemoryRank(Number<LastUnfoldDim>{}),
LastUnfoldDim - FirstUnfoldDim + 1>{},
GetMemoryRanks());
// new lengths, strides and ranks
constexpr auto new_lengths = GetLengths()
......@@ -354,26 +369,26 @@ struct ConstantTensorDescriptor
};
template <class Lengths>
__host__ __device__ constexpr auto make_packed_ConstantTensorDescriptor(Lengths)
__host__ __device__ constexpr auto make_ConstantTensorDescriptor_default_rank_packed(Lengths)
{
using Strides = decltype(calculate_packed_tensor_strides(Lengths{}));
using Strides = decltype(calculate_tensor_strides_default_rank_packed(Lengths{}));
using MemoryRanks = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::SeqType;
return ConstantTensorDescriptor<Lengths, Strides, MemoryRanks>{};
}
template <class Lengths, class Strides>
__host__ __device__ constexpr auto make_ranked_ConstantTensorDescriptor(Lengths, Strides)
__host__ __device__ constexpr auto make_ConstantTensorDescriptor_default_rank(Lengths, Strides)
{
using MemoryRanks = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::SeqType;
return ConstantTensorDescriptor<Lengths, Strides, MemoryRanks>{};
}
template <class Lengths, index_t Align>
__host__ __device__ constexpr auto
make_ranked_ConstantTensorDescriptor_with_alignment(Lengths, Number<Align>)
__host__ __device__ constexpr auto make_ConstantTensorDescriptor_default_rank_aligned(Lengths,
Number<Align>)
{
using Strides =
decltype(calculate_rank_tensor_default_strides_with_alignment(Lengths{}, Number<Align>{}));
decltype(calculate_tensor_strides_default_rank_aligned(Lengths{}, Number<Align>{}));
using MemoryRanks = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::SeqType;
return ConstantTensorDescriptor<Lengths, Strides, MemoryRanks>{};
}
......
#pragma once
#include "constant_integral.hip.hpp"
#include "integral_constant.hip.hpp"
#include "functional.hip.hpp"
template <index_t... Is>
......@@ -21,12 +21,6 @@ struct Sequence
return mData[I];
}
__host__ __device__ index_t operator[](index_t i) const
{
const index_t mData[mSize + 1] = {Is..., 0};
return mData[i];
}
template <index_t... IRs>
__host__ __device__ static constexpr auto ReorderGivenNew2Old(Sequence<IRs...> /*new2old*/)
{
......@@ -164,6 +158,12 @@ struct sequence_reverse_inclusive_scan<Sequence<I>, Reduce>
using SeqType = Sequence<I>;
};
template <class Reduce>
struct sequence_reverse_inclusive_scan<Sequence<>, Reduce>
{
using SeqType = Sequence<>;
};
template <class, class>
struct sequence_extract;
......
......@@ -457,7 +457,8 @@ struct Blockwise2dTensorCopy3
index_t mSrcMyThreadOffset;
index_t mDstMyThreadOffset;
__device__ Blockwise2dTensorCopy3()
__device__ Blockwise2dTensorCopy3(Array<index_t, 2> src_block_data_multi_id_begin,
Array<index_t, 2> dst_block_data_multi_id_begin)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
......@@ -499,10 +500,13 @@ struct Blockwise2dTensorCopy3
const index_t thread_id_d0 = get_thread_local_1d_id() / thread_per_d1;
const index_t thread_id_d1 = get_thread_local_1d_id() - thread_id_d0 * thread_per_d1;
mSrcMyThreadOffset =
SrcDesc{}.GetOffsetFromMultiIndex(thread_id_d0, thread_id_d1 * DataPerRead);
mDstMyThreadOffset =
DstDesc{}.GetOffsetFromMultiIndex(thread_id_d0, thread_id_d1 * DataPerRead);
mSrcMyThreadOffset = SrcDesc{}.GetOffsetFromMultiIndex(
src_block_data_multi_id_begin +
Array<index_t, 2>{thread_id_d0, thread_id_d1 * DataPerRead});
mDstMyThreadOffset = DstDesc{}.GetOffsetFromMultiIndex(
dst_block_data_multi_id_begin +
Array<index_t, 2>{thread_id_d0, thread_id_d1 * DataPerRead});
}
__device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
......
......@@ -13,7 +13,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
constexpr auto dst_desc = DstDesc{};
constexpr auto desc = make_packed_ConstantTensorDescriptor(dst_desc.GetLengths());
constexpr auto desc = make_ConstantTensorDescriptor_default_rank_packed(dst_desc.GetLengths());
#if 0
if(get_thread_local_1d_id() == 0)
......@@ -108,7 +108,7 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds
constexpr auto src_desc = SrcDesc{};
constexpr auto dst_desc = DstDesc{};
constexpr auto ref_desc = make_packed_ConstantTensorDescriptor(SrcOpLengths{});
constexpr auto ref_desc = make_ConstantTensorDescriptor_default_rank_packed(SrcOpLengths{});
constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize;
......@@ -259,7 +259,7 @@ struct Blockwise4dTensorCopy1
constexpr index_t read_per_d3 = mod_conv::integer_divide_ceil(L3, DataPerRead);
constexpr auto ref_desc =
make_packed_ConstantTensorDescriptor(Sequence<L0, L1, L2, read_per_d3>{});
make_ConstantTensorDescriptor_default_rank_packed(Sequence<L0, L1, L2, read_per_d3>{});
constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize;
......@@ -336,7 +336,7 @@ struct BlockwiseChwnTensorCopyPadded
constexpr auto src_desc = SrcDesc{};
constexpr auto dst_desc = DstDesc{};
constexpr auto ref_desc = make_packed_ConstantTensorDescriptor(DstOpLengths{});
constexpr auto ref_desc = make_ConstantTensorDescriptor_default_rank_packed(DstOpLengths{});
constexpr auto h_global_pad_low = GlobalLowerPads{}.Get(I0);
constexpr auto w_global_pad_low = GlobalLowerPads{}.Get(I1);
......@@ -510,7 +510,8 @@ struct Blockwise4dTensorCopy3
}
}
constexpr auto thread_cluster_desc = make_packed_ConstantTensorDescriptor(ThreadPerDims{});
constexpr auto thread_cluster_desc =
make_ConstantTensorDescriptor_default_rank_packed(ThreadPerDims{});
const auto thread_multi_id =
thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
......@@ -652,7 +653,7 @@ struct Blockwise4dTensorCopy3
constexpr index_t nloop_d2 = L2 / thread_per_d2;
constexpr index_t nloop_d3 = mod_conv::integer_divide_ceil(L3, thread_per_d3 * DataPerRead);
constexpr auto clipboard_desc = make_packed_ConstantTensorDescriptor(
constexpr auto clipboard_desc = make_ConstantTensorDescriptor_default_rank_packed(
Sequence<nloop_d0, nloop_d1, nloop_d2, nloop_d3 * DataPerRead>{});
#pragma unroll
......@@ -719,7 +720,7 @@ struct Blockwise4dTensorCopy3
constexpr index_t nloop_d2 = L2 / thread_per_d2;
constexpr index_t nloop_d3 = mod_conv::integer_divide_ceil(L3, thread_per_d3 * DataPerRead);
constexpr auto clipboard_desc = make_packed_ConstantTensorDescriptor(
constexpr auto clipboard_desc = make_ConstantTensorDescriptor_default_rank_packed(
Sequence<nloop_d0, nloop_d1, nloop_d2, nloop_d3 * DataPerRead>{});
#pragma unroll
......
......@@ -46,7 +46,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
N % (NPerThreadSubC * NLevel0Cluster * NLevel1Cluster) == 0,
"wrong! Cannot evenly divide work among\n");
static_assert(ThreadMatrixC::GetLengths() == GetThreadMatrixCLengths(),
static_assert(is_same_type(ThreadMatrixC::GetLengths(), GetThreadMatrixCLengths()),
"wrong! ThreadMatrixC lengths is wrong");
auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id());
......@@ -55,7 +55,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
mMyThreadOffsetB = BlockMatrixB::GetOffsetFromMultiIndex(0, c_thread_mtx_index.col);
}
__device__ static auto GetThreadMatrixCLengths()
__device__ static constexpr auto GetThreadMatrixCLengths()
{
constexpr index_t M = BlockMatrixA::NCol(); // A is transposed
constexpr index_t N = BlockMatrixB::NCol();
......
#pragma once
#include "threadwise_tensor_slice_op.hip.hpp"
// slice a merged tensor, reorder and copy it into a normal tensor
// src: a merged tensor,
// dst: a normal tensor
// slice a (normal or merged) tensor, reorder and copy it into another (normal or merged) tensor
template <index_t BlockSize,
class Float,
class SrcDesc,
class DstDesc,
class SliceLengths,
class SubLengths,
class ClusterLengths,
class DataClusterLengths,
class ThreadClusterArrangeOrder,
class SrcAccessOrder,
class DstAccessOrder>
class DstAccessOrder,
index_t SrcDataPerRead,
index_t DstDataPerRead>
struct BlockwiseTensorSliceCopy_generic_v1
{
static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
......@@ -21,39 +21,44 @@ struct BlockwiseTensorSliceCopy_generic_v1
index_t mSrcMyThreadOffset;
index_t mDstMyThreadOffset;
__device__ BlockwiseTensorSliceCopy_generic_v1(Array<index_t, nDim> src_block_multi_offset,
Array<index_t, nDim> dst_block_multi_offset)
__device__
BlockwiseTensorSliceCopy_generic_v1(Array<index_t, nDim> src_block_data_multi_id_begin,
Array<index_t, nDim> dst_block_data_multi_id_begin)
{
// check NDim consistent
static_assert(SrcDesc::GetNumOfDimension() == DstDesc::GetNumOfDimension(), "wrong");
constexpr auto thread_cluster_desc = make_packed_ConstantTensorDescriptor(
ClusterLengths{}.ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));
// thread cluster
constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_default_rank_packed(
DataClusterLengths{}.ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));
// BlockSize
static_assert(BlockSize == thread_cluster_desc.GetElementSize(), "wrong! BlockSize");
// divide work
static_for<0, nDim, 1>{}([&](auto IDim) {
static_assert(SliceLengths{}.Get(IDim) % SubLenghs{}.Get(IDim) == 0,
constexpr auto data_per_cluster_per_dims = SubLengths{} * DataClusterLengths{};
static_for<0, nDim, 1>{}([&](auto IDim_) {
constexpr auto IDim = decltype(IDim_){};
static_assert(SliceLengths::Get(IDim) % SubLengths::Get(IDim) == 0,
"wrong! cannot evenly divide sliced tensor into sub-tensor");
static_assert(SliceLengths::Get(IDim) % data_per_cluster_per_dims.Get(IDim) == 0,
"wrong! cannot evenly divide sliced tensor into cluster");
});
constexpr auto thread_work_desc =
make_packed_ConstantTensorDescriptor(SliceLengths{} / SliceSubLengths{});
constexpr auto repeat_lengths = SliceLengths{} / data_per_cluster_per_dims;
static_for<0, nDim, 1>{}([&](auto IDim) {
static_assert(thread_work_desc.GetLength(IDim) % thread_cluster_desc.Get(IDim) == 0,
"wrong! cannot evenly divide work to cluster");
});
// for now, only support SubLengths.Get() == 1 on a merged dimension that is merge from
// multiple dimensions
static_for<0, nDim, 1>{}([&](auto IDim_) {
constexpr auto IDim = decltype(IDim_){};
// only support SubLengths.Get() == 1 on merged dimension, for now
static_for<0, nDim, 1>{}([&](auto IDim) {
static_if<(SrcDesc::ContainMultipleOriginalDimensions(IDim) ||
DstDesc::ContainMultipleOriginalDimensions(IDim))>{}([&](auto fwd) {
static_assert(fwd(SubLengths{}).Get(IDim) == 1,
"wrong! Sub-Lengths on merged dimension should be 1");
});
static_assert(SubLengths::Get(IDim) == 1 ||
(!SrcDesc::ContainMultipleOriginalDimensions(IDim) &&
!DstDesc::ContainMultipleOriginalDimensions(IDim)),
"wrong! only surpport Sub-Length == 1 on a merged dimension");
});
// calculate mSrcMyThreadOffset, mDstMyThreadOffset
......@@ -63,22 +68,23 @@ struct BlockwiseTensorSliceCopy_generic_v1
const auto data_cluster_multi_id =
reorder_array_given_old2new(thread_cluster_multi_id, ThreadClusterArrangeOrder{});
const auto thread_data_multi_offset = data_cluster_multi_id * SubLengths{};
const auto thread_data_multi_id_begin = data_cluster_multi_id * SubLengths{};
mSrcMyThreadOffset = SrcDesc::GetOffsetFromMultiIndex(src_block_data_multi_id_begin +
thread_data_multi_id_begin);
mSrcMythreadOffset =
SrcDesc::GetOffsetFromMultiIndex(src_block_multi_offset + thread_data_multi_offset);
mSrcMythreadOffset =
DstDesc::GetOffsetFromMultiIndex(dst_block_multi_offset + thread_data_multi_offset);
mSrcMyThreadOffset = DstDesc::GetOffsetFromMultiIndex(dst_block_data_multi_id_begin +
thread_data_multi_id_begin);
}
__device__ static constexpr index_t GetRegisterClipboardSize()
{
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ClusterLengths{});
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{});
constexpr auto thread_tensor_desc =
make_packed_ConstantTensorDescriptor(SubLengths{} * repeat_lengths);
make_ConstantTensorDescriptor_default_rank_packed(SubLengths{} * repeat_lengths);
return thread_tensor_desc.GetElementSpaceSize();
return thread_tensor_desc.GetElementSpace();
}
__device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src,
......@@ -86,32 +92,34 @@ struct BlockwiseTensorSliceCopy_generic_v1
{
constexpr auto thread_sub_tensor_lengths = SubLengths{};
constexpr auto data_per_cluster_per_dims = thread_sub_tensor_lengths * ClusterLengths{};
constexpr auto data_per_cluster_per_dims = thread_sub_tensor_lengths * DataClusterLengths{};
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ClusterLengths{});
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{});
constexpr auto thread_tensor_desc =
make_packed_ConstantTensorDescriptor(thread_sub_tensor_lengths * repeat_lengths);
constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor_default_rank_packed(
thread_sub_tensor_lengths * repeat_lengths);
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});
const auto src_thread_data_multi_id_begin =
repeat_multi_id * data_per_cluster_per_dims; // cannot not constexpr, why?
constexpr auto src_data_multi_offset = repeat_multi_id * data_per_cluster_per_dims;
const auto clipboard_data_multi_id_begin =
repeat_multi_id * thread_sub_tensor_lengths; // cannot not constexpr, why?
constexpr auto clipboard_data_multi_offset =
repeat_multi_id * thread_sub_tensor_lengths;
const index_t src_offset = SrcDesc{}.GetOffsetFromMultiIndex(
src_thread_data_multi_id_begin); // cannot not constexpr, why?
constexpr index_t src_offset = SrcDesc{}.GetOffsetFromMultiIndex(src_data_multi_id);
constexpr index_t clipboard_offset =
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id);
const index_t clipboard_offset = thread_tensor_desc.GetOffsetFromMultiIndex(
clipboard_data_multi_id_begin); // cannot not constexpr, why?
threadwise_tensor_slice_copy_generic(SrcDesc{},
p_src + src_offset + mSrcMyThreadOffset,
thread_tensor_desc,
zero_array<index_t, nDim>{},
make_zero_array<index_t, nDim>(),
thread_tensor_desc,
p_clipboard + clipboard_offset,
zero_array<index_t, nDim>{},
make_zero_array<index_t, nDim>(),
thread_sub_tensor_lengths,
SrcAccessOrder{});
});
......@@ -122,34 +130,37 @@ struct BlockwiseTensorSliceCopy_generic_v1
{
constexpr auto thread_sub_tensor_lengths = SubLengths{};
constexpr auto data_per_cluster_per_dims = thread_sub_tensor_lengths * ClusterLengths{};
constexpr auto data_per_cluster_per_dims = thread_sub_tensor_lengths * DataClusterLengths{};
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ClusterLengths{});
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{});
constexpr auto thread_tensor_desc =
make_packed_ConstantTensorDescriptor(thread_sub_tensor_lengths * repeat_lengths);
constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor_default_rank_packed(
thread_sub_tensor_lengths * repeat_lengths);
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});
constexpr auto clipboard_data_multi_offset =
repeat_multi_id * thread_sub_tensor_lengths;
const auto clipboard_data_multi_id_begin =
repeat_multi_id * thread_sub_tensor_lengths; // cannot not constexpr, why?
constexpr auto dst_data_multi_offset = repeat_multi_id * data_per_cluster_per_dims;
const auto dst_data_multi_id_begin =
repeat_multi_id * data_per_cluster_per_dims; // cannot not constexpr, why?
constexpr index_t clipboard_offset =
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_offset);
const index_t clipboard_offset = thread_tensor_desc.GetOffsetFromMultiIndex(
clipboard_data_multi_id_begin); // cannot not constexpr, why?
constexpr index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_offset);
const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(
dst_data_multi_id_begin); // cannot not constexpr, why?
threadwise_tensor_slice_copy_generic(thread_tensor_desc,
p_clipboard + clipboard_offset,
zero_array<index_t, nDim>{},
make_zero_array<index_t, nDim>(),
DstDesc{},
p_dst + dst_offset + mDstMyThreadOffset,
zero_array<index_t, nDim>{},
make_zero_array<index_t, nDim>(),
thread_sub_tensor_lengths,
DstAccessOrder{});
});
}
__device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
......@@ -159,4 +170,4 @@ struct BlockwiseTensorSliceCopy_generic_v1
RunLoadRegisterClipboard(p_src, p_clipboard);
RunStoreRegisterClipboard(p_clipboard, p_dst);
}
};
};
......@@ -40,7 +40,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
src_cluster_lengths.ReorderGivenNew2Old(map_thread_cluster_2_src_cluster);
constexpr auto thread_cluster_desc =
make_packed_ConstantTensorDescriptor(thread_cluster_lengths);
make_ConstantTensorDescriptor_default_rank_packed(thread_cluster_lengths);
// sanity check: data type
static_assert(is_same<Float, float>::value, "wrong! only support float for now!\n");
......@@ -149,7 +149,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;
constexpr auto thread_tensor_desc =
make_packed_ConstantTensorDescriptor(thread_tensor_lengths);
make_ConstantTensorDescriptor_default_rank_packed(thread_tensor_lengths);
return thread_tensor_desc.GetElementSpace();
}
......@@ -170,7 +170,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;
constexpr auto thread_tensor_desc =
make_packed_ConstantTensorDescriptor(thread_tensor_lengths);
make_ConstantTensorDescriptor_default_rank_packed(thread_tensor_lengths);
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
......@@ -208,7 +208,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;
constexpr auto thread_tensor_desc =
make_packed_ConstantTensorDescriptor(thread_tensor_lengths);
make_ConstantTensorDescriptor_default_rank_packed(thread_tensor_lengths);
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
......
#pragma once
#include "vector_type.hip.hpp"
#include "constant_integral.hip.hpp"
#include "integral_constant.hip.hpp"
#include "Sequence.hip.hpp"
#include "Array.hip.hpp"
#include "functional.hip.hpp"
......@@ -17,15 +17,21 @@ __device__ index_t get_block_1d_id() { return blockIdx.x; }
template <class T1, class T2>
struct is_same
{
static const bool value = false;
static constexpr bool value = false;
};
template <class T>
struct is_same<T, T>
{
static const bool value = true;
static constexpr bool value = true;
};
template <class X, class Y>
__host__ __device__ constexpr bool is_same_type(X, Y)
{
return is_same<X, Y>::value;
}
namespace mod_conv { // namespace mod_conv
template <class T, T s>
struct scales
......
......@@ -30,7 +30,7 @@ __host__ __device__ constexpr auto get_convolution_output_default_4d_tensor_desc
constexpr auto HO = HI + 1 - Y;
constexpr auto WO = WI + 1 - X;
return make_packed_ConstantTensorDescriptor(Sequence<N, K, HO, WO>{});
return make_ConstantTensorDescriptor_default_rank_packed(Sequence<N, K, HO, WO>{});
}
template <class InDesc, class WeiDesc, class LowerPads, class UpperPads>
......@@ -67,7 +67,7 @@ __host__ __device__ constexpr auto get_convolution_with_padding_output_default_4
constexpr auto HO = HI + HPadLow + HPadUp + 1 - Y;
constexpr auto WO = WI + WPadLow + WPadUp + 1 - X;
return make_packed_ConstantTensorDescriptor(Sequence<N, K, HO, WO>{});
return make_ConstantTensorDescriptor_default_rank_packed(Sequence<N, K, HO, WO>{});
}
template <class InDesc, class WeiDesc, class OutDesc>
......
#pragma once
#include "constant_integral.hip.hpp"
#include "integral_constant.hip.hpp"
struct forwarder
{
......
......@@ -85,7 +85,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock);
constexpr index_t NBlockWork = mod_conv::integer_divide_ceil(N, NPerBlock);
constexpr auto block_work_desc = make_packed_ConstantTensorDescriptor(
constexpr auto block_work_desc = make_ConstantTensorDescriptor_default_rank_packed(
Sequence<KBlockWork, HBlockWork, WBlockWork, NBlockWork>{});
const auto block_work_multi_id =
......@@ -109,7 +109,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
GemmDataPerReadA,
GemmDataPerReadB);
constexpr auto in_c_h_w_n_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment(
constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{},
Number<InBlockCopyDataPerRead_N>{});
......@@ -118,12 +118,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
"GemmDataPerReadB alignment requirement is not meet");
constexpr auto wei_c_k_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment(
constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
Sequence<CPerBlock, KPerBlock>{},
Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
// tensor view of threadwise output in register
constexpr auto out_k_h_w_n_thread_desc = make_packed_ConstantTensorDescriptor(
constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_default_rank_packed(
Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{});
// blockwise copy
......
......@@ -86,7 +86,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
constexpr index_t HBlockWork = mod_conv::integer_divide_ceil(Ho, HoPerBlock);
constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock);
constexpr auto block_work_desc = make_packed_ConstantTensorDescriptor(
constexpr auto block_work_desc = make_ConstantTensorDescriptor_default_rank_packed(
Sequence<NBlockWork, KBlockWork, HBlockWork, WBlockWork>{});
const auto block_work_multi_id =
......@@ -102,7 +102,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
// global tensor view
constexpr auto wei_c_k_global_desc =
make_ranked_ConstantTensorDescriptor(Sequence<C, K>{}, Sequence<Y * X * K, 1>{});
make_ConstantTensorDescriptor_default_rank(Sequence<C, K>{}, Sequence<Y * X * K, 1>{});
// LDS tensor view
// be careful of alignment
......@@ -111,7 +111,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
GemmDataPerReadA,
GemmDataPerReadB);
constexpr auto in_c_h_w_n_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment(
constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{},
Number<InBlockReorderDataPerWrite_N>{});
......@@ -120,12 +120,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
"GemmDataPerReadB alignment requirement is not meet");
constexpr auto wei_c_k_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment(
constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
Sequence<CPerBlock, KPerBlock>{},
Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
// tensor view of threadwise output in register
constexpr auto out_k_h_w_n_thread_desc = make_packed_ConstantTensorDescriptor(
constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_default_rank_packed(
Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{});
// blockwise copy
......@@ -448,10 +448,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
constexpr index_t K1 = KPerBlock / KPerThread;
#if 0
constexpr auto out_10d_global_desc = make_packed_ConstantTensorDescriptor(
constexpr auto out_10d_global_desc = make_ConstantTensorDescriptor_default_rank_packed(
Sequence<K / (K1 * K2), K1, K2, Ho, Wo / (W1 * W2 * W3), W1, W2, W3, N / N1, N1>{});
constexpr auto out_10d_thread_desc = make_packed_ConstantTensorDescriptor(
constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor_default_rank_packed(
Sequence<KPerThread / K2, 1, K2, HoPerThread, 1, W1, 1, W3, 1, N1>{});
#else
constexpr auto out_10d_global_desc =
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment