"git@developer.sourcefind.cn:modelzoo/resnet50_tensorflow.git" did not exist on "9a833e2c455b7cb19dbed7fd2b527270da82e3c2"
Commit 8a4b5978 authored by Chao Liu's avatar Chao Liu
Browse files

adding implicit gemm v3

parent 2a48812e
...@@ -38,7 +38,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, ...@@ -38,7 +38,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
constexpr index_t X = wei_kcyx_desc.GetLength(I3); constexpr index_t X = wei_kcyx_desc.GetLength(I3);
// reorder weight // reorder weight
auto wei_cyxk_desc = make_packed_ConstantTensorDescriptor(Sequence<C, Y, X, K>{}); auto wei_cyxk_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<C, Y, X, K>{});
ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: "); ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc)); Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
...@@ -51,7 +51,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, ...@@ -51,7 +51,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
std::thread::hardware_concurrency()); std::thread::hardware_concurrency());
// reorder input // reorder input
auto in_chwn_desc = make_packed_ConstantTensorDescriptor(Sequence<C, Hi, Wi, N>{}); auto in_chwn_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<C, Hi, Wi, N>{});
ostream_ConstantTensorDescriptor(in_chwn_desc, std::cout << "in_chwn_desc: "); ostream_ConstantTensorDescriptor(in_chwn_desc, std::cout << "in_chwn_desc: ");
Tensor<T> in_chwn(make_TensorDescriptor(in_chwn_desc)); Tensor<T> in_chwn(make_TensorDescriptor(in_chwn_desc));
...@@ -64,7 +64,8 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, ...@@ -64,7 +64,8 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
std::thread::hardware_concurrency()); std::thread::hardware_concurrency());
// output // output
auto out_khwn_desc = make_packed_ConstantTensorDescriptor(Sequence<K, Ho, Wo, N>{}); auto out_khwn_desc =
make_ConstantTensorDescriptor_default_rank_packed(Sequence<K, Ho, Wo, N>{});
ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: "); ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: ");
Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc)); Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc));
......
...@@ -37,7 +37,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc, ...@@ -37,7 +37,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
constexpr index_t X = wei_kcyx_desc.GetLength(I3); constexpr index_t X = wei_kcyx_desc.GetLength(I3);
// reorder weight // reorder weight
auto wei_cyxk_desc = make_packed_ConstantTensorDescriptor(Sequence<C, Y, X, K>{}); auto wei_cyxk_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<C, Y, X, K>{});
ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: "); ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc)); Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
...@@ -50,7 +50,8 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc, ...@@ -50,7 +50,8 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
std::thread::hardware_concurrency()); std::thread::hardware_concurrency());
// output // output
auto out_khwn_desc = make_packed_ConstantTensorDescriptor(Sequence<K, Ho, Wo, N>{}); auto out_khwn_desc =
make_ConstantTensorDescriptor_default_rank_packed(Sequence<K, Ho, Wo, N>{});
ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: "); ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: ");
Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc)); Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc));
......
...@@ -36,7 +36,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc, ...@@ -36,7 +36,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
constexpr index_t X = wei_kcyx_desc.GetLength(I3); constexpr index_t X = wei_kcyx_desc.GetLength(I3);
// reorder weight // reorder weight
auto wei_cyxk_desc = make_packed_ConstantTensorDescriptor(Sequence<C, Y, X, K>{}); auto wei_cyxk_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<C, Y, X, K>{});
ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: "); ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc)); Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
...@@ -57,7 +57,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc, ...@@ -57,7 +57,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data()); wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data());
out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
#if 0 #if 1
// for 3x3, 34x34, v1r3, Pascal // for 3x3, 34x34, v1r3, Pascal
constexpr index_t BlockSize = 128; constexpr index_t BlockSize = 128;
...@@ -92,7 +92,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc, ...@@ -92,7 +92,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
constexpr index_t WeiBlockCopyDataPerRead_K = 4; constexpr index_t WeiBlockCopyDataPerRead_K = 4;
constexpr index_t OutThreadCopyDataPerWrite_W = 2; constexpr index_t OutThreadCopyDataPerWrite_W = 2;
#elif 0 #elif 1
// for 3x3, 34x34, v1r3, Vega 20, WoPerBlock = 32 // for 3x3, 34x34, v1r3, Vega 20, WoPerBlock = 32
constexpr index_t BlockSize = 256; constexpr index_t BlockSize = 256;
...@@ -162,7 +162,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc, ...@@ -162,7 +162,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
constexpr index_t WeiBlockCopyDataPerRead_K = 4; constexpr index_t WeiBlockCopyDataPerRead_K = 4;
constexpr index_t OutThreadCopyDataPerWrite_W = 2; constexpr index_t OutThreadCopyDataPerWrite_W = 2;
#elif 1 #elif 0
// for 3x3, 34x34, v1r3, Vega 20, WoPerBlock = 8 // for 3x3, 34x34, v1r3, Vega 20, WoPerBlock = 8
constexpr index_t BlockSize = 256; constexpr index_t BlockSize = 256;
......
...@@ -35,7 +35,7 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc, ...@@ -35,7 +35,7 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
constexpr index_t X = wei_kcyx_desc.GetLength(I3); constexpr index_t X = wei_kcyx_desc.GetLength(I3);
// reorder weight // reorder weight
auto wei_cyxk_desc = make_ConstantTensorDescriptor(Sequence<C, Y, X, K>{}); auto wei_cyxk_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<C, Y, X, K>{});
ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: "); ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc)); Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
...@@ -56,37 +56,40 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc, ...@@ -56,37 +56,40 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data()); wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data());
out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
constexpr index_t N1 = 2;
constexpr index_t N2 = 4;
constexpr index_t B = (N * Ho * Wo) / (N1 * N2);
#if 1 #if 1
// for 3x3, 28x28, v3, Pascal // for 3x3, 28x28, v3
constexpr index_t BlockSize = 128; constexpr index_t BlockSize = 256;
constexpr index_t BPerBlock = 16; constexpr index_t BPerBlock = 16;
constexpr index_t KPerBlock = 128; constexpr index_t KPerBlock = 128;
constexpr index_t CPerBlock = 8; constexpr index_t CPerBlock = 8;
constexpr index_t BPerThread = 1;
constexpr index_t KPerThread = 8;
constexpr index_t GemmMPerThreadSubC = 4; constexpr index_t GemmMPerThreadSubC = 4;
constexpr index_t GemmNPerThreadSubC = 4; constexpr index_t GemmNPerThreadSubC = 4;
constexpr index_t GemmMLevel0Cluster = 4; constexpr index_t GemmMLevel0Cluster = 4;
constexpr index_t GemmNLevel0Cluster = 2; constexpr index_t GemmNLevel0Cluster = 4;
constexpr index_t GemmMLevel1Cluster = 4; constexpr index_t GemmMLevel1Cluster = 4;
constexpr index_t GemmNLevel1Cluster = 2; constexpr index_t GemmNLevel1Cluster = 4;
constexpr index_t GemmKPerThreadLoop = 1; constexpr index_t GemmKPerThreadLoop = 1;
constexpr index_t GemmDataPerReadA = 4; constexpr index_t GemmDataPerReadA = 4;
constexpr index_t GemmDataPerReadB = 4; constexpr index_t GemmDataPerReadB = 4;
using InBlockReorderSrcSubLengths_NCHW = Sequence<4, 1, 1, 1>; using InBlockCopySubLengths_N1_N2_C_B = Sequence<1, 4, 1, 1>;
using InBlockReorderSrcClusterLengths_NCHW = Sequence<4, 8, 2, 2>; using InBlockCopyClusterLengths_N1_N2_C_B = Sequence<2, 1, 8, 16>;
using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
constexpr index_t InBlockCopySrcDataPerRead_B = 1;
constexpr index_t InBlockCopyDstDataPerWrite_N2 = 4;
constexpr index_t WeiBlockCopyDataPerRead_K = 4; constexpr index_t WeiBlockCopyDataPerAccess_K = 4;
#endif #endif
constexpr index_t GridSize = constexpr index_t GridSize =
((N + NPerBlock - 1) / NPerBlock) * ((K + KPerBlock - 1) / KPerBlock) * ((B + BPerBlock - 1) / BPerBlock) * ((K + KPerBlock - 1) / KPerBlock);
((Ho + HoPerBlock - 1) / HoPerBlock) * ((Wo + WoPerBlock - 1) / WoPerBlock);
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
...@@ -102,15 +105,11 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc, ...@@ -102,15 +105,11 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
decltype(in_nchw_desc), decltype(in_nchw_desc),
decltype(wei_cyxk_desc), decltype(wei_cyxk_desc),
decltype(out_nkhw_desc), decltype(out_nkhw_desc),
NPerBlock, BPerBlock,
KPerBlock, KPerBlock,
CPerBlock, CPerBlock,
HoPerBlock, N1,
WoPerBlock, N2,
NPerThread,
KPerThread,
HoPerThread,
WoPerThread,
GemmMPerThreadSubC, GemmMPerThreadSubC,
GemmNPerThreadSubC, GemmNPerThreadSubC,
GemmMLevel0Cluster, GemmMLevel0Cluster,
...@@ -120,14 +119,11 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc, ...@@ -120,14 +119,11 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
GemmKPerThreadLoop, GemmKPerThreadLoop,
GemmDataPerReadA, GemmDataPerReadA,
GemmDataPerReadB, GemmDataPerReadB,
InBlockReorderSrcSubLengths_NCHW, InBlockCopySubLengths_N1_N2_C_B,
InBlockReorderSrcClusterLengths_NCHW, InBlockCopyClusterLengths_N1_N2_C_B,
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW, InBlockCopySrcDataPerRead_B,
InBlockReorderDataPerRead_W, InBlockCopyDstDataPerWrite_N2,
InBlockReorderDataPerWrite_N, WeiBlockCopyDataPerAccess_K>{};
WeiBlockCopyClusterLengths,
WeiBlockCopyDataPerRead_K,
OutThreadCopyDataPerWrite_W>{};
float time = launch_kernel(run_gridwise_convolution<decltype(gridwise_conv), T>, float time = launch_kernel(run_gridwise_convolution<decltype(gridwise_conv), T>,
dim3(GridSize), dim3(GridSize),
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
#include "device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp" #include "device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp"
#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp" #include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp" #include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
//#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp" #include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
struct GeneratorTensor_1 struct GeneratorTensor_1
{ {
...@@ -548,8 +548,8 @@ int main(int argc, char* argv[]) ...@@ -548,8 +548,8 @@ int main(int argc, char* argv[])
auto lower_pads = Sequence<HPad, WPad>{}; auto lower_pads = Sequence<HPad, WPad>{};
auto upper_pads = Sequence<HPad, WPad>{}; auto upper_pads = Sequence<HPad, WPad>{};
auto in_nchw_desc = make_packed_ConstantTensorDescriptor(Sequence<N, C, HI, WI>{}); auto in_nchw_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<N, C, HI, WI>{});
auto wei_kcyx_desc = make_packed_ConstantTensorDescriptor(Sequence<K, C, Y, X>{}); auto wei_kcyx_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<K, C, Y, X>{});
auto out_nkhw_desc = get_convolution_with_padding_output_default_4d_tensor_descriptor( auto out_nkhw_desc = get_convolution_with_padding_output_default_4d_tensor_descriptor(
in_nchw_desc, wei_kcyx_desc, lower_pads, upper_pads); in_nchw_desc, wei_kcyx_desc, lower_pads, upper_pads);
...@@ -612,11 +612,11 @@ int main(int argc, char* argv[]) ...@@ -612,11 +612,11 @@ int main(int argc, char* argv[])
device_convolution_implicit_gemm_v1_chwn_cyxk_khwn device_convolution_implicit_gemm_v1_chwn_cyxk_khwn
#elif 0 #elif 0
device_convolution_implicit_gemm_v1_nchw_cyxk_khwn device_convolution_implicit_gemm_v1_nchw_cyxk_khwn
#elif 1 #elif 0
device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw
#elif 0 #elif 0
device_convolution_implicit_gemm_v2_chwn_cyxk_khwn device_convolution_implicit_gemm_v2_chwn_cyxk_khwn
#elif 0 #elif 1
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw
#endif #endif
(in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
......
...@@ -12,7 +12,7 @@ struct Array ...@@ -12,7 +12,7 @@ struct Array
index_t mData[nSize]; index_t mData[nSize];
template <class... Xs> template <class... Xs>
__host__ __device__ Array(Xs... xs) : mData{static_cast<TData>(xs)...} __host__ __device__ constexpr Array(Xs... xs) : mData{static_cast<TData>(xs)...}
{ {
} }
...@@ -37,6 +37,25 @@ struct Array ...@@ -37,6 +37,25 @@ struct Array
} }
}; };
template <index_t... Is>
__host__ __device__ constexpr auto sequence2array(Sequence<Is...>)
{
return Array<index_t, sizeof...(Is)>{Is...};
}
template <class TData, index_t NSize>
__host__ __device__ constexpr auto make_zero_array()
{
Array<TData, NSize> a;
static_for<0, NSize, 1>{}([&](auto I) {
constexpr index_t i = I.Get();
a[i] = static_cast<TData>(0);
});
return a;
}
template <class TData, index_t NSize, index_t... IRs> template <class TData, index_t NSize, index_t... IRs>
__host__ __device__ auto reorder_array_given_new2old(const Array<TData, NSize>& old_array, __host__ __device__ auto reorder_array_given_new2old(const Array<TData, NSize>& old_array,
Sequence<IRs...> new2old) Sequence<IRs...> new2old)
...@@ -80,15 +99,14 @@ __host__ __device__ auto extract_array(const Array<TData, NSize>& old_array, Ext ...@@ -80,15 +99,14 @@ __host__ __device__ auto extract_array(const Array<TData, NSize>& old_array, Ext
static_for<0, new_size, 1>{}([&](auto I) { static_for<0, new_size, 1>{}([&](auto I) {
constexpr index_t i = I.Get(); constexpr index_t i = I.Get();
new_array[i] = old_array[ExtractSeq{}.Get(I)]; new_array[i] = old_array[ExtractSeq::Get(I)];
}); });
return new_array; return new_array;
} }
template <class TData, index_t NSize> template <class TData, index_t NSize>
__host__ __device__ constexpr auto operator+(const Array<TData, NSize>& a, __host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Array<TData, NSize> b)
const Array<TData, NSize>& b)
{ {
Array<TData, NSize> result; Array<TData, NSize> result;
...@@ -99,3 +117,20 @@ __host__ __device__ constexpr auto operator+(const Array<TData, NSize>& a, ...@@ -99,3 +117,20 @@ __host__ __device__ constexpr auto operator+(const Array<TData, NSize>& a,
return result; return result;
} }
// Array = Array * Sequence
template <class TData, index_t NSize, index_t... Is>
__host__ __device__ constexpr auto operator*(Array<TData, NSize> a, Sequence<Is...> b)
{
static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
Array<TData, NSize> result;
static_for<0, NSize, 1>{}([&](auto I) {
constexpr index_t i = I.Get();
result[i] = a[i] + b.Get(I);
});
return result;
}
...@@ -9,26 +9,26 @@ struct ConstantMatrixDescriptor ...@@ -9,26 +9,26 @@ struct ConstantMatrixDescriptor
static_assert(NCol_ <= RowStride_, "wrong! NCol > RowStride!"); static_assert(NCol_ <= RowStride_, "wrong! NCol > RowStride!");
} }
__host__ __device__ constexpr index_t NRow() const { return NRow_; } __host__ __device__ static constexpr index_t NRow() { return NRow_; }
__host__ __device__ constexpr index_t NCol() const { return NCol_; } __host__ __device__ static constexpr index_t NCol() { return NCol_; }
__host__ __device__ constexpr index_t RowStride() const { return RowStride_; } __host__ __device__ static constexpr index_t RowStride() { return RowStride_; }
__host__ __device__ constexpr auto GetLengths() const { return Sequence<NRow_, NCol_>{}; } __host__ __device__ static constexpr auto GetLengths() { return Sequence<NRow_, NCol_>{}; }
__host__ __device__ constexpr index_t GetElementSize() const { return NRow_ * NCol_; } __host__ __device__ static constexpr index_t GetElementSize() { return NRow_ * NCol_; }
__host__ __device__ constexpr index_t GetElementSpace() const { return NRow_ * RowStride_; } __host__ __device__ static constexpr index_t GetElementSpace() { return NRow_ * RowStride_; }
__host__ __device__ index_t GetOffsetFromMultiIndex(index_t irow, index_t icol) const __host__ __device__ static index_t GetOffsetFromMultiIndex(index_t irow, index_t icol)
{ {
return irow * RowStride_ + icol; return irow * RowStride_ + icol;
} }
template <index_t SubNRow, index_t SubNCol> template <index_t SubNRow, index_t SubNCol>
__host__ __device__ constexpr auto MakeSubMatrixDescriptor(Number<SubNRow>, __host__ __device__ static constexpr auto MakeSubMatrixDescriptor(Number<SubNRow>,
Number<SubNCol>) const Number<SubNCol>)
{ {
return ConstantMatrixDescriptor<SubNRow, SubNCol, RowStride_>{}; return ConstantMatrixDescriptor<SubNRow, SubNCol, RowStride_>{};
} }
......
...@@ -11,8 +11,8 @@ struct ConstantMergedTensorDescriptor ...@@ -11,8 +11,8 @@ struct ConstantMergedTensorDescriptor
{ {
static constexpr auto mOriginalDimMergeSeqs = std::tuple<OriginalDimMergeSeqs...>{}; static constexpr auto mOriginalDimMergeSeqs = std::tuple<OriginalDimMergeSeqs...>{};
static constexpr index_t nDim = std::tuple_size<mOriginalDimMergeSeqs>::value; static constexpr index_t nDim = sizeof...(OriginalDimMergeSeqs);
static constexpr index_t nOriginalDim = OriginalDesc::GetNumOfDimension(); static constexpr index_t nOriginalDim = OriginalTensorDesc::GetNumOfDimension();
__host__ __device__ constexpr ConstantMergedTensorDescriptor() __host__ __device__ constexpr ConstantMergedTensorDescriptor()
{ {
...@@ -21,25 +21,28 @@ struct ConstantMergedTensorDescriptor ...@@ -21,25 +21,28 @@ struct ConstantMergedTensorDescriptor
// TODO: check each of OriginalDimMergeSeqs contains at least 1, and at most // TODO: check each of OriginalDimMergeSeqs contains at least 1, and at most
// OriginalTensorDesc::nDim number of dimensions // OriginalTensorDesc::nDim number of dimensions
// TODO: check there is no duplication in OriginalDimMergeSeqs
// TODO: check OriginalDimMergeSeqs contains all original dimensions // TODO: check OriginalDimMergeSeqs contains all original dimensions
// TODO: check there is no duplication in OriginalDimMergeSeqs
} }
__host__ __device__ static constexpr index_t GetNumOfDimension() { return nDim; } __host__ __device__ static constexpr index_t GetNumOfDimension() { return nDim; }
__host__ __device__ static constexpr index_t GetNumOfOriginalDimension() { return nOriginalDim } __host__ __device__ static constexpr index_t GetNumOfOriginalDimension()
{
return nOriginalDim;
}
template <index_t IDim> template <index_t IDim>
__host__ __device__ static constexpr bool ContainMultipleOriginalDimensions(Number<IDim>) __host__ __device__ static constexpr bool ContainMultipleOriginalDimensions(Number<IDim>)
{ {
return (std::Get<IDIM>(mOriginalDimMergeSeqs).GetSize() > 1); return (std::get<IDim>(mOriginalDimMergeSeqs).GetSize() > 1);
} }
template <index_t IDim> template <index_t IDim>
__host__ __device__ static constexpr index_t GetLength(Number<IDim>) __host__ __device__ static constexpr index_t GetLength(Number<IDim>)
{ {
constexpr auto original_dims_partial = std::Get<IDim>(mOriginalDimMergeSeqs); constexpr auto original_dims_partial = std::get<IDim>(mOriginalDimMergeSeqs);
return OriginalTensorDesc::Extract(original_dims_partial).GetElementSize(); return OriginalTensorDesc::Extract(original_dims_partial).GetElementSize();
} }
...@@ -50,14 +53,14 @@ struct ConstantMergedTensorDescriptor ...@@ -50,14 +53,14 @@ struct ConstantMergedTensorDescriptor
static_assert(!ContainMultipleOriginalDimensions(Number<IDim>{}), static_assert(!ContainMultipleOriginalDimensions(Number<IDim>{}),
"wrong! stride of a merged dimension is undefined"); "wrong! stride of a merged dimension is undefined");
constexpr auto idim_original = std::Get<IDim>(mOriginalDimMergeSeqs).Front(); constexpr auto idim_original = std::get<IDim>(mOriginalDimMergeSeqs).Front();
return OriginalTensorDesc::GetStride(Number<idim_original>{}); return OriginalTensorDesc::GetStride(Number<idim_original>{});
} }
__host__ __device__ static constexpr auto GetLengths() __host__ __device__ static constexpr auto GetLengths()
{ {
return Sequence<OriginalTensorDesc::Extract(OriginalDimMergeSeqs).GetElementSize()...>{}; return Sequence<OriginalTensorDesc::Extract(OriginalDimMergeSeqs{}).GetElementSize()...>{};
} }
__host__ __device__ static constexpr index_t GetElementSize() __host__ __device__ static constexpr index_t GetElementSize()
...@@ -75,17 +78,16 @@ struct ConstantMergedTensorDescriptor ...@@ -75,17 +78,16 @@ struct ConstantMergedTensorDescriptor
constexpr auto original_dims_partial = std::get<idim>(mOriginalDimMergeSeqs); constexpr auto original_dims_partial = std::get<idim>(mOriginalDimMergeSeqs);
// get partial original-multi-id corresponding to this merged dimension // get partial original-multi-id corresponding to this merged dimension
constexpr auto original_multi_id_partial = const auto original_multi_id_partial =
OriginalTensorDesc::Extract(original_dims_partial) OriginalTensorDesc::Extract(original_dims_partial)
.GetMultiIndexFrom1dIndex(multi_id[idim]); .GetMultiIndexFrom1dIndex(multi_id[idim]);
// make sure compiler unroll this loop and propagate all the constants static_for<0, original_dims_partial.GetSize(), 1>{}([&](auto I_) {
for(index_t i = 0; i < original_dims_partial.GetSize(); ++i) constexpr auto I = decltype(I_){};
{ constexpr index_t idim_original = original_dims_partial.Get(I);
index_t idim_original = original_dims_partial[i];
original_multi_id[idim_original] = original_multi_id_partial[i] original_multi_id[idim_original] = original_multi_id_partial[I.Get()];
} });
}); });
return original_multi_id; return original_multi_id;
...@@ -95,10 +97,10 @@ struct ConstantMergedTensorDescriptor ...@@ -95,10 +97,10 @@ struct ConstantMergedTensorDescriptor
{ {
const auto original_multi_id = GetOriginalMultiIndexFromMultiIndex(multi_id); const auto original_multi_id = GetOriginalMultiIndexFromMultiIndex(multi_id);
return OriginalTensorDesc::GetOffsetFromMultiIndex(orginal_multi_id); return OriginalTensorDesc::GetOffsetFromMultiIndex(original_multi_id);
} }
template <index_t... Is> template <class... Is>
__host__ __device__ static index_t GetOffsetFromMultiIndex(Is... is) __host__ __device__ static index_t GetOffsetFromMultiIndex(Is... is)
{ {
return GetOffsetFromMultiIndex(Array<index_t, nDim>{is...}); return GetOffsetFromMultiIndex(Array<index_t, nDim>{is...});
...@@ -106,14 +108,15 @@ struct ConstantMergedTensorDescriptor ...@@ -106,14 +108,15 @@ struct ConstantMergedTensorDescriptor
__host__ __device__ static Array<index_t, nDim> GetMultiIndexFrom1dIndex(index_t id) __host__ __device__ static Array<index_t, nDim> GetMultiIndexFrom1dIndex(index_t id)
{ {
constexpr auto dummy_desc = make_packed_ConstantTensorDescriptor(GetLengths()); constexpr auto dummy_desc = make_ConstantTensorDescriptor_default_rank_packed(GetLengths());
return dummy_desc.GetMultiIndexFrom1dIndex(id); return dummy_desc.GetMultiIndexFrom1dIndex(id);
} }
}; };
template <class OriginalTensorDesc, class... OriginalDimMergeSeqs> template <class OriginalTensorDesc, class... OriginalDimMergeSeqs>
constexpr auto make_ConstantMergedTensorDescriptor(OriginalTensorDesc, OriginalDimMergeSeqs...) __host__ __device__ constexpr auto make_ConstantMergedTensorDescriptor(OriginalTensorDesc,
OriginalDimMergeSeqs...)
{ {
return ConstantMergedTensorDescriptor<OriginalTensorDesc, OriginalDimMergeSeqs...>{}; return ConstantMergedTensorDescriptor<OriginalTensorDesc, OriginalDimMergeSeqs...>{};
} }
...@@ -2,20 +2,20 @@ ...@@ -2,20 +2,20 @@
#include "common.hip.hpp" #include "common.hip.hpp"
template <class Lengths> template <class Lengths>
__host__ __device__ constexpr auto calculate_packed_tensor_strides(Lengths) __host__ __device__ constexpr auto calculate_tensor_strides_default_rank_packed(Lengths)
{ {
return reverse_inclusive_scan_sequence(Lengths{}.PopFront(), std::multiplies<index_t>{}) return reverse_inclusive_scan_sequence(Lengths{}.PopFront(), std::multiplies<index_t>{})
.PushBack(Number<1>{}); .PushBack(Number<1>{});
} }
template <class Lengths, index_t Align> template <class Lengths, index_t Align>
__host__ __device__ constexpr auto __host__ __device__ constexpr auto calculate_tensor_strides_default_rank_aligned(Lengths,
calculate_rank_tensor_default_strides_with_alignment(Lengths, Number<Align>) Number<Align>)
{ {
constexpr index_t L_back_align = constexpr index_t L_back_align =
Align * mod_conv::integer_divide_ceiler<index_t>{}(Lengths{}.Back(), Align); Align * mod_conv::integer_divide_ceiler<index_t>{}(Lengths{}.Back(), Align);
return calculate_packed_tensor_strides( return calculate_tensor_strides_default_rank_packed(
Lengths{}.Modify(Number<Lengths{}.GetSize() - 1>{}, Number<L_back_align>{})); Lengths{}.Modify(Number<Lengths{}.GetSize() - 1>{}, Number<L_back_align>{}));
} }
...@@ -66,6 +66,12 @@ struct ConstantTensorDescriptor ...@@ -66,6 +66,12 @@ struct ConstantTensorDescriptor
return MemoryRanks{}.Get(Number<I>{}); return MemoryRanks{}.Get(Number<I>{});
} }
template <class T>
__host__ __device__ static constexpr bool ContainMultipleOriginalDimensions(T)
{
return false;
}
__host__ __device__ static constexpr index_t GetElementSize() __host__ __device__ static constexpr index_t GetElementSize()
{ {
return accumulate_on_sequence(Lengths{}, std::multiplies<index_t>{}, Number<1>{}); return accumulate_on_sequence(Lengths{}, std::multiplies<index_t>{}, Number<1>{});
...@@ -146,7 +152,7 @@ struct ConstantTensorDescriptor ...@@ -146,7 +152,7 @@ struct ConstantTensorDescriptor
{ {
Array<index_t, nDim> multi_id; Array<index_t, nDim> multi_id;
constexpr auto dummy_strides = calculate_packed_tensor_strides(GetLengths()); constexpr auto dummy_strides = calculate_tensor_strides_default_rank_packed(GetLengths());
// calculate index in each of the dimensions in the order of their dimension (not rank) // calculate index in each of the dimensions in the order of their dimension (not rank)
static_for<0, nDim - 1, 1>{}([&](auto IDim) { static_for<0, nDim - 1, 1>{}([&](auto IDim) {
...@@ -181,6 +187,12 @@ struct ConstantTensorDescriptor ...@@ -181,6 +187,12 @@ struct ConstantTensorDescriptor
return ConstantTensorDescriptor<extract_lengths, extract_strides, new_ranks>{}; return ConstantTensorDescriptor<extract_lengths, extract_strides, new_ranks>{};
} }
template <index_t... IDims>
__host__ __device__ static constexpr auto Extract(Sequence<IDims...>)
{
return Extract(Number<IDims>{}...);
}
template <index_t IDim, index_t SliceLen> template <index_t IDim, index_t SliceLen>
__host__ __device__ static constexpr auto Slice(Number<IDim>, Number<SliceLen>) __host__ __device__ static constexpr auto Slice(Number<IDim>, Number<SliceLen>)
{ {
...@@ -271,9 +283,11 @@ struct ConstantTensorDescriptor ...@@ -271,9 +283,11 @@ struct ConstantTensorDescriptor
FirstUnfoldDim <= LastUnfoldDim, FirstUnfoldDim <= LastUnfoldDim,
"wrong! should have FirstUnfoldDim <= LastUnfoldDim!"); "wrong! should have FirstUnfoldDim <= LastUnfoldDim!");
#if 0 // cannot compile: compiler complain about constexpr
// dimensions to be unfold need to be in descending order (w.r.t. strides), and need to be // dimensions to be unfold need to be in descending order (w.r.t. strides), and need to be
// packed in memory, otherwise, unfolding is invalid // packed in memory, otherwise, unfolding is invalid
static_for<FirstUnfoldDim, LastUnfoldDim, 1>{}([&](auto IDim) { static_for<FirstUnfoldDim, LastUnfoldDim, 1>{}([&](auto IDim_) {
constexpr auto IDim = decltype(IDim_){};
constexpr auto IDim_p1 = IDim + Number<1>{}; constexpr auto IDim_p1 = IDim + Number<1>{};
// check stride // check stride
...@@ -285,11 +299,12 @@ struct ConstantTensorDescriptor ...@@ -285,11 +299,12 @@ struct ConstantTensorDescriptor
static_assert(GetStride(IDim_p1) * GetLength(IDim_p1) == GetStride(IDim), static_assert(GetStride(IDim_p1) * GetLength(IDim_p1) == GetStride(IDim),
"wrong! dimensions to be unfolded need to be packed"); "wrong! dimensions to be unfolded need to be packed");
// checkt ranks // check ranks
static_assert(GetMemoryRank(IDim_p1) == GetMemoryRank(IDim) + 1, static_assert(GetMemoryRank(IDim_p1) == GetMemoryRank(IDim) + 1,
"wrong! ranks of dimensions to be unfolded need to be in increasing and " "wrong! ranks of dimensions to be unfolded need to be in increasing and "
"continuous ranks"); "continuous ranks");
}); });
#endif
// left and right // left and right
constexpr auto left = typename arithmetic_sequence_gen<0, FirstUnfoldDim, 1>::SeqType{}; constexpr auto left = typename arithmetic_sequence_gen<0, FirstUnfoldDim, 1>::SeqType{};
...@@ -308,9 +323,9 @@ struct ConstantTensorDescriptor ...@@ -308,9 +323,9 @@ struct ConstantTensorDescriptor
// decrease the ranks that are larger than the rank of LastUnfoldDim // decrease the ranks that are larger than the rank of LastUnfoldDim
constexpr auto tmp_ranks = constexpr auto tmp_ranks =
transform_sequences(GetMemoryRanks(), transform_sequences(f_unfold_impl<GetMemoryRank(Number<LastUnfoldDim>{}),
f_unfold_impl<GetMemoryRank(Number<LastUnfoldDim>{}), LastUnfoldDim - FirstUnfoldDim + 1>{},
LastUnfoldDim - FirstUnfoldDim + 1>{}); GetMemoryRanks());
// new lengths, strides and ranks // new lengths, strides and ranks
constexpr auto new_lengths = GetLengths() constexpr auto new_lengths = GetLengths()
...@@ -354,26 +369,26 @@ struct ConstantTensorDescriptor ...@@ -354,26 +369,26 @@ struct ConstantTensorDescriptor
}; };
template <class Lengths> template <class Lengths>
__host__ __device__ constexpr auto make_packed_ConstantTensorDescriptor(Lengths) __host__ __device__ constexpr auto make_ConstantTensorDescriptor_default_rank_packed(Lengths)
{ {
using Strides = decltype(calculate_packed_tensor_strides(Lengths{})); using Strides = decltype(calculate_tensor_strides_default_rank_packed(Lengths{}));
using MemoryRanks = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::SeqType; using MemoryRanks = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::SeqType;
return ConstantTensorDescriptor<Lengths, Strides, MemoryRanks>{}; return ConstantTensorDescriptor<Lengths, Strides, MemoryRanks>{};
} }
template <class Lengths, class Strides> template <class Lengths, class Strides>
__host__ __device__ constexpr auto make_ranked_ConstantTensorDescriptor(Lengths, Strides) __host__ __device__ constexpr auto make_ConstantTensorDescriptor_default_rank(Lengths, Strides)
{ {
using MemoryRanks = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::SeqType; using MemoryRanks = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::SeqType;
return ConstantTensorDescriptor<Lengths, Strides, MemoryRanks>{}; return ConstantTensorDescriptor<Lengths, Strides, MemoryRanks>{};
} }
template <class Lengths, index_t Align> template <class Lengths, index_t Align>
__host__ __device__ constexpr auto __host__ __device__ constexpr auto make_ConstantTensorDescriptor_default_rank_aligned(Lengths,
make_ranked_ConstantTensorDescriptor_with_alignment(Lengths, Number<Align>) Number<Align>)
{ {
using Strides = using Strides =
decltype(calculate_rank_tensor_default_strides_with_alignment(Lengths{}, Number<Align>{})); decltype(calculate_tensor_strides_default_rank_aligned(Lengths{}, Number<Align>{}));
using MemoryRanks = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::SeqType; using MemoryRanks = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::SeqType;
return ConstantTensorDescriptor<Lengths, Strides, MemoryRanks>{}; return ConstantTensorDescriptor<Lengths, Strides, MemoryRanks>{};
} }
......
#pragma once #pragma once
#include "constant_integral.hip.hpp" #include "integral_constant.hip.hpp"
#include "functional.hip.hpp" #include "functional.hip.hpp"
template <index_t... Is> template <index_t... Is>
...@@ -21,12 +21,6 @@ struct Sequence ...@@ -21,12 +21,6 @@ struct Sequence
return mData[I]; return mData[I];
} }
__host__ __device__ index_t operator[](index_t i) const
{
const index_t mData[mSize + 1] = {Is..., 0};
return mData[i];
}
template <index_t... IRs> template <index_t... IRs>
__host__ __device__ static constexpr auto ReorderGivenNew2Old(Sequence<IRs...> /*new2old*/) __host__ __device__ static constexpr auto ReorderGivenNew2Old(Sequence<IRs...> /*new2old*/)
{ {
...@@ -164,6 +158,12 @@ struct sequence_reverse_inclusive_scan<Sequence<I>, Reduce> ...@@ -164,6 +158,12 @@ struct sequence_reverse_inclusive_scan<Sequence<I>, Reduce>
using SeqType = Sequence<I>; using SeqType = Sequence<I>;
}; };
template <class Reduce>
struct sequence_reverse_inclusive_scan<Sequence<>, Reduce>
{
using SeqType = Sequence<>;
};
template <class, class> template <class, class>
struct sequence_extract; struct sequence_extract;
......
...@@ -457,7 +457,8 @@ struct Blockwise2dTensorCopy3 ...@@ -457,7 +457,8 @@ struct Blockwise2dTensorCopy3
index_t mSrcMyThreadOffset; index_t mSrcMyThreadOffset;
index_t mDstMyThreadOffset; index_t mDstMyThreadOffset;
__device__ Blockwise2dTensorCopy3() __device__ Blockwise2dTensorCopy3(Array<index_t, 2> src_block_data_multi_id_begin,
Array<index_t, 2> dst_block_data_multi_id_begin)
{ {
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
...@@ -499,10 +500,13 @@ struct Blockwise2dTensorCopy3 ...@@ -499,10 +500,13 @@ struct Blockwise2dTensorCopy3
const index_t thread_id_d0 = get_thread_local_1d_id() / thread_per_d1; const index_t thread_id_d0 = get_thread_local_1d_id() / thread_per_d1;
const index_t thread_id_d1 = get_thread_local_1d_id() - thread_id_d0 * thread_per_d1; const index_t thread_id_d1 = get_thread_local_1d_id() - thread_id_d0 * thread_per_d1;
mSrcMyThreadOffset = mSrcMyThreadOffset = SrcDesc{}.GetOffsetFromMultiIndex(
SrcDesc{}.GetOffsetFromMultiIndex(thread_id_d0, thread_id_d1 * DataPerRead); src_block_data_multi_id_begin +
mDstMyThreadOffset = Array<index_t, 2>{thread_id_d0, thread_id_d1 * DataPerRead});
DstDesc{}.GetOffsetFromMultiIndex(thread_id_d0, thread_id_d1 * DataPerRead);
mDstMyThreadOffset = DstDesc{}.GetOffsetFromMultiIndex(
dst_block_data_multi_id_begin +
Array<index_t, 2>{thread_id_d0, thread_id_d1 * DataPerRead});
} }
__device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
......
...@@ -13,7 +13,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst ...@@ -13,7 +13,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
constexpr auto dst_desc = DstDesc{}; constexpr auto dst_desc = DstDesc{};
constexpr auto desc = make_packed_ConstantTensorDescriptor(dst_desc.GetLengths()); constexpr auto desc = make_ConstantTensorDescriptor_default_rank_packed(dst_desc.GetLengths());
#if 0 #if 0
if(get_thread_local_1d_id() == 0) if(get_thread_local_1d_id() == 0)
...@@ -108,7 +108,7 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds ...@@ -108,7 +108,7 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds
constexpr auto src_desc = SrcDesc{}; constexpr auto src_desc = SrcDesc{};
constexpr auto dst_desc = DstDesc{}; constexpr auto dst_desc = DstDesc{};
constexpr auto ref_desc = make_packed_ConstantTensorDescriptor(SrcOpLengths{}); constexpr auto ref_desc = make_ConstantTensorDescriptor_default_rank_packed(SrcOpLengths{});
constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize; constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize;
...@@ -259,7 +259,7 @@ struct Blockwise4dTensorCopy1 ...@@ -259,7 +259,7 @@ struct Blockwise4dTensorCopy1
constexpr index_t read_per_d3 = mod_conv::integer_divide_ceil(L3, DataPerRead); constexpr index_t read_per_d3 = mod_conv::integer_divide_ceil(L3, DataPerRead);
constexpr auto ref_desc = constexpr auto ref_desc =
make_packed_ConstantTensorDescriptor(Sequence<L0, L1, L2, read_per_d3>{}); make_ConstantTensorDescriptor_default_rank_packed(Sequence<L0, L1, L2, read_per_d3>{});
constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize; constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize;
...@@ -336,7 +336,7 @@ struct BlockwiseChwnTensorCopyPadded ...@@ -336,7 +336,7 @@ struct BlockwiseChwnTensorCopyPadded
constexpr auto src_desc = SrcDesc{}; constexpr auto src_desc = SrcDesc{};
constexpr auto dst_desc = DstDesc{}; constexpr auto dst_desc = DstDesc{};
constexpr auto ref_desc = make_packed_ConstantTensorDescriptor(DstOpLengths{}); constexpr auto ref_desc = make_ConstantTensorDescriptor_default_rank_packed(DstOpLengths{});
constexpr auto h_global_pad_low = GlobalLowerPads{}.Get(I0); constexpr auto h_global_pad_low = GlobalLowerPads{}.Get(I0);
constexpr auto w_global_pad_low = GlobalLowerPads{}.Get(I1); constexpr auto w_global_pad_low = GlobalLowerPads{}.Get(I1);
...@@ -510,7 +510,8 @@ struct Blockwise4dTensorCopy3 ...@@ -510,7 +510,8 @@ struct Blockwise4dTensorCopy3
} }
} }
constexpr auto thread_cluster_desc = make_packed_ConstantTensorDescriptor(ThreadPerDims{}); constexpr auto thread_cluster_desc =
make_ConstantTensorDescriptor_default_rank_packed(ThreadPerDims{});
const auto thread_multi_id = const auto thread_multi_id =
thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id()); thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
...@@ -652,7 +653,7 @@ struct Blockwise4dTensorCopy3 ...@@ -652,7 +653,7 @@ struct Blockwise4dTensorCopy3
constexpr index_t nloop_d2 = L2 / thread_per_d2; constexpr index_t nloop_d2 = L2 / thread_per_d2;
constexpr index_t nloop_d3 = mod_conv::integer_divide_ceil(L3, thread_per_d3 * DataPerRead); constexpr index_t nloop_d3 = mod_conv::integer_divide_ceil(L3, thread_per_d3 * DataPerRead);
constexpr auto clipboard_desc = make_packed_ConstantTensorDescriptor( constexpr auto clipboard_desc = make_ConstantTensorDescriptor_default_rank_packed(
Sequence<nloop_d0, nloop_d1, nloop_d2, nloop_d3 * DataPerRead>{}); Sequence<nloop_d0, nloop_d1, nloop_d2, nloop_d3 * DataPerRead>{});
#pragma unroll #pragma unroll
...@@ -719,7 +720,7 @@ struct Blockwise4dTensorCopy3 ...@@ -719,7 +720,7 @@ struct Blockwise4dTensorCopy3
constexpr index_t nloop_d2 = L2 / thread_per_d2; constexpr index_t nloop_d2 = L2 / thread_per_d2;
constexpr index_t nloop_d3 = mod_conv::integer_divide_ceil(L3, thread_per_d3 * DataPerRead); constexpr index_t nloop_d3 = mod_conv::integer_divide_ceil(L3, thread_per_d3 * DataPerRead);
constexpr auto clipboard_desc = make_packed_ConstantTensorDescriptor( constexpr auto clipboard_desc = make_ConstantTensorDescriptor_default_rank_packed(
Sequence<nloop_d0, nloop_d1, nloop_d2, nloop_d3 * DataPerRead>{}); Sequence<nloop_d0, nloop_d1, nloop_d2, nloop_d3 * DataPerRead>{});
#pragma unroll #pragma unroll
......
...@@ -46,7 +46,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 ...@@ -46,7 +46,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
N % (NPerThreadSubC * NLevel0Cluster * NLevel1Cluster) == 0, N % (NPerThreadSubC * NLevel0Cluster * NLevel1Cluster) == 0,
"wrong! Cannot evenly divide work among\n"); "wrong! Cannot evenly divide work among\n");
static_assert(ThreadMatrixC::GetLengths() == GetThreadMatrixCLengths(), static_assert(is_same_type(ThreadMatrixC::GetLengths(), GetThreadMatrixCLengths()),
"wrong! ThreadMatrixC lengths is wrong"); "wrong! ThreadMatrixC lengths is wrong");
auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id()); auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id());
...@@ -55,7 +55,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 ...@@ -55,7 +55,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
mMyThreadOffsetB = BlockMatrixB::GetOffsetFromMultiIndex(0, c_thread_mtx_index.col); mMyThreadOffsetB = BlockMatrixB::GetOffsetFromMultiIndex(0, c_thread_mtx_index.col);
} }
__device__ static auto GetThreadMatrixCLengths() __device__ static constexpr auto GetThreadMatrixCLengths()
{ {
constexpr index_t M = BlockMatrixA::NCol(); // A is transposed constexpr index_t M = BlockMatrixA::NCol(); // A is transposed
constexpr index_t N = BlockMatrixB::NCol(); constexpr index_t N = BlockMatrixB::NCol();
......
#pragma once #pragma once
#include "threadwise_tensor_slice_op.hip.hpp" #include "threadwise_tensor_slice_op.hip.hpp"
// slice a merged tensor, reorder and copy it into a normal tensor // slice a (normal or merged) tensor, reorder and copy it into another (normal or merged) tensor
// src: a merged tensor,
// dst: a normal tensor
template <index_t BlockSize, template <index_t BlockSize,
class Float, class Float,
class SrcDesc, class SrcDesc,
class DstDesc, class DstDesc,
class SliceLengths, class SliceLengths,
class SubLengths, class SubLengths,
class ClusterLengths, class DataClusterLengths,
class ThreadClusterArrangeOrder, class ThreadClusterArrangeOrder,
class SrcAccessOrder, class SrcAccessOrder,
class DstAccessOrder> class DstAccessOrder,
index_t SrcDataPerRead,
index_t DstDataPerRead>
struct BlockwiseTensorSliceCopy_generic_v1 struct BlockwiseTensorSliceCopy_generic_v1
{ {
static constexpr index_t nDim = SrcDesc::GetNumOfDimension(); static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
...@@ -21,39 +21,44 @@ struct BlockwiseTensorSliceCopy_generic_v1 ...@@ -21,39 +21,44 @@ struct BlockwiseTensorSliceCopy_generic_v1
index_t mSrcMyThreadOffset; index_t mSrcMyThreadOffset;
index_t mDstMyThreadOffset; index_t mDstMyThreadOffset;
__device__ BlockwiseTensorSliceCopy_generic_v1(Array<index_t, nDim> src_block_multi_offset, __device__
Array<index_t, nDim> dst_block_multi_offset) BlockwiseTensorSliceCopy_generic_v1(Array<index_t, nDim> src_block_data_multi_id_begin,
Array<index_t, nDim> dst_block_data_multi_id_begin)
{ {
// check NDim consistent // check NDim consistent
static_assert(SrcDesc::GetNumOfDimension() == DstDesc::GetNumOfDimension(), "wrong"); static_assert(SrcDesc::GetNumOfDimension() == DstDesc::GetNumOfDimension(), "wrong");
constexpr auto thread_cluster_desc = make_packed_ConstantTensorDescriptor( // thread cluster
ClusterLengths{}.ReorderGivenNew2Old(ThreadClusterArrangeOrder{})); constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_default_rank_packed(
DataClusterLengths{}.ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));
// BlockSize // BlockSize
static_assert(BlockSize == thread_cluster_desc.GetElementSize(), "wrong! BlockSize"); static_assert(BlockSize == thread_cluster_desc.GetElementSize(), "wrong! BlockSize");
// divide work // divide work
static_for<0, nDim, 1>{}([&](auto IDim) { constexpr auto data_per_cluster_per_dims = SubLengths{} * DataClusterLengths{};
static_assert(SliceLengths{}.Get(IDim) % SubLenghs{}.Get(IDim) == 0,
static_for<0, nDim, 1>{}([&](auto IDim_) {
constexpr auto IDim = decltype(IDim_){};
static_assert(SliceLengths::Get(IDim) % SubLengths::Get(IDim) == 0,
"wrong! cannot evenly divide sliced tensor into sub-tensor"); "wrong! cannot evenly divide sliced tensor into sub-tensor");
static_assert(SliceLengths::Get(IDim) % data_per_cluster_per_dims.Get(IDim) == 0,
"wrong! cannot evenly divide sliced tensor into cluster");
}); });
constexpr auto thread_work_desc = constexpr auto repeat_lengths = SliceLengths{} / data_per_cluster_per_dims;
make_packed_ConstantTensorDescriptor(SliceLengths{} / SliceSubLengths{});
static_for<0, nDim, 1>{}([&](auto IDim) { // for now, only support SubLengths.Get() == 1 on a merged dimension that is merge from
static_assert(thread_work_desc.GetLength(IDim) % thread_cluster_desc.Get(IDim) == 0, // multiple dimensions
"wrong! cannot evenly divide work to cluster"); static_for<0, nDim, 1>{}([&](auto IDim_) {
}); constexpr auto IDim = decltype(IDim_){};
// only support SubLengths.Get() == 1 on merged dimension, for now static_assert(SubLengths::Get(IDim) == 1 ||
static_for<0, nDim, 1>{}([&](auto IDim) { (!SrcDesc::ContainMultipleOriginalDimensions(IDim) &&
static_if<(SrcDesc::ContainMultipleOriginalDimensions(IDim) || !DstDesc::ContainMultipleOriginalDimensions(IDim)),
DstDesc::ContainMultipleOriginalDimensions(IDim))>{}([&](auto fwd) { "wrong! only surpport Sub-Length == 1 on a merged dimension");
static_assert(fwd(SubLengths{}).Get(IDim) == 1,
"wrong! Sub-Lengths on merged dimension should be 1");
});
}); });
// calculate mSrcMyThreadOffset, mDstMyThreadOffset // calculate mSrcMyThreadOffset, mDstMyThreadOffset
...@@ -63,22 +68,23 @@ struct BlockwiseTensorSliceCopy_generic_v1 ...@@ -63,22 +68,23 @@ struct BlockwiseTensorSliceCopy_generic_v1
const auto data_cluster_multi_id = const auto data_cluster_multi_id =
reorder_array_given_old2new(thread_cluster_multi_id, ThreadClusterArrangeOrder{}); reorder_array_given_old2new(thread_cluster_multi_id, ThreadClusterArrangeOrder{});
const auto thread_data_multi_offset = data_cluster_multi_id * SubLengths{}; const auto thread_data_multi_id_begin = data_cluster_multi_id * SubLengths{};
mSrcMyThreadOffset = SrcDesc::GetOffsetFromMultiIndex(src_block_data_multi_id_begin +
thread_data_multi_id_begin);
mSrcMythreadOffset = mSrcMyThreadOffset = DstDesc::GetOffsetFromMultiIndex(dst_block_data_multi_id_begin +
SrcDesc::GetOffsetFromMultiIndex(src_block_multi_offset + thread_data_multi_offset); thread_data_multi_id_begin);
mSrcMythreadOffset =
DstDesc::GetOffsetFromMultiIndex(dst_block_multi_offset + thread_data_multi_offset);
} }
__device__ static constexpr index_t GetRegisterClipboardSize() __device__ static constexpr index_t GetRegisterClipboardSize()
{ {
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ClusterLengths{}); constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{});
constexpr auto thread_tensor_desc = constexpr auto thread_tensor_desc =
make_packed_ConstantTensorDescriptor(SubLengths{} * repeat_lengths); make_ConstantTensorDescriptor_default_rank_packed(SubLengths{} * repeat_lengths);
return thread_tensor_desc.GetElementSpaceSize(); return thread_tensor_desc.GetElementSpace();
} }
__device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src, __device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src,
...@@ -86,32 +92,34 @@ struct BlockwiseTensorSliceCopy_generic_v1 ...@@ -86,32 +92,34 @@ struct BlockwiseTensorSliceCopy_generic_v1
{ {
constexpr auto thread_sub_tensor_lengths = SubLengths{}; constexpr auto thread_sub_tensor_lengths = SubLengths{};
constexpr auto data_per_cluster_per_dims = thread_sub_tensor_lengths * ClusterLengths{}; constexpr auto data_per_cluster_per_dims = thread_sub_tensor_lengths * DataClusterLengths{};
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ClusterLengths{}); constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{});
constexpr auto thread_tensor_desc = constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor_default_rank_packed(
make_packed_ConstantTensorDescriptor(thread_sub_tensor_lengths * repeat_lengths); thread_sub_tensor_lengths * repeat_lengths);
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) { static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){}; constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});
const auto src_thread_data_multi_id_begin =
repeat_multi_id * data_per_cluster_per_dims; // cannot not constexpr, why?
constexpr auto src_data_multi_offset = repeat_multi_id * data_per_cluster_per_dims; const auto clipboard_data_multi_id_begin =
repeat_multi_id * thread_sub_tensor_lengths; // cannot not constexpr, why?
constexpr auto clipboard_data_multi_offset = const index_t src_offset = SrcDesc{}.GetOffsetFromMultiIndex(
repeat_multi_id * thread_sub_tensor_lengths; src_thread_data_multi_id_begin); // cannot not constexpr, why?
constexpr index_t src_offset = SrcDesc{}.GetOffsetFromMultiIndex(src_data_multi_id); const index_t clipboard_offset = thread_tensor_desc.GetOffsetFromMultiIndex(
constexpr index_t clipboard_offset = clipboard_data_multi_id_begin); // cannot not constexpr, why?
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id);
threadwise_tensor_slice_copy_generic(SrcDesc{}, threadwise_tensor_slice_copy_generic(SrcDesc{},
p_src + src_offset + mSrcMyThreadOffset, p_src + src_offset + mSrcMyThreadOffset,
thread_tensor_desc, make_zero_array<index_t, nDim>(),
zero_array<index_t, nDim>{},
thread_tensor_desc, thread_tensor_desc,
p_clipboard + clipboard_offset, p_clipboard + clipboard_offset,
zero_array<index_t, nDim>{}, make_zero_array<index_t, nDim>(),
thread_sub_tensor_lengths, thread_sub_tensor_lengths,
SrcAccessOrder{}); SrcAccessOrder{});
}); });
...@@ -122,41 +130,44 @@ struct BlockwiseTensorSliceCopy_generic_v1 ...@@ -122,41 +130,44 @@ struct BlockwiseTensorSliceCopy_generic_v1
{ {
constexpr auto thread_sub_tensor_lengths = SubLengths{}; constexpr auto thread_sub_tensor_lengths = SubLengths{};
constexpr auto data_per_cluster_per_dims = thread_sub_tensor_lengths * ClusterLengths{}; constexpr auto data_per_cluster_per_dims = thread_sub_tensor_lengths * DataClusterLengths{};
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ClusterLengths{}); constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{});
constexpr auto thread_tensor_desc = constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor_default_rank_packed(
make_packed_ConstantTensorDescriptor(thread_sub_tensor_lengths * repeat_lengths); thread_sub_tensor_lengths * repeat_lengths);
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) { static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){}; constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});
constexpr auto clipboard_data_multi_offset = const auto clipboard_data_multi_id_begin =
repeat_multi_id * thread_sub_tensor_lengths; repeat_multi_id * thread_sub_tensor_lengths; // cannot not constexpr, why?
constexpr auto dst_data_multi_offset = repeat_multi_id * data_per_cluster_per_dims; const auto dst_data_multi_id_begin =
repeat_multi_id * data_per_cluster_per_dims; // cannot not constexpr, why?
constexpr index_t clipboard_offset = const index_t clipboard_offset = thread_tensor_desc.GetOffsetFromMultiIndex(
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_offset); clipboard_data_multi_id_begin); // cannot not constexpr, why?
constexpr index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_offset); const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(
dst_data_multi_id_begin); // cannot not constexpr, why?
threadwise_tensor_slice_copy_generic(thread_tensor_desc, threadwise_tensor_slice_copy_generic(thread_tensor_desc,
p_clipboard + clipboard_offset, p_clipboard + clipboard_offset,
zero_array<index_t, nDim>{}, make_zero_array<index_t, nDim>(),
DstDesc{}, DstDesc{},
p_dst + dst_offset + mDstMyThreadOffset, p_dst + dst_offset + mDstMyThreadOffset,
zero_array<index_t, nDim>{}, make_zero_array<index_t, nDim>(),
thread_sub_tensor_lengths, thread_sub_tensor_lengths,
DstAccessOrder{}); DstAccessOrder{});
});
} }
__device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
{ {
Float p_clipboard[GetRegisterClipboardSize()]; Float p_clipboard[GetRegisterClipboardSize()];
RunLoadRegisterClipboard(p_src, p_clipboard); RunLoadRegisterClipboard(p_src, p_clipboard);
RunStoreRegisterClipboard(p_clipboard, p_dst); RunStoreRegisterClipboard(p_clipboard, p_dst);
} }
}; };
...@@ -40,7 +40,7 @@ struct BlockwiseTensorSliceReorderCopy_v3 ...@@ -40,7 +40,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
src_cluster_lengths.ReorderGivenNew2Old(map_thread_cluster_2_src_cluster); src_cluster_lengths.ReorderGivenNew2Old(map_thread_cluster_2_src_cluster);
constexpr auto thread_cluster_desc = constexpr auto thread_cluster_desc =
make_packed_ConstantTensorDescriptor(thread_cluster_lengths); make_ConstantTensorDescriptor_default_rank_packed(thread_cluster_lengths);
// sanity check: data type // sanity check: data type
static_assert(is_same<Float, float>::value, "wrong! only support float for now!\n"); static_assert(is_same<Float, float>::value, "wrong! only support float for now!\n");
...@@ -149,7 +149,7 @@ struct BlockwiseTensorSliceReorderCopy_v3 ...@@ -149,7 +149,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths; constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;
constexpr auto thread_tensor_desc = constexpr auto thread_tensor_desc =
make_packed_ConstantTensorDescriptor(thread_tensor_lengths); make_ConstantTensorDescriptor_default_rank_packed(thread_tensor_lengths);
return thread_tensor_desc.GetElementSpace(); return thread_tensor_desc.GetElementSpace();
} }
...@@ -170,7 +170,7 @@ struct BlockwiseTensorSliceReorderCopy_v3 ...@@ -170,7 +170,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths; constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;
constexpr auto thread_tensor_desc = constexpr auto thread_tensor_desc =
make_packed_ConstantTensorDescriptor(thread_tensor_lengths); make_ConstantTensorDescriptor_default_rank_packed(thread_tensor_lengths);
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) { static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){}; constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
...@@ -208,7 +208,7 @@ struct BlockwiseTensorSliceReorderCopy_v3 ...@@ -208,7 +208,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths; constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;
constexpr auto thread_tensor_desc = constexpr auto thread_tensor_desc =
make_packed_ConstantTensorDescriptor(thread_tensor_lengths); make_ConstantTensorDescriptor_default_rank_packed(thread_tensor_lengths);
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) { static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){}; constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
......
#pragma once #pragma once
#include "vector_type.hip.hpp" #include "vector_type.hip.hpp"
#include "constant_integral.hip.hpp" #include "integral_constant.hip.hpp"
#include "Sequence.hip.hpp" #include "Sequence.hip.hpp"
#include "Array.hip.hpp" #include "Array.hip.hpp"
#include "functional.hip.hpp" #include "functional.hip.hpp"
...@@ -17,15 +17,21 @@ __device__ index_t get_block_1d_id() { return blockIdx.x; } ...@@ -17,15 +17,21 @@ __device__ index_t get_block_1d_id() { return blockIdx.x; }
template <class T1, class T2> template <class T1, class T2>
struct is_same struct is_same
{ {
static const bool value = false; static constexpr bool value = false;
}; };
template <class T> template <class T>
struct is_same<T, T> struct is_same<T, T>
{ {
static const bool value = true; static constexpr bool value = true;
}; };
template <class X, class Y>
__host__ __device__ constexpr bool is_same_type(X, Y)
{
return is_same<X, Y>::value;
}
namespace mod_conv { // namespace mod_conv namespace mod_conv { // namespace mod_conv
template <class T, T s> template <class T, T s>
struct scales struct scales
......
...@@ -30,7 +30,7 @@ __host__ __device__ constexpr auto get_convolution_output_default_4d_tensor_desc ...@@ -30,7 +30,7 @@ __host__ __device__ constexpr auto get_convolution_output_default_4d_tensor_desc
constexpr auto HO = HI + 1 - Y; constexpr auto HO = HI + 1 - Y;
constexpr auto WO = WI + 1 - X; constexpr auto WO = WI + 1 - X;
return make_packed_ConstantTensorDescriptor(Sequence<N, K, HO, WO>{}); return make_ConstantTensorDescriptor_default_rank_packed(Sequence<N, K, HO, WO>{});
} }
template <class InDesc, class WeiDesc, class LowerPads, class UpperPads> template <class InDesc, class WeiDesc, class LowerPads, class UpperPads>
...@@ -67,7 +67,7 @@ __host__ __device__ constexpr auto get_convolution_with_padding_output_default_4 ...@@ -67,7 +67,7 @@ __host__ __device__ constexpr auto get_convolution_with_padding_output_default_4
constexpr auto HO = HI + HPadLow + HPadUp + 1 - Y; constexpr auto HO = HI + HPadLow + HPadUp + 1 - Y;
constexpr auto WO = WI + WPadLow + WPadUp + 1 - X; constexpr auto WO = WI + WPadLow + WPadUp + 1 - X;
return make_packed_ConstantTensorDescriptor(Sequence<N, K, HO, WO>{}); return make_ConstantTensorDescriptor_default_rank_packed(Sequence<N, K, HO, WO>{});
} }
template <class InDesc, class WeiDesc, class OutDesc> template <class InDesc, class WeiDesc, class OutDesc>
......
#pragma once #pragma once
#include "constant_integral.hip.hpp" #include "integral_constant.hip.hpp"
struct forwarder struct forwarder
{ {
......
...@@ -85,7 +85,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn ...@@ -85,7 +85,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock); constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock);
constexpr index_t NBlockWork = mod_conv::integer_divide_ceil(N, NPerBlock); constexpr index_t NBlockWork = mod_conv::integer_divide_ceil(N, NPerBlock);
constexpr auto block_work_desc = make_packed_ConstantTensorDescriptor( constexpr auto block_work_desc = make_ConstantTensorDescriptor_default_rank_packed(
Sequence<KBlockWork, HBlockWork, WBlockWork, NBlockWork>{}); Sequence<KBlockWork, HBlockWork, WBlockWork, NBlockWork>{});
const auto block_work_multi_id = const auto block_work_multi_id =
...@@ -109,7 +109,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn ...@@ -109,7 +109,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
GemmDataPerReadA, GemmDataPerReadA,
GemmDataPerReadB); GemmDataPerReadB);
constexpr auto in_c_h_w_n_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment( constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{}, Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{},
Number<InBlockCopyDataPerRead_N>{}); Number<InBlockCopyDataPerRead_N>{});
...@@ -118,12 +118,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn ...@@ -118,12 +118,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0, static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
"GemmDataPerReadB alignment requirement is not meet"); "GemmDataPerReadB alignment requirement is not meet");
constexpr auto wei_c_k_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment( constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
Sequence<CPerBlock, KPerBlock>{}, Sequence<CPerBlock, KPerBlock>{},
Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{}); Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
// tensor view of threadwise output in register // tensor view of threadwise output in register
constexpr auto out_k_h_w_n_thread_desc = make_packed_ConstantTensorDescriptor( constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_default_rank_packed(
Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{}); Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{});
// blockwise copy // blockwise copy
......
...@@ -86,7 +86,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn ...@@ -86,7 +86,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
constexpr index_t HBlockWork = mod_conv::integer_divide_ceil(Ho, HoPerBlock); constexpr index_t HBlockWork = mod_conv::integer_divide_ceil(Ho, HoPerBlock);
constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock); constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock);
constexpr auto block_work_desc = make_packed_ConstantTensorDescriptor( constexpr auto block_work_desc = make_ConstantTensorDescriptor_default_rank_packed(
Sequence<NBlockWork, KBlockWork, HBlockWork, WBlockWork>{}); Sequence<NBlockWork, KBlockWork, HBlockWork, WBlockWork>{});
const auto block_work_multi_id = const auto block_work_multi_id =
...@@ -102,7 +102,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn ...@@ -102,7 +102,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
// global tensor view // global tensor view
constexpr auto wei_c_k_global_desc = constexpr auto wei_c_k_global_desc =
make_ranked_ConstantTensorDescriptor(Sequence<C, K>{}, Sequence<Y * X * K, 1>{}); make_ConstantTensorDescriptor_default_rank(Sequence<C, K>{}, Sequence<Y * X * K, 1>{});
// LDS tensor view // LDS tensor view
// be careful of alignment // be careful of alignment
...@@ -111,7 +111,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn ...@@ -111,7 +111,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
GemmDataPerReadA, GemmDataPerReadA,
GemmDataPerReadB); GemmDataPerReadB);
constexpr auto in_c_h_w_n_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment( constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{}, Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{},
Number<InBlockReorderDataPerWrite_N>{}); Number<InBlockReorderDataPerWrite_N>{});
...@@ -120,12 +120,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn ...@@ -120,12 +120,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0, static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
"GemmDataPerReadB alignment requirement is not meet"); "GemmDataPerReadB alignment requirement is not meet");
constexpr auto wei_c_k_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment( constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
Sequence<CPerBlock, KPerBlock>{}, Sequence<CPerBlock, KPerBlock>{},
Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{}); Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
// tensor view of threadwise output in register // tensor view of threadwise output in register
constexpr auto out_k_h_w_n_thread_desc = make_packed_ConstantTensorDescriptor( constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_default_rank_packed(
Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{}); Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{});
// blockwise copy // blockwise copy
...@@ -448,10 +448,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn ...@@ -448,10 +448,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
constexpr index_t K1 = KPerBlock / KPerThread; constexpr index_t K1 = KPerBlock / KPerThread;
#if 0 #if 0
constexpr auto out_10d_global_desc = make_packed_ConstantTensorDescriptor( constexpr auto out_10d_global_desc = make_ConstantTensorDescriptor_default_rank_packed(
Sequence<K / (K1 * K2), K1, K2, Ho, Wo / (W1 * W2 * W3), W1, W2, W3, N / N1, N1>{}); Sequence<K / (K1 * K2), K1, K2, Ho, Wo / (W1 * W2 * W3), W1, W2, W3, N / N1, N1>{});
constexpr auto out_10d_thread_desc = make_packed_ConstantTensorDescriptor( constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor_default_rank_packed(
Sequence<KPerThread / K2, 1, K2, HoPerThread, 1, W1, 1, W3, 1, N1>{}); Sequence<KPerThread / K2, 1, K2, HoPerThread, 1, W1, 1, W3, 1, N1>{});
#else #else
constexpr auto out_10d_global_desc = constexpr auto out_10d_global_desc =
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment