Commit 5fd40ad7 authored by Chao Liu's avatar Chao Liu
Browse files

clean up

parent 4543d17a
...@@ -34,25 +34,24 @@ void device_direct_convolution_1(InDesc, ...@@ -34,25 +34,24 @@ void device_direct_convolution_1(InDesc,
#if 1 #if 1
// 3x3, 34x34 // 3x3, 34x34
constexpr unsigned OutTileSizeH = 2; constexpr unsigned NPerBlock = 2;
constexpr unsigned OutTileSizeW = 2; constexpr unsigned KPerBlock = 16;
constexpr unsigned NPerBlock = 2; constexpr unsigned CPerBlock = 2;
constexpr unsigned KPerBlock = 16; constexpr unsigned HoPerBlock = 4;
constexpr unsigned CPerBlock = 2; constexpr unsigned WoPerBlock = 32;
constexpr unsigned YPerBlock = 2;
constexpr unsigned XPerBlock = 16;
constexpr unsigned NPerThread = 2; constexpr unsigned NPerThread = 2;
constexpr unsigned KPerThread = 4; constexpr unsigned KPerThread = 4;
constexpr unsigned CPerThread = 2; constexpr unsigned CPerThread = 2;
constexpr unsigned HoPerThread = 2;
constexpr unsigned WoPerThread = 2;
constexpr unsigned BlockSize = 128; constexpr unsigned BlockSize = 128;
#endif #endif
constexpr unsigned GridSize = (out_desc.GetLength(I0) / NPerBlock) * constexpr unsigned GridSize =
(out_desc.GetLength(I1) / KPerBlock) * (out_desc.GetLength(I0) / NPerBlock) * (out_desc.GetLength(I1) / KPerBlock) *
(out_desc.GetLength(I2) / (OutTileSizeH * YPerBlock)) * (out_desc.GetLength(I2) / HoPerBlock) * (out_desc.GetLength(I3) / WoPerBlock);
(out_desc.GetLength(I3) / (OutTileSizeW * XPerBlock));
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
...@@ -62,16 +61,16 @@ void device_direct_convolution_1(InDesc, ...@@ -62,16 +61,16 @@ void device_direct_convolution_1(InDesc,
InDesc, InDesc,
WeiDesc, WeiDesc,
OutDesc, OutDesc,
OutTileSizeH,
OutTileSizeW,
NPerBlock, NPerBlock,
KPerBlock, KPerBlock,
CPerBlock, CPerBlock,
YPerBlock, HoPerBlock,
XPerBlock, WoPerBlock,
NPerThread, NPerThread,
KPerThread, KPerThread,
CPerThread, CPerThread,
HoPerThread,
WoPerThread,
BlockSize, BlockSize,
GridSize>, GridSize>,
dim3(GridSize), dim3(GridSize),
......
...@@ -34,40 +34,24 @@ void device_direct_convolution_2(InDesc, ...@@ -34,40 +34,24 @@ void device_direct_convolution_2(InDesc,
#if 1 #if 1
// 3x3, 34x34, 128 thread // 3x3, 34x34, 128 thread
constexpr unsigned OutTileSizeH = 2; constexpr unsigned NPerBlock = 2;
constexpr unsigned OutTileSizeW = 2; constexpr unsigned KPerBlock = 32;
constexpr unsigned NPerBlock = 2; constexpr unsigned CPerBlock = 4;
constexpr unsigned KPerBlock = 32; constexpr unsigned HoPerBlock = 2;
constexpr unsigned CPerBlock = 4; constexpr unsigned WoPerBlock = 32;
constexpr unsigned YPerBlock = 1;
constexpr unsigned XPerBlock = 16;
constexpr unsigned NPerThread = 2; constexpr unsigned NPerThread = 2;
constexpr unsigned KPerThread = 4; constexpr unsigned KPerThread = 4;
constexpr unsigned CPerThread = 2; constexpr unsigned CPerThread = 2;
constexpr unsigned HoPerThread = 2;
constexpr unsigned WoPerThread = 2;
constexpr unsigned BlockSize = 128; constexpr unsigned BlockSize = 128;
#elif 0
// 3x3, 34x34, 256 thread
constexpr unsigned OutTileSizeH = 2;
constexpr unsigned OutTileSizeW = 2;
constexpr unsigned NPerBlock = 2;
constexpr unsigned KPerBlock = 32;
constexpr unsigned CPerBlock = 4;
constexpr unsigned YPerBlock = 1;
constexpr unsigned XPerBlock = 32;
constexpr unsigned NPerThread = 2;
constexpr unsigned KPerThread = 4;
constexpr unsigned CPerThread = 2;
constexpr unsigned BlockSize = 256;
#endif #endif
constexpr unsigned GridSize = (out_desc.GetLength(I0) / NPerBlock) * constexpr unsigned GridSize =
(out_desc.GetLength(I1) / KPerBlock) * (out_desc.GetLength(I0) / NPerBlock) * (out_desc.GetLength(I1) / KPerBlock) *
(out_desc.GetLength(I2) / (OutTileSizeH * YPerBlock)) * (out_desc.GetLength(I2) / HoPerBlock) * (out_desc.GetLength(I3) / WoPerBlock);
(out_desc.GetLength(I3) / (OutTileSizeW * XPerBlock));
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
...@@ -77,16 +61,16 @@ void device_direct_convolution_2(InDesc, ...@@ -77,16 +61,16 @@ void device_direct_convolution_2(InDesc,
InDesc, InDesc,
WeiDesc, WeiDesc,
OutDesc, OutDesc,
OutTileSizeH,
OutTileSizeW,
NPerBlock, NPerBlock,
KPerBlock, KPerBlock,
CPerBlock, CPerBlock,
YPerBlock, HoPerBlock,
XPerBlock, WoPerBlock,
NPerThread, NPerThread,
KPerThread, KPerThread,
CPerThread, CPerThread,
HoPerThread,
WoPerThread,
BlockSize, BlockSize,
GridSize>, GridSize>,
dim3(GridSize), dim3(GridSize),
......
...@@ -30,11 +30,11 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc, ...@@ -30,11 +30,11 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
constexpr unsigned K = wei_kcsr_desc.GetLength(I0); constexpr unsigned K = wei_kcsr_desc.GetLength(I0);
constexpr unsigned C = wei_kcsr_desc.GetLength(I1); constexpr unsigned C = wei_kcsr_desc.GetLength(I1);
constexpr unsigned S = wei_kcsr_desc.GetLength(I2); constexpr unsigned Y = wei_kcsr_desc.GetLength(I2);
constexpr unsigned R = wei_kcsr_desc.GetLength(I3); constexpr unsigned X = wei_kcsr_desc.GetLength(I3);
// reorder weight // reorder weight
auto wei_csrk_desc = make_ConstantTensorDescriptor(Sequence<C, S, R, K>{}); auto wei_csrk_desc = make_ConstantTensorDescriptor(Sequence<C, Y, X, K>{});
ostream_ConstantTensorDescriptor(wei_csrk_desc, std::cout << "wei_csrk_desc: "); ostream_ConstantTensorDescriptor(wei_csrk_desc, std::cout << "wei_csrk_desc: ");
Tensor<T> wei_csrk(make_TensorDescriptor(wei_csrk_desc)); Tensor<T> wei_csrk(make_TensorDescriptor(wei_csrk_desc));
...@@ -43,7 +43,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc, ...@@ -43,7 +43,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
wei_csrk(c, s, r, k) = wei_kcsr(k, c, s, r); wei_csrk(c, s, r, k) = wei_kcsr(k, c, s, r);
}; };
make_ParallelTensorFunctor(f_reorder_kcsr2csrk, K, C, S, R)( make_ParallelTensorFunctor(f_reorder_kcsr2csrk, K, C, Y, X)(
std::thread::hardware_concurrency()); std::thread::hardware_concurrency());
// reorder input // reorder input
......
...@@ -2,7 +2,6 @@ ...@@ -2,7 +2,6 @@
#include <unistd.h> #include <unistd.h>
#include "device.hpp" #include "device.hpp"
#include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.hip.hpp" #include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.hip.hpp"
#include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.hip.hpp"
template <class T, class InDesc, class WeiDesc, class OutDesc, class LowerPads, class UpperPads> template <class T, class InDesc, class WeiDesc, class OutDesc, class LowerPads, class UpperPads>
void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc, void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc,
...@@ -33,11 +32,11 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc, ...@@ -33,11 +32,11 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc,
constexpr unsigned K = wei_kcsr_desc.GetLength(I0); constexpr unsigned K = wei_kcsr_desc.GetLength(I0);
constexpr unsigned C = wei_kcsr_desc.GetLength(I1); constexpr unsigned C = wei_kcsr_desc.GetLength(I1);
constexpr unsigned S = wei_kcsr_desc.GetLength(I2); constexpr unsigned Y = wei_kcsr_desc.GetLength(I2);
constexpr unsigned R = wei_kcsr_desc.GetLength(I3); constexpr unsigned X = wei_kcsr_desc.GetLength(I3);
// reorder weight // reorder weight
auto wei_csrk_desc = make_ConstantTensorDescriptor(Sequence<C, S, R, K>{}); auto wei_csrk_desc = make_ConstantTensorDescriptor(Sequence<C, Y, X, K>{});
ostream_ConstantTensorDescriptor(wei_csrk_desc, std::cout << "wei_csrk_desc: "); ostream_ConstantTensorDescriptor(wei_csrk_desc, std::cout << "wei_csrk_desc: ");
Tensor<T> wei_csrk(make_TensorDescriptor(wei_csrk_desc)); Tensor<T> wei_csrk(make_TensorDescriptor(wei_csrk_desc));
...@@ -46,7 +45,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc, ...@@ -46,7 +45,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc,
wei_csrk(c, s, r, k) = wei_kcsr(k, c, s, r); wei_csrk(c, s, r, k) = wei_kcsr(k, c, s, r);
}; };
make_ParallelTensorFunctor(f_reorder_kcsr2csrk, K, C, S, R)( make_ParallelTensorFunctor(f_reorder_kcsr2csrk, K, C, Y, X)(
std::thread::hardware_concurrency()); std::thread::hardware_concurrency());
// reorder input // reorder input
...@@ -251,31 +250,26 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc, ...@@ -251,31 +250,26 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc,
for(unsigned i = 0; i < nrepeat; ++i) for(unsigned i = 0; i < nrepeat; ++i)
{ {
float time = launch_kernel( float time = launch_kernel(
#if 0 gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded<GridSize,
gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded BlockSize,
#elif 1 T,
gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline decltype(in_chwn_desc),
#endif decltype(wei_csrk_desc),
<GridSize, decltype(out_khwn_desc),
BlockSize, LowerPads,
T, UpperPads,
decltype(in_chwn_desc), NPerBlock,
decltype(wei_csrk_desc), KPerBlock,
decltype(out_khwn_desc), CPerBlock,
LowerPads, HoPerBlock,
UpperPads, WoPerBlock,
NPerBlock, NPerThread,
KPerBlock, KPerThread,
CPerBlock, CPerThread,
HoPerBlock, HoPerThread,
WoPerBlock, WoPerThread,
NPerThread, WeiBlockCopyThreadPerDim0,
KPerThread, WeiBlockCopyThreadPerDim1>,
CPerThread,
HoPerThread,
WoPerThread,
WeiBlockCopyThreadPerDim0,
WeiBlockCopyThreadPerDim1>,
dim3(GridSize), dim3(GridSize),
dim3(BlockSize), dim3(BlockSize),
......
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw.hip.hpp"
template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_implicit_gemm_convolution_1_nchw_kcsr_nkhw(InDesc,
const Tensor<T>& in,
WeiDesc,
const Tensor<T>& wei,
OutDesc,
Tensor<T>& out,
unsigned nrepeat)
{
std::size_t data_sz = sizeof(T);
DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace());
DeviceMem wei_device_buf(data_sz * wei.mDesc.GetElementSpace());
DeviceMem out_device_buf(data_sz * out.mDesc.GetElementSpace());
int num_thread = std::thread::hardware_concurrency();
in_device_buf.ToDevice(in.mData.data());
wei_device_buf.ToDevice(wei.mData.data());
out_device_buf.ToDevice(out.mData.data());
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto in_desc = InDesc{};
constexpr auto wei_desc = WeiDesc{};
constexpr auto out_desc = OutDesc{};
#if 1
// 3x3, 34x34
constexpr unsigned NPerBlock = 1;
constexpr unsigned KPerBlock = 64;
constexpr unsigned CPerBlock = 2;
constexpr unsigned HoPerBlock = 4;
constexpr unsigned WoPerBlock = 32;
constexpr unsigned KPerThread = 16;
constexpr unsigned CPerThread = 1;
constexpr unsigned HoPerThread = 2;
constexpr unsigned WoPerThread = 2;
constexpr unsigned BlockSize = 128;
#endif
constexpr unsigned GridSize =
(out_desc.GetLength(I0) / NPerBlock) * (out_desc.GetLength(I1) / KPerBlock) *
(out_desc.GetLength(I2) / HoPerBlock) * (out_desc.GetLength(I3) / WoPerBlock);
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
for(unsigned i = 0; i < nrepeat; ++i)
{
float time = launch_kernel(gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw<GridSize,
BlockSize,
T,
InDesc,
WeiDesc,
OutDesc,
NPerBlock,
KPerBlock,
CPerBlock,
HoPerBlock,
WoPerBlock,
KPerThread,
CPerThread,
HoPerThread,
WoPerThread>,
dim3(GridSize),
dim3(BlockSize),
static_cast<T*>(in_device_buf.GetDeviceBuffer()),
static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
static_cast<T*>(out_device_buf.GetDeviceBuffer()));
printf("Elapsed time : %f ms\n", time);
usleep(std::min(time * 1000, float(10000)));
}
out_device_buf.FromDevice(out.mData.data());
}
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.hip.hpp"
template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_implicit_gemm_convolution_1_nchw_srck_nkhw(InDesc,
const Tensor<T>& in_nchw,
WeiDesc,
const Tensor<T>& wei_kcsr,
OutDesc,
Tensor<T>& out_nkhw,
unsigned nrepeat)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto in_nchw_desc = InDesc{};
constexpr auto wei_kcsr_desc = WeiDesc{};
constexpr auto out_nkhw_desc = OutDesc{};
constexpr unsigned N = out_nkhw_desc.GetLength(I0);
constexpr unsigned Ho = out_nkhw_desc.GetLength(I2);
constexpr unsigned Wo = out_nkhw_desc.GetLength(I3);
constexpr unsigned K = wei_kcsr_desc.GetLength(I0);
constexpr unsigned C = wei_kcsr_desc.GetLength(I1);
constexpr unsigned S = wei_kcsr_desc.GetLength(I2);
constexpr unsigned R = wei_kcsr_desc.GetLength(I3);
auto wei_srck_desc = make_ConstantTensorDescriptor(Sequence<S, R, C, K>{});
ostream_ConstantTensorDescriptor(wei_srck_desc, std::cout << "wei_srck_desc: ");
Tensor<T> wei_srck(make_TensorDescriptor(wei_srck_desc));
auto f_reorder_kcsr2srck = [&](auto k, auto c, auto s, auto r) {
wei_srck(s, r, c, k) = wei_kcsr(k, c, s, r);
};
make_ParallelTensorFunctor(f_reorder_kcsr2srck, K, C, S, R)(
std::thread::hardware_concurrency());
std::size_t data_sz = sizeof(T);
DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
DeviceMem wei_srck_device_buf(data_sz * wei_srck.mDesc.GetElementSpace());
DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace());
int num_thread = std::thread::hardware_concurrency();
in_nchw_device_buf.ToDevice(in_nchw.mData.data());
wei_srck_device_buf.ToDevice(wei_srck.mData.data());
out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
#if 1
// for 3x3, 34x34
constexpr unsigned NPerBlock = 1;
constexpr unsigned KPerBlock = 64;
constexpr unsigned CPerBlock = 2;
constexpr unsigned HoPerBlock = 4;
constexpr unsigned WoPerBlock = 32;
constexpr unsigned NPerThread = 1;
constexpr unsigned KPerThread = 16;
constexpr unsigned CPerThread = 1;
constexpr unsigned HoPerThread = 2;
constexpr unsigned WoPerThread = 2;
constexpr unsigned BlockSize = 128;
#elif 0
// for 3x3, 58x58
constexpr unsigned NPerBlock = 4;
constexpr unsigned KPerBlock = 64;
constexpr unsigned CPerBlock = 2;
constexpr unsigned HoPerBlock = 4;
constexpr unsigned WoPerBlock = 8;
constexpr unsigned NPerThread = 4;
constexpr unsigned KPerThread = 16;
constexpr unsigned CPerThread = 1;
constexpr unsigned HoPerThread = 1;
constexpr unsigned WoPerThread = 1;
constexpr unsigned BlockSize = 128;
#elif 1
// for 3x3, 56x56
constexpr unsigned NPerBlock = 32;
constexpr unsigned KPerBlock = 64;
constexpr unsigned CPerBlock = 4;
constexpr unsigned HoPerBlock = 2;
constexpr unsigned WoPerBlock = 2;
constexpr unsigned NPerThread = 4;
constexpr unsigned KPerThread = 16;
constexpr unsigned CPerThread = 1;
constexpr unsigned HoPerThread = 1;
constexpr unsigned WoPerThread = 1;
constexpr unsigned BlockSize = 128;
#endif
constexpr unsigned GridSize =
((N + NPerBlock - 1) / NPerBlock) * ((K + KPerBlock - 1) / KPerBlock) *
((Ho + HoPerBlock - 1) / HoPerBlock) * ((Wo + WoPerBlock - 1) / WoPerBlock);
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
for(unsigned i = 0; i < nrepeat; ++i)
{
float time = launch_kernel(
gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw<GridSize,
BlockSize,
T,
decltype(in_nchw_desc),
decltype(wei_srck_desc),
decltype(out_nkhw_desc),
NPerBlock,
KPerBlock,
CPerBlock,
HoPerBlock,
WoPerBlock,
NPerThread,
KPerThread,
CPerThread,
HoPerThread,
WoPerThread>,
dim3(GridSize),
dim3(BlockSize),
static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
static_cast<T*>(wei_srck_device_buf.GetDeviceBuffer()),
static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));
printf("Elapsed time : %f ms\n", time);
usleep(std::min(time * 1000, float(10000)));
}
out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
}
...@@ -30,10 +30,10 @@ void device_implicit_gemm_convolution_2_chwn_csrk_khwn(InDesc, ...@@ -30,10 +30,10 @@ void device_implicit_gemm_convolution_2_chwn_csrk_khwn(InDesc,
constexpr unsigned K = wei_kcsr_desc.GetLength(I0); constexpr unsigned K = wei_kcsr_desc.GetLength(I0);
constexpr unsigned C = wei_kcsr_desc.GetLength(I1); constexpr unsigned C = wei_kcsr_desc.GetLength(I1);
constexpr unsigned S = wei_kcsr_desc.GetLength(I2); constexpr unsigned Y = wei_kcsr_desc.GetLength(I2);
constexpr unsigned R = wei_kcsr_desc.GetLength(I3); constexpr unsigned X = wei_kcsr_desc.GetLength(I3);
constexpr unsigned BGhostRead = (S - 1) * Wi + (R - 1); constexpr unsigned BGhostRead = (Y - 1) * Wi + (X - 1);
// convert in_nchw to in_cnhw // convert in_nchw to in_cnhw
auto in_chwn_desc = make_ConstantTensorDescriptor(Sequence<C, Hi, Wi, N>{}); auto in_chwn_desc = make_ConstantTensorDescriptor(Sequence<C, Hi, Wi, N>{});
...@@ -49,7 +49,7 @@ void device_implicit_gemm_convolution_2_chwn_csrk_khwn(InDesc, ...@@ -49,7 +49,7 @@ void device_implicit_gemm_convolution_2_chwn_csrk_khwn(InDesc,
Wi)(std::thread::hardware_concurrency()); Wi)(std::thread::hardware_concurrency());
// convert wei_kcsr to wei_csrk // convert wei_kcsr to wei_csrk
auto wei_csrk_desc = make_ConstantTensorDescriptor(Sequence<C, S, R, K>{}); auto wei_csrk_desc = make_ConstantTensorDescriptor(Sequence<C, Y, X, K>{});
ostream_ConstantTensorDescriptor(wei_csrk_desc, std::cout << "wei_csrk_desc: "); ostream_ConstantTensorDescriptor(wei_csrk_desc, std::cout << "wei_csrk_desc: ");
Tensor<T> wei_csrk(make_TensorDescriptor(wei_csrk_desc)); Tensor<T> wei_csrk(make_TensorDescriptor(wei_csrk_desc));
...@@ -58,8 +58,8 @@ void device_implicit_gemm_convolution_2_chwn_csrk_khwn(InDesc, ...@@ -58,8 +58,8 @@ void device_implicit_gemm_convolution_2_chwn_csrk_khwn(InDesc,
[&](auto k, auto c, auto s, auto r) { wei_csrk(c, s, r, k) = wei_kcsr(k, c, s, r); }, [&](auto k, auto c, auto s, auto r) { wei_csrk(c, s, r, k) = wei_kcsr(k, c, s, r); },
K, K,
C, C,
S, Y,
R)(std::thread::hardware_concurrency()); X)(std::thread::hardware_concurrency());
// conver out_nkhw to out_knhw // conver out_nkhw to out_knhw
auto out_khwn_desc = make_ConstantTensorDescriptor(Sequence<K, Ho, Wo, N>{}); auto out_khwn_desc = make_ConstantTensorDescriptor(Sequence<K, Ho, Wo, N>{});
...@@ -209,43 +209,39 @@ void device_implicit_gemm_convolution_2_chwn_csrk_khwn(InDesc, ...@@ -209,43 +209,39 @@ void device_implicit_gemm_convolution_2_chwn_csrk_khwn(InDesc,
for(unsigned i = 0; i < nrepeat; ++i) for(unsigned i = 0; i < nrepeat; ++i)
{ {
float time = launch_kernel( float time =
#if 0 launch_kernel(gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_buffer<
gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn GridSize,
#else BlockSize,
gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_buffer T,
#endif decltype(in_chwn_desc),
<GridSize, decltype(wei_csrk_desc),
BlockSize, decltype(out_khwn_desc),
T, BPerBlock,
decltype(in_chwn_desc), KPerBlock,
decltype(wei_csrk_desc), CPerBlock,
decltype(out_khwn_desc), BPerThread,
BPerBlock, KPerThread,
KPerBlock, GemmThreadPerColumnPerCluster,
CPerBlock, GemmThreadPerRowPerCluster,
BPerThread, GemmMPerThreadSubC,
KPerThread, GemmNPerThreadSubC,
GemmThreadPerColumnPerCluster, GemmMLevel0Cluster,
GemmThreadPerRowPerCluster, GemmNLevel0Cluster,
GemmMPerThreadSubC, GemmMLevel1Cluster,
GemmNPerThreadSubC, GemmNLevel1Cluster,
GemmMLevel0Cluster, GemmKPerThreadLoop,
GemmNLevel0Cluster, InBlockCopyThreadPerDim0,
GemmMLevel1Cluster, InBlockCopyThreadPerDim1,
GemmNLevel1Cluster, WeiBlockCopyThreadPerDim0,
GemmKPerThreadLoop, WeiBlockCopyThreadPerDim1,
InBlockCopyThreadPerDim0, InBlockCopyDataPerRead,
InBlockCopyThreadPerDim1, WeiBlockCopyDataPerRead>,
WeiBlockCopyThreadPerDim0, dim3(GridSize),
WeiBlockCopyThreadPerDim1, dim3(BlockSize),
InBlockCopyDataPerRead, static_cast<T*>(in_chwn_device_buf.GetDeviceBuffer()),
WeiBlockCopyDataPerRead>, static_cast<T*>(wei_csrk_device_buf.GetDeviceBuffer()),
dim3(GridSize), static_cast<T*>(out_khwn_device_buf.GetDeviceBuffer()));
dim3(BlockSize),
static_cast<T*>(in_chwn_device_buf.GetDeviceBuffer()),
static_cast<T*>(wei_csrk_device_buf.GetDeviceBuffer()),
static_cast<T*>(out_khwn_device_buf.GetDeviceBuffer()));
printf("Elapsed time : %f ms\n", time); printf("Elapsed time : %f ms\n", time);
usleep(std::min(time * 1000, float(10000))); usleep(std::min(time * 1000, float(10000)));
......
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.hip.hpp"
#include "gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.hip.hpp"
template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,
const Tensor<T>& in_nchw,
WeiDesc,
const Tensor<T>& wei_kcsr,
OutDesc,
Tensor<T>& out_nkhw,
unsigned nrepeat)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto in_nchw_desc = InDesc{};
constexpr auto wei_kcsr_desc = WeiDesc{};
constexpr auto out_nkhw_desc = OutDesc{};
constexpr unsigned N = in_nchw_desc.GetLength(I0);
constexpr unsigned Hi = in_nchw_desc.GetLength(I2);
constexpr unsigned Wi = in_nchw_desc.GetLength(I3);
constexpr unsigned Ho = out_nkhw_desc.GetLength(I2);
constexpr unsigned Wo = out_nkhw_desc.GetLength(I3);
constexpr unsigned K = wei_kcsr_desc.GetLength(I0);
constexpr unsigned C = wei_kcsr_desc.GetLength(I1);
constexpr unsigned S = wei_kcsr_desc.GetLength(I2);
constexpr unsigned R = wei_kcsr_desc.GetLength(I3);
constexpr unsigned BGhostRead = (S - 1) * Wi + (R - 1);
// convert in_nchw to in_cnhw
auto in_cnhw_desc = make_ConstantTensorDescriptor(Sequence<C, N, Hi, Wi>{});
ostream_ConstantTensorDescriptor(in_cnhw_desc, std::cout << "in_cnhw_desc: ");
Tensor<T> in_cnhw(make_TensorDescriptor(in_cnhw_desc));
auto f_reorder_nchw2cnhw = [&](auto n, auto c, auto hi, auto wi) {
in_cnhw(c, n, hi, wi) = in_nchw(n, c, hi, wi);
};
make_ParallelTensorFunctor(f_reorder_nchw2cnhw, N, C, Hi, Wi)(
std::thread::hardware_concurrency());
// convert wei_kcsr to wei_csrk
auto wei_csrk_desc = make_ConstantTensorDescriptor(Sequence<C, S, R, K>{});
ostream_ConstantTensorDescriptor(wei_csrk_desc, std::cout << "wei_csrk_desc: ");
Tensor<T> wei_csrk(make_TensorDescriptor(wei_csrk_desc));
auto f_reorder_kcsr2csrk = [&](auto k, auto c, auto s, auto r) {
wei_csrk(c, s, r, k) = wei_kcsr(k, c, s, r);
};
make_ParallelTensorFunctor(f_reorder_kcsr2csrk, K, C, S, R)(
std::thread::hardware_concurrency());
// conver out_nkhw to out_knhw
auto out_knhw_desc = make_ConstantTensorDescriptor(Sequence<K, N, Ho, Wo>{});
ostream_ConstantTensorDescriptor(out_knhw_desc, std::cout << "out_knhw_desc: ");
Tensor<T> out_knhw(make_TensorDescriptor(out_knhw_desc));
#if 0
// 3x3, 34x34
// need to use register double buffer for GEMM
constexpr unsigned BPerBlock = 128;
constexpr unsigned KPerBlock = 64;
constexpr unsigned CPerBlock = 4;
constexpr unsigned BPerThread = 8;
constexpr unsigned KPerThread = 8;
constexpr unsigned GemmMPerThreadSubC = 4;
constexpr unsigned GemmNPerThreadSubC = 4;
constexpr unsigned GemmMLevel0Cluster = 4;
constexpr unsigned GemmNLevel0Cluster = 2;
constexpr unsigned GemmMLevel1Cluster = 2;
constexpr unsigned GemmNLevel1Cluster = 8;
constexpr unsigned GemmKPerThreadLoop = 1;
constexpr unsigned GemmThreadPerColumnPerCluster = 8;
constexpr unsigned GemmThreadPerRowPerCluster = 8;
constexpr unsigned InBlockCopyThreadPerDim0 = 4;
constexpr unsigned InBlockCopyThreadPerDim1 = 16;
constexpr unsigned WeiBlockCopyThreadPerDim0 = 4;
constexpr unsigned WeiBlockCopyThreadPerDim1 = 16;
constexpr unsigned InBlockCopyDataPerRead = 4;
constexpr unsigned WeiBlockCopyDataPerRead = 4;
constexpr unsigned BlockSize = 128;
#elif 0
// 1x1, 28x28, 64 threads
constexpr unsigned BPerBlock = 64;
constexpr unsigned KPerBlock = 64;
constexpr unsigned CPerBlock = 8;
constexpr unsigned BPerThread = 8;
constexpr unsigned KPerThread = 8;
constexpr unsigned GemmMPerThreadSubC = 4;
constexpr unsigned GemmNPerThreadSubC = 4;
constexpr unsigned GemmMLevel0Cluster = 4;
constexpr unsigned GemmNLevel0Cluster = 2;
constexpr unsigned GemmMLevel1Cluster = 2;
constexpr unsigned GemmNLevel1Cluster = 4;
constexpr unsigned GemmKPerThreadLoop = 1;
constexpr unsigned GemmThreadPerColumnPerCluster = 8;
constexpr unsigned GemmThreadPerRowPerCluster = 8;
constexpr unsigned InBlockCopyThreadPerDim0 = 4;
constexpr unsigned InBlockCopyThreadPerDim1 = 16;
constexpr unsigned WeiBlockCopyThreadPerDim0 = 4;
constexpr unsigned WeiBlockCopyThreadPerDim1 = 16;
constexpr unsigned InBlockCopyDataPerRead = 4;
constexpr unsigned WeiBlockCopyDataPerRead = 4;
constexpr unsigned BlockSize = 64;
#elif 1
// 1x1, 28x28, 128 threads, no lds-double-buffer
// 1x1, 28x28, 128 threads, with lds-double-buffer, max_register = 128
constexpr unsigned BPerBlock = 64;
constexpr unsigned KPerBlock = 128;
constexpr unsigned CPerBlock = 8;
constexpr unsigned BPerThread = 8;
constexpr unsigned KPerThread = 8;
constexpr unsigned GemmMPerThreadSubC = 4;
constexpr unsigned GemmNPerThreadSubC = 4;
constexpr unsigned GemmMLevel0Cluster = 4;
constexpr unsigned GemmNLevel0Cluster = 2;
constexpr unsigned GemmMLevel1Cluster = 4;
constexpr unsigned GemmNLevel1Cluster = 4;
constexpr unsigned GemmKPerThreadLoop = 1;
constexpr unsigned GemmThreadPerColumnPerCluster = 8;
constexpr unsigned GemmThreadPerRowPerCluster = 8;
constexpr unsigned InBlockCopyThreadPerDim0 = 4;
constexpr unsigned InBlockCopyThreadPerDim1 = 16;
constexpr unsigned WeiBlockCopyThreadPerDim0 = 4;
constexpr unsigned WeiBlockCopyThreadPerDim1 = 16;
constexpr unsigned InBlockCopyDataPerRead = 4;
constexpr unsigned WeiBlockCopyDataPerRead = 4;
constexpr unsigned BlockSize = 128;
#elif 1
// 1x1, 28x28, 256 thread
constexpr unsigned BPerBlock = 128;
constexpr unsigned KPerBlock = 128;
constexpr unsigned CPerBlock = 8;
constexpr unsigned BPerThread = 8;
constexpr unsigned KPerThread = 8;
constexpr unsigned GemmMPerThreadSubC = 4;
constexpr unsigned GemmNPerThreadSubC = 4;
constexpr unsigned GemmMLevel0Cluster = 4;
constexpr unsigned GemmNLevel0Cluster = 4;
constexpr unsigned GemmMLevel1Cluster = 4;
constexpr unsigned GemmNLevel1Cluster = 4;
constexpr unsigned GemmKPerThreadLoop = 1;
constexpr unsigned GemmThreadPerColumnPerCluster = 8;
constexpr unsigned GemmThreadPerRowPerCluster = 8;
constexpr unsigned InBlockCopyThreadPerDim0 = 4;
constexpr unsigned InBlockCopyThreadPerDim1 = 16;
constexpr unsigned WeiBlockCopyThreadPerDim0 = 4;
constexpr unsigned WeiBlockCopyThreadPerDim1 = 16;
constexpr unsigned InBlockCopyDataPerRead = 4;
constexpr unsigned WeiBlockCopyDataPerRead = 4;
constexpr unsigned BlockSize = 256;
#endif
constexpr unsigned GridSize =
((N * Hi * Wi + BPerBlock - 1) / BPerBlock) * ((K + KPerBlock - 1) / KPerBlock);
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
// mem
std::size_t data_sz = sizeof(T);
DeviceMem in_cnhw_device_buf(data_sz * (in_cnhw.mDesc.GetElementSpace() + BGhostRead +
BPerBlock)); // reserve extra space for BGhostRead
DeviceMem wei_csrk_device_buf(data_sz * wei_csrk.mDesc.GetElementSpace());
DeviceMem out_knhw_device_buf(data_sz * out_knhw.mDesc.GetElementSpace());
in_cnhw_device_buf.ToDevice(in_cnhw.mData.data());
wei_csrk_device_buf.ToDevice(wei_csrk.mData.data());
out_knhw_device_buf.ToDevice(out_knhw.mData.data());
for(unsigned i = 0; i < nrepeat; ++i)
{
float time = launch_kernel(
#if 0
gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw
#else
gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer
#endif
<GridSize,
BlockSize,
T,
decltype(in_cnhw_desc),
decltype(wei_csrk_desc),
decltype(out_knhw_desc),
BPerBlock,
KPerBlock,
CPerBlock,
BPerThread,
KPerThread,
GemmThreadPerColumnPerCluster,
GemmThreadPerRowPerCluster,
GemmMPerThreadSubC,
GemmNPerThreadSubC,
GemmMLevel0Cluster,
GemmNLevel0Cluster,
GemmMLevel1Cluster,
GemmNLevel1Cluster,
GemmKPerThreadLoop,
InBlockCopyThreadPerDim0,
InBlockCopyThreadPerDim1,
WeiBlockCopyThreadPerDim0,
WeiBlockCopyThreadPerDim1,
InBlockCopyDataPerRead,
WeiBlockCopyDataPerRead>,
dim3(GridSize),
dim3(BlockSize),
static_cast<T*>(in_cnhw_device_buf.GetDeviceBuffer()),
static_cast<T*>(wei_csrk_device_buf.GetDeviceBuffer()),
static_cast<T*>(out_knhw_device_buf.GetDeviceBuffer()));
printf("Elapsed time : %f ms\n", time);
usleep(std::min(time * 1000, float(10000)));
}
out_knhw_device_buf.FromDevice(out_knhw.mData.data());
// convert out_knhw to out_nkhw
auto f_reorder_knhw2nkhw = [&](auto n, auto k, auto ho, auto wo) {
out_nkhw(n, k, ho, wo) = out_knhw(k, n, ho, wo);
};
make_ParallelTensorFunctor(f_reorder_knhw2nkhw, N, K, Ho, Wo)(
std::thread::hardware_concurrency());
}
This diff is collapsed.
...@@ -8,11 +8,11 @@ template <unsigned BlockSize, ...@@ -8,11 +8,11 @@ template <unsigned BlockSize,
class InBlockDesc, class InBlockDesc,
class WeiBlockDesc, class WeiBlockDesc,
class OutBlockDesc, class OutBlockDesc,
unsigned OutTileSizeH,
unsigned OutTileSizeW,
unsigned NPerThread, unsigned NPerThread,
unsigned KPerThread, unsigned KPerThread,
unsigned CPerThread> unsigned CPerThread,
unsigned HoPerThread,
unsigned WoPerThread>
__device__ void blockwise_direct_convolution(InBlockDesc, __device__ void blockwise_direct_convolution(InBlockDesc,
Float* const __restrict__ p_in_block, Float* const __restrict__ p_in_block,
WeiBlockDesc, WeiBlockDesc,
...@@ -29,19 +29,17 @@ __device__ void blockwise_direct_convolution(InBlockDesc, ...@@ -29,19 +29,17 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
constexpr auto wei_block_desc = WeiBlockDesc{}; constexpr auto wei_block_desc = WeiBlockDesc{};
constexpr auto out_block_desc = OutBlockDesc{}; constexpr auto out_block_desc = OutBlockDesc{};
constexpr unsigned S = wei_block_desc.GetLength(I2); constexpr unsigned Y = wei_block_desc.GetLength(I2);
constexpr unsigned R = wei_block_desc.GetLength(I3); constexpr unsigned X = wei_block_desc.GetLength(I3);
constexpr unsigned InTileSizeH = OutTileSizeH + S - 1; constexpr unsigned InTileSizeH = HoPerThread + Y - 1;
constexpr unsigned InTileSizeW = OutTileSizeW + R - 1; constexpr unsigned InTileSizeW = WoPerThread + X - 1;
// divide thread work // divide thread work
constexpr unsigned NThreadWork = (out_block_desc.GetLength(I0) + NPerThread - 1) / NPerThread; constexpr unsigned NThreadWork = (out_block_desc.GetLength(I0) + NPerThread - 1) / NPerThread;
constexpr unsigned KThreadWork = (out_block_desc.GetLength(I1) + KPerThread - 1) / KPerThread; constexpr unsigned KThreadWork = (out_block_desc.GetLength(I1) + KPerThread - 1) / KPerThread;
constexpr unsigned YThreadWork = constexpr unsigned YThreadWork = (out_block_desc.GetLength(I2) + HoPerThread - 1) / HoPerThread;
(out_block_desc.GetLength(I2) + OutTileSizeH - 1) / OutTileSizeH; constexpr unsigned XThreadWork = (out_block_desc.GetLength(I3) + WoPerThread - 1) / WoPerThread;
constexpr unsigned XThreadWork =
(out_block_desc.GetLength(I3) + OutTileSizeW - 1) / OutTileSizeW;
#if 0 #if 0
if(threadIdx.x == 0) if(threadIdx.x == 0)
...@@ -56,7 +54,7 @@ __device__ void blockwise_direct_convolution(InBlockDesc, ...@@ -56,7 +54,7 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
make_ConstantTensorDescriptor(Sequence<NPerThread, CPerThread, InTileSizeH, InTileSizeW>{}); make_ConstantTensorDescriptor(Sequence<NPerThread, CPerThread, InTileSizeH, InTileSizeW>{});
constexpr auto wei_thread_desc = constexpr auto wei_thread_desc =
make_ConstantTensorDescriptor(Sequence<KPerThread, CPerThread, S, R>{}); make_ConstantTensorDescriptor(Sequence<KPerThread, CPerThread, Y, X>{});
constexpr auto out_thread_desc = constexpr auto out_thread_desc =
get_convolution_output_default_4d_tensor_descriptor(in_thread_desc, wei_thread_desc); get_convolution_output_default_4d_tensor_descriptor(in_thread_desc, wei_thread_desc);
...@@ -86,8 +84,8 @@ __device__ void blockwise_direct_convolution(InBlockDesc, ...@@ -86,8 +84,8 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
unsigned n_thread_data_begin = n_thread_work_id * NPerThread; unsigned n_thread_data_begin = n_thread_work_id * NPerThread;
unsigned k_thread_data_begin = k_thread_work_id * KPerThread; unsigned k_thread_data_begin = k_thread_work_id * KPerThread;
unsigned ho_thread_data_begin = y_thread_work_id * OutTileSizeH; unsigned ho_thread_data_begin = y_thread_work_id * HoPerThread;
unsigned wo_thread_data_begin = x_thread_work_id * OutTileSizeW; unsigned wo_thread_data_begin = x_thread_work_id * WoPerThread;
unsigned hi_thread_data_begin = ho_thread_data_begin; // minus padding unsigned hi_thread_data_begin = ho_thread_data_begin; // minus padding
unsigned wi_thread_data_begin = wo_thread_data_begin; // minus padding unsigned wi_thread_data_begin = wo_thread_data_begin; // minus padding
......
...@@ -24,11 +24,11 @@ __host__ __device__ constexpr auto get_convolution_output_default_4d_tensor_desc ...@@ -24,11 +24,11 @@ __host__ __device__ constexpr auto get_convolution_output_default_4d_tensor_desc
constexpr auto WI = in_desc.GetLength(I3); constexpr auto WI = in_desc.GetLength(I3);
constexpr auto K = wei_desc.GetLength(I0); constexpr auto K = wei_desc.GetLength(I0);
constexpr auto S = wei_desc.GetLength(I2); constexpr auto Y = wei_desc.GetLength(I2);
constexpr auto R = wei_desc.GetLength(I3); constexpr auto X = wei_desc.GetLength(I3);
constexpr auto HO = HI + 1 - S; constexpr auto HO = HI + 1 - Y;
constexpr auto WO = WI + 1 - R; constexpr auto WO = WI + 1 - X;
return make_ConstantTensorDescriptor(Sequence<N, K, HO, WO>{}); return make_ConstantTensorDescriptor(Sequence<N, K, HO, WO>{});
} }
...@@ -55,8 +55,8 @@ __host__ __device__ constexpr auto get_convolution_with_padding_output_default_4 ...@@ -55,8 +55,8 @@ __host__ __device__ constexpr auto get_convolution_with_padding_output_default_4
constexpr auto WI = in_desc.GetLength(I3); constexpr auto WI = in_desc.GetLength(I3);
constexpr auto K = wei_desc.GetLength(I0); constexpr auto K = wei_desc.GetLength(I0);
constexpr auto S = wei_desc.GetLength(I2); constexpr auto Y = wei_desc.GetLength(I2);
constexpr auto R = wei_desc.GetLength(I3); constexpr auto X = wei_desc.GetLength(I3);
constexpr auto HPadLow = LowerPads{}.Get(I0); constexpr auto HPadLow = LowerPads{}.Get(I0);
constexpr auto WPadLow = LowerPads{}.Get(I1); constexpr auto WPadLow = LowerPads{}.Get(I1);
...@@ -64,8 +64,8 @@ __host__ __device__ constexpr auto get_convolution_with_padding_output_default_4 ...@@ -64,8 +64,8 @@ __host__ __device__ constexpr auto get_convolution_with_padding_output_default_4
constexpr auto HPadUp = UpperPads{}.Get(I0); constexpr auto HPadUp = UpperPads{}.Get(I0);
constexpr auto WPadUp = UpperPads{}.Get(I1); constexpr auto WPadUp = UpperPads{}.Get(I1);
constexpr auto HO = HI + HPadLow + HPadUp + 1 - S; constexpr auto HO = HI + HPadLow + HPadUp + 1 - Y;
constexpr auto WO = WI + WPadLow + WPadUp + 1 - R; constexpr auto WO = WI + WPadLow + WPadUp + 1 - X;
return make_ConstantTensorDescriptor(Sequence<N, K, HO, WO>{}); return make_ConstantTensorDescriptor(Sequence<N, K, HO, WO>{});
} }
...@@ -8,16 +8,16 @@ template <class Float, ...@@ -8,16 +8,16 @@ template <class Float,
class InGlobalDesc, class InGlobalDesc,
class WeiGlobalDesc, class WeiGlobalDesc,
class OutGlobalDesc, class OutGlobalDesc,
unsigned OutTileSizeH,
unsigned OutTileSizeW,
unsigned NPerBlock, unsigned NPerBlock,
unsigned KPerBlock, unsigned KPerBlock,
unsigned CPerBlock, unsigned CPerBlock,
unsigned YPerBlock, unsigned HoPerBlock,
unsigned XPerBlock, unsigned WoPerBlock,
unsigned NPerThread, unsigned NPerThread,
unsigned KPerThread, unsigned KPerThread,
unsigned CPerThread, unsigned CPerThread,
unsigned HoPerThread,
unsigned WoPerThread,
unsigned BlockSize, unsigned BlockSize,
unsigned GridSize> unsigned GridSize>
__global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_in_global, __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_in_global,
...@@ -33,25 +33,22 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_ ...@@ -33,25 +33,22 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
constexpr auto wei_global_desc = WeiGlobalDesc{}; constexpr auto wei_global_desc = WeiGlobalDesc{};
constexpr auto out_global_desc = OutGlobalDesc{}; constexpr auto out_global_desc = OutGlobalDesc{};
constexpr unsigned S = wei_global_desc.GetLength(I2); constexpr unsigned Y = wei_global_desc.GetLength(I2);
constexpr unsigned R = wei_global_desc.GetLength(I3); constexpr unsigned X = wei_global_desc.GetLength(I3);
constexpr unsigned HoPerBlock = OutTileSizeH * YPerBlock; constexpr unsigned HiPerBlock = HoPerBlock + Y - 1;
constexpr unsigned WoPerBlock = OutTileSizeW * XPerBlock; constexpr unsigned WiPerBlock = WoPerBlock + X - 1;
constexpr unsigned HiPerBlock = YPerBlock * OutTileSizeH + S - 1;
constexpr unsigned WiPerBlock = XPerBlock * OutTileSizeW + R - 1;
constexpr unsigned NBlockWork = (out_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock; constexpr unsigned NBlockWork = (out_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
constexpr unsigned KBlockWork = (out_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock; constexpr unsigned KBlockWork = (out_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
constexpr unsigned YBlockWork = (out_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock; constexpr unsigned HBlockWork = (out_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock;
constexpr unsigned XBlockWork = (out_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock; constexpr unsigned WBlockWork = (out_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock;
constexpr auto in_block_global_desc = make_ConstantTensorDescriptor( constexpr auto in_block_global_desc = make_ConstantTensorDescriptor(
Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{}, in_global_desc.GetStrides()); Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{}, in_global_desc.GetStrides());
constexpr auto wei_block_global_desc = make_ConstantTensorDescriptor( constexpr auto wei_block_global_desc = make_ConstantTensorDescriptor(
Sequence<KPerBlock, CPerBlock, S, R>{}, wei_global_desc.GetStrides()); Sequence<KPerBlock, CPerBlock, Y, X>{}, wei_global_desc.GetStrides());
constexpr auto out_block_global_desc = make_ConstantTensorDescriptor( constexpr auto out_block_global_desc = make_ConstantTensorDescriptor(
Sequence<NPerBlock, KPerBlock, HoPerBlock, WoPerBlock>{}, out_global_desc.GetStrides()); Sequence<NPerBlock, KPerBlock, HoPerBlock, WoPerBlock>{}, out_global_desc.GetStrides());
...@@ -73,52 +70,21 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_ ...@@ -73,52 +70,21 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
const unsigned block_id = blockIdx.x; const unsigned block_id = blockIdx.x;
unsigned itmp = block_id; unsigned itmp = block_id;
unsigned n_block_work_id = itmp / (KBlockWork * YBlockWork * XBlockWork); unsigned n_block_work_id = itmp / (KBlockWork * HBlockWork * WBlockWork);
itmp -= n_block_work_id * (KBlockWork * YBlockWork * XBlockWork); itmp -= n_block_work_id * (KBlockWork * HBlockWork * WBlockWork);
unsigned k_block_work_id = itmp / (YBlockWork * XBlockWork); unsigned k_block_work_id = itmp / (HBlockWork * WBlockWork);
itmp -= k_block_work_id * (YBlockWork * XBlockWork); itmp -= k_block_work_id * (HBlockWork * WBlockWork);
unsigned y_block_work_id = itmp / XBlockWork; unsigned h_block_work_id = itmp / WBlockWork;
unsigned x_block_work_id = itmp - y_block_work_id * XBlockWork; unsigned w_block_work_id = itmp - h_block_work_id * WBlockWork;
unsigned n_block_work_begin = n_block_work_id * NPerBlock; unsigned n_block_work_begin = n_block_work_id * NPerBlock;
unsigned k_block_work_begin = k_block_work_id * KPerBlock; unsigned k_block_work_begin = k_block_work_id * KPerBlock;
unsigned y_block_work_begin = y_block_work_id * YPerBlock; unsigned ho_block_work_begin = h_block_work_id * HoPerBlock;
unsigned x_block_work_begin = x_block_work_id * XPerBlock; unsigned wo_block_work_begin = w_block_work_id * WoPerBlock;
unsigned ho_block_work_begin = y_block_work_begin * OutTileSizeH;
unsigned wo_block_work_begin = x_block_work_begin * OutTileSizeW;
unsigned hi_block_work_begin = ho_block_work_begin; // minus padding unsigned hi_block_work_begin = ho_block_work_begin; // minus padding
unsigned wi_block_work_begin = wo_block_work_begin; // minus padding unsigned wi_block_work_begin = wo_block_work_begin; // minus padding
#if 0
if(threadIdx.x == 0)
{
print_ConstantTensorDescriptor( in_global_desc, "gridwise_convolution: in_global_desc: ");
print_ConstantTensorDescriptor(wei_global_desc, "gridwise_convolution: wei_global_desc: ");
print_ConstantTensorDescriptor(out_global_desc, "gridwise_convolution: out_global_desc: ");
print_ConstantTensorDescriptor( in_block_global_desc, "gridwise_convolution: in_block_global_desc: ");
print_ConstantTensorDescriptor(wei_block_global_desc, "gridwise_convolution: wei_block_global_desc: ");
print_ConstantTensorDescriptor(out_block_global_desc, "gridwise_convolution: out_block_global_desc: ");
print_ConstantTensorDescriptor( in_block_desc, "gridwise_convolution: in_block_desc: ");
print_ConstantTensorDescriptor(wei_block_desc, "gridwise_convolution: wei_block_desc: ");
print_ConstantTensorDescriptor(out_block_desc, "gridwise_convolution: out_block_desc: ");
printf("NBlockWork %u, KBlockWork %u, YBlockWork %u, XBlockWork %u \t"
"block_id %u, n_block_work_id %u, k_block_work_id %u, y_block_work_id %u, "
"x_block_work_id %u\n",
NBlockWork,
KBlockWork,
YBlockWork,
XBlockWork,
block_id,
n_block_work_id,
k_block_work_id,
y_block_work_id,
x_block_work_id);
}
#endif
constexpr auto blockwise_in_copy = constexpr auto blockwise_in_copy =
Blockwise4dTensorCopy1<BlockSize, Blockwise4dTensorCopy1<BlockSize,
Float, Float,
...@@ -166,11 +132,11 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_ ...@@ -166,11 +132,11 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
decltype(in_block_desc), decltype(in_block_desc),
decltype(wei_block_desc), decltype(wei_block_desc),
decltype(out_block_desc), decltype(out_block_desc),
OutTileSizeH,
OutTileSizeW,
NPerThread, NPerThread,
KPerThread, KPerThread,
CPerThread>( CPerThread,
HoPerThread,
WoPerThread>(
in_block_desc, p_in_block, wei_block_desc, p_wei_block, out_block_desc, p_out_block); in_block_desc, p_in_block, wei_block_desc, p_wei_block, out_block_desc, p_out_block);
__syncthreads(); __syncthreads();
......
...@@ -10,16 +10,16 @@ template <class Float, ...@@ -10,16 +10,16 @@ template <class Float,
class InGlobalDesc, class InGlobalDesc,
class WeiGlobalDesc, class WeiGlobalDesc,
class OutGlobalDesc, class OutGlobalDesc,
unsigned OutTileSizeH,
unsigned OutTileSizeW,
unsigned NPerBlock, unsigned NPerBlock,
unsigned KPerBlock, unsigned KPerBlock,
unsigned CPerBlock, unsigned CPerBlock,
unsigned YPerBlock, unsigned HoPerBlock,
unsigned XPerBlock, unsigned WoPerBlock,
unsigned NPerThread, unsigned NPerThread,
unsigned KPerThread, unsigned KPerThread,
unsigned CPerThread, unsigned CPerThread,
unsigned HoPerThread,
unsigned WoPerThread,
unsigned BlockSize, unsigned BlockSize,
unsigned GridSize> unsigned GridSize>
__global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_in_global, __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_in_global,
...@@ -35,20 +35,17 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_ ...@@ -35,20 +35,17 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
constexpr auto wei_global_desc = WeiGlobalDesc{}; constexpr auto wei_global_desc = WeiGlobalDesc{};
constexpr auto out_global_desc = OutGlobalDesc{}; constexpr auto out_global_desc = OutGlobalDesc{};
constexpr unsigned S = wei_global_desc.GetLength(I2); constexpr unsigned Y = wei_global_desc.GetLength(I2);
constexpr unsigned R = wei_global_desc.GetLength(I3); constexpr unsigned X = wei_global_desc.GetLength(I3);
constexpr unsigned HoPerBlock = OutTileSizeH * YPerBlock; constexpr unsigned HiPerBlock = HoPerBlock + Y - 1;
constexpr unsigned WoPerBlock = OutTileSizeW * XPerBlock; constexpr unsigned WiPerBlock = WoPerBlock + X - 1;
constexpr unsigned HiPerBlock = YPerBlock * OutTileSizeH + S - 1;
constexpr unsigned WiPerBlock = XPerBlock * OutTileSizeW + R - 1;
constexpr auto in_block_desc = constexpr auto in_block_desc =
make_ConstantTensorDescriptor(Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{}); make_ConstantTensorDescriptor(Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{});
constexpr auto wei_block_desc = constexpr auto wei_block_desc =
make_ConstantTensorDescriptor(Sequence<KPerBlock, CPerBlock, S, R>{}); make_ConstantTensorDescriptor(Sequence<KPerBlock, CPerBlock, Y, X>{});
// shared mem // shared mem
constexpr unsigned in_block_size = in_block_desc.GetElementSpace(); constexpr unsigned in_block_size = in_block_desc.GetElementSpace();
...@@ -58,14 +55,14 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_ ...@@ -58,14 +55,14 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
__shared__ Float p_wei_block[wei_block_size]; __shared__ Float p_wei_block[wei_block_size];
// threadwise tensors // threadwise tensors
constexpr unsigned InTileSizeH = OutTileSizeH + S - 1; constexpr unsigned HiPerThread = HoPerThread + Y - 1;
constexpr unsigned InTileSizeW = OutTileSizeW + R - 1; constexpr unsigned WiPerThread = WoPerThread + X - 1;
constexpr auto in_thread_block_desc = make_ConstantTensorDescriptor( constexpr auto in_thread_block_desc = make_ConstantTensorDescriptor(
Sequence<NPerThread, CPerThread, InTileSizeH, InTileSizeW>{}, in_block_desc.GetStrides()); Sequence<NPerThread, CPerThread, HiPerThread, WiPerThread>{}, in_block_desc.GetStrides());
constexpr auto wei_thread_block_desc = make_ConstantTensorDescriptor( constexpr auto wei_thread_block_desc = make_ConstantTensorDescriptor(
Sequence<KPerThread, CPerThread, S, R>{}, wei_block_desc.GetStrides()); Sequence<KPerThread, CPerThread, Y, X>{}, wei_block_desc.GetStrides());
constexpr auto out_thread_desc = get_convolution_output_default_4d_tensor_descriptor( constexpr auto out_thread_desc = get_convolution_output_default_4d_tensor_descriptor(
in_thread_block_desc, wei_thread_block_desc); in_thread_block_desc, wei_thread_block_desc);
...@@ -76,26 +73,23 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_ ...@@ -76,26 +73,23 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
// divide block work // divide block work
constexpr unsigned NBlockWork = (out_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock; constexpr unsigned NBlockWork = (out_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
constexpr unsigned KBlockWork = (out_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock; constexpr unsigned KBlockWork = (out_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
constexpr unsigned YBlockWork = (out_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock; constexpr unsigned HBlockWork = (out_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock;
constexpr unsigned XBlockWork = (out_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock; constexpr unsigned WBlockWork = (out_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock;
const unsigned block_id = blockIdx.x; const unsigned block_id = blockIdx.x;
unsigned itmp = block_id; unsigned itmp = block_id;
const unsigned n_block_work_id = itmp / (KBlockWork * YBlockWork * XBlockWork); const unsigned n_block_work_id = itmp / (KBlockWork * HBlockWork * WBlockWork);
itmp -= n_block_work_id * (KBlockWork * YBlockWork * XBlockWork); itmp -= n_block_work_id * (KBlockWork * HBlockWork * WBlockWork);
const unsigned k_block_work_id = itmp / (YBlockWork * XBlockWork); const unsigned k_block_work_id = itmp / (HBlockWork * WBlockWork);
itmp -= k_block_work_id * (YBlockWork * XBlockWork); itmp -= k_block_work_id * (HBlockWork * WBlockWork);
const unsigned y_block_work_id = itmp / XBlockWork; const unsigned h_block_work_id = itmp / WBlockWork;
const unsigned x_block_work_id = itmp - y_block_work_id * XBlockWork; const unsigned w_block_work_id = itmp - h_block_work_id * WBlockWork;
const unsigned n_block_data_begin = n_block_work_id * NPerBlock; const unsigned n_block_data_begin = n_block_work_id * NPerBlock;
const unsigned k_block_data_begin = k_block_work_id * KPerBlock; const unsigned k_block_data_begin = k_block_work_id * KPerBlock;
const unsigned y_block_data_begin = y_block_work_id * YPerBlock; const unsigned ho_block_data_begin = h_block_work_id * HoPerBlock;
const unsigned x_block_data_begin = x_block_work_id * XPerBlock; const unsigned wo_block_data_begin = w_block_work_id * WoPerBlock;
const unsigned ho_block_data_begin = y_block_data_begin * OutTileSizeH;
const unsigned wo_block_data_begin = x_block_data_begin * OutTileSizeW;
const unsigned hi_block_data_begin = ho_block_data_begin; // minus padding const unsigned hi_block_data_begin = ho_block_data_begin; // minus padding
const unsigned wi_block_data_begin = wo_block_data_begin; // minus padding const unsigned wi_block_data_begin = wo_block_data_begin; // minus padding
...@@ -103,45 +97,27 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_ ...@@ -103,45 +97,27 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
// divide thread work // divide thread work
constexpr unsigned NThreadWork = (NPerBlock + NPerThread - 1) / NPerThread; constexpr unsigned NThreadWork = (NPerBlock + NPerThread - 1) / NPerThread;
constexpr unsigned KThreadWork = (KPerBlock + KPerThread - 1) / KPerThread; constexpr unsigned KThreadWork = (KPerBlock + KPerThread - 1) / KPerThread;
constexpr unsigned YThreadWork = YPerBlock; constexpr unsigned HThreadWork = (HoPerBlock + HoPerThread - 1) / HoPerThread;
constexpr unsigned XThreadWork = XPerBlock; constexpr unsigned WThreadWork = (WoPerBlock + WoPerThread - 1) / WoPerThread;
const unsigned thread_id = threadIdx.x; const unsigned thread_id = threadIdx.x;
itmp = thread_id; itmp = thread_id;
const unsigned n_thread_work_id = itmp / (KThreadWork * YThreadWork * XThreadWork); const unsigned n_thread_work_id = itmp / (KThreadWork * HThreadWork * WThreadWork);
itmp -= n_thread_work_id * (KThreadWork * YThreadWork * XThreadWork); itmp -= n_thread_work_id * (KThreadWork * HThreadWork * WThreadWork);
const unsigned k_thread_work_id = itmp / (YThreadWork * XThreadWork); const unsigned k_thread_work_id = itmp / (HThreadWork * WThreadWork);
itmp -= k_thread_work_id * (YThreadWork * XThreadWork); itmp -= k_thread_work_id * (HThreadWork * WThreadWork);
const unsigned y_thread_work_id = itmp / XThreadWork; const unsigned h_thread_work_id = itmp / WThreadWork;
const unsigned x_thread_work_id = itmp - y_thread_work_id * XThreadWork; const unsigned w_thread_work_id = itmp - h_thread_work_id * WThreadWork;
const unsigned n_thread_data_begin = n_thread_work_id * NPerThread; const unsigned n_thread_data_begin = n_thread_work_id * NPerThread;
const unsigned k_thread_data_begin = k_thread_work_id * KPerThread; const unsigned k_thread_data_begin = k_thread_work_id * KPerThread;
const unsigned ho_thread_data_begin = y_thread_work_id * OutTileSizeH; const unsigned ho_thread_data_begin = h_thread_work_id * HoPerThread;
const unsigned wo_thread_data_begin = x_thread_work_id * OutTileSizeW; const unsigned wo_thread_data_begin = w_thread_work_id * WoPerThread;
const unsigned hi_thread_data_begin = ho_thread_data_begin; const unsigned hi_thread_data_begin = ho_thread_data_begin;
const unsigned wi_thread_data_begin = wo_thread_data_begin; const unsigned wi_thread_data_begin = wo_thread_data_begin;
#if 0
if(threadIdx.x == 0)
{
print_ConstantTensorDescriptor(in_global_desc, "gridwise_convolution: in_global_desc: ");
print_ConstantTensorDescriptor(wei_global_desc, "gridwise_convolution: wei_global_desc: ");
print_ConstantTensorDescriptor(out_global_desc, "gridwise_convolution: out_global_desc: ");
}
printf("threadIdx.x %u \t"
"n_thread_data_begin %u, k_thread_data_begin %u, ho_thread_data_begin %u, "
"wo_thread_data_begin %u\n",
threadIdx.x,
n_thread_data_begin,
k_thread_data_begin,
ho_thread_data_begin,
wo_thread_data_begin);
#endif
constexpr auto blockwise_in_copy = constexpr auto blockwise_in_copy =
Blockwise4dTensorCopy1<BlockSize, Blockwise4dTensorCopy1<BlockSize,
Float, Float,
......
...@@ -62,11 +62,11 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric ...@@ -62,11 +62,11 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
constexpr unsigned Wo = out_khwn_global_desc.GetLength(I2); constexpr unsigned Wo = out_khwn_global_desc.GetLength(I2);
constexpr unsigned N = out_khwn_global_desc.GetLength(I3); constexpr unsigned N = out_khwn_global_desc.GetLength(I3);
constexpr unsigned S = wei_csrk_global_desc.GetLength(I1); constexpr unsigned Y = wei_csrk_global_desc.GetLength(I1);
constexpr unsigned R = wei_csrk_global_desc.GetLength(I2); constexpr unsigned X = wei_csrk_global_desc.GetLength(I2);
constexpr unsigned HiPerBlock = HoPerBlock + S - 1; constexpr unsigned HiPerBlock = HoPerBlock + Y - 1;
constexpr unsigned WiPerBlock = WoPerBlock + R - 1; constexpr unsigned WiPerBlock = WoPerBlock + X - 1;
// divide block work: [K, Ho, Wo, N] // divide block work: [K, Ho, Wo, N]
constexpr unsigned KBlockWork = (K + KPerBlock - 1) / KPerBlock; constexpr unsigned KBlockWork = (K + KPerBlock - 1) / KPerBlock;
...@@ -90,7 +90,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric ...@@ -90,7 +90,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
const unsigned wi_block_data_begin = wo_block_data_begin; const unsigned wi_block_data_begin = wo_block_data_begin;
// flattend (2d) tensor view of gridwise weight // flattend (2d) tensor view of gridwise weight
constexpr auto wei_ek_global_desc = make_ConstantTensorDescriptor(Sequence<C * S * R, K>{}); constexpr auto wei_ek_global_desc = make_ConstantTensorDescriptor(Sequence<C * Y * X, K>{});
// tensor view of blockwise input and weight in LDS // tensor view of blockwise input and weight in LDS
// be careful of alignment // be careful of alignment
...@@ -98,10 +98,10 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric ...@@ -98,10 +98,10 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
Sequence<CPerBlock, HiPerBlock, WiPerBlock, NPerBlock>{}, Number<InBlockCopyDataPerRead>{}); Sequence<CPerBlock, HiPerBlock, WiPerBlock, NPerBlock>{}, Number<InBlockCopyDataPerRead>{});
constexpr auto wei_ek_block_desc = make_ConstantTensorDescriptor_aligned( constexpr auto wei_ek_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<CPerBlock * S * R, KPerBlock>{}, Number<WeiBlockCopyDataPerRead>{}); Sequence<CPerBlock * Y * X, KPerBlock>{}, Number<WeiBlockCopyDataPerRead>{});
constexpr auto wei_csrk_block_desc = make_ConstantTensorDescriptor_aligned( constexpr auto wei_csrk_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<CPerBlock, S, R, KPerBlock>{}, Number<WeiBlockCopyDataPerRead>{}); Sequence<CPerBlock, Y, X, KPerBlock>{}, Number<WeiBlockCopyDataPerRead>{});
// tensor view of threadwise output in register // tensor view of threadwise output in register
constexpr auto out_khwn_thread_desc = constexpr auto out_khwn_thread_desc =
...@@ -118,7 +118,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric ...@@ -118,7 +118,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
InBlockCopyDataPerRead>{}; InBlockCopyDataPerRead>{};
// blockwise wei copy // blockwise wei copy
// format is [CPerBlock*S*R,KPerBlock] // format is [CPerBlock*Y*X,KPerBlock]
const auto blockwise_wei_copy = Blockwise2dTensorCopy3<BlockSize, const auto blockwise_wei_copy = Blockwise2dTensorCopy3<BlockSize,
Float, Float,
decltype(wei_ek_global_desc), decltype(wei_ek_global_desc),
...@@ -129,7 +129,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric ...@@ -129,7 +129,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
// a series of blockwise batched GEMM // a series of blockwise batched GEMM
// C_matrix += transpose(A_matrix) * B_matrix // C_matrix += transpose(A_matrix) * B_matrix
// A_matrix and B_matrix saved in LDS, C_matrix saved in register // A_matrix and B_matrix saved in LDS, C_matrix saved in register
// A_matrix[C,K] is a sub-matrix of wei_block[C,S,R,K] // A_matrix[C,K] is a sub-matrix of wei_block[C,Y,X,K]
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N] // B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
// C_matrix[K,Wo*N] is a sub-matrix of out_block[K,Ho,Wo,N] // C_matrix[K,Wo*N] is a sub-matrix of out_block[K,Ho,Wo,N]
constexpr auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor( constexpr auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
...@@ -204,9 +204,9 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric ...@@ -204,9 +204,9 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
__syncthreads(); __syncthreads();
// a series of batched GEMM // a series of batched GEMM
for(unsigned s = 0; s < S; ++s) for(unsigned s = 0; s < Y; ++s)
{ {
for(unsigned r = 0; r < R; ++r) for(unsigned r = 0; r < X; ++r)
{ {
blockwise_batch_gemm.Run(p_wei_block + wei_csrk_block_desc.Get1dIndex(0, s, r, 0), blockwise_batch_gemm.Run(p_wei_block + wei_csrk_block_desc.Get1dIndex(0, s, r, 0),
p_in_block + in_chwn_block_desc.Get1dIndex(0, s, r, 0), p_in_block + in_chwn_block_desc.Get1dIndex(0, s, r, 0),
......
...@@ -55,8 +55,8 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded( ...@@ -55,8 +55,8 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(
constexpr unsigned Wo = out_khwn_global_desc.GetLength(I2); constexpr unsigned Wo = out_khwn_global_desc.GetLength(I2);
constexpr unsigned N = out_khwn_global_desc.GetLength(I3); constexpr unsigned N = out_khwn_global_desc.GetLength(I3);
constexpr unsigned S = wei_csrk_global_desc.GetLength(I1); constexpr unsigned Y = wei_csrk_global_desc.GetLength(I1);
constexpr unsigned R = wei_csrk_global_desc.GetLength(I2); constexpr unsigned X = wei_csrk_global_desc.GetLength(I2);
constexpr unsigned HPadLow = LowerPads{}.Get(I0); constexpr unsigned HPadLow = LowerPads{}.Get(I0);
constexpr unsigned WPadLow = LowerPads{}.Get(I1); constexpr unsigned WPadLow = LowerPads{}.Get(I1);
...@@ -64,8 +64,8 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded( ...@@ -64,8 +64,8 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(
constexpr unsigned HPadUp = UpperPads{}.Get(I0); constexpr unsigned HPadUp = UpperPads{}.Get(I0);
constexpr unsigned WPadUp = UpperPads{}.Get(I1); constexpr unsigned WPadUp = UpperPads{}.Get(I1);
constexpr unsigned HiPerBlock = HoPerBlock + S - 1; constexpr unsigned HiPerBlock = HoPerBlock + Y - 1;
constexpr unsigned WiPerBlock = WoPerBlock + R - 1; constexpr unsigned WiPerBlock = WoPerBlock + X - 1;
// divide block work: [K, Ho, Wo, N] // divide block work: [K, Ho, Wo, N]
constexpr unsigned KBlockWork = (K + KPerBlock - 1) / KPerBlock; constexpr unsigned KBlockWork = (K + KPerBlock - 1) / KPerBlock;
...@@ -86,18 +86,18 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded( ...@@ -86,18 +86,18 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(
const unsigned n_block_data_begin = n_block_work_id * NPerBlock; const unsigned n_block_data_begin = n_block_work_id * NPerBlock;
// flattened (2d) tensor view of wei in global mem // flattened (2d) tensor view of wei in global mem
constexpr auto wei_ek_global_desc = make_ConstantTensorDescriptor(Sequence<C * S * R, K>{}); constexpr auto wei_ek_global_desc = make_ConstantTensorDescriptor(Sequence<C * Y * X, K>{});
// tensor view of blockwise input and weight in LDS // tensor view of blockwise input and weight in LDS
constexpr auto in_chwn_block_desc = constexpr auto in_chwn_block_desc =
make_ConstantTensorDescriptor(Sequence<CPerBlock, HiPerBlock, WiPerBlock, NPerBlock>{}); make_ConstantTensorDescriptor(Sequence<CPerBlock, HiPerBlock, WiPerBlock, NPerBlock>{});
constexpr auto wei_csrk_block_desc = constexpr auto wei_csrk_block_desc =
make_ConstantTensorDescriptor(Sequence<CPerBlock, S, R, KPerBlock>{}); make_ConstantTensorDescriptor(Sequence<CPerBlock, Y, X, KPerBlock>{});
// flattened (2d) tensor view of wei in LDS // flattened (2d) tensor view of wei in LDS
constexpr auto wei_ek_block_desc = constexpr auto wei_ek_block_desc =
make_ConstantTensorDescriptor(Sequence<CPerBlock * S * R, KPerBlock>{}); make_ConstantTensorDescriptor(Sequence<CPerBlock * Y * X, KPerBlock>{});
// tensor view of threadwise output in register // tensor view of threadwise output in register
constexpr auto out_hkwn_thread_desc = constexpr auto out_hkwn_thread_desc =
...@@ -144,7 +144,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded( ...@@ -144,7 +144,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(
LowerPads>{}; LowerPads>{};
#if 0 #if 0
// weight: format is [C,S,R,K] // weight: format is [C,Y,X,K]
constexpr auto blockwise_wei_copy = constexpr auto blockwise_wei_copy =
Blockwise4dTensorCopy1<BlockSize, Blockwise4dTensorCopy1<BlockSize,
Float, Float,
...@@ -152,7 +152,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded( ...@@ -152,7 +152,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(
decltype(wei_csrk_block_desc), decltype(wei_csrk_block_desc),
decltype(wei_csrk_block_desc.GetLengths())>{}; decltype(wei_csrk_block_desc.GetLengths())>{};
#elif 0 #elif 0
// weight: format is [C*S*R,K] // weight: format is [C*Y*X,K]
constexpr auto blockwise_wei_copy = constexpr auto blockwise_wei_copy =
Blockwise2dTensorCopy1<BlockSize, Blockwise2dTensorCopy1<BlockSize,
Float, Float,
...@@ -160,7 +160,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded( ...@@ -160,7 +160,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(
decltype(wei_ek_block_desc), decltype(wei_ek_block_desc),
decltype(wei_ek_block_desc.GetLengths())>{}; decltype(wei_ek_block_desc.GetLengths())>{};
#elif 1 #elif 1
// weight: format is [C*S*R,K] // weight: format is [C*Y*X,K]
const auto blockwise_wei_copy = Blockwise2dTensorCopy2<BlockSize, const auto blockwise_wei_copy = Blockwise2dTensorCopy2<BlockSize,
Float, Float,
decltype(wei_ek_global_desc), decltype(wei_ek_global_desc),
...@@ -173,7 +173,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded( ...@@ -173,7 +173,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(
// a series of blockwise batched GEMM // a series of blockwise batched GEMM
// C_matrix += transpose(A_matrix) * B_matrix // C_matrix += transpose(A_matrix) * B_matrix
// A_matrix and B_matrix saved in LDS, C_matrix saved in register // A_matrix and B_matrix saved in LDS, C_matrix saved in register
// A_matrix[C,K] is a sub-matrix of wei_block[C,S,R,K] // A_matrix[C,K] is a sub-matrix of wei_block[C,Y,X,K]
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N] // B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
// C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N] // C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N]
constexpr auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor( constexpr auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
...@@ -245,9 +245,9 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded( ...@@ -245,9 +245,9 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(
__syncthreads(); __syncthreads();
// a series of batched GEMM // a series of batched GEMM
for(unsigned s = 0; s < S; ++s) for(unsigned s = 0; s < Y; ++s)
{ {
for(unsigned r = 0; r < R; ++r) for(unsigned r = 0; r < X; ++r)
{ {
auto f_accum = [](auto& acc, const auto&& v) { acc += v; }; auto f_accum = [](auto& acc, const auto&& v) { acc += v; };
......
#pragma once
#include "common.hip.hpp"
#include "ConstantTensorDescriptor.hip.hpp"
#include "ConstantMatrixDescriptor.hip.hpp"
#include "blockwise_4d_tensor_op.hip.hpp"
#include "blockwise_2d_tensor_op.hip.hpp"
#include "threadwise_4d_tensor_op.hip.hpp"
#include "blockwise_gemm.hip.hpp"
template <unsigned GridSize,
unsigned BlockSize,
class Float,
class InGlobalDesc,
class WeiGlobalDesc,
class OutGlobalDesc,
class LowerPads,
class UpperPads,
unsigned NPerBlock,
unsigned KPerBlock,
unsigned CPerBlock,
unsigned HoPerBlock,
unsigned WoPerBlock,
unsigned NPerThread,
unsigned KPerThread,
unsigned CPerThread,
unsigned HoPerThread,
unsigned WoPerThread,
unsigned WeiBlockCopyThreadPerDim0,
unsigned WeiBlockCopyThreadPerDim1>
__global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline(
const Float* const __restrict__ p_in_global,
const Float* const __restrict__ p_wei_global,
Float* const __restrict__ p_out_global)
{
// NPerThread == NPerBlock, because the format of input in LDS [C,Hi,Wi,N]
// for GEMM trans([C,K]) * [C,Wo*N], we need a thread to do all the "N"
// if we use [C,Hi,N,Wi,N] in LDS, then NPerThread can be different from NPerBlock
static_assert(NPerBlock % NPerThread == 0, "wrong! NPerBlock % NPerThread !=0");
static_assert((NPerThread < NPerBlock && WoPerThread == 1) || NPerThread == NPerBlock,
"wrong!");
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto in_chwn_global_desc = InGlobalDesc{};
constexpr auto wei_csrk_global_desc = WeiGlobalDesc{};
constexpr auto out_khwn_global_desc = OutGlobalDesc{};
constexpr unsigned C = in_chwn_global_desc.GetLength(I0);
constexpr unsigned K = out_khwn_global_desc.GetLength(I0);
constexpr unsigned Ho = out_khwn_global_desc.GetLength(I1);
constexpr unsigned Wo = out_khwn_global_desc.GetLength(I2);
constexpr unsigned N = out_khwn_global_desc.GetLength(I3);
constexpr unsigned S = wei_csrk_global_desc.GetLength(I1);
constexpr unsigned R = wei_csrk_global_desc.GetLength(I2);
constexpr unsigned HPadLow = LowerPads{}.Get(I0);
constexpr unsigned WPadLow = LowerPads{}.Get(I1);
constexpr unsigned HPadUp = UpperPads{}.Get(I0);
constexpr unsigned WPadUp = UpperPads{}.Get(I1);
constexpr unsigned HiPerBlock = HoPerBlock + S - 1;
constexpr unsigned WiPerBlock = WoPerBlock + R - 1;
// divide block work: [K, Ho, Wo, N]
constexpr unsigned KBlockWork = (K + KPerBlock - 1) / KPerBlock;
constexpr unsigned HBlockWork = (Ho + HoPerBlock - 1) / HoPerBlock;
constexpr unsigned WBlockWork = (Wo + WoPerBlock - 1) / WoPerBlock;
constexpr unsigned NBlockWork = (N + NPerBlock - 1) / NPerBlock;
const unsigned k_block_work_id = get_block_1d_id() / (HBlockWork * WBlockWork * NBlockWork);
unsigned itmp = get_block_1d_id() - k_block_work_id * (HBlockWork * WBlockWork * NBlockWork);
const unsigned h_block_work_id = itmp / (WBlockWork * NBlockWork);
itmp -= h_block_work_id * (WBlockWork * NBlockWork);
const unsigned w_block_work_id = itmp / NBlockWork;
const unsigned n_block_work_id = itmp - w_block_work_id * NBlockWork;
const unsigned k_block_data_begin = k_block_work_id * KPerBlock;
const unsigned ho_block_data_begin = h_block_work_id * HoPerBlock;
const unsigned wo_block_data_begin = w_block_work_id * WoPerBlock;
const unsigned n_block_data_begin = n_block_work_id * NPerBlock;
// flattened (2d) tensor view of wei in global mem
constexpr auto wei_ek_global_desc = make_ConstantTensorDescriptor(Sequence<C * S * R, K>{});
// tensor view of blockwise input and weight in LDS
constexpr auto in_chwn_block_desc =
make_ConstantTensorDescriptor(Sequence<CPerBlock, HiPerBlock, WiPerBlock, NPerBlock>{});
constexpr auto wei_csrk_block_desc =
make_ConstantTensorDescriptor(Sequence<CPerBlock, S, R, KPerBlock>{});
// flattened (2d) tensor view of wei in LDS
constexpr auto wei_ek_block_desc =
make_ConstantTensorDescriptor(Sequence<CPerBlock * S * R, KPerBlock>{});
// tensor view of threadwise output in register
constexpr auto out_hkwn_thread_desc =
make_ConstantTensorDescriptor(Sequence<HoPerThread, KPerThread, WoPerThread, NPerThread>{});
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
print_ConstantTensorDescriptor(in_chwn_block_desc, "in_chwn_block_desc");
print_ConstantTensorDescriptor(wei_csrk_block_desc, "wei_csrk_block_desc");
print_ConstantTensorDescriptor(out_hkwn_thread_desc, "out_hkwn_thread_desc");
}
#endif
// blockwise copy
// input: format is [C, Hi, Wi, N]
const unsigned h_block_pad_low = h_block_work_id == 0 ? HPadLow : 0;
const unsigned w_block_pad_low = w_block_work_id == 0 ? WPadLow : 0;
const unsigned h_block_pad_up = h_block_work_id == HBlockWork - 1 ? HPadUp : 0;
const unsigned w_block_pad_up = w_block_work_id == WBlockWork - 1 ? WPadUp : 0;
#if 0
if(get_thread_local_1d_id() == 0)
;
{
printf(
"%u %u, h_block_pad_low %u w_block_pad_low %u h_block_pad_up %u w_block_pad_up %u\n",
get_block_1d_id(),
get_thread_local_1d_id(),
h_block_pad_low,
w_block_pad_low,
h_block_pad_up,
w_block_pad_up);
}
#endif
constexpr auto blockwise_in_copy =
BlockwiseChwnTensorCopyPadded<BlockSize,
Float,
decltype(in_chwn_global_desc),
decltype(in_chwn_block_desc),
decltype(in_chwn_block_desc.GetLengths()),
LowerPads>{};
#if 0
// weight: format is [C,S,R,K]
constexpr auto blockwise_wei_copy =
Blockwise4dTensorCopy1<BlockSize,
Float,
decltype(wei_csrk_global_desc),
decltype(wei_csrk_block_desc),
decltype(wei_csrk_block_desc.GetLengths())>{};
#elif 0
// weight: format is [C*S*R,K]
constexpr auto blockwise_wei_copy =
Blockwise2dTensorCopy1<BlockSize,
Float,
decltype(wei_ek_global_desc),
decltype(wei_ek_block_desc),
decltype(wei_ek_block_desc.GetLengths())>{};
#elif 1
// weight: format is [C*S*R,K]
const auto blockwise_wei_copy = Blockwise2dTensorCopy2<BlockSize,
Float,
decltype(wei_ek_global_desc),
decltype(wei_ek_block_desc),
decltype(wei_ek_block_desc.GetLengths()),
WeiBlockCopyThreadPerDim0,
WeiBlockCopyThreadPerDim1>{};
#endif
// a series of blockwise batched GEMM
// C_matrix += transpose(A_matrix) * B_matrix
// A_matrix and B_matrix saved in LDS, C_matrix saved in register
// A_matrix[C,K] is a sub-matrix of wei_block[S,R,C,K]
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
// C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N]
constexpr auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{}, Number<KPerBlock>{}, Number<wei_csrk_block_desc.GetStride(I0)>{});
constexpr auto b_cxwn_block_mtx_desc =
make_ConstantMatrixDescriptor(Number<CPerBlock>{},
Number<WoPerBlock * NPerBlock>{},
Number<in_chwn_block_desc.GetStride(I0)>{});
constexpr auto c_kxwn_thread_mtx_desc =
make_ConstantMatrixDescriptor(Number<KPerThread>{}, Number<WoPerThread * NPerThread>{});
const auto blockwise_batch_gemm =
Blockwise1dStridedBatchedGemmBlockABlockBThreadC<BlockSize,
decltype(a_cxk_block_mtx_desc),
decltype(b_cxwn_block_mtx_desc),
decltype(c_kxwn_thread_mtx_desc),
true,
false,
false,
0,
in_chwn_block_desc.GetStride(I1),
out_hkwn_thread_desc.GetStride(I0),
HoPerBlock,
HoPerThread,
CPerThread,
true>{};
// LDS
constexpr unsigned in_block_size = in_chwn_block_desc.GetElementSpace();
constexpr unsigned wei_block_size = wei_csrk_block_desc.GetElementSpace();
// LDS double buffer
__shared__ Float p_in_block_0[in_block_size];
__shared__ Float p_wei_block_0[wei_block_size];
__shared__ Float p_in_block_1[in_block_size];
__shared__ Float p_wei_block_1[wei_block_size];
// register
Float p_out_thread[out_hkwn_thread_desc.GetElementSpace()];
// set threadwise output tensor to 0
threadwise_4d_tensor_set_zero(out_hkwn_thread_desc, p_out_thread);
const Float* p_wei_global_block_begin =
p_wei_global + wei_ek_global_desc.Get1dIndex(0, k_block_data_begin);
// prelog: load data
// input: global mem to LDS,
blockwise_in_copy.Run(p_in_global,
0,
ho_block_data_begin,
wo_block_data_begin,
n_block_data_begin,
p_in_block_0,
h_block_pad_low,
w_block_pad_low,
h_block_pad_up,
w_block_pad_up);
// weight: global mem to LDS,
blockwise_wei_copy.Run(p_wei_global_block_begin, p_wei_block_0);
p_wei_global_block_begin += CPerBlock * wei_ek_global_desc.GetStride(I0);
bool even_loop = true;
for(unsigned c_block_data_begin = CPerBlock; c_block_data_begin < C;
c_block_data_begin += CPerBlock,
p_wei_global_block_begin += CPerBlock * wei_ek_global_desc.GetStride(I0),
even_loop = !even_loop)
{
__syncthreads();
Float* p_in_block_now = even_loop ? p_in_block_0 : p_in_block_1;
Float* p_wei_block_now = even_loop ? p_wei_block_0 : p_wei_block_1;
Float* p_in_block_next = even_loop ? p_in_block_1 : p_in_block_0;
Float* p_wei_block_next = even_loop ? p_wei_block_1 : p_wei_block_0;
// preload next data
#if 1
// input: global mem to LDS,
blockwise_in_copy.Run(p_in_global,
c_block_data_begin,
ho_block_data_begin,
wo_block_data_begin,
n_block_data_begin,
p_in_block_next,
h_block_pad_low,
w_block_pad_low,
h_block_pad_up,
w_block_pad_up);
#endif
#if 1
// weight: global mem to LDS,
blockwise_wei_copy.Run(p_wei_global_block_begin, p_wei_block_next);
#endif
// a series of batched GEMM
for(unsigned s = 0; s < S; ++s)
{
for(unsigned r = 0; r < R; ++r)
{
auto f_accum = [](auto& acc, const auto&& v) { acc += v; };
blockwise_batch_gemm.Run(p_wei_block_now +
wei_csrk_block_desc.Get1dIndex(0, s, r, 0),
p_in_block_now + in_chwn_block_desc.Get1dIndex(0, s, r, 0),
p_out_thread,
f_accum);
}
}
}
// last computation
{
__syncthreads();
Float* p_in_block_now = even_loop ? p_in_block_0 : p_in_block_1;
Float* p_wei_block_now = even_loop ? p_wei_block_0 : p_wei_block_1;
// a series of batched GEMM
for(unsigned s = 0; s < S; ++s)
{
for(unsigned r = 0; r < R; ++r)
{
auto f_accum = [](auto& acc, const auto&& v) { acc += v; };
blockwise_batch_gemm.Run(p_wei_block_now +
wei_csrk_block_desc.Get1dIndex(0, s, r, 0),
p_in_block_now + in_chwn_block_desc.Get1dIndex(0, s, r, 0),
p_out_thread,
f_accum);
}
}
}
const auto matrix_c_index =
blockwise_batch_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());
const unsigned ho_thread_data_begin = matrix_c_index.batch;
const unsigned k_thread_data_begin = matrix_c_index.row;
const unsigned wo_thread_data_begin = matrix_c_index.col / NPerBlock;
const unsigned n_thread_data_begin = matrix_c_index.col - wo_thread_data_begin * NPerBlock;
#if 0
printf("block %u %u, %u %u %u %u, %u %u %u %u, %f \n",
get_block_1d_id(), get_thread_local_1d_id(),
ho_block_data_begin, k_block_data_begin, wo_block_data_begin, n_block_data_begin,
ho_thread_data_begin, k_thread_data_begin, wo_thread_data_begin, n_thread_data_begin,
p_out_thread[0]);
#endif
// output: register to global mem,
// convert out_thread[Ho,K,Wo,N] to out_global[K,Ho,Wo,N]
constexpr auto reorder_khwn_from_hkwn = Sequence<1, 0, 2, 3>{};
threadwise_4d_tensor_copy_reorder_by_get_dst_from_src(
out_hkwn_thread_desc,
p_out_thread,
out_khwn_global_desc,
p_out_global + out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin,
n_block_data_begin + n_thread_data_begin),
out_hkwn_thread_desc.GetLengths(),
reorder_khwn_from_hkwn);
}
#pragma once
#include "common.hip.hpp"
#include "ConstantTensorDescriptor.hip.hpp"
#include "ConstantMatrixDescriptor.hip.hpp"
#include "blockwise_4d_tensor_op.hip.hpp"
#include "threadwise_4d_tensor_op.hip.hpp"
#include "blockwise_gemm.hip.hpp"
template <unsigned GridSize,
unsigned BlockSize,
class Float,
class InGlobalDesc,
class WeiGlobalDesc,
class OutGlobalDesc,
unsigned NPerBlock,
unsigned KPerBlock,
unsigned CPerBlock,
unsigned HoPerBlock,
unsigned WoPerBlock,
unsigned KPerThread,
unsigned CPerThread,
unsigned HoPerThread,
unsigned WoPerThread>
__global__ void
gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw(const Float* const __restrict__ p_in_global,
const Float* const __restrict__ p_wei_global,
Float* const __restrict__ p_out_global)
{
// NPerThread == NPerBlock, because the format of input in LDS [C,Hi,Wi,N]
// for GEMM trans([C,K]) * [C,Wo*N], we need a thread to do all the "N"
// if we use [C,Hi,N,Wi,N] in LDS, then NPerThread can be different from NPerBlock
constexpr unsigned NPerThread = NPerBlock;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto in_nchw_global_desc = InGlobalDesc{};
constexpr auto wei_kcsr_global_desc = WeiGlobalDesc{};
constexpr auto out_nkhw_global_desc = OutGlobalDesc{};
constexpr unsigned S = wei_kcsr_global_desc.GetLength(I2);
constexpr unsigned R = wei_kcsr_global_desc.GetLength(I3);
constexpr unsigned HiPerBlock = HoPerBlock + S - 1;
constexpr unsigned WiPerBlock = WoPerBlock + R - 1;
// divide block work: NCHW
constexpr unsigned NBlockWork =
(out_nkhw_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
constexpr unsigned KBlockWork =
(out_nkhw_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
constexpr unsigned HBlockWork =
(out_nkhw_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock;
constexpr unsigned WBlockWork =
(out_nkhw_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock;
// tensor view of un-reorderd blockwise input and weight (imaginary)
constexpr auto in_nchw_block_desc =
make_ConstantTensorDescriptor(Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{});
constexpr auto wei_kcsr_block_desc =
make_ConstantTensorDescriptor(Sequence<KPerBlock, CPerBlock, S, R>{});
// tensor view of reordered blockwise input and weight in LDS
constexpr auto reorder_srck_from_kcsr = Sequence<2, 3, 1, 0>{};
constexpr auto wei_srck_block_desc = make_ConstantTensorDescriptor(
wei_kcsr_block_desc.GetLengths().ReorderByGetNewFromOld(reorder_srck_from_kcsr));
constexpr auto reorder_chwn_from_nchw = Sequence<1, 2, 3, 0>{};
constexpr auto in_chwn_block_desc = make_ConstantTensorDescriptor(
in_nchw_block_desc.GetLengths().ReorderByGetNewFromOld(reorder_chwn_from_nchw));
// tensor view of threadwise output in register
constexpr auto out_hkwn_thread_desc =
make_ConstantTensorDescriptor(Sequence<HoPerThread, KPerThread, WoPerThread, NPerThread>{});
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
print_ConstantTensorDescriptor(in_nchw_block_desc, "in_nchw_block_desc");
print_ConstantTensorDescriptor(in_chwn_block_desc, "in_chwn_block_desc");
print_ConstantTensorDescriptor(wei_kcsr_block_desc, "wei_kcsr_block_desc");
print_ConstantTensorDescriptor(wei_srck_block_desc, "wei_srck_block_desc");
print_ConstantTensorDescriptor(out_hkwn_thread_desc, "out_hkwn_thread_desc");
}
#endif
// my block work
unsigned itmp = get_block_1d_id();
const unsigned n_block_work_id = itmp / (KBlockWork * HBlockWork * WBlockWork);
itmp -= n_block_work_id * (KBlockWork * HBlockWork * WBlockWork);
const unsigned k_block_work_id = itmp / (HBlockWork * WBlockWork);
itmp -= k_block_work_id * (HBlockWork * WBlockWork);
const unsigned h_block_work_id = itmp / WBlockWork;
const unsigned w_block_work_id = itmp - h_block_work_id * WBlockWork;
const unsigned n_block_data_begin = n_block_work_id * NPerBlock;
const unsigned k_block_data_begin = k_block_work_id * KPerBlock;
const unsigned ho_block_data_begin = h_block_work_id * HoPerBlock;
const unsigned wo_block_data_begin = w_block_work_id * WoPerBlock;
const unsigned hi_block_data_begin = ho_block_data_begin;
const unsigned wi_block_data_begin = wo_block_data_begin;
// a series of blockwise batched GEMM
// C_matrix += transpose(A_matrix) * B_matrix
// A_matrix and B_matrix saved in LDS, C_matrix saved in register
// A_matrix[C,K] is a sub-matrix of wei_block[S,R,C,K]
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
// C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N]
constexpr auto a_cxk_block_mtx_desc =
make_ConstantMatrixDescriptor(Number<CPerBlock>{}, Number<KPerBlock>{});
constexpr auto b_cxwn_block_mtx_desc =
make_ConstantMatrixDescriptor(Number<CPerBlock>{},
Number<WoPerBlock * NPerBlock>{},
Number<in_chwn_block_desc.GetStride(I0)>{});
constexpr auto c_kxwn_thread_mtx_desc =
make_ConstantMatrixDescriptor(Number<KPerThread>{}, Number<WoPerThread * NPerThread>{});
const auto blockwise_batch_gemm =
Blockwise1dStridedBatchedGemmBlockABlockBThreadC<BlockSize,
decltype(a_cxk_block_mtx_desc),
decltype(b_cxwn_block_mtx_desc),
decltype(c_kxwn_thread_mtx_desc),
true,
false,
false,
0,
in_chwn_block_desc.GetStride(I1),
out_hkwn_thread_desc.GetStride(I0),
HoPerBlock,
HoPerThread,
CPerThread,
true>{};
// LDS
constexpr unsigned in_block_size = in_chwn_block_desc.GetElementSpace();
constexpr unsigned wei_block_size = wei_srck_block_desc.GetElementSpace();
__shared__ Float p_in_block[in_block_size];
__shared__ Float p_wei_block[wei_block_size];
// register
Float p_out_thread[out_hkwn_thread_desc.GetElementSpace()];
// set threadwise output tensor to 0
threadwise_4d_tensor_set_zero(out_hkwn_thread_desc, p_out_thread);
for(unsigned c_block_data_begin = 0; c_block_data_begin < in_nchw_global_desc.GetLength(I1);
c_block_data_begin += CPerBlock, __syncthreads())
{
#if 1
// input: global mem to LDS,
// convert [N,C,Hi,Wi] to [C,Hi,Wi,N]
blockwise_4d_tensor_copy_reorder_by_get_dst_from_src<BlockSize>(
in_nchw_global_desc,
p_in_global + in_nchw_global_desc.Get1dIndex(n_block_data_begin,
c_block_data_begin,
hi_block_data_begin,
wi_block_data_begin),
in_chwn_block_desc,
p_in_block,
in_nchw_block_desc.GetLengths(),
reorder_chwn_from_nchw);
#else
// input: global mem to LDS,
// no format conversion, this is wrong, for performance study only!
Blockwise4dTensorCopy<BlockSize>(in_nchw_global_desc,
p_in_global +
in_nchw_global_desc.Get1dIndex(n_block_data_begin,
c_block_data_begin,
hi_block_data_begin,
wi_block_data_begin),
in_nchw_block_desc,
p_in_block,
in_nchw_block_desc.GetLengths());
#endif
#if 1
// weight: global mem to LDS,
// convert [K,C,S,R] to [S,R,C,K]
blockwise_4d_tensor_copy_reorder_by_get_dst_from_src<BlockSize>(
wei_kcsr_global_desc,
p_wei_global +
wei_kcsr_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0),
wei_srck_block_desc,
p_wei_block,
wei_kcsr_block_desc.GetLengths(),
reorder_srck_from_kcsr);
#else
// weight: global mem to LDS,
// no format conversion, this is wrong, for performance study only!
Blockwise4dTensorCopy<BlockSize>(
wei_kcsr_global_desc,
p_wei_global +
wei_kcsr_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0),
wei_kcsr_block_desc,
p_wei_block,
wei_kcsr_block_desc.GetLengths());
#endif
__syncthreads();
#if 1
// a series of batched GEMM
for(unsigned s = 0; s < S; ++s)
{
for(unsigned r = 0; r < R; ++r)
{
auto f_accum = [](auto& c, const auto&& ab) { c += ab; };
blockwise_batch_gemm.Run(p_wei_block + wei_srck_block_desc.Get1dIndex(s, r, 0, 0),
p_in_block + in_chwn_block_desc.Get1dIndex(0, s, r, 0),
p_out_thread,
f_accum);
}
}
#endif
}
const auto matrix_c_index =
blockwise_batch_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());
#if 0
printf("%u %u, %u %u %u\n",get_block_1d_id(), get_thread_local_1d_id(), matrix_c_index.batch, matrix_c_index.row, matrix_c_index.col);
#endif
const unsigned ho_thread_data_begin = matrix_c_index.batch;
const unsigned k_thread_data_begin = matrix_c_index.row;
const unsigned wo_thread_data_begin = matrix_c_index.col / NPerThread;
#if 1
// output: register to global mem,
// convert out_thread[Ho,K,Wo,N] to out_global[N,K,Ho,Wo]
constexpr auto reorder_nkhw_from_hkwn = Sequence<3, 1, 0, 2>{};
threadwise_4d_tensor_copy_reorder_by_get_dst_from_src(
out_hkwn_thread_desc,
p_out_thread,
out_nkhw_global_desc,
p_out_global + out_nkhw_global_desc.Get1dIndex(n_block_data_begin,
k_block_data_begin + k_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin),
out_hkwn_thread_desc.GetLengths(),
reorder_nkhw_from_hkwn);
#else
// output: register to global mem,
// no format conversion, assume register is in [N,K,Ho,Wo], this is wrong, for performance
// study only!
constexpr auto out_nkhw_thread_desc =
make_ConstantTensorDescriptor(Sequence<NPerThread, KPerThread, HoPerThread, WoPerThread>{});
threadwise_4d_tensor_copy(
out_nkhw_thread_desc,
p_out_thread,
out_nkhw_global_desc,
p_out_global + out_nkhw_global_desc.Get1dIndex(n_block_data_begin,
k_block_data_begin + k_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin),
out_nkhw_thread_desc.GetLengths());
#endif
}
#pragma once
#include "common.hip.hpp"
#include "ConstantTensorDescriptor.hip.hpp"
#include "ConstantMatrixDescriptor.hip.hpp"
#include "blockwise_4d_tensor_op.hip.hpp"
#include "threadwise_4d_tensor_op.hip.hpp"
#include "blockwise_gemm.hip.hpp"
template <unsigned GridSize,
unsigned BlockSize,
class Float,
class InGlobalDesc,
class WeiGlobalDesc,
class OutGlobalDesc,
unsigned NPerBlock,
unsigned KPerBlock,
unsigned CPerBlock,
unsigned HoPerBlock,
unsigned WoPerBlock,
unsigned NPerThread,
unsigned KPerThread,
unsigned CPerThread,
unsigned HoPerThread,
unsigned WoPerThread>
__global__ void
gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(const Float* const __restrict__ p_in_global,
const Float* const __restrict__ p_wei_global,
Float* const __restrict__ p_out_global)
{
// NPerThread == NPerBlock, because the format of input in LDS [C,Hi,Wi,N]
// for GEMM trans([C,K]) * [C,Wo*N], we need a thread to do all the "N"
// if we use [C,Hi,N,Wi,N] in LDS, then NPerThread can be different from NPerBlock
static_assert(NPerBlock % NPerThread == 0, "wrong! NPerBlock % NPerThread !=0");
static_assert((NPerThread < NPerBlock && WoPerThread == 1) || NPerThread == NPerBlock,
"wrong!");
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto in_nchw_global_desc = InGlobalDesc{};
constexpr auto wei_srck_global_desc = WeiGlobalDesc{};
constexpr auto out_nkhw_global_desc = OutGlobalDesc{};
constexpr unsigned S = wei_srck_global_desc.GetLength(I0);
constexpr unsigned R = wei_srck_global_desc.GetLength(I1);
constexpr unsigned HiPerBlock = HoPerBlock + S - 1;
constexpr unsigned WiPerBlock = WoPerBlock + R - 1;
// divide block work: NCHW
constexpr unsigned NBlockWork =
(out_nkhw_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
constexpr unsigned KBlockWork =
(out_nkhw_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
constexpr unsigned HBlockWork =
(out_nkhw_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock;
constexpr unsigned WBlockWork =
(out_nkhw_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock;
unsigned itmp = get_block_1d_id();
const unsigned n_block_work_id = itmp / (KBlockWork * HBlockWork * WBlockWork);
itmp -= n_block_work_id * (KBlockWork * HBlockWork * WBlockWork);
const unsigned k_block_work_id = itmp / (HBlockWork * WBlockWork);
itmp -= k_block_work_id * (HBlockWork * WBlockWork);
const unsigned h_block_work_id = itmp / WBlockWork;
const unsigned w_block_work_id = itmp - h_block_work_id * WBlockWork;
const unsigned n_block_data_begin = n_block_work_id * NPerBlock;
const unsigned k_block_data_begin = k_block_work_id * KPerBlock;
const unsigned ho_block_data_begin = h_block_work_id * HoPerBlock;
const unsigned wo_block_data_begin = w_block_work_id * WoPerBlock;
const unsigned hi_block_data_begin = ho_block_data_begin;
const unsigned wi_block_data_begin = wo_block_data_begin;
// tensor view of un-reorderd blockwise input and weight (imaginary)
constexpr auto in_nchw_block_desc =
make_ConstantTensorDescriptor(Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{});
constexpr auto wei_srck_block_desc =
make_ConstantTensorDescriptor(Sequence<S, R, CPerBlock, KPerBlock>{});
// tensor view of reordered blockwise input and weight in LDS
constexpr auto reorder_chwn_from_nchw = Sequence<1, 2, 3, 0>{};
constexpr auto in_chwn_block_desc = make_ConstantTensorDescriptor(
in_nchw_block_desc.GetLengths().ReorderByGetNewFromOld(reorder_chwn_from_nchw));
// tensor view of threadwise output in register
constexpr auto out_hkwn_thread_desc =
make_ConstantTensorDescriptor(Sequence<HoPerThread, KPerThread, WoPerThread, NPerThread>{});
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
print_ConstantTensorDescriptor(in_nchw_block_desc, "in_nchw_block_desc");
print_ConstantTensorDescriptor(in_chwn_block_desc, "in_chwn_block_desc");
print_ConstantTensorDescriptor(wei_srck_block_desc, "wei_srck_block_desc");
print_ConstantTensorDescriptor(out_hkwn_thread_desc, "out_hkwn_thread_desc");
}
#endif
// blockwise copy
// wei: format is [S,R,C,K], no conversion needed
constexpr auto blockwise_wei_copy =
Blockwise4dTensorCopy1<BlockSize,
Float,
decltype(wei_srck_global_desc),
decltype(wei_srck_block_desc),
decltype(wei_srck_block_desc.GetLengths())>{};
// a series of blockwise batched GEMM
// C_matrix += transpose(A_matrix) * B_matrix
// A_matrix and B_matrix saved in LDS, C_matrix saved in register
// A_matrix[C,K] is a sub-matrix of wei_block[S,R,C,K]
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
// C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N]
constexpr auto a_cxk_block_mtx_desc =
make_ConstantMatrixDescriptor(Number<CPerBlock>{}, Number<KPerBlock>{});
constexpr auto b_cxwn_block_mtx_desc =
make_ConstantMatrixDescriptor(Number<CPerBlock>{},
Number<WoPerBlock * NPerBlock>{},
Number<in_chwn_block_desc.GetStride(I0)>{});
constexpr auto c_kxwn_thread_mtx_desc =
make_ConstantMatrixDescriptor(Number<KPerThread>{}, Number<WoPerThread * NPerThread>{});
const auto blockwise_batch_gemm =
Blockwise1dStridedBatchedGemmBlockABlockBThreadC<BlockSize,
decltype(a_cxk_block_mtx_desc),
decltype(b_cxwn_block_mtx_desc),
decltype(c_kxwn_thread_mtx_desc),
true,
false,
false,
0,
in_chwn_block_desc.GetStride(I1),
out_hkwn_thread_desc.GetStride(I0),
HoPerBlock,
HoPerThread,
CPerThread,
true>{};
// LDS
constexpr unsigned in_block_size = in_chwn_block_desc.GetElementSpace();
constexpr unsigned wei_block_size = wei_srck_block_desc.GetElementSpace();
__shared__ Float p_in_block[in_block_size];
__shared__ Float p_wei_block[wei_block_size];
// register
Float p_out_thread[out_hkwn_thread_desc.GetElementSpace()];
// set threadwise output tensor to 0
threadwise_4d_tensor_set_zero(out_hkwn_thread_desc, p_out_thread);
for(unsigned c_block_data_begin = 0; c_block_data_begin < in_nchw_global_desc.GetLength(I1);
c_block_data_begin += CPerBlock, __syncthreads())
{
#if 1
// input: global mem to LDS,
// convert [N,C,Hi,Wi] to [C,Hi,Wi,N]
blockwise_4d_tensor_copy_reorder_by_get_dst_from_src<BlockSize>(
in_nchw_global_desc,
p_in_global + in_nchw_global_desc.Get1dIndex(n_block_data_begin,
c_block_data_begin,
hi_block_data_begin,
wi_block_data_begin),
in_chwn_block_desc,
p_in_block,
in_nchw_block_desc.GetLengths(),
reorder_chwn_from_nchw);
#endif
#if 1
// weight: global mem to LDS,
// format is [S,R,C,K], no conversion needed
blockwise_wei_copy.Run(p_wei_global + wei_srck_global_desc.Get1dIndex(
0, 0, c_block_data_begin, k_block_data_begin),
p_wei_block);
#endif
__syncthreads();
// a series of batched GEMM
for(unsigned s = 0; s < S; ++s)
{
for(unsigned r = 0; r < R; ++r)
{
auto f_accum = [](auto& c, const auto&& ab) { c += ab; };
blockwise_batch_gemm.Run(p_wei_block + wei_srck_block_desc.Get1dIndex(s, r, 0, 0),
p_in_block + in_chwn_block_desc.Get1dIndex(0, s, r, 0),
p_out_thread,
f_accum);
}
}
}
const auto matrix_c_index =
blockwise_batch_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());
const unsigned ho_thread_data_begin = matrix_c_index.batch;
const unsigned k_thread_data_begin = matrix_c_index.row;
const unsigned wo_thread_data_begin = matrix_c_index.col / NPerBlock;
const unsigned n_thread_data_begin = matrix_c_index.col - wo_thread_data_begin * NPerBlock;
// output: register to global mem,
// convert out_thread[Ho,K,Wo,N] to out_global[N,K,Ho,Wo]
constexpr auto reorder_nkhw_from_hkwn = Sequence<3, 1, 0, 2>{};
threadwise_4d_tensor_copy_reorder_by_get_dst_from_src(
out_hkwn_thread_desc,
p_out_thread,
out_nkhw_global_desc,
p_out_global + out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
k_block_data_begin + k_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin),
out_hkwn_thread_desc.GetLengths(),
reorder_nkhw_from_hkwn);
}
...@@ -57,11 +57,11 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b ...@@ -57,11 +57,11 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b
constexpr unsigned Ho = out_khwn_global_desc.GetLength(I1); constexpr unsigned Ho = out_khwn_global_desc.GetLength(I1);
constexpr unsigned Wo = out_khwn_global_desc.GetLength(I2); constexpr unsigned Wo = out_khwn_global_desc.GetLength(I2);
constexpr unsigned S = wei_csrk_global_desc.GetLength(I1); constexpr unsigned Y = wei_csrk_global_desc.GetLength(I1);
constexpr unsigned R = wei_csrk_global_desc.GetLength(I2); constexpr unsigned X = wei_csrk_global_desc.GetLength(I2);
constexpr unsigned B = N * Hi * Wi; constexpr unsigned B = N * Hi * Wi;
constexpr unsigned BGhostRead = (S - 1) * Wi + (R - 1); constexpr unsigned BGhostRead = (Y - 1) * Wi + (X - 1);
// divide block work by 2d: [K, B] // divide block work by 2d: [K, B]
constexpr unsigned KBlockWork = (K + KPerBlock - 1) / KPerBlock; constexpr unsigned KBlockWork = (K + KPerBlock - 1) / KPerBlock;
...@@ -75,7 +75,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b ...@@ -75,7 +75,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b
// flattend (2d) tensor view of gridwise input // flattend (2d) tensor view of gridwise input
constexpr auto in_cb_global_desc = make_ConstantTensorDescriptor(Sequence<C, B>{}); constexpr auto in_cb_global_desc = make_ConstantTensorDescriptor(Sequence<C, B>{});
constexpr auto wei_ek_global_desc = make_ConstantTensorDescriptor(Sequence<C * S * R, K>{}); constexpr auto wei_ek_global_desc = make_ConstantTensorDescriptor(Sequence<C * Y * X, K>{});
// tensor view of blockwise input and weight // tensor view of blockwise input and weight
// be careful of alignment // be careful of alignment
...@@ -83,10 +83,10 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b ...@@ -83,10 +83,10 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b
Sequence<CPerBlock, BPerBlock + BGhostRead>{}, Number<InBlockCopyDataPerRead>{}); Sequence<CPerBlock, BPerBlock + BGhostRead>{}, Number<InBlockCopyDataPerRead>{});
constexpr auto wei_ek_block_desc = make_ConstantTensorDescriptor_aligned( constexpr auto wei_ek_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<CPerBlock * S * R, KPerBlock>{}, Number<WeiBlockCopyDataPerRead>{}); Sequence<CPerBlock * Y * X, KPerBlock>{}, Number<WeiBlockCopyDataPerRead>{});
constexpr auto wei_csrk_block_desc = make_ConstantTensorDescriptor_aligned( constexpr auto wei_csrk_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<CPerBlock, S, R, KPerBlock>{}, Number<WeiBlockCopyDataPerRead>{}); Sequence<CPerBlock, Y, X, KPerBlock>{}, Number<WeiBlockCopyDataPerRead>{});
// tensor view of threadwise output in register // tensor view of threadwise output in register
constexpr auto out_kb_thread_desc = constexpr auto out_kb_thread_desc =
...@@ -138,7 +138,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b ...@@ -138,7 +138,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b
#endif #endif
// blockwise wei copy // blockwise wei copy
// format is [CPerBlock*S*R,KPerBlock] // format is [CPerBlock*Y*X,KPerBlock]
#if 0 #if 0
const auto blockwise_wei_copy = const auto blockwise_wei_copy =
Blockwise2dTensorCopy1<BlockSize, Blockwise2dTensorCopy1<BlockSize,
...@@ -166,7 +166,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b ...@@ -166,7 +166,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b
// a series of blockwise GEMM // a series of blockwise GEMM
// c_mtx += transpose(a_mtx) * b_mtx // c_mtx += transpose(a_mtx) * b_mtx
// a_mtx and b_mtx saved in LDS, c_mtx saved in register // a_mtx and b_mtx saved in LDS, c_mtx saved in register
// a_mtx[C,K] is a sub-matrix of wei_block[C,S,R,K] // a_mtx[C,K] is a sub-matrix of wei_block[C,Y,X,K]
// b_mtx[C,B] is a subset of in_block[C,B + BGhostRead] // b_mtx[C,B] is a subset of in_block[C,B + BGhostRead]
// c_mtx[K,B] is out_block[K,B] // c_mtx[K,B] is out_block[K,B]
constexpr auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor( constexpr auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
...@@ -275,9 +275,9 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b ...@@ -275,9 +275,9 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b
// compute on current data // compute on current data
// a series of GEMM // a series of GEMM
for(unsigned s = 0; s < S; ++s) for(unsigned s = 0; s < Y; ++s)
{ {
for(unsigned r = 0; r < R; ++r) for(unsigned r = 0; r < X; ++r)
{ {
auto f_accum = [](auto& acc, const auto&& v) { acc += v; }; auto f_accum = [](auto& acc, const auto&& v) { acc += v; };
#if 1 #if 1
...@@ -305,9 +305,9 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b ...@@ -305,9 +305,9 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_csrk_khwn_lds_double_b
__syncthreads(); __syncthreads();
for(unsigned s = 0; s < S; ++s) for(unsigned s = 0; s < Y; ++s)
{ {
for(unsigned r = 0; r < R; ++r) for(unsigned r = 0; r < X; ++r)
{ {
auto f_accum = [](auto& acc, const auto&& v) { acc += v; }; auto f_accum = [](auto& acc, const auto&& v) { acc += v; };
#if 0 #if 0
......
#pragma once
#include "common.hip.hpp"
#include "ConstantTensorDescriptor.hip.hpp"
#include "ConstantMatrixDescriptor.hip.hpp"
#include "blockwise_4d_tensor_op.hip.hpp"
#include "blockwise_2d_tensor_op.hip.hpp"
#include "threadwise_2d_tensor_op.hip.hpp"
#include "blockwise_gemm.hip.hpp"
// define B = flatten(N, Hi, Wi)
template <unsigned GridSize,
unsigned BlockSize,
class Float,
class InGlobalDesc,
class WeiGlobalDesc,
class OutGlobalDesc,
unsigned BPerBlock,
unsigned KPerBlock,
unsigned CPerBlock,
unsigned BPerThread,
unsigned KPerThread,
unsigned GemmThreadPerColumnPerCluster,
unsigned GemmThreadPerRowPerCluster,
unsigned GemmMPerThreadSubC,
unsigned GemmNPerThreadSubC,
unsigned GemmMLevel0Cluster,
unsigned GemmNLevel0Cluster,
unsigned GemmMLevel1Cluster,
unsigned GemmNLevel1Cluster,
unsigned GemmKPerThreadLoop,
unsigned InBlockCopyThreadPerDim0,
unsigned InBlockCopyThreadPerDim1,
unsigned WeiBlockCopyThreadPerDim0,
unsigned WeiBlockCopyThreadPerDim1,
unsigned InBlockCopyDataPerRead,
unsigned WeiBlockCopyDataPerRead>
__global__ void
gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(const Float* const __restrict__ p_in_global,
const Float* const __restrict__ p_wei_global,
Float* const __restrict__ p_out_global)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto in_cnhw_global_desc = InGlobalDesc{};
constexpr auto wei_csrk_global_desc = WeiGlobalDesc{};
constexpr auto out_knhw_global_desc = OutGlobalDesc{};
constexpr unsigned C = in_cnhw_global_desc.GetLength(I0);
constexpr unsigned N = in_cnhw_global_desc.GetLength(I1);
constexpr unsigned Hi = in_cnhw_global_desc.GetLength(I2);
constexpr unsigned Wi = in_cnhw_global_desc.GetLength(I3);
constexpr unsigned K = out_knhw_global_desc.GetLength(I0);
constexpr unsigned Ho = out_knhw_global_desc.GetLength(I2);
constexpr unsigned Wo = out_knhw_global_desc.GetLength(I3);
constexpr unsigned S = wei_csrk_global_desc.GetLength(I1);
constexpr unsigned R = wei_csrk_global_desc.GetLength(I2);
constexpr unsigned B = N * Hi * Wi;
constexpr unsigned BGhostRead = (S - 1) * Wi + (R - 1);
// divide block work by 2d: [K, B]
constexpr unsigned KBlockWork = (K + KPerBlock - 1) / KPerBlock;
constexpr unsigned BBlockWork = (B + BPerBlock - 1) / BPerBlock;
const unsigned k_block_work_id = get_block_1d_id() / BBlockWork;
const unsigned b_block_work_id = get_block_1d_id() - k_block_work_id * BBlockWork;
const unsigned k_block_data_begin = k_block_work_id * KPerBlock;
const unsigned b_block_data_begin = b_block_work_id * BPerBlock;
// flattend (2d) tensor view of gridwise input
constexpr auto in_cb_global_desc = make_ConstantTensorDescriptor(Sequence<C, B>{});
constexpr auto wei_ek_global_desc = make_ConstantTensorDescriptor(Sequence<C * S * R, K>{});
// tensor view of blockwise input and weight
// be careful of alignment
constexpr auto in_cb_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<CPerBlock, BPerBlock + BGhostRead>{}, Number<InBlockCopyDataPerRead>{});
constexpr auto wei_ek_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<CPerBlock * S * R, KPerBlock>{}, Number<WeiBlockCopyDataPerRead>{});
constexpr auto wei_csrk_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<CPerBlock, S, R, KPerBlock>{}, Number<WeiBlockCopyDataPerRead>{});
// tensor view of threadwise output in register
constexpr auto out_kb_thread_desc =
make_ConstantTensorDescriptor(Sequence<KPerThread, BPerThread>{});
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
print_ConstantTensorDescriptor(in_cnhw_global_desc, "in_cnhw_global_desc");
print_ConstantTensorDescriptor(wei_csrk_global_desc, "wei_csrk_global_desc");
print_ConstantTensorDescriptor(out_knhw_global_desc, "out_knhw_global_desc");
print_ConstantTensorDescriptor(in_cb_global_desc, "in_cb_global_desc");
print_ConstantTensorDescriptor(wei_ek_global_desc, "wei_ek_global_desc");
print_ConstantTensorDescriptor(in_cb_block_desc, "in_cb_block_desc");
print_ConstantTensorDescriptor(wei_csrk_block_desc, "wei_csrk_block_desc");
print_ConstantTensorDescriptor(wei_ek_block_desc, "wei_ek_block_desc");
print_ConstantTensorDescriptor(out_kb_thread_desc, "out_kb_thread_desc");
printf("KPerBlock %u\n", KPerBlock);
}
#endif
// blockwise in copy
// formmat is [CPerBlock,BPerBlock + BGhostRead]
#if 0
const auto blockwise_in_copy =
Blockwise2dTensorCopy1<BlockSize,
Float,
decltype(in_cb_global_desc),
decltype(in_cb_block_desc),
decltype(in_cb_block_desc.GetLengths())>{};
#elif 0
const auto blockwise_in_copy = Blockwise2dTensorCopy2<BlockSize,
Float,
decltype(in_cb_global_desc),
decltype(in_cb_block_desc),
decltype(in_cb_block_desc.GetLengths()),
InBlockCopyThreadPerDim0,
InBlockCopyThreadPerDim1>{};
#elif 1
const auto blockwise_in_copy = Blockwise2dTensorCopy3<BlockSize,
Float,
decltype(in_cb_global_desc),
decltype(in_cb_block_desc),
decltype(in_cb_block_desc.GetLengths()),
InBlockCopyDataPerRead>{};
#endif
// blockwise wei copy
// format is [CPerBlock*S*R,KPerBlock]
#if 0
const auto blockwise_wei_copy =
Blockwise2dTensorCopy1<BlockSize,
Float,
decltype(wei_ek_global_desc),
decltype(wei_ek_block_desc),
decltype(wei_ek_block_desc.GetLengths())>{};
#elif 0
const auto blockwise_wei_copy = Blockwise2dTensorCopy2<BlockSize,
Float,
decltype(wei_ek_global_desc),
decltype(wei_ek_block_desc),
decltype(wei_ek_block_desc.GetLengths()),
WeiBlockCopyThreadPerDim0,
WeiBlockCopyThreadPerDim1>{};
#elif 1
const auto blockwise_wei_copy = Blockwise2dTensorCopy3<BlockSize,
Float,
decltype(wei_ek_global_desc),
decltype(wei_ek_block_desc),
decltype(wei_ek_block_desc.GetLengths()),
WeiBlockCopyDataPerRead>{};
#endif
// a series of blockwise GEMM
// c_mtx += transpose(a_mtx) * b_mtx
// a_mtx and b_mtx saved in LDS, c_mtx saved in register
// a_mtx[C,K] is a sub-matrix of wei_block[C,S,R,K]
// b_mtx[C,B] is a subset of in_block[C,B + BGhostRead]
// c_mtx[K,B] is out_block[K,B]
constexpr auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{}, Number<KPerBlock>{}, Number<wei_csrk_block_desc.GetStride(I0)>{});
constexpr auto b_cxb_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{}, Number<BPerBlock>{}, Number<in_cb_block_desc.GetStride(I0)>{});
constexpr auto c_kxb_thread_mtx_desc =
make_ConstantMatrixDescriptor(Number<KPerThread>{}, Number<BPerThread>{});
#if 0
const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadC<BlockSize,
decltype(a_cxk_block_mtx_desc),
decltype(b_cxb_block_mtx_desc),
decltype(c_kxb_thread_mtx_desc),
true,
false,
false,
GemmKPerThreadLoop,
GemmThreadPerColumnPerCluster,
GemmThreadPerRowPerCluster,
true>{};
#else
const auto blockwise_gemm =
BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2<BlockSize,
decltype(a_cxk_block_mtx_desc),
decltype(b_cxb_block_mtx_desc),
decltype(c_kxb_thread_mtx_desc),
GemmMPerThreadSubC,
GemmNPerThreadSubC,
GemmMLevel0Cluster,
GemmNLevel0Cluster,
GemmMLevel1Cluster,
GemmNLevel1Cluster,
GemmKPerThreadLoop>{};
#endif
// LDS: be careful of alignment
constexpr unsigned in_block_size =
in_cb_block_desc.GetElementSpace(Number<InBlockCopyDataPerRead>{});
constexpr unsigned wei_block_size =
wei_csrk_block_desc.GetElementSpace(Number<WeiBlockCopyDataPerRead>{});
constexpr unsigned max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead
? InBlockCopyDataPerRead
: WeiBlockCopyDataPerRead;
__shared__ Float p_in_block[max_align * ((in_block_size + max_align - 1) / max_align)];
__shared__ Float p_wei_block[max_align * ((wei_block_size + max_align - 1) / max_align)];
// register
Float p_out_thread[out_kb_thread_desc.GetElementSpace()];
// set threadwise output tensor to 0
threadwise_2d_tensor_set_zero(out_kb_thread_desc, p_out_thread);
const Float* p_in_global_block_offset =
p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin);
const Float* p_wei_global_block_offset =
p_wei_global + wei_csrk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
for(unsigned c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock,
p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0),
p_wei_global_block_offset += CPerBlock * wei_csrk_global_desc.GetStride(I0),
__syncthreads())
{
// input: global mem to LDS,
blockwise_in_copy.Run(p_in_global_block_offset, p_in_block);
// weight: global mem to LDS,
blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block);
__syncthreads();
// a series of GEMM
for(unsigned s = 0; s < S; ++s)
{
for(unsigned r = 0; r < R; ++r)
{
auto f_accum = [](auto& acc, const auto&& v) { acc += v; };
#if 1
blockwise_gemm.Run
#elif 0
blockwise_gemm.Run_v2
#elif 0
blockwise_gemm.Run_RegisterDoubleBuffer
#endif
(p_wei_block + wei_csrk_block_desc.Get1dIndex(0, s, r, 0),
p_in_block + s * Wi + r,
p_out_thread,
f_accum);
}
}
}
// output: register to global mem,
const auto c_thread_mtx_begin =
blockwise_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());
const unsigned k_thread_data_begin = k_block_data_begin + c_thread_mtx_begin.row;
const unsigned b_thread_data_begin = b_block_data_begin + c_thread_mtx_begin.col;
#if 0
if(get_block_1d_id() == 0)
{
printf("%u %u, row %u col %u, k_data_begin %u b_data_begin %u, %f %f %f %f\n",
get_block_1d_id(),
get_thread_local_1d_id(),
matrix_c_index.row,
matrix_c_index.col,
k_data_begin,
b_data_begin,
p_out_thread[0], p_out_thread[1], p_out_thread[2], p_out_thread[3]);
}
#endif
for(unsigned k = 0; k < out_kb_thread_desc.GetLength(I0); ++k)
{
for(unsigned b = 0; b < out_kb_thread_desc.GetLength(I1); ++b)
{
const auto c_thread_mtx_distance =
blockwise_gemm.GetDistanceFromBeginOfThreadMatrixC(k, b);
unsigned k_data = k_thread_data_begin + c_thread_mtx_distance.row;
unsigned b_data = b_thread_data_begin + c_thread_mtx_distance.col;
unsigned n_data = b_data / (Hi * Wi);
unsigned itmp = b_data - n_data * (Hi * Wi);
unsigned h_data = itmp / Wi;
unsigned w_data = itmp - h_data * Wi;
#if 0
if(get_block_1d_id() == 0)
{
printf("%u %u, k %u b %u, k_data %u n_data %u h_data %u w_data %u %f\n",
get_block_1d_id(),
get_thread_local_1d_id(),
k,
b,
k_data,
n_data,
h_data,
w_data,
p_out_thread[out_kb_thread_desc.Get1dIndex(k, b)]);
}
#endif
if(n_data < N && h_data < Ho && w_data < Wo)
{
p_out_global[out_knhw_global_desc.Get1dIndex(k_data, n_data, h_data, w_data)] =
p_out_thread[out_kb_thread_desc.Get1dIndex(k, b)];
}
}
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment