"git@developer.sourcefind.cn:OpenDAS/d2go.git" did not exist on "5ecbb174015ebbde7bfaf3e129b7f4e7daface61"
Commit fdaaaa50 authored by Chao Liu's avatar Chao Liu
Browse files

Merge branch 'direct_fp16'

parents 2c9b8c24 18a81e35
#pragma once #pragma once
#include <unistd.h> #include <unistd.h>
#include "device.hpp" #include "device.hpp"
#include "gridwise_direct_convolution_2.hip.hpp" #include "gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp"
template <class T, class InDesc, class WeiDesc, class OutDesc> template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_direct_convolution_2(InDesc, void device_direct_convolution_2_nchw_kcyx_nkhw(InDesc,
const Tensor<T>& in, const Tensor<T>& in,
WeiDesc, WeiDesc,
const Tensor<T>& wei, const Tensor<T>& wei,
OutDesc, OutDesc,
Tensor<T>& out, Tensor<T>& out,
unsigned nrepeat) unsigned nrepeat)
{ {
std::size_t data_sz = sizeof(T); std::size_t data_sz = sizeof(T);
DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace()); DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace());
...@@ -46,6 +46,27 @@ void device_direct_convolution_2(InDesc, ...@@ -46,6 +46,27 @@ void device_direct_convolution_2(InDesc,
constexpr unsigned HoPerThread = 2; constexpr unsigned HoPerThread = 2;
constexpr unsigned WoPerThread = 2; constexpr unsigned WoPerThread = 2;
constexpr unsigned InBlockCopyDataPerRead = 2;
constexpr unsigned WeiBlockCopyDataPerRead = 4;
constexpr unsigned BlockSize = 128;
#elif 1
// 3x3, 34x34, 128 thread, fp16
constexpr unsigned NPerBlock = 2;
constexpr unsigned KPerBlock = 32;
constexpr unsigned CPerBlock = 4;
constexpr unsigned HoPerBlock = 2;
constexpr unsigned WoPerBlock = 32;
constexpr unsigned NPerThread = 2;
constexpr unsigned KPerThread = 4;
constexpr unsigned CPerThread = 2;
constexpr unsigned HoPerThread = 2;
constexpr unsigned WoPerThread = 2;
constexpr unsigned InBlockCopyDataPerRead = 2;
constexpr unsigned WeiBlockCopyDataPerRead = 4;
constexpr unsigned BlockSize = 128; constexpr unsigned BlockSize = 128;
#endif #endif
...@@ -57,27 +78,30 @@ void device_direct_convolution_2(InDesc, ...@@ -57,27 +78,30 @@ void device_direct_convolution_2(InDesc,
for(unsigned i = 0; i < nrepeat; ++i) for(unsigned i = 0; i < nrepeat; ++i)
{ {
float time = launch_kernel(gridwise_direct_convolution_2<T, float time =
InDesc, launch_kernel(gridwise_direct_convolution_2_nchw_kcyx_nkhw<T,
WeiDesc, InDesc,
OutDesc, WeiDesc,
NPerBlock, OutDesc,
KPerBlock, NPerBlock,
CPerBlock, KPerBlock,
HoPerBlock, CPerBlock,
WoPerBlock, HoPerBlock,
NPerThread, WoPerBlock,
KPerThread, NPerThread,
CPerThread, KPerThread,
HoPerThread, CPerThread,
WoPerThread, HoPerThread,
BlockSize, WoPerThread,
GridSize>, InBlockCopyDataPerRead,
dim3(GridSize), WeiBlockCopyDataPerRead,
dim3(BlockSize), BlockSize,
static_cast<T*>(in_device_buf.GetDeviceBuffer()), GridSize>,
static_cast<T*>(wei_device_buf.GetDeviceBuffer()), dim3(GridSize),
static_cast<T*>(out_device_buf.GetDeviceBuffer())); dim3(BlockSize),
static_cast<T*>(in_device_buf.GetDeviceBuffer()),
static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
static_cast<T*>(out_device_buf.GetDeviceBuffer()));
printf("Elapsed time : %f ms\n", time); printf("Elapsed time : %f ms\n", time);
usleep(std::min(time * 1000, float(10000))); usleep(std::min(time * 1000, float(10000)));
......
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp"
template <class TInWei, class TOut, class InDesc, class WeiDesc, class OutDesc>
void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
const Tensor<TInWei>& in_nchw,
WeiDesc,
const Tensor<TInWei>& wei_kcyx,
OutDesc,
Tensor<TOut>& out_nkhw,
unsigned nrepeat)
{
// this suppose in / wei data type is int8x4
constexpr unsigned NVector = 4;
using accum_t = int32_t;
using vector_t = vector_type<TInWei, NVector>;
using vector_mem_t = typename vector_t::MemoryType;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto in_nchw_desc = InDesc{};
constexpr auto wei_kcyx_desc = WeiDesc{};
constexpr auto out_nkhw_desc = OutDesc{};
constexpr unsigned Hi = in_nchw_desc.GetLength(I2);
constexpr unsigned Wi = in_nchw_desc.GetLength(I3);
constexpr unsigned N = out_nkhw_desc.GetLength(I0);
constexpr unsigned Ho = out_nkhw_desc.GetLength(I2);
constexpr unsigned Wo = out_nkhw_desc.GetLength(I3);
constexpr unsigned K = wei_kcyx_desc.GetLength(I0);
constexpr unsigned C = wei_kcyx_desc.GetLength(I1);
constexpr unsigned Y = wei_kcyx_desc.GetLength(I2);
constexpr unsigned X = wei_kcyx_desc.GetLength(I3);
// vectorized input
auto in_nchw_vec_desc = make_ConstantTensorDescriptor(Sequence<N, C / NVector, Hi, Wi>{});
ostream_ConstantTensorDescriptor(in_nchw_vec_desc, std::cout << "in_nchw_vec_desc: ");
Tensor<vector_mem_t> in_nchw_vec(make_TensorDescriptor(in_nchw_vec_desc));
auto f_vectorized_nchw = [&](auto n, auto c, auto h, auto w) {
#if 0
in_nchw_vec(n, c, h, w) = in_nchw(n, c, h, w);
#elif 0
in_nchw_vec(n, c, h, w) =
vector_t::Pack(in_nchw(n, 2 * c, h, w), in_nchw(n, 2 * c + 1, h, w));
#elif 1
in_nchw_vec(n, c, h, w) = vector_t::Pack(in_nchw(n, 4 * c, h, w),
in_nchw(n, 4 * c + 1, h, w),
in_nchw(n, 4 * c + 2, h, w),
in_nchw(n, 4 * c + 3, h, w));
#endif
};
make_ParallelTensorFunctor(f_vectorized_nchw, N, C / NVector, Hi, Wi)(
std::thread::hardware_concurrency());
// vectorize weight
auto wei_kcyx_vec_desc = make_ConstantTensorDescriptor(Sequence<K, C / NVector, Y, X>{});
ostream_ConstantTensorDescriptor(wei_kcyx_vec_desc, std::cout << "wei_kcyx_vec_desc: ");
Tensor<vector_mem_t> wei_kcyx_vec(make_TensorDescriptor(wei_kcyx_vec_desc));
auto f_vectorized_kcyx = [&](auto k, auto c, auto y, auto x) {
#if 0
wei_kcyx_vec(k, c, y, x) = wei_kcyx(k, c, y, x);
#elif 0
wei_kcyx_vec(k, c, y, x) =
vector_t::Pack(wei_kcyx(k, 2 * c, y, x), wei_kcyx(k, 2 * c + 1, y, x));
#elif 1
wei_kcyx_vec(k, c, y, x) = vector_t::Pack(wei_kcyx(k, 4 * c, y, x),
wei_kcyx(k, 4 * c + 1, y, x),
wei_kcyx(k, 4 * c + 2, y, x),
wei_kcyx(k, 4 * c + 3, y, x));
#endif
};
make_ParallelTensorFunctor(f_vectorized_kcyx, K, C / NVector, Y, X)(
std::thread::hardware_concurrency());
//
DeviceMem in_nchw_vec_device_buf(sizeof(vector_mem_t) * in_nchw_vec.mDesc.GetElementSpace());
DeviceMem wei_kcyx_vec_device_buf(sizeof(vector_mem_t) * wei_kcyx_vec.mDesc.GetElementSpace());
DeviceMem out_nkhw_device_buf(sizeof(TOut) * out_nkhw.mDesc.GetElementSpace());
in_nchw_vec_device_buf.ToDevice(in_nchw_vec.mData.data());
wei_kcyx_vec_device_buf.ToDevice(wei_kcyx_vec.mData.data());
out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
#if 0
// 3x3, 34x34, 128 thread, fp32, vector = 1
constexpr unsigned NPerBlock = 2;
constexpr unsigned KPerBlock = 32;
constexpr unsigned CPerBlock = 4;
constexpr unsigned HoPerBlock = 2;
constexpr unsigned WoPerBlock = 32;
constexpr unsigned NPerThread = 2;
constexpr unsigned KPerThread = 4;
constexpr unsigned CPerThread = 2;
constexpr unsigned HoPerThread = 2;
constexpr unsigned WoPerThread = 2;
constexpr unsigned InBlockCopyDataPerRead = 2;
constexpr unsigned WeiBlockCopyDataPerRead = 2;
constexpr unsigned BlockSize = 128;
#elif 0
// 3x3, 34x34, 128 thread, fp32, vector = 2
constexpr unsigned NPerBlock = 2;
constexpr unsigned KPerBlock = 32;
constexpr unsigned CPerBlock = 2;
constexpr unsigned HoPerBlock = 2;
constexpr unsigned WoPerBlock = 32;
constexpr unsigned NPerThread = 2;
constexpr unsigned KPerThread = 4;
constexpr unsigned CPerThread = 1;
constexpr unsigned HoPerThread = 2;
constexpr unsigned WoPerThread = 2;
constexpr unsigned InBlockCopyDataPerRead = 2;
constexpr unsigned WeiBlockCopyDataPerRead = 2;
constexpr unsigned BlockSize = 128;
#elif 0
// 3x3, 34x34, 128 thread, int8, vector = 4
constexpr unsigned NPerBlock = 2;
constexpr unsigned KPerBlock = 32;
constexpr unsigned CPerBlock = 8;
constexpr unsigned HoPerBlock = 4;
constexpr unsigned WoPerBlock = 32;
constexpr unsigned NPerThread = 1;
constexpr unsigned KPerThread = 8;
constexpr unsigned CPerThread = 2;
constexpr unsigned HoPerThread = 4;
constexpr unsigned WoPerThread = 2;
constexpr unsigned InBlockCopyDataPerRead = 2;
constexpr unsigned WeiBlockCopyDataPerRead = 2;
constexpr unsigned BlockSize = 128;
#elif 1
// 1x1, 32x32, 128 thread, int8, vector = 4
constexpr unsigned NPerBlock = 1;
constexpr unsigned KPerBlock = 64;
constexpr unsigned CPerBlock = 16;
constexpr unsigned HoPerBlock = 4;
constexpr unsigned WoPerBlock = 32;
constexpr unsigned NPerThread = 1;
constexpr unsigned KPerThread = 8;
constexpr unsigned CPerThread = 2;
constexpr unsigned HoPerThread = 4;
constexpr unsigned WoPerThread = 2;
constexpr unsigned InBlockCopyDataPerRead = 2;
constexpr unsigned WeiBlockCopyDataPerRead = 2;
constexpr unsigned BlockSize = 128;
#endif
constexpr unsigned GridSize =
(N / NPerBlock) * (K / KPerBlock) * (Ho / HoPerBlock) * (Wo / WoPerBlock);
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
for(unsigned i = 0; i < nrepeat; ++i)
{
float time = launch_kernel(
gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw<TInWei,
TOut,
accum_t,
decltype(in_nchw_vec_desc),
decltype(wei_kcyx_vec_desc),
decltype(out_nkhw_desc),
NVector,
NPerBlock,
KPerBlock,
CPerBlock,
HoPerBlock,
WoPerBlock,
NPerThread,
KPerThread,
CPerThread,
HoPerThread,
WoPerThread,
InBlockCopyDataPerRead,
WeiBlockCopyDataPerRead,
BlockSize,
GridSize>,
dim3(GridSize),
dim3(BlockSize),
static_cast<TInWei*>(in_nchw_vec_device_buf.GetDeviceBuffer()),
static_cast<TInWei*>(wei_kcyx_vec_device_buf.GetDeviceBuffer()),
static_cast<TInWei*>(out_nkhw_device_buf.GetDeviceBuffer()));
printf("Elapsed time : %f ms\n", time);
usleep(std::min(time * 1000, float(10000)));
}
out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
}
...@@ -74,7 +74,7 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc, ...@@ -74,7 +74,7 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc,
wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data()); wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data());
out_khwn_device_buf.ToDevice(out_khwn.mData.data()); out_khwn_device_buf.ToDevice(out_khwn.mData.data());
#if 1 #if 0
// for 3x3, 34x34 // for 3x3, 34x34
constexpr unsigned NPerBlock = 16; constexpr unsigned NPerBlock = 16;
constexpr unsigned KPerBlock = 64; constexpr unsigned KPerBlock = 64;
...@@ -213,7 +213,7 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc, ...@@ -213,7 +213,7 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc,
constexpr unsigned WoPerThread = 1; constexpr unsigned WoPerThread = 1;
constexpr unsigned BlockSize = 128; constexpr unsigned BlockSize = 128;
#elif 1 #elif 0
// for 1x1, 28x28 // for 1x1, 28x28
constexpr unsigned NPerBlock = 16; constexpr unsigned NPerBlock = 16;
constexpr unsigned KPerBlock = 128; constexpr unsigned KPerBlock = 128;
...@@ -245,6 +245,39 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc, ...@@ -245,6 +245,39 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc,
constexpr unsigned OutThreadCopyDataPerWrite = 2; constexpr unsigned OutThreadCopyDataPerWrite = 2;
constexpr unsigned BlockSize = 128;
#elif 1
// for 1x1, 14x14
constexpr unsigned NPerBlock = 16;
constexpr unsigned KPerBlock = 128;
constexpr unsigned CPerBlock = 8;
constexpr unsigned HoPerBlock = 2;
constexpr unsigned WoPerBlock = 2;
constexpr unsigned NPerThread = 4;
constexpr unsigned KPerThread = 16;
constexpr unsigned CPerThread = 1;
constexpr unsigned HoPerThread = 1;
constexpr unsigned WoPerThread = 1;
constexpr unsigned InBlockCopy_ThreadPerDimC = 8;
constexpr unsigned InBlockCopy_ThreadPerDimH = 2;
constexpr unsigned InBlockCopy_ThreadPerDimW = 2;
constexpr unsigned InBlockCopy_ThreadPerDimN = 4;
constexpr unsigned InBlockCopyDataPerRead = 4;
constexpr unsigned WeiBlockCopyDataPerRead = 4;
constexpr unsigned GemmMPerThreadSubC = 4;
constexpr unsigned GemmNPerThreadSubC = 4;
constexpr unsigned GemmMLevel0Cluster = 4;
constexpr unsigned GemmNLevel0Cluster = 2;
constexpr unsigned GemmMLevel1Cluster = 2;
constexpr unsigned GemmNLevel1Cluster = 4;
constexpr unsigned GemmKPerThreadLoop = 1;
constexpr unsigned OutThreadCopyDataPerWrite = 2;
constexpr unsigned BlockSize = 128; constexpr unsigned BlockSize = 128;
#endif #endif
......
#pragma once #pragma once
#include <unistd.h> #include <unistd.h>
#include "device.hpp" #include "device.hpp"
#include "gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp"
#include "gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp" #include "gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp"
template <class T, class InDesc, class WeiDesc, class OutDesc> template <class T, class InDesc, class WeiDesc, class OutDesc>
...@@ -209,39 +210,43 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc, ...@@ -209,39 +210,43 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
for(unsigned i = 0; i < nrepeat; ++i) for(unsigned i = 0; i < nrepeat; ++i)
{ {
float time = float time = launch_kernel(
launch_kernel(gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer< #if 0
GridSize, gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn
BlockSize, #else
T, gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer
decltype(in_chwn_desc), #endif
decltype(wei_cyxk_desc), <GridSize,
decltype(out_khwn_desc), BlockSize,
BPerBlock, T,
KPerBlock, decltype(in_chwn_desc),
CPerBlock, decltype(wei_cyxk_desc),
BPerThread, decltype(out_khwn_desc),
KPerThread, BPerBlock,
GemmThreadPerColumnPerCluster, KPerBlock,
GemmThreadPerRowPerCluster, CPerBlock,
GemmMPerThreadSubC, BPerThread,
GemmNPerThreadSubC, KPerThread,
GemmMLevel0Cluster, GemmThreadPerColumnPerCluster,
GemmNLevel0Cluster, GemmThreadPerRowPerCluster,
GemmMLevel1Cluster, GemmMPerThreadSubC,
GemmNLevel1Cluster, GemmNPerThreadSubC,
GemmKPerThreadLoop, GemmMLevel0Cluster,
InBlockCopyThreadPerDim0, GemmNLevel0Cluster,
InBlockCopyThreadPerDim1, GemmMLevel1Cluster,
WeiBlockCopyThreadPerDim0, GemmNLevel1Cluster,
WeiBlockCopyThreadPerDim1, GemmKPerThreadLoop,
InBlockCopyDataPerRead, InBlockCopyThreadPerDim0,
WeiBlockCopyDataPerRead>, InBlockCopyThreadPerDim1,
dim3(GridSize), WeiBlockCopyThreadPerDim0,
dim3(BlockSize), WeiBlockCopyThreadPerDim1,
static_cast<T*>(in_chwn_device_buf.GetDeviceBuffer()), InBlockCopyDataPerRead,
static_cast<T*>(wei_cyxk_device_buf.GetDeviceBuffer()), WeiBlockCopyDataPerRead>,
static_cast<T*>(out_khwn_device_buf.GetDeviceBuffer())); dim3(GridSize),
dim3(BlockSize),
static_cast<T*>(in_chwn_device_buf.GetDeviceBuffer()),
static_cast<T*>(wei_cyxk_device_buf.GetDeviceBuffer()),
static_cast<T*>(out_khwn_device_buf.GetDeviceBuffer()));
printf("Elapsed time : %f ms\n", time); printf("Elapsed time : %f ms\n", time);
usleep(std::min(time * 1000, float(10000))); usleep(std::min(time * 1000, float(10000)));
......
...@@ -7,10 +7,11 @@ ...@@ -7,10 +7,11 @@
#include "tensor.hpp" #include "tensor.hpp"
#include "ConstantTensorDescriptor.hip.hpp" #include "ConstantTensorDescriptor.hip.hpp"
#include "conv_common.hip.hpp" #include "conv_common.hip.hpp"
#include "device_direct_convolution_1.hpp" //#include "device_direct_convolution_1.hpp"
#include "device_direct_convolution_2.hpp" #include "device_direct_convolution_2_nchw_kcyx_nkhw.hpp"
//#include "device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"
#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp" #include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp"
#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp" //#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp"
#include "device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp" #include "device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp"
struct GeneratorTensor_1 struct GeneratorTensor_1
...@@ -34,25 +35,6 @@ struct GeneratorTensor_2 ...@@ -34,25 +35,6 @@ struct GeneratorTensor_2
} }
}; };
struct GeneratorTensor_3
{
template <class... Is>
double operator()(Is... is)
{
#if 0
std::initializer_list<std::size_t> ls = {static_cast<std::size_t>(is)...};
return std::accumulate(ls.begin(), ls.end(), std::size_t(0));
#elif 1
assert(sizeof...(Is) > 0);
std::initializer_list<std::size_t> ids = {static_cast<std::size_t>(is)...};
std::vector<std::size_t> lens(sizeof...(Is), 100);
std::vector<std::size_t> strides(sizeof...(Is), 1);
std::partial_sum(lens.rbegin(), lens.rbegin() + (sizeof...(Is)-1), strides.rbegin() + 1);
return std::inner_product(ids.begin(), ids.end(), strides.begin(), std::size_t(0)) + 1;
#endif
}
};
struct GeneratorTensor_Checkboard struct GeneratorTensor_Checkboard
{ {
template <class... Ts> template <class... Ts>
...@@ -106,9 +88,12 @@ auto make_TensorDescriptor(TConstTensorDesc) ...@@ -106,9 +88,12 @@ auto make_TensorDescriptor(TConstTensorDesc)
return TensorDescriptor(lengths, strides); return TensorDescriptor(lengths, strides);
} }
template <class T, class LowerPads, class UpperPads> template <class TIn, class TWei, class TOut, class LowerPads, class UpperPads>
void host_direct_convolution( void host_direct_convolution(const Tensor<TIn>& in_nchw,
const Tensor<T>& in_nchw, const Tensor<T>& wei_kcyx, Tensor<T>& out, LowerPads, UpperPads) const Tensor<TWei>& wei_kcyx,
Tensor<TOut>& out_nkhw,
LowerPads,
UpperPads)
{ {
unsigned h_pad_low = LowerPads{}.Get(Number<0>{}); unsigned h_pad_low = LowerPads{}.Get(Number<0>{});
unsigned w_pad_low = LowerPads{}.Get(Number<1>{}); unsigned w_pad_low = LowerPads{}.Get(Number<1>{});
...@@ -129,26 +114,29 @@ void host_direct_convolution( ...@@ -129,26 +114,29 @@ void host_direct_convolution(
if(hi >= 0 && hi < in_nchw.mDesc.GetLengths()[2] && wi >= 0 && if(hi >= 0 && hi < in_nchw.mDesc.GetLengths()[2] && wi >= 0 &&
wi < in_nchw.mDesc.GetLengths()[3]) wi < in_nchw.mDesc.GetLengths()[3])
{ {
v += in_nchw(n, c, hi, wi) * wei_kcyx(k, c, y, x); v += double(in_nchw(n, c, hi, wi)) * double(wei_kcyx(k, c, y, x));
} }
} }
} }
} }
out(n, k, ho, wo) = v; out_nkhw(n, k, ho, wo) = v;
}; };
auto f_par = make_ParallelTensorFunctor(f, auto f_par = make_ParallelTensorFunctor(f,
out.mDesc.GetLengths()[0], out_nkhw.mDesc.GetLengths()[0],
out.mDesc.GetLengths()[1], out_nkhw.mDesc.GetLengths()[1],
out.mDesc.GetLengths()[2], out_nkhw.mDesc.GetLengths()[2],
out.mDesc.GetLengths()[3]); out_nkhw.mDesc.GetLengths()[3]);
f_par(std::thread::hardware_concurrency()); f_par(std::thread::hardware_concurrency());
} }
template <class T, class LowerPads, class UpperPads> template <class TIn, class TWei, class TOut, class LowerPads, class UpperPads>
void host_winograd_3x3_convolution( void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
const Tensor<T>& in_nchw, const Tensor<T>& wei_kcyx, Tensor<T>& out, LowerPads, UpperPads) const Tensor<TWei>& wei_kcyx,
Tensor<TOut>& out_nkhw,
LowerPads,
UpperPads)
{ {
constexpr std::size_t HoPerTile = 2; constexpr std::size_t HoPerTile = 2;
constexpr std::size_t WoPerTile = 2; constexpr std::size_t WoPerTile = 2;
...@@ -162,8 +150,8 @@ void host_winograd_3x3_convolution( ...@@ -162,8 +150,8 @@ void host_winograd_3x3_convolution(
std::size_t Y = wei_kcyx.mDesc.GetLengths()[2]; std::size_t Y = wei_kcyx.mDesc.GetLengths()[2];
std::size_t X = wei_kcyx.mDesc.GetLengths()[3]; std::size_t X = wei_kcyx.mDesc.GetLengths()[3];
std::size_t HO = out.mDesc.GetLengths()[2]; std::size_t HO = out_nkhw.mDesc.GetLengths()[2];
std::size_t WO = out.mDesc.GetLengths()[3]; std::size_t WO = out_nkhw.mDesc.GetLengths()[3];
unsigned h_pad_low = LowerPads{}.Get(Number<0>{}); unsigned h_pad_low = LowerPads{}.Get(Number<0>{});
unsigned w_pad_low = LowerPads{}.Get(Number<1>{}); unsigned w_pad_low = LowerPads{}.Get(Number<1>{});
...@@ -177,11 +165,11 @@ void host_winograd_3x3_convolution( ...@@ -177,11 +165,11 @@ void host_winograd_3x3_convolution(
std::size_t HTile = (HO + HoPerTile - 1) / HoPerTile; std::size_t HTile = (HO + HoPerTile - 1) / HoPerTile;
std::size_t WTile = (WO + WoPerTile - 1) / WoPerTile; std::size_t WTile = (WO + WoPerTile - 1) / WoPerTile;
Tensor<T> in_hold({N, C, HTile, WTile, HiPerTile, WiPerTile}); Tensor<double> in_hold({N, C, HTile, WTile, HiPerTile, WiPerTile});
Tensor<T> in_transform({N, C, HTile, WTile, HiPerTile, WiPerTile}); Tensor<double> in_transform({N, C, HTile, WTile, HiPerTile, WiPerTile});
Tensor<T> wei_transform({K, C, HiPerTile, WiPerTile}); Tensor<double> wei_transform({K, C, HiPerTile, WiPerTile});
Tensor<T> out_transform({N, K, HTile, WTile, HiPerTile, HiPerTile}); Tensor<double> out_transform({N, K, HTile, WTile, HiPerTile, HiPerTile});
Tensor<T> out_hold({N, K, HTile, WTile, HoPerTile, WoPerTile}); Tensor<double> out_hold({N, K, HTile, WTile, HoPerTile, WoPerTile});
auto f_in_hold = [&](auto n, auto c, auto htile, auto wtile) { auto f_in_hold = [&](auto n, auto c, auto htile, auto wtile) {
for(int j = 0; j < HiPerTile; ++j) for(int j = 0; j < HiPerTile; ++j)
...@@ -198,7 +186,7 @@ void host_winograd_3x3_convolution( ...@@ -198,7 +186,7 @@ void host_winograd_3x3_convolution(
} }
else else
{ {
in_hold(n, c, htile, wtile, j, i) = T(0); in_hold(n, c, htile, wtile, j, i) = TIn(0);
} }
} }
} }
...@@ -259,49 +247,61 @@ void host_winograd_3x3_convolution( ...@@ -259,49 +247,61 @@ void host_winograd_3x3_convolution(
}; };
auto f_wei_transform = [&](auto k, auto c) { auto f_wei_transform = [&](auto k, auto c) {
wei_transform(k, c, 0, 0) = wei_kcyx(k, c, 0, 0); wei_transform(k, c, 0, 0) = double(wei_kcyx(k, c, 0, 0));
wei_transform(k, c, 0, 1) = wei_transform(k, c, 0, 1) = 0.5 * double(wei_kcyx(k, c, 0, 0)) +
0.5 * wei_kcyx(k, c, 0, 0) + 0.5 * wei_kcyx(k, c, 0, 1) + 0.5 * wei_kcyx(k, c, 0, 2); 0.5 * double(wei_kcyx(k, c, 0, 1)) +
wei_transform(k, c, 0, 2) = 0.5 * double(wei_kcyx(k, c, 0, 2));
0.5 * wei_kcyx(k, c, 0, 0) - 0.5 * wei_kcyx(k, c, 0, 1) + 0.5 * wei_kcyx(k, c, 0, 2); wei_transform(k, c, 0, 2) = 0.5 * double(wei_kcyx(k, c, 0, 0)) -
wei_transform(k, c, 0, 3) = wei_kcyx(k, c, 0, 2); 0.5 * double(wei_kcyx(k, c, 0, 1)) +
0.5 * double(wei_kcyx(k, c, 0, 2));
wei_transform(k, c, 1, 0) = wei_transform(k, c, 0, 3) = double(wei_kcyx(k, c, 0, 2));
0.5 * wei_kcyx(k, c, 0, 0) + 0.5 * wei_kcyx(k, c, 1, 0) + 0.5 * wei_kcyx(k, c, 2, 0);
wei_transform(k, c, 1, 1) = 0.25 * wei_kcyx(k, c, 0, 0) + 0.25 * wei_kcyx(k, c, 0, 1) + wei_transform(k, c, 1, 0) = 0.5 * double(wei_kcyx(k, c, 0, 0)) +
0.25 * wei_kcyx(k, c, 0, 2) + 0.25 * wei_kcyx(k, c, 1, 0) + 0.5 * double(wei_kcyx(k, c, 1, 0)) +
0.25 * wei_kcyx(k, c, 1, 1) + 0.25 * wei_kcyx(k, c, 1, 2) + 0.5 * double(wei_kcyx(k, c, 2, 0));
0.25 * wei_kcyx(k, c, 2, 0) + 0.25 * wei_kcyx(k, c, 2, 1) + wei_transform(k, c, 1, 1) =
0.25 * wei_kcyx(k, c, 2, 2); 0.25 * double(wei_kcyx(k, c, 0, 0)) + 0.25 * double(wei_kcyx(k, c, 0, 1)) +
wei_transform(k, c, 1, 2) = 0.25 * wei_kcyx(k, c, 0, 0) - 0.25 * wei_kcyx(k, c, 0, 1) + 0.25 * double(wei_kcyx(k, c, 0, 2)) + 0.25 * double(wei_kcyx(k, c, 1, 0)) +
0.25 * wei_kcyx(k, c, 0, 2) + 0.25 * wei_kcyx(k, c, 1, 0) - 0.25 * double(wei_kcyx(k, c, 1, 1)) + 0.25 * double(wei_kcyx(k, c, 1, 2)) +
0.25 * wei_kcyx(k, c, 1, 1) + 0.25 * wei_kcyx(k, c, 1, 2) + 0.25 * double(wei_kcyx(k, c, 2, 0)) + 0.25 * double(wei_kcyx(k, c, 2, 1)) +
0.25 * wei_kcyx(k, c, 2, 0) - 0.25 * wei_kcyx(k, c, 2, 1) + 0.25 * double(wei_kcyx(k, c, 2, 2));
0.25 * wei_kcyx(k, c, 2, 2); wei_transform(k, c, 1, 2) =
wei_transform(k, c, 1, 3) = 0.25 * double(wei_kcyx(k, c, 0, 0)) - 0.25 * double(wei_kcyx(k, c, 0, 1)) +
0.5 * wei_kcyx(k, c, 0, 2) + 0.5 * wei_kcyx(k, c, 1, 2) + 0.5 * wei_kcyx(k, c, 2, 2); 0.25 * double(wei_kcyx(k, c, 0, 2)) + 0.25 * double(wei_kcyx(k, c, 1, 0)) -
0.25 * double(wei_kcyx(k, c, 1, 1)) + 0.25 * double(wei_kcyx(k, c, 1, 2)) +
wei_transform(k, c, 2, 0) = 0.25 * double(wei_kcyx(k, c, 2, 0)) - 0.25 * double(wei_kcyx(k, c, 2, 1)) +
0.5 * wei_kcyx(k, c, 0, 0) - 0.5 * wei_kcyx(k, c, 1, 0) + 0.5 * wei_kcyx(k, c, 2, 0); 0.25 * double(wei_kcyx(k, c, 2, 2));
wei_transform(k, c, 2, 1) = 0.25 * wei_kcyx(k, c, 0, 0) + 0.25 * wei_kcyx(k, c, 0, 1) + wei_transform(k, c, 1, 3) = 0.5 * double(wei_kcyx(k, c, 0, 2)) +
0.25 * wei_kcyx(k, c, 0, 2) - 0.25 * wei_kcyx(k, c, 1, 0) - 0.5 * double(wei_kcyx(k, c, 1, 2)) +
0.25 * wei_kcyx(k, c, 1, 1) - 0.25 * wei_kcyx(k, c, 1, 2) + 0.5 * double(wei_kcyx(k, c, 2, 2));
0.25 * wei_kcyx(k, c, 2, 0) + 0.25 * wei_kcyx(k, c, 2, 1) +
0.25 * wei_kcyx(k, c, 2, 2); wei_transform(k, c, 2, 0) = 0.5 * double(wei_kcyx(k, c, 0, 0)) -
wei_transform(k, c, 2, 2) = 0.25 * wei_kcyx(k, c, 0, 0) - 0.25 * wei_kcyx(k, c, 0, 1) + 0.5 * double(wei_kcyx(k, c, 1, 0)) +
0.25 * wei_kcyx(k, c, 0, 2) - 0.25 * wei_kcyx(k, c, 1, 0) + 0.5 * double(wei_kcyx(k, c, 2, 0));
0.25 * wei_kcyx(k, c, 1, 1) - 0.25 * wei_kcyx(k, c, 1, 2) + wei_transform(k, c, 2, 1) =
0.25 * wei_kcyx(k, c, 2, 0) - 0.25 * wei_kcyx(k, c, 2, 1) + 0.25 * double(wei_kcyx(k, c, 0, 0)) + 0.25 * double(wei_kcyx(k, c, 0, 1)) +
0.25 * wei_kcyx(k, c, 2, 2); 0.25 * double(wei_kcyx(k, c, 0, 2)) - 0.25 * double(wei_kcyx(k, c, 1, 0)) -
wei_transform(k, c, 2, 3) = 0.25 * double(wei_kcyx(k, c, 1, 1)) - 0.25 * double(wei_kcyx(k, c, 1, 2)) +
0.5 * wei_kcyx(k, c, 0, 2) - 0.5 * wei_kcyx(k, c, 1, 2) + 0.5 * wei_kcyx(k, c, 2, 2); 0.25 * double(wei_kcyx(k, c, 2, 0)) + 0.25 * double(wei_kcyx(k, c, 2, 1)) +
0.25 * double(wei_kcyx(k, c, 2, 2));
wei_transform(k, c, 3, 0) = wei_kcyx(k, c, 2, 0); wei_transform(k, c, 2, 2) =
wei_transform(k, c, 3, 1) = 0.25 * double(wei_kcyx(k, c, 0, 0)) - 0.25 * double(wei_kcyx(k, c, 0, 1)) +
0.5 * wei_kcyx(k, c, 2, 0) + 0.5 * wei_kcyx(k, c, 2, 1) + 0.5 * wei_kcyx(k, c, 2, 2); 0.25 * double(wei_kcyx(k, c, 0, 2)) - 0.25 * double(wei_kcyx(k, c, 1, 0)) +
wei_transform(k, c, 3, 2) = 0.25 * double(wei_kcyx(k, c, 1, 1)) - 0.25 * double(wei_kcyx(k, c, 1, 2)) +
0.5 * wei_kcyx(k, c, 2, 0) - 0.5 * wei_kcyx(k, c, 2, 1) + 0.5 * wei_kcyx(k, c, 2, 2); 0.25 * double(wei_kcyx(k, c, 2, 0)) - 0.25 * double(wei_kcyx(k, c, 2, 1)) +
wei_transform(k, c, 3, 3) = wei_kcyx(k, c, 2, 2); 0.25 * double(wei_kcyx(k, c, 2, 2));
wei_transform(k, c, 2, 3) = 0.5 * double(wei_kcyx(k, c, 0, 2)) -
0.5 * double(wei_kcyx(k, c, 1, 2)) +
0.5 * double(wei_kcyx(k, c, 2, 2));
wei_transform(k, c, 3, 0) = double(wei_kcyx(k, c, 2, 0));
wei_transform(k, c, 3, 1) = 0.5 * double(wei_kcyx(k, c, 2, 0)) +
0.5 * double(wei_kcyx(k, c, 2, 1)) +
0.5 * double(wei_kcyx(k, c, 2, 2));
wei_transform(k, c, 3, 2) = 0.5 * double(wei_kcyx(k, c, 2, 0)) -
0.5 * double(wei_kcyx(k, c, 2, 1)) +
0.5 * double(wei_kcyx(k, c, 2, 2));
wei_transform(k, c, 3, 3) = double(wei_kcyx(k, c, 2, 2));
}; };
auto f_out_transform = [&](auto n, auto k, auto htile, auto wtile) { auto f_out_transform = [&](auto n, auto k, auto htile, auto wtile) {
...@@ -354,7 +354,7 @@ void host_winograd_3x3_convolution( ...@@ -354,7 +354,7 @@ void host_winograd_3x3_convolution(
for(int i = 0; i < WoPerTile; ++i) for(int i = 0; i < WoPerTile; ++i)
{ {
std::size_t wo = WoPerTile * wtile + i; std::size_t wo = WoPerTile * wtile + i;
out(n, k, ho, wo) = out_hold(n, k, htile, wtile, j, i); out_nkhw(n, k, ho, wo) = out_hold(n, k, htile, wtile, j, i);
} }
} }
}; };
...@@ -372,20 +372,25 @@ void host_winograd_3x3_convolution( ...@@ -372,20 +372,25 @@ void host_winograd_3x3_convolution(
template <class T> template <class T>
void check_error(const Tensor<T>& ref, const Tensor<T>& result) void check_error(const Tensor<T>& ref, const Tensor<T>& result)
{ {
// printf("\n");
float error = 0; float error = 0;
float max_diff = -1; float max_diff = -1;
float ref_value = 0, result_value = 0; float ref_value = 0, result_value = 0;
for(int i = 0; i < ref.mData.size(); ++i) for(int i = 0; i < ref.mData.size(); ++i)
{ {
error += std::abs(ref.mData[i] - result.mData[i]); error += std::abs(double(ref.mData[i]) - double(result.mData[i]));
float diff = std::abs(ref.mData[i] - result.mData[i]); float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
if(max_diff < diff) if(max_diff < diff)
{ {
max_diff = diff; max_diff = diff;
ref_value = ref.mData[i]; ref_value = ref.mData[i];
result_value = result.mData[i]; result_value = result.mData[i];
} }
// printf("{%f, %f}", double(ref.mData[i]), double(result.mData[i]));
} }
// printf("\n");
std::cout << "error: " << error << std::endl; std::cout << "error: " << error << std::endl;
std::cout << "max_diff: " << max_diff << ", " << ref_value << ", " << result_value << std::endl; std::cout << "max_diff: " << max_diff << ", " << ref_value << ", " << result_value << std::endl;
...@@ -404,7 +409,7 @@ int main(int argc, char* argv[]) ...@@ -404,7 +409,7 @@ int main(int argc, char* argv[])
constexpr unsigned HPad = 0; constexpr unsigned HPad = 0;
constexpr unsigned WPad = 0; constexpr unsigned WPad = 0;
#elif 1 #elif 0
// 3x3, 34x34 // 3x3, 34x34
constexpr unsigned N = 64; constexpr unsigned N = 64;
constexpr unsigned C = 256; constexpr unsigned C = 256;
...@@ -563,6 +568,30 @@ int main(int argc, char* argv[]) ...@@ -563,6 +568,30 @@ int main(int argc, char* argv[])
constexpr unsigned HPad = 2; constexpr unsigned HPad = 2;
constexpr unsigned WPad = 2; constexpr unsigned WPad = 2;
#elif 0
// 1x1 filter, 32x32 image
constexpr unsigned N = 64;
constexpr unsigned C = 256;
constexpr unsigned HI = 32;
constexpr unsigned WI = 32;
constexpr unsigned K = 512;
constexpr unsigned Y = 1;
constexpr unsigned X = 1;
constexpr unsigned HPad = 0;
constexpr unsigned WPad = 0;
#elif 1
// 1x1 filter, 14x14 image
constexpr unsigned N = 128;
constexpr unsigned C = 2048;
constexpr unsigned HI = 14;
constexpr unsigned WI = 14;
constexpr unsigned K = 512;
constexpr unsigned Y = 1;
constexpr unsigned X = 1;
constexpr unsigned HPad = 0;
constexpr unsigned WPad = 0;
#endif #endif
auto lower_pads = Sequence<HPad, WPad>{}; auto lower_pads = Sequence<HPad, WPad>{};
...@@ -577,10 +606,12 @@ int main(int argc, char* argv[]) ...@@ -577,10 +606,12 @@ int main(int argc, char* argv[])
ostream_ConstantTensorDescriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: "); ostream_ConstantTensorDescriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: ");
ostream_ConstantTensorDescriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: "); ostream_ConstantTensorDescriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: ");
Tensor<float> in_nchw(make_TensorDescriptor(in_nchw_desc)); using in_data_t = float;
Tensor<float> wei_kcyx(make_TensorDescriptor(wei_kcyx_desc)); using out_data_t = float;
Tensor<float> out_nkhw_host(make_TensorDescriptor(out_nkhw_desc)); Tensor<in_data_t> in_nchw(make_TensorDescriptor(in_nchw_desc));
Tensor<float> out_nkhw_device(make_TensorDescriptor(out_nkhw_desc)); Tensor<in_data_t> wei_kcyx(make_TensorDescriptor(wei_kcyx_desc));
Tensor<out_data_t> out_nkhw_host(make_TensorDescriptor(out_nkhw_desc));
Tensor<out_data_t> out_nkhw_device(make_TensorDescriptor(out_nkhw_desc));
std::size_t num_thread = std::thread::hardware_concurrency(); std::size_t num_thread = std::thread::hardware_concurrency();
...@@ -601,9 +632,13 @@ int main(int argc, char* argv[]) ...@@ -601,9 +632,13 @@ int main(int argc, char* argv[])
#elif 1 #elif 1
in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
#elif 1 #elif 0
in_nchw.GenerateTensorValue(GeneratorTensor_2{-2, 2}, num_thread); in_nchw.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
auto gen_wei = [](auto... is) {
return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
};
wei_kcyx.GenerateTensorValue(gen_wei, num_thread);
#endif #endif
} }
...@@ -611,7 +646,9 @@ int main(int argc, char* argv[]) ...@@ -611,7 +646,9 @@ int main(int argc, char* argv[])
#if 0 #if 0
device_direct_convolution_1 device_direct_convolution_1
#elif 0 #elif 0
device_direct_convolution_2 device_direct_convolution_2_nchw_kcyx_nkhw
#elif 0
device_direct_convolution_2_vectorized_nchw_kcyx_nkhw
#elif 1 #elif 1
device_implicit_gemm_convolution_1_chwn_cyxk_khwn device_implicit_gemm_convolution_1_chwn_cyxk_khwn
#elif 0 #elif 0
...@@ -633,7 +670,6 @@ int main(int argc, char* argv[]) ...@@ -633,7 +670,6 @@ int main(int argc, char* argv[])
if(do_verification) if(do_verification)
{ {
#if 1
if(Y == 3 && X == 3) if(Y == 3 && X == 3)
{ {
host_winograd_3x3_convolution(in_nchw, wei_kcyx, out_nkhw_host, lower_pads, upper_pads); host_winograd_3x3_convolution(in_nchw, wei_kcyx, out_nkhw_host, lower_pads, upper_pads);
...@@ -643,7 +679,6 @@ int main(int argc, char* argv[]) ...@@ -643,7 +679,6 @@ int main(int argc, char* argv[])
host_direct_convolution(in_nchw, wei_kcyx, out_nkhw_host, lower_pads, upper_pads); host_direct_convolution(in_nchw, wei_kcyx, out_nkhw_host, lower_pads, upper_pads);
} }
check_error(out_nkhw_host, out_nkhw_device); check_error(out_nkhw_host, out_nkhw_device);
#endif
#if 0 #if 0
LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl; LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
......
#pragma once
template <class TData, unsigned NSize>
struct Array
{
using Type = Array<TData, NSize>;
static constexpr unsigned nSize = NSize;
unsigned mData[nSize];
template <class... Xs>
__host__ __device__ Array(Xs... xs) : mData{static_cast<TData>(xs)...}
{
}
__host__ __device__ TData operator[](unsigned i) const { return mData[i]; }
};
...@@ -65,8 +65,8 @@ __host__ __device__ constexpr auto calculate_default_strides_aligned(Sequence<L0 ...@@ -65,8 +65,8 @@ __host__ __device__ constexpr auto calculate_default_strides_aligned(Sequence<L0
template <class Lengths, class Strides> template <class Lengths, class Strides>
struct ConstantTensorDescriptor struct ConstantTensorDescriptor
{ {
using Type = ConstantTensorDescriptor<Lengths, Strides>;
static constexpr unsigned nDim = Lengths::nDim; static constexpr unsigned nDim = Lengths::nDim;
using NDimConstant = Number<nDim>;
__host__ __device__ constexpr ConstantTensorDescriptor() __host__ __device__ constexpr ConstantTensorDescriptor()
{ {
...@@ -91,293 +91,70 @@ struct ConstantTensorDescriptor ...@@ -91,293 +91,70 @@ struct ConstantTensorDescriptor
return Strides{}.Get(Number<I>{}); return Strides{}.Get(Number<I>{});
} }
__host__ __device__ constexpr unsigned GetElementSize() const // c++14 doesn't support constexpr lambdas, has to use this trick instead
struct GetElementSize_f
{ {
static_assert(nDim >= 2 && nDim <= 8, "nDim"); template <class IDim>
__host__ __device__ constexpr unsigned operator()(IDim idim) const
if(nDim == 2)
{ {
constexpr auto I0 = Number<0>{}; return Type{}.GetLength(idim);
constexpr auto I1 = Number<1>{};
return GetLength(I0) * GetLength(I1);
} }
else if(nDim == 3) };
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
return GetLength(I0) * GetLength(I1) * GetLength(I2); __host__ __device__ constexpr unsigned GetElementSize() const
} {
else if(nDim == 4) // c++14 doesn't support constexpr lambdas, has to use this trick instead
struct multiply
{ {
constexpr auto I0 = Number<0>{}; __host__ __device__ constexpr unsigned operator()(unsigned a, unsigned b) const
constexpr auto I1 = Number<1>{}; {
constexpr auto I2 = Number<2>{}; return a * b;
constexpr auto I3 = Number<3>{}; }
};
return GetLength(I0) * GetLength(I1) * GetLength(I2) * GetLength(I3); return static_const_reduce_n<nDim>{}(GetElementSize_f{}, multiply{});
} }
else if(nDim == 5)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto I4 = Number<4>{};
return GetLength(I0) * GetLength(I1) * GetLength(I2) * GetLength(I3) * GetLength(I4); // c++14 doesn't support constexpr lambdas, has to use this trick instead
} struct GetElementSpace_f
else if(nDim == 6) {
{ template <class IDim>
constexpr auto I0 = Number<0>{}; __host__ __device__ constexpr unsigned operator()(IDim idim) const
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto I4 = Number<4>{};
constexpr auto I5 = Number<5>{};
return GetLength(I0) * GetLength(I1) * GetLength(I2) * GetLength(I3) * GetLength(I4) *
GetLength(I5);
}
else if(nDim == 7)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto I4 = Number<4>{};
constexpr auto I5 = Number<5>{};
constexpr auto I6 = Number<6>{};
return GetLength(I0) * GetLength(I1) * GetLength(I2) * GetLength(I3) * GetLength(I4) *
GetLength(I5) * GetLength(I6);
}
else if(nDim == 8)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto I4 = Number<4>{};
constexpr auto I5 = Number<5>{};
constexpr auto I6 = Number<6>{};
constexpr auto I7 = Number<7>{};
return GetLength(I0) * GetLength(I1) * GetLength(I2) * GetLength(I3) * GetLength(I4) *
GetLength(I5) * GetLength(I6) * GetLength(I7);
}
else
{ {
assert(false); return (Type{}.GetLength(idim) - 1) * Type{}.GetStride(idim);
} }
} };
template <class Align = Number<1>> template <class Align = Number<1>>
__host__ __device__ constexpr unsigned GetElementSpace(Align align = Align{}) const __host__ __device__ constexpr unsigned GetElementSpace(Align align = Align{}) const
{ {
static_assert(nDim >= 2 && nDim <= 8, "nDim"); // c++14 doesn't support constexpr lambdas, has to use this trick instead
struct add
constexpr unsigned align_size = align.Get();
if(nDim == 2)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) +
align_size;
}
else if(nDim == 3)
{ {
constexpr auto I0 = Number<0>{}; __host__ __device__ constexpr unsigned operator()(unsigned a, unsigned b) const
constexpr auto I1 = Number<1>{}; {
constexpr auto I2 = Number<2>{}; return a + b;
}
return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) + };
(GetLength(I2) - 1) * GetStride(I2) + align_size;
}
else if(nDim == 4)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) +
(GetLength(I2) - 1) * GetStride(I2) + (GetLength(I3) - 1) * GetStride(I3) +
align_size;
}
else if(nDim == 5)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto I4 = Number<4>{};
return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) +
(GetLength(I2) - 1) * GetStride(I2) + (GetLength(I3) - 1) * GetStride(I3) +
(GetLength(I4) - 1) * GetStride(I4) + align_size;
}
else if(nDim == 6)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto I4 = Number<4>{};
constexpr auto I5 = Number<5>{};
return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) +
(GetLength(I2) - 1) * GetStride(I2) + (GetLength(I3) - 1) * GetStride(I3) +
(GetLength(I4) - 1) * GetStride(I4) + (GetLength(I5) - 1) * GetStride(I5) +
align_size;
}
else if(nDim == 7)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto I4 = Number<4>{};
constexpr auto I5 = Number<5>{};
constexpr auto I6 = Number<6>{};
return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) +
(GetLength(I2) - 1) * GetStride(I2) + (GetLength(I3) - 1) * GetStride(I3) +
(GetLength(I4) - 1) * GetStride(I4) + (GetLength(I5) - 1) * GetStride(I5) +
(GetLength(I6) - 1) * GetStride(I6) + align_size;
}
else if(nDim == 8)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto I4 = Number<4>{};
constexpr auto I5 = Number<5>{};
constexpr auto I6 = Number<6>{};
constexpr auto I7 = Number<7>{};
return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) +
(GetLength(I2) - 1) * GetStride(I2) + (GetLength(I3) - 1) * GetStride(I3) +
(GetLength(I4) - 1) * GetStride(I4) + (GetLength(I5) - 1) * GetStride(I5) +
(GetLength(I6) - 1) * GetStride(I6) + (GetLength(I7) - 1) * GetStride(I7) +
align_size;
}
}
// this is ugly, only for 2d
__host__ __device__ unsigned Get1dIndex(unsigned i0, unsigned i1) const
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
static_assert(nDim == 2, "nDim is not 2");
return i0 * GetStride(I0) + i1 * GetStride(I1);
}
// this is ugly, only for 3d
__host__ __device__ unsigned Get1dIndex(unsigned i0, unsigned i1, unsigned i2) const
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
static_assert(nDim == 3, "nDim is not 3"); return static_const_reduce_n<nDim>{}(GetElementSpace_f{}, add{}) + align.Get();
return i0 * GetStride(I0) + i1 * GetStride(I1) + i2 * GetStride(I2);
} }
// this is ugly, only for 4d template <class... Is>
__host__ __device__ unsigned __host__ __device__ unsigned Get1dIndex(Is... is) const
Get1dIndex(unsigned i0, unsigned i1, unsigned i2, unsigned i3) const
{ {
constexpr auto I0 = Number<0>{}; static_assert(sizeof...(Is) == nDim, "number of multi-index is wrong");
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
static_assert(nDim == 4, "nDim is not 4");
return i0 * GetStride(I0) + i1 * GetStride(I1) + i2 * GetStride(I2) + i3 * GetStride(I3);
}
// this is ugly, only for 5d const auto multi_id = Array<unsigned, nDim>(is...);
__host__ __device__ unsigned
Get1dIndex(unsigned i0, unsigned i1, unsigned i2, unsigned i3, unsigned i4) const
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto I4 = Number<4>{};
static_assert(nDim == 5, "nDim is not 5"); unsigned id = 0;
return i0 * GetStride(I0) + i1 * GetStride(I1) + i2 * GetStride(I2) + i3 * GetStride(I3) +
i4 * GetStride(I4);
}
// this is ugly, only for 6d static_loop_n<nDim>{}([&](auto IDim) {
__host__ __device__ unsigned constexpr unsigned idim = IDim.Get();
Get1dIndex(unsigned i0, unsigned i1, unsigned i2, unsigned i3, unsigned i4, unsigned i5) const id += multi_id[idim] * GetStride(IDim);
{ });
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto I4 = Number<4>{};
constexpr auto I5 = Number<5>{};
static_assert(nDim == 6, "nDim is not 6"); return id;
return i0 * GetStride(I0) + i1 * GetStride(I1) + i2 * GetStride(I2) + i3 * GetStride(I3) +
i4 * GetStride(I4) + i5 * GetStride(I5);
}
// this is ugly, only for 7d
__host__ __device__ unsigned Get1dIndex(unsigned i0,
unsigned i1,
unsigned i2,
unsigned i3,
unsigned i4,
unsigned i5,
unsigned i6) const
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto I4 = Number<4>{};
constexpr auto I5 = Number<5>{};
constexpr auto I6 = Number<6>{};
static_assert(nDim == 7, "nDim is not 7");
return i0 * GetStride(I0) + i1 * GetStride(I1) + i2 * GetStride(I2) + i3 * GetStride(I3) +
i4 * GetStride(I4) + i5 * GetStride(I5) + i6 * GetStride(I6);
}
// this is ugly, only for 8d
__host__ __device__ unsigned Get1dIndex(unsigned i0,
unsigned i1,
unsigned i2,
unsigned i3,
unsigned i4,
unsigned i5,
unsigned i6,
unsigned i7) const
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto I4 = Number<4>{};
constexpr auto I5 = Number<5>{};
constexpr auto I6 = Number<6>{};
constexpr auto I7 = Number<7>{};
static_assert(nDim == 8, "nDim is not 8");
return i0 * GetStride(I0) + i1 * GetStride(I1) + i2 * GetStride(I2) + i3 * GetStride(I3) +
i4 * GetStride(I4) + i5 * GetStride(I5) + i6 * GetStride(I6) + i7 * GetStride(I7);
} }
__host__ __device__ constexpr auto Condense() const __host__ __device__ constexpr auto Condense() const
...@@ -385,6 +162,12 @@ struct ConstantTensorDescriptor ...@@ -385,6 +162,12 @@ struct ConstantTensorDescriptor
constexpr auto default_strides = calculate_default_strides(Lengths{}); constexpr auto default_strides = calculate_default_strides(Lengths{});
return ConstantTensorDescriptor<Lengths, decltype(default_strides)>{}; return ConstantTensorDescriptor<Lengths, decltype(default_strides)>{};
} }
template <unsigned IDim, unsigned NVector>
__host__ __device__ constexpr auto Vectorize(Number<IDim>, Number<NVector>) const
{
assert(false); // not implemented
}
}; };
template <class Lengths> template <class Lengths>
......
#pragma once
#include "constant_integral.hip.hpp"
#include "functional.hip.hpp"
template <unsigned... Is>
struct Sequence
{
using Type = Sequence<Is...>;
static constexpr unsigned nDim = sizeof...(Is);
const unsigned mData[nDim] = {Is...};
template <unsigned I>
__host__ __device__ constexpr unsigned Get(Number<I>) const
{
return mData[I];
}
// this is ugly, only for nDIm = 4
template <unsigned I0, unsigned I1, unsigned I2, unsigned I3>
__host__ __device__ constexpr auto ReorderByGetNewFromOld(Sequence<I0, I1, I2, I3>) const
{
static_assert(nDim == 4, "nDim != 4");
constexpr auto old_sequence = Type{};
constexpr unsigned NR0 = old_sequence.mData[I0];
constexpr unsigned NR1 = old_sequence.mData[I1];
constexpr unsigned NR2 = old_sequence.mData[I2];
constexpr unsigned NR3 = old_sequence.mData[I3];
return Sequence<NR0, NR1, NR2, NR3>{};
}
template <unsigned I0, unsigned I1, unsigned I2, unsigned I3>
__host__ __device__ constexpr auto ReorderByPutOldToNew(Sequence<I0, I1, I2, I3>) const
{
// don't know how to implement this
printf("Sequence::ReorderByPutOldToNew not implemented");
assert(false);
}
template <unsigned I>
__host__ __device__ constexpr auto PushBack(Number<I>) const
{
return Sequence<Is..., I>{};
}
__host__ __device__ constexpr auto PopBack() const;
template <class F>
__host__ __device__ constexpr auto Transform(F f) const
{
return Sequence<f(Is)...>{};
}
};
template <unsigned... Is, unsigned I>
__host__ __device__ constexpr auto sequence_pop_back(Sequence<Is..., I>)
{
static_assert(sizeof...(Is) >= 1, "empty Sequence!");
return Sequence<Is...>{};
}
template <class F, unsigned... Xs, unsigned... Ys>
__host__ __device__ constexpr auto sequence_sequence_op(Sequence<Xs...>, Sequence<Ys...>, F f)
{
static_assert(Sequence<Xs...>::nDim == Sequence<Ys...>::nDim, "Dim not the same");
return Sequence<f(Xs, Ys)...>{};
}
template <unsigned... Xs, unsigned... Ys>
__host__ __device__ constexpr auto sequence_sequence_add(Sequence<Xs...>, Sequence<Ys...>)
{
struct add
{
__host__ __device__ constexpr unsigned operator()(unsigned x, unsigned y) const
{
return x + y;
}
};
return sequence_sequence_op(Sequence<Xs...>{}, Sequence<Ys...>{}, add{});
}
template <unsigned... Is>
__host__ __device__ constexpr auto Sequence<Is...>::PopBack() const
{
return sequence_pop_back(Type{});
}
...@@ -373,7 +373,7 @@ template <unsigned BlockSize, ...@@ -373,7 +373,7 @@ template <unsigned BlockSize,
unsigned DataPerRead> unsigned DataPerRead>
struct Blockwise2dTensorCopy3 struct Blockwise2dTensorCopy3
{ {
using vector_t = typename vector_type<Float, DataPerRead>::type; using vector_t = typename vector_type<Float, DataPerRead>::MemoryType;
unsigned mSrcMyThreadOffset; unsigned mSrcMyThreadOffset;
unsigned mDstMyThreadOffset; unsigned mDstMyThreadOffset;
...@@ -383,8 +383,9 @@ struct Blockwise2dTensorCopy3 ...@@ -383,8 +383,9 @@ struct Blockwise2dTensorCopy3
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
static_assert(SrcDesc{}.GetStride(I1) == 1 && DstDesc{}.GetStride(I1) == 1, static_assert(DataPerRead == 1 ||
"wrong! only support stride1 == 1!\n"); (SrcDesc{}.GetStride(I1) == 1 && DstDesc{}.GetStride(I1) == 1),
"wrong! only support stride1 == 1 if DataPerRead > 1!\n");
static_assert(DataPerRead == 1 || DataPerRead == 2 || DataPerRead == 4, static_assert(DataPerRead == 1 || DataPerRead == 2 || DataPerRead == 4,
"wrong! only support DataPerRead == 1, 2 or 4!\n"); "wrong! only support DataPerRead == 1, 2 or 4!\n");
......
...@@ -131,11 +131,11 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds ...@@ -131,11 +131,11 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds
did[3] = is / ref_desc.GetStride(I3); did[3] = is / ref_desc.GetStride(I3);
const unsigned aindex = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]); const unsigned src_index = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]);
const unsigned bindex = dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]); const unsigned dst_index = dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]);
f(p_src[aindex], p_dst[bindex]); f(p_src[src_index], p_dst[dst_index]);
} }
constexpr bool has_tail = (ref_desc.GetElementSize() > NLoop * BlockSize); constexpr bool has_tail = (ref_desc.GetElementSize() > NLoop * BlockSize);
...@@ -162,11 +162,11 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds ...@@ -162,11 +162,11 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds
did[3] = is / ref_desc.GetStride(I3); did[3] = is / ref_desc.GetStride(I3);
const unsigned aindex = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]); const unsigned src_index = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]);
const unsigned bindex = dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]); const unsigned dst_index = dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]);
f(p_src[aindex], p_dst[bindex]); f(p_src[src_index], p_dst[dst_index]);
} }
} }
} }
...@@ -199,15 +199,110 @@ blockwise_4d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc, ...@@ -199,15 +199,110 @@ blockwise_4d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc,
SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, DstFromSrcReorder{}, f_copy); SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, DstFromSrcReorder{}, f_copy);
} }
template <unsigned BlockSize, class Float, class SrcDesc, class DstDesc, class SrcOpLengths> template <unsigned BlockSize,
class Float,
class SrcDesc,
class DstDesc,
class CopyLengths,
unsigned DataPerRead>
struct Blockwise4dTensorCopy1 struct Blockwise4dTensorCopy1
{ {
using vector_t = typename vector_type<Float, DataPerRead>::MemoryType;
__device__ constexpr Blockwise4dTensorCopy1()
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
static_assert(DataPerRead == 1 ||
(SrcDesc{}.GetStride(I3) == 1 && DstDesc{}.GetStride(I3) == 1),
"wrong! only support stride3 == 1 if DataPerRead > 1!\n");
static_assert(DataPerRead == 1 || DataPerRead == 2 || DataPerRead == 4,
"wrong! only support DataPerRead == 1, 2 or 4!\n");
static_assert(SrcDesc{}.GetStride(I2) % DataPerRead == 0 &&
DstDesc{}.GetStride(I2) % DataPerRead == 0,
"src and dst stride2 should be multiple of DataPerRead to keep alignment");
// we allow out-of-bound read from src in D3 dimension,
// but we need to make sure dst stride2 is big enough,
// so that the out-of-bound write won't contaminate next line in dst
constexpr unsigned L3 = CopyLengths{}.Get(I3);
constexpr unsigned read_per_d3 = integer_divide_ceil(L3, DataPerRead);
static_assert(read_per_d3 * DataPerRead <= DstDesc{}.GetStride(I2),
"wrong! out-of-bound write will contaminate next line!\n");
}
__device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
{ {
constexpr auto dst_from_src_reorder = Sequence<0, 1, 2, 3>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto src_desc = SrcDesc{};
constexpr auto dst_desc = DstDesc{};
constexpr unsigned L0 = CopyLengths{}.Get(I0);
constexpr unsigned L1 = CopyLengths{}.Get(I1);
constexpr unsigned L2 = CopyLengths{}.Get(I2);
constexpr unsigned L3 = CopyLengths{}.Get(I3);
constexpr unsigned read_per_d3 = integer_divide_ceil(L3, DataPerRead);
blockwise_4d_tensor_copy_reorder_by_get_dst_from_src<BlockSize>( constexpr auto ref_desc =
SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, dst_from_src_reorder); make_ConstantTensorDescriptor(Sequence<L0, L1, L2, read_per_d3>{});
constexpr unsigned NLoop = ref_desc.GetElementSize() / BlockSize;
auto f_copy = [&](unsigned is) {
unsigned did[4];
did[0] = is / ref_desc.GetStride(I0);
is -= did[0] * ref_desc.GetStride(I0);
did[1] = is / ref_desc.GetStride(I1);
is -= did[1] * ref_desc.GetStride(I1);
did[2] = is / ref_desc.GetStride(I2);
is -= did[2] * ref_desc.GetStride(I2);
did[3] = is / ref_desc.GetStride(I3);
const unsigned src_index =
src_desc.Get1dIndex(did[0], did[1], did[2], did[3] * DataPerRead);
const unsigned dst_index =
dst_desc.Get1dIndex(did[0], did[1], did[2], did[3] * DataPerRead);
*(reinterpret_cast<vector_t*>(p_dst + dst_index)) =
*(reinterpret_cast<const vector_t*>(p_src + src_index));
};
for(unsigned iloop = 0; iloop < NLoop; ++iloop)
{
unsigned is = threadIdx.x + iloop * BlockSize;
f_copy(is);
}
constexpr bool has_tail = (ref_desc.GetElementSize() > NLoop * BlockSize);
if(has_tail)
{
unsigned is = threadIdx.x + NLoop * BlockSize;
if(is < ref_desc.GetElementSize())
{
f_copy(is);
}
}
} }
}; };
...@@ -350,7 +445,7 @@ template <unsigned BlockSize, ...@@ -350,7 +445,7 @@ template <unsigned BlockSize,
unsigned DataPerRead> unsigned DataPerRead>
struct Blockwise4dTensorCopy3 struct Blockwise4dTensorCopy3
{ {
using vector_t = typename vector_type<Float, DataPerRead>::type; using vector_t = typename vector_type<Float, DataPerRead>::MemoryType;
unsigned mSrcMyThreadOffset; unsigned mSrcMyThreadOffset;
unsigned mDstMyThreadOffset; unsigned mDstMyThreadOffset;
...@@ -362,8 +457,9 @@ struct Blockwise4dTensorCopy3 ...@@ -362,8 +457,9 @@ struct Blockwise4dTensorCopy3
constexpr auto I2 = Number<2>{}; constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{}; constexpr auto I3 = Number<3>{};
static_assert(SrcDesc{}.GetStride(I3) == 1 && DstDesc{}.GetStride(I3) == 1, static_assert(DataPerRead == 1 ||
"wrong! only support stride3 == 1!\n"); (SrcDesc{}.GetStride(I3) == 1 && DstDesc{}.GetStride(I3) == 1),
"wrong! only support stride3 == 1 if DataPerRead > 1!\n");
static_assert(DataPerRead == 1 || DataPerRead == 2 || DataPerRead == 4, static_assert(DataPerRead == 1 || DataPerRead == 2 || DataPerRead == 4,
"wrong! only support DataPerRead == 1, 2 or 4!\n"); "wrong! only support DataPerRead == 1, 2 or 4!\n");
...@@ -371,7 +467,7 @@ struct Blockwise4dTensorCopy3 ...@@ -371,7 +467,7 @@ struct Blockwise4dTensorCopy3
static_assert( static_assert(
SrcDesc{}.GetStride(I2) % DataPerRead == 0 && SrcDesc{}.GetStride(I2) % DataPerRead == 0 &&
DstDesc{}.GetStride(I2) % DataPerRead == 0, DstDesc{}.GetStride(I2) % DataPerRead == 0,
"wrong! src and dst stride should be multiple of DataPerRead to keep alignment"); "wrong! src and dst stride2 should be multiple of DataPerRead to keep alignment");
constexpr unsigned L0 = CopyLengths{}.Get(I0); constexpr unsigned L0 = CopyLengths{}.Get(I0);
constexpr unsigned L1 = CopyLengths{}.Get(I1); constexpr unsigned L1 = CopyLengths{}.Get(I1);
......
...@@ -435,11 +435,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -435,11 +435,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
#pragma unroll #pragma unroll
for(unsigned k_begin = 0; k_begin < KPerBlock; k_begin += KPerThreadLoop) for(unsigned k_begin = 0; k_begin < KPerBlock; k_begin += KPerThreadLoop)
{ {
// read first batch of A, B // read first batch of A, B
// copy A-sub to form A // copy A-sub to form A
#pragma unroll //#pragma unroll
for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat) for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
{ {
#if 0
threadwise_matrix_copy( threadwise_matrix_copy(
a_block_mtx, a_block_mtx,
p_a_block + a_block_mtx.Get1dIndex(k_begin, m_repeat * MPerLevel1Cluster) + p_a_block + a_block_mtx.Get1dIndex(k_begin, m_repeat * MPerLevel1Cluster) +
...@@ -447,12 +448,25 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -447,12 +448,25 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
a_thread_mtx, a_thread_mtx,
p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC), p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC),
a_thread_sub_mtx.GetLengths()); a_thread_sub_mtx.GetLengths());
#else
for(unsigned i = 0; i < a_thread_mtx.NRow(); ++i)
{
for(unsigned j = 0; j < a_thread_mtx.NCol(); ++j)
{
p_a_thread[a_thread_mtx.Get1dIndex(i, m_repeat * MPerThreadSubC + j)] =
p_a_block[a_block_mtx.Get1dIndex(k_begin + i,
m_repeat * MPerLevel1Cluster + j) +
mMyThreadOffsetA];
}
}
#endif
} }
// copy B-sub to form B // copy B-sub to form B
#pragma unroll //#pragma unroll
for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat) for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat)
{ {
#if 0
threadwise_matrix_copy( threadwise_matrix_copy(
b_block_mtx, b_block_mtx,
p_b_block + b_block_mtx.Get1dIndex(k_begin, n_repeat * NPerLevel1Cluster) + p_b_block + b_block_mtx.Get1dIndex(k_begin, n_repeat * NPerLevel1Cluster) +
...@@ -460,13 +474,26 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -460,13 +474,26 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
b_thread_mtx, b_thread_mtx,
p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC), p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
b_thread_sub_mtx.GetLengths()); b_thread_sub_mtx.GetLengths());
#else
for(unsigned i = 0; i < b_thread_mtx.NRow(); ++i)
{
for(unsigned j = 0; j < b_thread_mtx.NCol(); ++j)
{
p_b_thread[b_thread_mtx.Get1dIndex(i, n_repeat * NPerThreadSubC + j)] =
p_b_block[b_block_mtx.Get1dIndex(k_begin + i,
n_repeat * MPerLevel1Cluster + j) +
mMyThreadOffsetB];
}
}
#endif
} }
// loop over batch // loop over batch
#pragma unroll //#pragma unroll
for(unsigned ib = 0; ib + 1 < BatchPerThread; ++ib) for(unsigned ib = 0; ib + 1 < BatchPerThread; ++ib)
{ {
// do current batch of gemm // do current batch of gemm
#if 0
threadwise_gemm(a_thread_mtx, threadwise_gemm(a_thread_mtx,
True, True,
p_a_thread, p_a_thread,
...@@ -477,13 +504,32 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -477,13 +504,32 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
False, False,
p_c_thread + ib * ThreadMatrixStrideC, p_c_thread + ib * ThreadMatrixStrideC,
f_accum); f_accum);
#else
for(unsigned k = 0; k < a_thread_mtx.NRow(); ++k)
{
for(unsigned i = 0; i < c_thread_mtx.NRow(); ++i)
{
for(unsigned j = 0; j < c_thread_mtx.NCol(); ++j)
{
const unsigned aindex =
a_thread_mtx.Get1dIndex(k, i); // A is transposed
const unsigned bindex = b_thread_mtx.Get1dIndex(k, j);
const unsigned cindex =
c_thread_mtx.Get1dIndex(i, j) + ib * ThreadMatrixStrideC;
f_accum(p_c_thread[cindex], p_a_thread[aindex] * p_b_thread[bindex]);
}
}
}
#endif
// read next batch of a, b // read next batch of a, b
if(BlockMatrixStrideA != 0) if(BlockMatrixStrideA != 0)
{ {
#pragma unroll //#pragma unroll
for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat) for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
{ {
#if 0
threadwise_matrix_copy( threadwise_matrix_copy(
a_block_mtx, a_block_mtx,
p_a_block + p_a_block +
...@@ -492,14 +538,28 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -492,14 +538,28 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
a_thread_mtx, a_thread_mtx,
p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC), p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC),
a_thread_sub_mtx.GetLengths()); a_thread_sub_mtx.GetLengths());
#else
for(unsigned i = 0; i < a_thread_mtx.NRow(); ++i)
{
for(unsigned j = 0; j < a_thread_mtx.NCol(); ++j)
{
p_a_thread[a_thread_mtx.Get1dIndex(i,
m_repeat * MPerThreadSubC + j)] =
p_a_block[a_block_mtx.Get1dIndex(
k_begin + i, m_repeat * MPerLevel1Cluster + j) +
(ib + 1) * BlockMatrixStrideA + mMyThreadOffsetA];
}
}
#endif
} }
} }
if(BlockMatrixStrideB != 0) if(BlockMatrixStrideB != 0)
{ {
#pragma unroll //#pragma unroll
for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat) for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat)
{ {
#if 0
threadwise_matrix_copy( threadwise_matrix_copy(
b_block_mtx, b_block_mtx,
p_b_block + p_b_block +
...@@ -508,11 +568,25 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -508,11 +568,25 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
b_thread_mtx, b_thread_mtx,
p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC), p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
b_thread_sub_mtx.GetLengths()); b_thread_sub_mtx.GetLengths());
#else
for(unsigned i = 0; i < b_thread_mtx.NRow(); ++i)
{
for(unsigned j = 0; j < b_thread_mtx.NCol(); ++j)
{
p_b_thread[b_thread_mtx.Get1dIndex(i,
n_repeat * NPerThreadSubC + j)] =
p_b_block[b_block_mtx.Get1dIndex(
k_begin + i, n_repeat * MPerLevel1Cluster + j) +
(ib + 1) * BlockMatrixStrideB + mMyThreadOffsetB];
}
}
#endif
} }
} }
} }
// do last batch of gemm // do last batch of gemm
#if 0
threadwise_gemm(a_thread_mtx, threadwise_gemm(a_thread_mtx,
True, True,
p_a_thread, p_a_thread,
...@@ -523,6 +597,23 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -523,6 +597,23 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
False, False,
p_c_thread + (BatchPerThread - 1) * ThreadMatrixStrideC, p_c_thread + (BatchPerThread - 1) * ThreadMatrixStrideC,
f_accum); f_accum);
#else
for(unsigned k = 0; k < a_thread_mtx.NRow(); ++k)
{
for(unsigned i = 0; i < c_thread_mtx.NRow(); ++i)
{
for(unsigned j = 0; j < c_thread_mtx.NCol(); ++j)
{
const unsigned aindex = a_thread_mtx.Get1dIndex(k, i); // A is transposed
const unsigned bindex = b_thread_mtx.Get1dIndex(k, j);
const unsigned cindex = c_thread_mtx.Get1dIndex(i, j) +
(BatchPerThread - 1) * ThreadMatrixStrideC;
f_accum(p_c_thread[cindex], p_a_thread[aindex] * p_b_thread[bindex]);
}
}
}
#endif
} }
} }
...@@ -551,9 +642,9 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -551,9 +642,9 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
c_thread_mtx_begin.batch * BlockMatrixStrideC + c_thread_mtx_begin.batch * BlockMatrixStrideC +
c_block_mtx.Get1dIndex(c_thread_mtx_begin.row, c_thread_mtx_begin.col); c_block_mtx.Get1dIndex(c_thread_mtx_begin.row, c_thread_mtx_begin.col);
for(unsigned m_repeat = 0; m_repeat, MRepeat; ++m_repeat) for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
{ {
for(unsigned n_repeat = 0; n_repeat, NRepeat; ++n_repeat) for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat)
{ {
threadwise_matrix_copy( threadwise_matrix_copy(
c_thread_sub_mtx, c_thread_sub_mtx,
......
#pragma once #pragma once
#include "data_type.hip.hpp"
#include "constant_integral.hip.hpp"
#include "Sequence.hip.hpp"
#include "Array.hip.hpp"
#include "functional.hip.hpp"
__device__ unsigned get_thread_local_1d_id() { return threadIdx.x; } __device__ unsigned get_thread_local_1d_id() { return threadIdx.x; }
...@@ -16,129 +21,6 @@ struct is_same<T, T> ...@@ -16,129 +21,6 @@ struct is_same<T, T>
static const bool value = true; static const bool value = true;
}; };
template <class T, unsigned N>
struct vector_type
{
};
template <>
struct vector_type<float, 1>
{
using type = float;
};
template <>
struct vector_type<float, 2>
{
using type = float2;
};
template <>
struct vector_type<float, 4>
{
using type = float4;
};
#if 0
template <>
struct vector_type<half_float::half, 1>
{
using type = half_float::half;
};
template <>
struct vector_type<half_float::half, 2>
{
using type = float;
};
template <>
struct vector_type<half_float::half, 4>
{
using type = float2;
};
template <>
struct vector_type<half_float::half, 8>
{
using type = float4;
};
#endif
#if 0
template <>
struct vector_type<half, 1>
{
using type = half;
};
template <>
struct vector_type<half, 2>
{
using type = half2;
};
template <>
struct vector_type<half, 4>
{
using type = float2;
};
template <>
struct vector_type<half, 8>
{
using type = float4;
};
#endif
template <class T, T N>
struct integral_constant
{
static const T value = N;
__host__ __device__ constexpr T Get() const { return value; }
};
template <unsigned N>
using Number = integral_constant<unsigned, N>;
template <unsigned... Is>
struct Sequence
{
using Type = Sequence<Is...>;
static constexpr unsigned nDim = sizeof...(Is);
const unsigned mData[nDim] = {Is...};
template <unsigned I>
__host__ __device__ constexpr unsigned Get(Number<I>) const
{
return mData[I];
}
template <unsigned I0, unsigned I1, unsigned I2, unsigned I3>
__host__ __device__ constexpr auto ReorderByGetNewFromOld(Sequence<I0, I1, I2, I3>) const
{
constexpr auto old_sequence = Type{};
constexpr unsigned NR0 = old_sequence.mData[I0];
constexpr unsigned NR1 = old_sequence.mData[I1];
constexpr unsigned NR2 = old_sequence.mData[I2];
constexpr unsigned NR3 = old_sequence.mData[I3];
return Sequence<NR0, NR1, NR2, NR3>{};
}
template <unsigned I0, unsigned I1, unsigned I2, unsigned I3>
__host__ __device__ constexpr auto ReorderByPutOldToNew(Sequence<I0, I1, I2, I3>) const
{
// don't know how to implement this
printf("Sequence::ReorderByPutOldToNew not implemented");
assert(false);
}
};
#if DEVICE_BACKEND_CUDA #if DEVICE_BACKEND_CUDA
template <typename T> template <typename T>
__host__ __device__ constexpr T max(T a, T b) __host__ __device__ constexpr T max(T a, T b)
......
...@@ -4,9 +4,10 @@ ...@@ -4,9 +4,10 @@
#if DEVICE_BACKEND_HIP #if DEVICE_BACKEND_HIP
#include "hip/hip_runtime.h" #include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
#elif DEVICE_BACKEND_CUDA #elif DEVICE_BACKEND_CUDA
#include "cuda_runtime.h" #include "cuda_runtime.h"
#include "cuda_fp16.h"
#include "nvToolsExt.h" #include "nvToolsExt.h"
#include "helper_cuda.h" #include "helper_cuda.h"
#include "cuda_fp16.h"
#endif #endif
#pragma once
template <class T, T N>
struct integral_constant
{
static const T value = N;
__host__ __device__ constexpr T Get() const { return value; }
};
template <unsigned N>
using Number = integral_constant<unsigned, N>;
#pragma once
#include "config.h"
template <class T, unsigned N>
struct vector_type
{
};
template <>
struct vector_type<float, 1>
{
using MemoryType = float;
};
template <>
struct vector_type<float, 2>
{
using MemoryType = float2;
__host__ __device__ static MemoryType Pack(float s0, float s1)
{
union
{
MemoryType vector;
float scalar[2];
} data;
data.scalar[0] = s0;
data.scalar[1] = s1;
return data.vector;
}
};
template <>
struct vector_type<float, 4>
{
using MemoryType = float4;
};
template <>
struct vector_type<float2, 2>
{
using MemoryType = float4;
};
#if 0
template <>
struct vector_type<half, 1>
{
using MemoryType = half;
__host__ __device__ static MemoryType Pack(half s) { return s; }
};
template <>
struct vector_type<half, 2>
{
using MemoryType = half2;
__host__ __device__ static MemoryType Pack(half s0, half s1)
{
union
{
MemoryType vector;
half scalar[2];
} data;
data.scalar[0] = s0;
data.scalar[1] = s1;
return data.vector;
}
};
template <>
struct vector_type<half, 4>
{
using MemoryType = float2;
};
template <>
struct vector_type<half, 8>
{
using MemoryType = float4;
};
template <>
struct vector_type<char, 1>
{
using MemoryType = char;
__host__ __device__ static MemoryType Pack(char s) { return s; }
};
template <>
struct vector_type<char, 2>
{
using MemoryType = int16_t;
__host__ __device__ static MemoryType Pack(char s0, char s1)
{
union
{
MemoryType vector;
char scalar[2];
} data;
data.scalar[0] = s0;
data.scalar[1] = s1;
return data.vector;
}
};
template <>
struct vector_type<char, 4>
{
using MemoryType = int32_t;
__host__ __device__ static MemoryType Pack(char s0, char s1, char s2, char s3)
{
union
{
MemoryType vector;
char scalar[4];
} data;
data.scalar[0] = s0;
data.scalar[1] = s1;
data.scalar[2] = s2;
data.scalar[3] = s3;
return data.vector;
}
};
template <>
struct vector_type<char, 8>
{
using MemoryType = int64_t;
};
template <>
struct vector_type<int32_t, 2>
{
using MemoryType = int64_t;
};
template <>
struct vector_type<char2, 2>
{
using MemoryType = char4;
};
template <>
struct vector_type<char2, 4>
{
using MemoryType = int64_t;
};
template <>
struct vector_type<char4, 1>
{
using MemoryType = int;
};
template <>
struct vector_type<char4, 2>
{
using MemoryType = int64_t;
};
#endif
__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
{
d += s0 * s1;
}
__device__ void fused_multiply_accumulate(float& d, const float2& s0, const float2& s1)
{
d += s0.x * s1.x;
d += s0.y * s1.y;
}
__device__ void fused_multiply_accumulate(float& d, const float4& s0, const float4& s1)
{
d += s0.x * s1.x;
d += s0.y * s1.y;
d += s0.z * s1.z;
d += s0.w * s1.w;
}
#if 0
__device__ void fused_multiply_accumulate(half& d, const half& s0, const half& s1) { d += s0 * s1; }
__device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2& s1)
{
d += s0.x * s1.x;
d += s0.y * s1.y;
}
__device__ void fused_multiply_accumulate(float& d, const half2& s0, const half2& s1)
{
d += s0.x * s1.x + s0.y * s1.y;
}
__device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s1) { d += s0 * s1; }
// TODO:: this interface is misleading, s0, s1 are actually int8x4
// need to make a better interface
__device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const int32_t& s1)
{
#if DEVICE_BACKEND_CUDA
d = __dp4a(s0, s1, d);
#endif
}
#endif
#pragma once
#include "constant_integral.hip.hpp"
template <unsigned NLoop>
struct static_loop_n
{
template <class F>
__host__ __device__ void operator()(F f) const
{
static_assert(NLoop > 1, "out-of-range");
f(Number<NLoop - 1>{});
static_loop_n<NLoop - 1>{}(f);
}
};
template <>
struct static_loop_n<1>
{
template <class F>
__host__ __device__ void operator()(F f) const
{
f(Number<0>{});
}
};
template <unsigned NLoop>
struct static_const_reduce_n
{
template <class F, class Reduce>
__host__ __device__ constexpr auto operator()(F f, Reduce r) const
{
static_assert(NLoop > 1, "out-of-range");
constexpr auto a = f(Number<NLoop - 1>{});
auto b = static_const_reduce_n<NLoop - 1>{}(f, r); // cannot use constexpr here, weird
return r(a, b);
}
};
template <>
struct static_const_reduce_n<1>
{
template <class F, class Reduce>
__host__ __device__ constexpr auto operator()(F f, Reduce) const
{
return f(Number<0>{});
}
};
#if 0
template<class F>
__host__ __device__ constexpr auto unpacker(F f)
{
return [=](auto xs_array){ f(xs...); };
}
#endif
\ No newline at end of file
#pragma once #pragma once
#include "common.hip.hpp" #include "common.hip.hpp"
#include "ConstantTensorDescriptor.hip.hpp" #include "ConstantTensorDescriptor.hip.hpp"
#include "blockwise_2d_tensor_op.hip.hpp"
#include "blockwise_4d_tensor_op.hip.hpp" #include "blockwise_4d_tensor_op.hip.hpp"
#include "blockwise_direct_convolution.hip.hpp" #include "blockwise_direct_convolution.hip.hpp"
#include "threadwise_4d_tensor_op.hip.hpp" #include "threadwise_4d_tensor_op.hip.hpp"
...@@ -20,61 +21,86 @@ template <class Float, ...@@ -20,61 +21,86 @@ template <class Float,
unsigned CPerThread, unsigned CPerThread,
unsigned HoPerThread, unsigned HoPerThread,
unsigned WoPerThread, unsigned WoPerThread,
unsigned InBlockCopyDataPerRead,
unsigned WeiBlockCopyDataPerRead,
unsigned BlockSize, unsigned BlockSize,
unsigned GridSize> unsigned GridSize>
__global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_in_global, __global__ void
const Float* const __restrict__ p_wei_global, gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_in_global,
Float* const __restrict__ p_out_global) const Float* const __restrict__ p_wei_global,
Float* const __restrict__ p_out_global)
{ {
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{}; constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{}; constexpr auto I3 = Number<3>{};
constexpr auto in_global_desc = InGlobalDesc{}; constexpr auto in_nchw_global_desc = InGlobalDesc{};
constexpr auto wei_global_desc = WeiGlobalDesc{}; constexpr auto wei_kcyx_global_desc = WeiGlobalDesc{};
constexpr auto out_global_desc = OutGlobalDesc{}; constexpr auto out_nkhw_global_desc = OutGlobalDesc{};
constexpr unsigned Y = wei_global_desc.GetLength(I2); constexpr unsigned N = in_nchw_global_desc.GetLength(I0);
constexpr unsigned X = wei_global_desc.GetLength(I3); constexpr unsigned K = wei_kcyx_global_desc.GetLength(I0);
constexpr unsigned C = wei_kcyx_global_desc.GetLength(I1);
constexpr unsigned Y = wei_kcyx_global_desc.GetLength(I2);
constexpr unsigned X = wei_kcyx_global_desc.GetLength(I3);
constexpr auto wei_ke_global_desc = make_ConstantTensorDescriptor(
Sequence<K, C * Y * X>{}); // 2d view of wei for blockwise copy
constexpr unsigned HiPerBlock = HoPerBlock + Y - 1; constexpr unsigned HiPerBlock = HoPerBlock + Y - 1;
constexpr unsigned WiPerBlock = WoPerBlock + X - 1; constexpr unsigned WiPerBlock = WoPerBlock + X - 1;
constexpr auto in_block_desc = constexpr auto in_nchw_block_desc = make_ConstantTensorDescriptor_aligned(
make_ConstantTensorDescriptor(Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{}); Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{}, Number<InBlockCopyDataPerRead>{});
constexpr auto wei_ke_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<KPerBlock, CPerBlock * Y * X>{},
Number<WeiBlockCopyDataPerRead>{}); // 2d view of wei for blockwise copy
constexpr auto wei_block_desc = constexpr auto wei_kcyx_block_desc =
make_ConstantTensorDescriptor(Sequence<KPerBlock, CPerBlock, Y, X>{}); make_ConstantTensorDescriptor(Sequence<KPerBlock, CPerBlock, Y, X>{},
Sequence<wei_ke_block_desc.GetStride(I0), Y * X, X, 1>{});
// shared mem // shared mem
constexpr unsigned in_block_size = in_block_desc.GetElementSpace(); constexpr unsigned in_block_size =
constexpr unsigned wei_block_size = wei_block_desc.GetElementSpace(); in_nchw_block_desc.GetElementSpace(Number<InBlockCopyDataPerRead>{});
constexpr unsigned wei_block_size =
wei_kcyx_block_desc.GetElementSpace(Number<WeiBlockCopyDataPerRead>{});
__shared__ Float p_in_block[in_block_size]; constexpr unsigned max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead
__shared__ Float p_wei_block[wei_block_size]; ? InBlockCopyDataPerRead
: WeiBlockCopyDataPerRead;
__shared__ Float p_in_block[max_align * ((in_block_size + max_align - 1) / max_align)];
__shared__ Float p_wei_block[max_align * ((wei_block_size + max_align - 1) / max_align)];
// threadwise tensors // threadwise tensors
constexpr unsigned HiPerThread = HoPerThread + Y - 1; constexpr unsigned HiPerThread = HoPerThread + Y - 1;
constexpr unsigned WiPerThread = WoPerThread + X - 1; constexpr unsigned WiPerThread = WoPerThread + X - 1;
constexpr auto in_thread_block_desc = make_ConstantTensorDescriptor( constexpr auto in_nchw_thread_block_desc =
Sequence<NPerThread, CPerThread, HiPerThread, WiPerThread>{}, in_block_desc.GetStrides()); make_ConstantTensorDescriptor(Sequence<NPerThread, CPerThread, HiPerThread, WiPerThread>{},
in_nchw_block_desc.GetStrides());
constexpr auto wei_thread_block_desc = make_ConstantTensorDescriptor( constexpr auto wei_kcyx_thread_block_desc = make_ConstantTensorDescriptor(
Sequence<KPerThread, CPerThread, Y, X>{}, wei_block_desc.GetStrides()); Sequence<KPerThread, CPerThread, Y, X>{}, wei_kcyx_block_desc.GetStrides());
constexpr auto out_thread_desc = get_convolution_output_default_4d_tensor_descriptor( constexpr auto out_nkhw_thread_desc = get_convolution_output_default_4d_tensor_descriptor(
in_thread_block_desc, wei_thread_block_desc); in_nchw_thread_block_desc, wei_kcyx_thread_block_desc);
// register // register
Float p_out_thread[out_thread_desc.GetElementSpace()]; Float p_out_thread[out_nkhw_thread_desc.GetElementSpace()];
// divide block work // divide block work
constexpr unsigned NBlockWork = (out_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock; constexpr unsigned NBlockWork =
constexpr unsigned KBlockWork = (out_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock; (out_nkhw_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
constexpr unsigned HBlockWork = (out_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock; constexpr unsigned KBlockWork =
constexpr unsigned WBlockWork = (out_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock; (out_nkhw_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
constexpr unsigned HBlockWork =
(out_nkhw_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock;
constexpr unsigned WBlockWork =
(out_nkhw_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock;
const unsigned block_id = blockIdx.x; const unsigned block_id = blockIdx.x;
...@@ -121,34 +147,46 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_ ...@@ -121,34 +147,46 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
constexpr auto blockwise_in_copy = constexpr auto blockwise_in_copy =
Blockwise4dTensorCopy1<BlockSize, Blockwise4dTensorCopy1<BlockSize,
Float, Float,
decltype(in_global_desc), decltype(in_nchw_global_desc),
decltype(in_block_desc), decltype(in_nchw_block_desc),
decltype(in_block_desc.GetLengths())>{}; decltype(in_nchw_block_desc.GetLengths()),
InBlockCopyDataPerRead>{};
#if 0
constexpr auto blockwise_wei_copy = constexpr auto blockwise_wei_copy =
Blockwise4dTensorCopy1<BlockSize, Blockwise4dTensorCopy1<BlockSize,
Float, Float,
decltype(wei_global_desc), decltype(wei_kcyx_global_desc),
decltype(wei_block_desc), decltype(wei_kcyx_block_desc),
decltype(wei_block_desc.GetLengths())>{}; decltype(wei_kcyx_block_desc.GetLengths()),
1>{};
#elif 1
const auto blockwise_wei_copy = Blockwise2dTensorCopy3<BlockSize,
Float,
decltype(wei_ke_global_desc),
decltype(wei_ke_block_desc),
decltype(wei_ke_block_desc.GetLengths()),
WeiBlockCopyDataPerRead>{};
#endif
// set threadwise output tensor to 0 // set threadwise output tensor to 0
threadwise_4d_tensor_set_zero(out_thread_desc, p_out_thread); threadwise_4d_tensor_set_zero(out_nkhw_thread_desc, p_out_thread);
for(unsigned c_block_data_begin = 0; c_block_data_begin < in_global_desc.GetLength(I1); for(unsigned c_block_data_begin = 0; c_block_data_begin < C;
c_block_data_begin += CPerBlock, __syncthreads()) c_block_data_begin += CPerBlock, __syncthreads())
{ {
// copy input tensor to LDS // copy input tensor to LDS
blockwise_in_copy.Run(p_in_global + blockwise_in_copy.Run(p_in_global +
in_global_desc.Get1dIndex(n_block_data_begin, in_nchw_global_desc.Get1dIndex(n_block_data_begin,
c_block_data_begin, c_block_data_begin,
hi_block_data_begin, hi_block_data_begin,
wi_block_data_begin), wi_block_data_begin),
p_in_block); p_in_block);
// copy weight tensor to LDS // copy weight tensor to LDS
blockwise_wei_copy.Run( blockwise_wei_copy.Run(
p_wei_global + wei_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0), p_wei_global +
wei_kcyx_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0),
p_wei_block); p_wei_block);
__syncthreads(); __syncthreads();
...@@ -158,27 +196,29 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_ ...@@ -158,27 +196,29 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
// threadwise convolution // threadwise convolution
#if 1 #if 1
threadwise_direct_convolution_2( threadwise_direct_convolution_2(
in_thread_block_desc, in_nchw_thread_block_desc,
p_in_block + p_in_block +
in_block_desc.Get1dIndex(n_thread_data_begin, in_nchw_block_desc.Get1dIndex(n_thread_data_begin,
c_thread_data, c_thread_data,
hi_thread_data_begin, hi_thread_data_begin,
wi_thread_data_begin), wi_thread_data_begin),
wei_thread_block_desc, wei_kcyx_thread_block_desc,
p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), p_wei_block +
out_thread_desc, wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
out_nkhw_thread_desc,
p_out_thread); p_out_thread);
#elif 0 #elif 0
threadwise_direct_convolution_3( threadwise_direct_convolution_3(
in_thread_block_desc, in_nchw_thread_block_desc,
p_in_block + p_in_block +
in_block_desc.Get1dIndex(n_thread_data_begin, in_nchw_block_desc.Get1dIndex(n_thread_data_begin,
c_thread_data, c_thread_data,
hi_thread_data_begin, hi_thread_data_begin,
wi_thread_data_begin), wi_thread_data_begin),
wei_thread_block_desc, wei_kcyx_thread_block_desc,
p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), p_wei_block +
out_thread_desc, wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
out_nkhw_thread_desc,
p_out_thread); p_out_thread);
#endif #endif
} }
...@@ -186,13 +226,13 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_ ...@@ -186,13 +226,13 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
// copy output tensor from register to global mem // copy output tensor from register to global mem
threadwise_4d_tensor_copy( threadwise_4d_tensor_copy(
out_thread_desc, out_nkhw_thread_desc,
p_out_thread, p_out_thread,
out_global_desc, out_nkhw_global_desc,
p_out_global + p_out_global +
out_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
k_block_data_begin + k_thread_data_begin, k_block_data_begin + k_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin, ho_block_data_begin + ho_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin), wo_block_data_begin + wo_thread_data_begin),
out_thread_desc.GetLengths()); out_nkhw_thread_desc.GetLengths());
} }
#pragma once
#include "common.hip.hpp"
#include "ConstantTensorDescriptor.hip.hpp"
#include "blockwise_2d_tensor_op.hip.hpp"
#include "blockwise_4d_tensor_op.hip.hpp"
#include "blockwise_direct_convolution.hip.hpp"
#include "threadwise_4d_tensor_op.hip.hpp"
#include "threadwise_direct_convolution.hip.hpp"
template <class TInWei,
class TOut,
class TAccum,
class InGlobalDesc,
class WeiGlobalDesc,
class OutGlobalDesc,
unsigned ScalarPerVector,
unsigned NPerBlock,
unsigned KPerBlock,
unsigned CPerBlock,
unsigned HoPerBlock,
unsigned WoPerBlock,
unsigned NPerThread,
unsigned KPerThread,
unsigned CPerThread,
unsigned HoPerThread,
unsigned WoPerThread,
unsigned InBlockCopyDataPerRead,
unsigned WeiBlockCopyDataPerRead,
unsigned BlockSize,
unsigned GridSize>
__global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
const typename vector_type<TInWei,
ScalarPerVector>::MemoryType* const __restrict__ p_in_vec_global,
const typename vector_type<TInWei,
ScalarPerVector>::MemoryType* const __restrict__ p_wei_vec_global,
TOut* const __restrict__ p_out_global)
{
using in_scalar_t = TInWei;
using in_vector_mem_t = typename vector_type<in_scalar_t, ScalarPerVector>::MemoryType;
using out_scalar_t = TOut;
using accum_t = TAccum;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto in_nchw_vec_global_desc = InGlobalDesc{};
constexpr auto wei_kcyx_vec_global_desc = WeiGlobalDesc{};
constexpr auto out_nkhw_global_desc = OutGlobalDesc{};
constexpr unsigned N = in_nchw_vec_global_desc.GetLength(I0);
constexpr unsigned K = wei_kcyx_vec_global_desc.GetLength(I0);
constexpr unsigned C = wei_kcyx_vec_global_desc.GetLength(I1);
constexpr unsigned Y = wei_kcyx_vec_global_desc.GetLength(I2);
constexpr unsigned X = wei_kcyx_vec_global_desc.GetLength(I3);
constexpr auto wei_ke_vec_global_desc = make_ConstantTensorDescriptor(
Sequence<K, C * Y * X>{}); // 2d view of wei for blockwise copy
constexpr unsigned HiPerBlock = HoPerBlock + Y - 1;
constexpr unsigned WiPerBlock = WoPerBlock + X - 1;
constexpr auto in_nchw_vec_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{}, Number<InBlockCopyDataPerRead>{});
constexpr auto wei_ke_vec_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<KPerBlock, CPerBlock * Y * X>{},
Number<WeiBlockCopyDataPerRead>{}); // 2d view of wei for blockwise copy
constexpr auto wei_kcyx_vec_block_desc =
make_ConstantTensorDescriptor(Sequence<KPerBlock, CPerBlock, Y, X>{},
Sequence<wei_ke_vec_block_desc.GetStride(I0), Y * X, X, 1>{});
// shared mem
constexpr unsigned in_block_size =
in_nchw_vec_block_desc.GetElementSpace(Number<InBlockCopyDataPerRead>{});
constexpr unsigned wei_block_size =
wei_kcyx_vec_block_desc.GetElementSpace(Number<WeiBlockCopyDataPerRead>{});
constexpr unsigned max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead
? InBlockCopyDataPerRead
: WeiBlockCopyDataPerRead;
__shared__ in_vector_mem_t
p_in_vec_block[max_align * ((in_block_size + max_align - 1) / max_align)];
__shared__ in_vector_mem_t
p_wei_vec_block[max_align * ((wei_block_size + max_align - 1) / max_align)];
// threadwise tensors
constexpr unsigned HiPerThread = HoPerThread + Y - 1;
constexpr unsigned WiPerThread = WoPerThread + X - 1;
constexpr auto in_nchw_vec_thread_block_desc =
make_ConstantTensorDescriptor(Sequence<NPerThread, CPerThread, HiPerThread, WiPerThread>{},
in_nchw_vec_block_desc.GetStrides());
constexpr auto wei_kcyx_vec_thread_block_desc = make_ConstantTensorDescriptor(
Sequence<KPerThread, CPerThread, Y, X>{}, wei_kcyx_vec_block_desc.GetStrides());
constexpr auto out_nkhw_thread_desc = get_convolution_output_default_4d_tensor_descriptor(
in_nchw_vec_thread_block_desc, wei_kcyx_vec_thread_block_desc);
// register
out_scalar_t p_out_thread[out_nkhw_thread_desc.GetElementSpace()];
// divide block work
constexpr unsigned NBlockWork =
(out_nkhw_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
constexpr unsigned KBlockWork =
(out_nkhw_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
constexpr unsigned HBlockWork =
(out_nkhw_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock;
constexpr unsigned WBlockWork =
(out_nkhw_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock;
const unsigned block_id = blockIdx.x;
unsigned itmp = block_id;
const unsigned n_block_work_id = itmp / (KBlockWork * HBlockWork * WBlockWork);
itmp -= n_block_work_id * (KBlockWork * HBlockWork * WBlockWork);
const unsigned k_block_work_id = itmp / (HBlockWork * WBlockWork);
itmp -= k_block_work_id * (HBlockWork * WBlockWork);
const unsigned h_block_work_id = itmp / WBlockWork;
const unsigned w_block_work_id = itmp - h_block_work_id * WBlockWork;
const unsigned n_block_data_begin = n_block_work_id * NPerBlock;
const unsigned k_block_data_begin = k_block_work_id * KPerBlock;
const unsigned ho_block_data_begin = h_block_work_id * HoPerBlock;
const unsigned wo_block_data_begin = w_block_work_id * WoPerBlock;
const unsigned hi_block_data_begin = ho_block_data_begin; // minus padding
const unsigned wi_block_data_begin = wo_block_data_begin; // minus padding
// divide thread work
constexpr unsigned NThreadWork = (NPerBlock + NPerThread - 1) / NPerThread;
constexpr unsigned KThreadWork = (KPerBlock + KPerThread - 1) / KPerThread;
constexpr unsigned HThreadWork = (HoPerBlock + HoPerThread - 1) / HoPerThread;
constexpr unsigned WThreadWork = (WoPerBlock + WoPerThread - 1) / WoPerThread;
const unsigned thread_id = threadIdx.x;
itmp = thread_id;
const unsigned n_thread_work_id = itmp / (KThreadWork * HThreadWork * WThreadWork);
itmp -= n_thread_work_id * (KThreadWork * HThreadWork * WThreadWork);
const unsigned k_thread_work_id = itmp / (HThreadWork * WThreadWork);
itmp -= k_thread_work_id * (HThreadWork * WThreadWork);
const unsigned h_thread_work_id = itmp / WThreadWork;
const unsigned w_thread_work_id = itmp - h_thread_work_id * WThreadWork;
const unsigned n_thread_data_begin = n_thread_work_id * NPerThread;
const unsigned k_thread_data_begin = k_thread_work_id * KPerThread;
const unsigned ho_thread_data_begin = h_thread_work_id * HoPerThread;
const unsigned wo_thread_data_begin = w_thread_work_id * WoPerThread;
const unsigned hi_thread_data_begin = ho_thread_data_begin;
const unsigned wi_thread_data_begin = wo_thread_data_begin;
constexpr auto blockwise_in_copy =
Blockwise4dTensorCopy1<BlockSize,
in_vector_mem_t,
decltype(in_nchw_vec_global_desc),
decltype(in_nchw_vec_block_desc),
decltype(in_nchw_vec_block_desc.GetLengths()),
InBlockCopyDataPerRead>{};
#if 0
constexpr auto blockwise_wei_copy =
Blockwise4dTensorCopy1<BlockSize,
in_vector_mem_t,
decltype(wei_kcyx_vec_global_desc),
decltype(wei_kcyx_vec_block_desc),
decltype(wei_kcyx_vec_block_desc.GetLengths()),
1>{};
#elif 1
const auto blockwise_wei_copy =
Blockwise2dTensorCopy3<BlockSize,
in_vector_mem_t,
decltype(wei_ke_vec_global_desc),
decltype(wei_ke_vec_block_desc),
decltype(wei_ke_vec_block_desc.GetLengths()),
WeiBlockCopyDataPerRead>{};
#endif
#if 1 // debug
// set threadwise output tensor to 0
threadwise_4d_tensor_set_zero(out_nkhw_thread_desc, p_out_thread);
#endif
for(unsigned c_block_data_begin = 0; c_block_data_begin < C;
c_block_data_begin += CPerBlock, __syncthreads())
{
// copy input tensor to LDS
blockwise_in_copy.Run(p_in_vec_global +
in_nchw_vec_global_desc.Get1dIndex(n_block_data_begin,
c_block_data_begin,
hi_block_data_begin,
wi_block_data_begin),
p_in_vec_block);
// copy weight tensor to LDS
blockwise_wei_copy.Run(
p_wei_vec_global +
wei_kcyx_vec_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0),
p_wei_vec_block);
__syncthreads();
for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread)
{
// threadwise convolution
#if 1
threadwise_direct_convolution_2(
in_nchw_vec_thread_block_desc,
p_in_vec_block +
in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin,
c_thread_data,
hi_thread_data_begin,
wi_thread_data_begin),
wei_kcyx_vec_thread_block_desc,
p_wei_vec_block +
wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
out_nkhw_thread_desc,
p_out_thread);
#elif 0
threadwise_direct_convolution_3(
in_nchw_vec_thread_block_desc,
p_in_vec_block +
in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin,
c_thread_data,
hi_thread_data_begin,
wi_thread_data_begin),
wei_kcyx_vec_thread_block_desc,
p_wei_vec_block +
wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
out_nkhw_thread_desc,
p_out_thread);
#endif
}
}
// copy output tensor from register to global mem
threadwise_4d_tensor_copy(
out_nkhw_thread_desc,
p_out_thread,
out_nkhw_global_desc,
p_out_global +
out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
k_block_data_begin + k_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin),
out_nkhw_thread_desc.GetLengths());
}
#pragma once
#include "common.hip.hpp"
#include "ConstantTensorDescriptor.hip.hpp"
#include "ConstantMatrixDescriptor.hip.hpp"
#include "blockwise_4d_tensor_op.hip.hpp"
#include "blockwise_2d_tensor_op.hip.hpp"
#include "threadwise_2d_tensor_op.hip.hpp"
#include "blockwise_gemm.hip.hpp"
// define B = flatten(N, Hi, Wi)
template <unsigned GridSize,
unsigned BlockSize,
class Float,
class InGlobalDesc,
class WeiGlobalDesc,
class OutGlobalDesc,
unsigned BPerBlock,
unsigned KPerBlock,
unsigned CPerBlock,
unsigned BPerThread,
unsigned KPerThread,
unsigned GemmThreadPerColumnPerCluster,
unsigned GemmThreadPerRowPerCluster,
unsigned GemmMPerThreadSubC,
unsigned GemmNPerThreadSubC,
unsigned GemmMLevel0Cluster,
unsigned GemmNLevel0Cluster,
unsigned GemmMLevel1Cluster,
unsigned GemmNLevel1Cluster,
unsigned GemmKPerThreadLoop,
unsigned InBlockCopyThreadPerDim0,
unsigned InBlockCopyThreadPerDim1,
unsigned WeiBlockCopyThreadPerDim0,
unsigned WeiBlockCopyThreadPerDim1,
unsigned InBlockCopyDataPerRead,
unsigned WeiBlockCopyDataPerRead>
__global__ void
gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restrict__ p_in_global,
const Float* const __restrict__ p_wei_global,
Float* const __restrict__ p_out_global)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto in_chwn_global_desc = InGlobalDesc{};
constexpr auto wei_cyxk_global_desc = WeiGlobalDesc{};
constexpr auto out_khwn_global_desc = OutGlobalDesc{};
constexpr unsigned C = in_chwn_global_desc.GetLength(I0);
constexpr unsigned Hi = in_chwn_global_desc.GetLength(I1);
constexpr unsigned Wi = in_chwn_global_desc.GetLength(I2);
constexpr unsigned N = in_chwn_global_desc.GetLength(I3);
constexpr unsigned K = out_khwn_global_desc.GetLength(I0);
constexpr unsigned Ho = out_khwn_global_desc.GetLength(I1);
constexpr unsigned Wo = out_khwn_global_desc.GetLength(I2);
constexpr unsigned Y = wei_cyxk_global_desc.GetLength(I1);
constexpr unsigned X = wei_cyxk_global_desc.GetLength(I2);
constexpr unsigned B = N * Hi * Wi;
constexpr unsigned BGhostRead = (Y - 1) * Wi + (X - 1);
// divide block work by 2d: [K, B]
constexpr unsigned KBlockWork = (K + KPerBlock - 1) / KPerBlock;
constexpr unsigned BBlockWork = (B + BPerBlock - 1) / BPerBlock;
const unsigned k_block_work_id = get_block_1d_id() / BBlockWork;
const unsigned b_block_work_id = get_block_1d_id() - k_block_work_id * BBlockWork;
const unsigned k_block_data_begin = k_block_work_id * KPerBlock;
const unsigned b_block_data_begin = b_block_work_id * BPerBlock;
// flattend (2d) tensor view of gridwise input
constexpr auto in_cb_global_desc = make_ConstantTensorDescriptor(Sequence<C, B>{});
constexpr auto wei_ek_global_desc = make_ConstantTensorDescriptor(Sequence<C * Y * X, K>{});
// tensor view of blockwise input and weight
// be careful of alignment
constexpr auto in_cb_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<CPerBlock, BPerBlock + BGhostRead>{}, Number<InBlockCopyDataPerRead>{});
constexpr auto wei_ek_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<CPerBlock * Y * X, KPerBlock>{}, Number<WeiBlockCopyDataPerRead>{});
constexpr auto wei_cyxk_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<CPerBlock, Y, X, KPerBlock>{}, Number<WeiBlockCopyDataPerRead>{});
// tensor view of threadwise output in register
constexpr auto out_kb_thread_desc =
make_ConstantTensorDescriptor(Sequence<KPerThread, BPerThread>{});
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
print_ConstantTensorDescriptor(in_chwn_global_desc, "in_chwn_global_desc");
print_ConstantTensorDescriptor(wei_cyxk_global_desc, "wei_cyxk_global_desc");
print_ConstantTensorDescriptor(out_khwn_global_desc, "out_khwn_global_desc");
print_ConstantTensorDescriptor(in_cb_global_desc, "in_cb_global_desc");
print_ConstantTensorDescriptor(wei_ek_global_desc, "wei_ek_global_desc");
print_ConstantTensorDescriptor(in_cb_block_desc, "in_cb_block_desc");
print_ConstantTensorDescriptor(wei_cyxk_block_desc, "wei_cyxk_block_desc");
print_ConstantTensorDescriptor(wei_ek_block_desc, "wei_ek_block_desc");
print_ConstantTensorDescriptor(out_kb_thread_desc, "out_kb_thread_desc");
printf("KPerBlock %u\n", KPerBlock);
}
#endif
// blockwise in copy
// formmat is [CPerBlock,BPerBlock + BGhostRead]
#if 0
const auto blockwise_in_copy =
Blockwise2dTensorCopy1<BlockSize,
Float,
decltype(in_cb_global_desc),
decltype(in_cb_block_desc),
decltype(in_cb_block_desc.GetLengths())>{};
#elif 0
const auto blockwise_in_copy = Blockwise2dTensorCopy2<BlockSize,
Float,
decltype(in_cb_global_desc),
decltype(in_cb_block_desc),
decltype(in_cb_block_desc.GetLengths()),
InBlockCopyThreadPerDim0,
InBlockCopyThreadPerDim1>{};
#elif 1
const auto blockwise_in_copy = Blockwise2dTensorCopy3<BlockSize,
Float,
decltype(in_cb_global_desc),
decltype(in_cb_block_desc),
decltype(in_cb_block_desc.GetLengths()),
InBlockCopyDataPerRead>{};
#endif
// blockwise wei copy
// format is [CPerBlock*Y*X,KPerBlock]
#if 0
const auto blockwise_wei_copy =
Blockwise2dTensorCopy1<BlockSize,
Float,
decltype(wei_ek_global_desc),
decltype(wei_ek_block_desc),
decltype(wei_ek_block_desc.GetLengths())>{};
#elif 0
const auto blockwise_wei_copy = Blockwise2dTensorCopy2<BlockSize,
Float,
decltype(wei_ek_global_desc),
decltype(wei_ek_block_desc),
decltype(wei_ek_block_desc.GetLengths()),
WeiBlockCopyThreadPerDim0,
WeiBlockCopyThreadPerDim1>{};
#elif 1
const auto blockwise_wei_copy = Blockwise2dTensorCopy3<BlockSize,
Float,
decltype(wei_ek_global_desc),
decltype(wei_ek_block_desc),
decltype(wei_ek_block_desc.GetLengths()),
WeiBlockCopyDataPerRead>{};
#endif
// a series of blockwise GEMM
// c_mtx += transpose(a_mtx) * b_mtx
// a_mtx and b_mtx saved in LDS, c_mtx saved in register
// a_mtx[C,K] is a sub-matrix of wei_block[C,Y,X,K]
// b_mtx[C,B] is a subset of in_block[C,B + BGhostRead]
// c_mtx[K,B] is out_block[K,B]
constexpr auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{}, Number<KPerBlock>{}, Number<wei_cyxk_block_desc.GetStride(I0)>{});
constexpr auto b_cxb_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{}, Number<BPerBlock>{}, Number<in_cb_block_desc.GetStride(I0)>{});
constexpr auto c_kxb_thread_mtx_desc =
make_ConstantMatrixDescriptor(Number<KPerThread>{}, Number<BPerThread>{});
const auto blockwise_gemm =
BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2<BlockSize,
decltype(a_cxk_block_mtx_desc),
decltype(b_cxb_block_mtx_desc),
decltype(c_kxb_thread_mtx_desc),
GemmMPerThreadSubC,
GemmNPerThreadSubC,
GemmMLevel0Cluster,
GemmNLevel0Cluster,
GemmMLevel1Cluster,
GemmNLevel1Cluster,
GemmKPerThreadLoop>{};
// LDS: be careful of alignment
constexpr unsigned in_block_size =
in_cb_block_desc.GetElementSpace(Number<InBlockCopyDataPerRead>{});
constexpr unsigned wei_block_size =
wei_cyxk_block_desc.GetElementSpace(Number<WeiBlockCopyDataPerRead>{});
constexpr unsigned max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead
? InBlockCopyDataPerRead
: WeiBlockCopyDataPerRead;
// LDS
__shared__ Float p_in_block[max_align * ((in_block_size + max_align - 1) / max_align)];
__shared__ Float p_wei_block[max_align * ((wei_block_size + max_align - 1) / max_align)];
const Float* p_in_global_block_offset =
p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin);
const Float* p_wei_global_block_offset =
p_wei_global + wei_cyxk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
// register
Float p_out_thread[out_kb_thread_desc.GetElementSpace()];
// set threadwise output tensor to 0
threadwise_2d_tensor_set_zero(out_kb_thread_desc, p_out_thread);
for(unsigned c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock,
p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0),
p_wei_global_block_offset += CPerBlock * wei_cyxk_global_desc.GetStride(I0),
__syncthreads())
{
// load data
blockwise_in_copy.Run(p_in_global_block_offset, p_in_block);
blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block);
__syncthreads();
// compute on current data
// a series of GEMM
for(unsigned y = 0; y < Y; ++y)
{
for(unsigned x = 0; x < X; ++x)
{
auto f_accum = [](auto& acc, const auto&& v) { acc += v; };
#if 1
blockwise_gemm.Run
#else
blockwise_gemm.Run_RegisterDoubleBuffer
#endif
(p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
p_in_block + y * Wi + x,
p_out_thread,
f_accum);
}
}
}
// output: register to global mem,
const auto c_thread_mtx_begin =
blockwise_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());
const unsigned k_thread_data_begin = k_block_data_begin + c_thread_mtx_begin.row;
const unsigned b_thread_data_begin = b_block_data_begin + c_thread_mtx_begin.col;
for(unsigned k = 0; k < out_kb_thread_desc.GetLength(I0); ++k)
{
for(unsigned b = 0; b < out_kb_thread_desc.GetLength(I1); ++b)
{
const auto c_thread_mtx_distance =
blockwise_gemm.GetDistanceFromBeginOfThreadMatrixC(k, b);
unsigned k_data = k_thread_data_begin + c_thread_mtx_distance.row;
unsigned b_data = b_thread_data_begin + c_thread_mtx_distance.col;
unsigned h_data = b_data / (Wi * N);
unsigned itmp = b_data - h_data * (Wi * N);
unsigned w_data = itmp / N;
unsigned n_data = itmp - w_data * N;
if(n_data < N && h_data < Ho && w_data < Wo)
{
p_out_global[out_khwn_global_desc.Get1dIndex(k_data, h_data, w_data, n_data)] =
p_out_thread[out_kb_thread_desc.Get1dIndex(k, b)];
}
}
}
}
...@@ -259,7 +259,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_b ...@@ -259,7 +259,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_b
__syncthreads(); __syncthreads();
// load next data // load next data
#if 1 #if 0
blockwise_in_copy.Run(p_in_global_block_offset, p_in_block_next); blockwise_in_copy.Run(p_in_global_block_offset, p_in_block_next);
blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block_next); blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block_next);
#elif 1 #elif 1
...@@ -292,7 +292,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_b ...@@ -292,7 +292,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_b
} }
} }
#if 0 #if 1
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block_next); blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block_next);
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, p_wei_block_next); blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, p_wei_block_next);
#endif #endif
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment