Commit c8d0356a authored by Chao Liu's avatar Chao Liu
Browse files

faster

parent 1812666a
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
#include "nvToolsExt.h" #include "nvToolsExt.h"
#include "tensor.hpp" #include "tensor.hpp"
#include "constant_tensor_descriptor.cuh" #include "constant_tensor_descriptor.cuh"
#include "direct_convolution.cuh" #include "direct_convolution_2.cuh"
template <class T> template <class T>
struct GeneratorConstant struct GeneratorConstant
...@@ -129,16 +129,16 @@ void device_convolution( ...@@ -129,16 +129,16 @@ void device_convolution(
constexpr auto I2 = Index<2>{}; constexpr auto I2 = Index<2>{};
constexpr auto I3 = Index<3>{}; constexpr auto I3 = Index<3>{};
constexpr auto in_desc = InDesc{}; constexpr auto in_desc = InDesc{};
constexpr auto wei_desc = WeiDesc{}; constexpr auto wei_desc = WeiDesc{};
constexpr auto out_desc = OutDesc{}; constexpr auto out_desc = OutDesc{};
constexpr unsigned NPerBlock = 1; constexpr unsigned OutTileSizeH = 2;
constexpr unsigned KPerBlock = 2; constexpr unsigned OutTileSizeW = 2;
constexpr unsigned CPerBlockLoop = 4; constexpr unsigned NPerBlock = 1;
constexpr unsigned OutTileSizeH = 2; constexpr unsigned KPerBlock = 4;
constexpr unsigned OutTileSizeW = 2; constexpr unsigned CPerBlock = 2;
constexpr unsigned YPerBlock = 8; constexpr unsigned YPerBlock = 8;
constexpr unsigned XPerBlock = 16; constexpr unsigned XPerBlock = 16;
constexpr unsigned NBlockCopyLen0 = 1; constexpr unsigned NBlockCopyLen0 = 1;
constexpr unsigned NBlockCopyLen1 = 1; constexpr unsigned NBlockCopyLen1 = 1;
...@@ -167,11 +167,11 @@ void device_convolution( ...@@ -167,11 +167,11 @@ void device_convolution(
InDesc, InDesc,
WeiDesc, WeiDesc,
OutDesc, OutDesc,
NPerBlock,
KPerBlock,
CPerBlockLoop,
OutTileSizeH, OutTileSizeH,
OutTileSizeW, OutTileSizeW,
NPerBlock,
KPerBlock,
CPerBlock,
YPerBlock, YPerBlock,
XPerBlock, XPerBlock,
NBlockCopyLen0, NBlockCopyLen0,
...@@ -248,7 +248,7 @@ int main() ...@@ -248,7 +248,7 @@ int main()
int num_thread = std::thread::hardware_concurrency(); int num_thread = std::thread::hardware_concurrency();
#if 0 #if 1
in.GenerateTensorValue(GeneratorTensor<float>{}, num_thread); in.GenerateTensorValue(GeneratorTensor<float>{}, num_thread);
wei.GenerateTensorValue(GeneratorTensor<float>{}, num_thread); wei.GenerateTensorValue(GeneratorTensor<float>{}, num_thread);
out_host.GenerateTensorValue(GeneratorConstant<float>{0}, num_thread); out_host.GenerateTensorValue(GeneratorConstant<float>{0}, num_thread);
...@@ -258,7 +258,7 @@ int main() ...@@ -258,7 +258,7 @@ int main()
device_convolution(in_desc, in, wei_desc, wei, out_desc, out_device); device_convolution(in_desc, in, wei_desc, wei, out_desc, out_device);
#if 0 #if 1
host_convolution(in, wei, out_host); host_convolution(in, wei, out_host);
float error = 0; float error = 0;
......
...@@ -96,9 +96,14 @@ __device__ void blockwise_4d_tensor_op( ...@@ -96,9 +96,14 @@ __device__ void blockwise_4d_tensor_op(
} }
#endif #endif
#if 0
if(threadIdx.x != 0)
return;
#endif
constexpr unsigned NLoop = desc.GetElementSize() / BlockSize; constexpr unsigned NLoop = desc.GetElementSize() / BlockSize;
for(unsigned iloop = 0; iloop + 1 < NLoop; ++iloop) for(unsigned iloop = 0; iloop < NLoop; ++iloop)
{ {
unsigned is = threadIdx.x + iloop * BlockSize; unsigned is = threadIdx.x + iloop * BlockSize;
......
...@@ -35,7 +35,7 @@ __device__ void blockwise_convolution(InDesc, ...@@ -35,7 +35,7 @@ __device__ void blockwise_convolution(InDesc,
constexpr unsigned YPerBlock = (out_desc.GetLength(I2) + OutTileSizeH - 1) / OutTileSizeH; constexpr unsigned YPerBlock = (out_desc.GetLength(I2) + OutTileSizeH - 1) / OutTileSizeH;
constexpr unsigned XPerBlock = (out_desc.GetLength(I3) + OutTileSizeW - 1) / OutTileSizeW; constexpr unsigned XPerBlock = (out_desc.GetLength(I3) + OutTileSizeW - 1) / OutTileSizeW;
constexpr unsigned CPerBlockLoop = in_desc.GetLength(I1); constexpr unsigned CPerBlock = in_desc.GetLength(I1);
constexpr unsigned InTileSizeH = OutTileSizeH + S - 1; constexpr unsigned InTileSizeH = OutTileSizeH + S - 1;
constexpr unsigned InTileSizeW = OutTileSizeW + R - 1; constexpr unsigned InTileSizeW = OutTileSizeW + R - 1;
...@@ -50,11 +50,10 @@ __device__ void blockwise_convolution(InDesc, ...@@ -50,11 +50,10 @@ __device__ void blockwise_convolution(InDesc,
#endif #endif
constexpr auto in_thread_src_desc = make_ConstantTensorDescriptor( constexpr auto in_thread_src_desc = make_ConstantTensorDescriptor(
Sequence<1, CPerBlockLoop, OutTileSizeH + S - 1, OutTileSizeW + R - 1>{}, Sequence<1, CPerBlock, OutTileSizeH + S - 1, OutTileSizeW + R - 1>{}, in_desc.GetStrides());
in_desc.GetStrides());
constexpr auto wei_thread_src_desc = constexpr auto wei_thread_src_desc =
make_ConstantTensorDescriptor(Sequence<1, CPerBlockLoop, S, R>{}, wei_desc.GetStrides()); make_ConstantTensorDescriptor(Sequence<1, CPerBlock, S, R>{}, wei_desc.GetStrides());
constexpr auto out_thread_src_desc = make_ConstantTensorDescriptor( constexpr auto out_thread_src_desc = make_ConstantTensorDescriptor(
Sequence<1, 1, OutTileSizeH, OutTileSizeW>{}, out_desc.GetStrides()); Sequence<1, 1, OutTileSizeH, OutTileSizeW>{}, out_desc.GetStrides());
...@@ -90,8 +89,8 @@ __device__ void blockwise_convolution(InDesc, ...@@ -90,8 +89,8 @@ __device__ void blockwise_convolution(InDesc,
unsigned hi_thread_work_begin = ho_thread_work_begin; // minus padding unsigned hi_thread_work_begin = ho_thread_work_begin; // minus padding
unsigned wi_thread_work_begin = wo_thread_work_begin; // minus padding unsigned wi_thread_work_begin = wo_thread_work_begin; // minus padding
TFloat p_in_thread[1 * CPerBlockLoop * InTileSizeH * InTileSizeW]; TFloat p_in_thread[1 * CPerBlock * InTileSizeH * InTileSizeW];
TFloat p_wei_thread[1 * CPerBlockLoop * S * R]; TFloat p_wei_thread[1 * CPerBlock * S * R];
TFloat p_out_thread[1 * 1 * OutTileSizeH * OutTileSizeW]; TFloat p_out_thread[1 * 1 * OutTileSizeH * OutTileSizeW];
auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; }; auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; };
...@@ -162,11 +161,11 @@ template <class TFloat, ...@@ -162,11 +161,11 @@ template <class TFloat,
class InDesc, class InDesc,
class WeiDesc, class WeiDesc,
class OutDesc, class OutDesc,
unsigned NPerBlock,
unsigned KPerBlock,
unsigned CPerBlockLoop,
unsigned OutTileSizeH, unsigned OutTileSizeH,
unsigned OutTileSizeW, unsigned OutTileSizeW,
unsigned NPerBlock,
unsigned KPerBlock,
unsigned CPerBlock,
unsigned YPerBlock, unsigned YPerBlock,
unsigned XPerBlock, unsigned XPerBlock,
unsigned NBlockCopyLen0, unsigned NBlockCopyLen0,
...@@ -206,10 +205,10 @@ __global__ void gridwise_convolution(InDesc, ...@@ -206,10 +205,10 @@ __global__ void gridwise_convolution(InDesc,
constexpr unsigned XBlockWork = (out_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock; constexpr unsigned XBlockWork = (out_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock;
constexpr auto in_block_glb_desc = make_ConstantTensorDescriptor( constexpr auto in_block_glb_desc = make_ConstantTensorDescriptor(
Sequence<NPerBlock, CPerBlockLoop, HiPerBlock, WiPerBlock>{}, in_desc.GetStrides()); Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{}, in_desc.GetStrides());
constexpr auto wei_block_glb_desc = make_ConstantTensorDescriptor( constexpr auto wei_block_glb_desc = make_ConstantTensorDescriptor(
Sequence<KPerBlock, CPerBlockLoop, S, R>{}, wei_desc.GetStrides()); Sequence<KPerBlock, CPerBlock, S, R>{}, wei_desc.GetStrides());
constexpr auto out_block_glb_desc = make_ConstantTensorDescriptor( constexpr auto out_block_glb_desc = make_ConstantTensorDescriptor(
Sequence<NPerBlock, KPerBlock, HoPerBlock, WoPerBlock>{}, out_desc.GetStrides()); Sequence<NPerBlock, KPerBlock, HoPerBlock, WoPerBlock>{}, out_desc.GetStrides());
...@@ -279,7 +278,7 @@ __global__ void gridwise_convolution(InDesc, ...@@ -279,7 +278,7 @@ __global__ void gridwise_convolution(InDesc,
#endif #endif
for(unsigned c_block_work_begin = 0; c_block_work_begin < in_desc.GetLength(I1); for(unsigned c_block_work_begin = 0; c_block_work_begin < in_desc.GetLength(I1);
c_block_work_begin += CPerBlockLoop) c_block_work_begin += CPerBlock)
{ {
auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; }; auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; };
......
#pragma once
#include "constant_tensor_descriptor.cuh"
#include "blockwise_tensor_op.cuh"
#include "threadwise_tensor_op.cuh"
#include "threadwise_convolution.cuh"
template <class TFloat,
class InDesc,
class WeiDesc,
class OutDesc,
unsigned OutTileSizeH,
unsigned OutTileSizeW,
unsigned BlockSize>
__device__ void blockwise_convolution(InDesc,
TFloat* const __restrict__ p_in,
WeiDesc,
TFloat* const __restrict__ p_wei,
OutDesc,
TFloat* __restrict__ p_out)
{
constexpr auto I0 = Index<0>{};
constexpr auto I1 = Index<1>{};
constexpr auto I2 = Index<2>{};
constexpr auto I3 = Index<3>{};
constexpr auto in_desc = InDesc{};
constexpr auto wei_desc = WeiDesc{};
constexpr auto out_desc = OutDesc{};
constexpr unsigned S = wei_desc.GetLength(I2);
constexpr unsigned R = wei_desc.GetLength(I3);
constexpr unsigned NPerBlock = out_desc.GetLength(I0);
constexpr unsigned KPerBlock = out_desc.GetLength(I1);
constexpr unsigned YPerBlock = (out_desc.GetLength(I2) + OutTileSizeH - 1) / OutTileSizeH;
constexpr unsigned XPerBlock = (out_desc.GetLength(I3) + OutTileSizeW - 1) / OutTileSizeW;
constexpr unsigned CPerBlock = in_desc.GetLength(I1);
constexpr unsigned InTileSizeH = OutTileSizeH + S - 1;
constexpr unsigned InTileSizeW = OutTileSizeW + R - 1;
#if 0
if(threadIdx.x == 0)
{
print_ConstantTensorDescriptor(in_desc);
print_ConstantTensorDescriptor(wei_desc);
print_ConstantTensorDescriptor(out_desc);
}
#endif
constexpr auto in_thread_src_desc = make_ConstantTensorDescriptor(
Sequence<1, CPerBlock, InTileSizeH, InTileSizeW>{}, in_desc.GetStrides());
constexpr auto wei_thread_src_desc =
make_ConstantTensorDescriptor(Sequence<1, CPerBlock, S, R>{}, wei_desc.GetStrides());
constexpr auto out_thread_src_desc = make_ConstantTensorDescriptor(
Sequence<1, 1, OutTileSizeH, OutTileSizeW>{}, out_desc.GetStrides());
constexpr auto in_thread_dst_desc =
make_ConstantTensorDescriptor(in_thread_src_desc.GetLengths());
constexpr auto wei_thread_dst_desc =
make_ConstantTensorDescriptor(wei_thread_src_desc.GetLengths());
constexpr auto out_thread_dst_desc =
make_ConstantTensorDescriptor(out_thread_src_desc.GetLengths());
const unsigned thread_id = threadIdx.x;
for(unsigned thread_work_id = thread_id; thread_work_id < NPerBlock * YPerBlock * XPerBlock;
thread_work_id += BlockSize)
{
unsigned itmp = thread_work_id;
unsigned n_thread_work_id = itmp / (YPerBlock * XPerBlock);
itmp -= n_thread_work_id * (YPerBlock * XPerBlock);
unsigned y_thread_work_id = itmp / XPerBlock;
unsigned x_thread_work_id = itmp - y_thread_work_id * XPerBlock;
unsigned n_thread_work_begin = n_thread_work_id * 1;
unsigned ho_thread_work_begin = y_thread_work_id * OutTileSizeH;
unsigned wo_thread_work_begin = x_thread_work_id * OutTileSizeW;
unsigned hi_thread_work_begin = ho_thread_work_begin; // minus padding
unsigned wi_thread_work_begin = wo_thread_work_begin; // minus padding
TFloat p_in_thread[in_thread_src_desc.GetElementSpace()];
TFloat p_wei_thread[wei_thread_src_desc.GetElementSpace()];
TFloat p_out_thread[out_thread_src_desc.GetElementSpace()];
auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; };
// copy input tensor into register
threadwise_4d_tensor_op<TFloat,
decltype(in_thread_src_desc),
decltype(in_thread_dst_desc),
decltype(f_copy)>(
in_thread_src_desc,
p_in + in_desc.Get1dIndex(
n_thread_work_begin, 0, hi_thread_work_begin, wi_thread_work_begin),
in_thread_dst_desc,
p_in_thread,
f_copy,
false);
for(unsigned k_thread_work_begin = 0; k_thread_work_begin < KPerBlock;
++k_thread_work_begin)
{
// copy weight tensor into register
threadwise_4d_tensor_op<TFloat,
decltype(wei_thread_src_desc),
decltype(wei_thread_dst_desc),
decltype(f_copy)>(
wei_thread_src_desc,
p_wei + wei_desc.Get1dIndex(k_thread_work_begin, 0, 0, 0),
wei_thread_dst_desc,
p_wei_thread,
f_copy,
false);
// copy output tensor into register
threadwise_4d_tensor_op<TFloat,
decltype(out_thread_src_desc),
decltype(out_thread_dst_desc),
decltype(f_copy)>(out_thread_src_desc,
p_out +
out_desc.Get1dIndex(n_thread_work_begin,
k_thread_work_begin,
ho_thread_work_begin,
wo_thread_work_begin),
out_thread_dst_desc,
p_out_thread,
f_copy,
false);
// threadwise convolution
threadwise_direct_convolution<TFloat,
decltype(in_thread_dst_desc),
decltype(wei_thread_dst_desc),
decltype(out_thread_dst_desc)>(in_thread_dst_desc,
p_in_thread,
wei_thread_dst_desc,
p_wei_thread,
out_thread_dst_desc,
p_out_thread);
// accumulate output tensor into LDS
threadwise_4d_tensor_op<TFloat,
decltype(out_thread_dst_desc),
decltype(out_thread_src_desc),
decltype(f_copy)>(out_thread_dst_desc,
p_out_thread,
out_thread_src_desc,
p_out +
out_desc.Get1dIndex(n_thread_work_begin,
k_thread_work_begin,
ho_thread_work_begin,
wo_thread_work_begin),
f_copy,
false);
}
}
}
template <class TFloat,
class InDesc,
class WeiDesc,
class OutDesc,
unsigned OutTileSizeH,
unsigned OutTileSizeW,
unsigned NPerBlock,
unsigned KPerBlock,
unsigned CPerBlock,
unsigned YPerBlock,
unsigned XPerBlock,
unsigned NBlockCopyLen0,
unsigned NBlockCopyLen1,
unsigned NBlockCopyLen2,
unsigned NBlockCopyLen3,
unsigned BlockSize,
unsigned GridSize>
__global__ void gridwise_convolution(InDesc,
TFloat* const __restrict__ p_in,
WeiDesc,
TFloat* const __restrict__ p_wei,
OutDesc,
TFloat* __restrict__ p_out)
{
constexpr auto I0 = Index<0>{};
constexpr auto I1 = Index<1>{};
constexpr auto I2 = Index<2>{};
constexpr auto I3 = Index<3>{};
constexpr auto in_desc = InDesc{};
constexpr auto wei_desc = WeiDesc{};
constexpr auto out_desc = OutDesc{};
constexpr unsigned S = wei_desc.GetLength(I2);
constexpr unsigned R = wei_desc.GetLength(I3);
constexpr unsigned HoPerBlock = OutTileSizeH * YPerBlock;
constexpr unsigned WoPerBlock = OutTileSizeW * XPerBlock;
constexpr unsigned HiPerBlock = YPerBlock * OutTileSizeH + S - 1;
constexpr unsigned WiPerBlock = XPerBlock * OutTileSizeW + R - 1;
constexpr unsigned NBlockWork = (out_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
constexpr unsigned KBlockWork = (out_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
constexpr unsigned YBlockWork = (out_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock;
constexpr unsigned XBlockWork = (out_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock;
constexpr auto in_block_glb_desc = make_ConstantTensorDescriptor(
Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{}, in_desc.GetStrides());
constexpr auto wei_block_glb_desc = make_ConstantTensorDescriptor(
Sequence<KPerBlock, CPerBlock, S, R>{}, wei_desc.GetStrides());
constexpr auto out_block_glb_desc = make_ConstantTensorDescriptor(
Sequence<NPerBlock, KPerBlock, HoPerBlock, WoPerBlock>{}, out_desc.GetStrides());
constexpr auto in_block_lds_desc =
make_ConstantTensorDescriptor(in_block_glb_desc.GetLengths());
constexpr auto wei_block_lds_desc =
make_ConstantTensorDescriptor(wei_block_glb_desc.GetLengths());
constexpr auto out_block_lds_desc =
make_ConstantTensorDescriptor(out_block_glb_desc.GetLengths());
constexpr unsigned in_block_size = in_block_lds_desc.GetElementSize();
constexpr unsigned wei_block_size = wei_block_lds_desc.GetElementSize();
constexpr unsigned out_block_size = out_block_lds_desc.GetElementSize();
__shared__ TFloat p_in_block[in_block_size];
__shared__ TFloat p_wei_block[wei_block_size];
__shared__ TFloat p_out_block[out_block_size];
const unsigned block_id = blockIdx.x;
unsigned itmp = block_id;
unsigned n_block_work_id = itmp / (KBlockWork * YBlockWork * XBlockWork);
itmp -= n_block_work_id * (KBlockWork * YBlockWork * XBlockWork);
unsigned k_block_work_id = itmp / (YBlockWork * XBlockWork);
itmp -= k_block_work_id * (YBlockWork * XBlockWork);
unsigned y_block_work_id = itmp / XBlockWork;
unsigned x_block_work_id = itmp - y_block_work_id * XBlockWork;
unsigned n_block_work_begin = n_block_work_id * NPerBlock;
unsigned k_block_work_begin = k_block_work_id * KPerBlock;
unsigned y_block_work_begin = y_block_work_id * YPerBlock;
unsigned x_block_work_begin = x_block_work_id * XPerBlock;
unsigned ho_block_work_begin = y_block_work_begin * OutTileSizeH;
unsigned wo_block_work_begin = x_block_work_begin * OutTileSizeW;
unsigned hi_block_work_begin = ho_block_work_begin; // minus padding
unsigned wi_block_work_begin = wo_block_work_begin; // minus padding
#if 0
if(threadIdx.x == 0)
{
print_ConstantTensorDescriptor( in_desc, "gridwise_convolution: in_desc: ");
print_ConstantTensorDescriptor(wei_desc, "gridwise_convolution: wei_desc: ");
print_ConstantTensorDescriptor(out_desc, "gridwise_convolution: out_desc: ");
print_ConstantTensorDescriptor( in_block_glb_desc, "gridwise_convolution: in_block_glb_desc: ");
print_ConstantTensorDescriptor(wei_block_glb_desc, "gridwise_convolution: wei_block_glb_desc: ");
print_ConstantTensorDescriptor(out_block_glb_desc, "gridwise_convolution: out_block_glb_desc: ");
print_ConstantTensorDescriptor( in_block_lds_desc, "gridwise_convolution: in_block_lds_desc: ");
print_ConstantTensorDescriptor(wei_block_lds_desc, "gridwise_convolution: wei_block_lds_desc: ");
print_ConstantTensorDescriptor(out_block_lds_desc, "gridwise_convolution: out_block_lds_desc: ");
printf("NBlockWork %u, KBlockWork %u, YBlockWork %u, XBlockWork %u \t"
"block_id %u, n_block_work_id %u, k_block_work_id %u, y_block_work_id %u, "
"x_block_work_id %u\n",
NBlockWork,
KBlockWork,
YBlockWork,
XBlockWork,
block_id,
n_block_work_id,
k_block_work_id,
y_block_work_id,
x_block_work_id);
}
#endif
auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; };
auto f_accu = [](const TFloat& src, TFloat& dst) { dst += src; };
// copy output tensor to LDS
blockwise_4d_tensor_op<TFloat,
decltype(out_block_glb_desc),
decltype(out_block_lds_desc),
NBlockCopyLen0,
NBlockCopyLen1,
NBlockCopyLen2,
NBlockCopyLen3,
decltype(f_copy),
BlockSize>(out_block_glb_desc,
p_out + out_block_glb_desc.Get1dIndex(n_block_work_begin,
k_block_work_begin,
ho_block_work_begin,
wo_block_work_begin),
out_block_lds_desc,
p_out_block,
f_copy);
for(unsigned c_block_work_begin = 0; c_block_work_begin < in_desc.GetLength(I1);
c_block_work_begin += CPerBlock)
{
#if 1
// copy input tensor to LDS
blockwise_4d_tensor_op<TFloat,
decltype(in_block_glb_desc),
decltype(in_block_lds_desc),
NBlockCopyLen0,
NBlockCopyLen1,
NBlockCopyLen2,
NBlockCopyLen3,
decltype(f_copy),
BlockSize>(in_block_glb_desc,
p_in + in_block_glb_desc.Get1dIndex(n_block_work_begin,
c_block_work_begin,
hi_block_work_begin,
wi_block_work_begin),
in_block_lds_desc,
p_in_block,
f_copy);
#endif
#if 1
// copy weight tensor to LDS
blockwise_4d_tensor_op<TFloat,
decltype(wei_block_glb_desc),
decltype(wei_block_lds_desc),
NBlockCopyLen0,
NBlockCopyLen1,
NBlockCopyLen2,
NBlockCopyLen3,
decltype(f_copy),
BlockSize>(
wei_block_glb_desc,
p_wei + wei_block_glb_desc.Get1dIndex(k_block_work_begin, c_block_work_begin, 0, 0),
wei_block_lds_desc,
p_wei_block,
f_copy);
#endif
#if 1
__syncthreads();
#endif
// blockwise convolution
blockwise_convolution<TFloat,
decltype(in_block_lds_desc),
decltype(wei_block_lds_desc),
decltype(out_block_lds_desc),
OutTileSizeH,
OutTileSizeW,
BlockSize>(in_block_lds_desc,
p_in_block,
wei_block_lds_desc,
p_wei_block,
out_block_lds_desc,
p_out_block);
#if 1
__syncthreads();
#endif
}
// copy output tensor from LDS to device mem
blockwise_4d_tensor_op<TFloat,
decltype(out_block_lds_desc),
decltype(out_block_glb_desc),
NBlockCopyLen0,
NBlockCopyLen1,
NBlockCopyLen2,
NBlockCopyLen3,
decltype(f_copy),
BlockSize>(out_block_lds_desc,
p_out_block,
out_block_glb_desc,
p_out + out_block_glb_desc.Get1dIndex(n_block_work_begin,
k_block_work_begin,
ho_block_work_begin,
wo_block_work_begin),
f_copy);
}
#pragma once #pragma once
#include "constant_tensor_descriptor.cuh" #include "constant_tensor_descriptor.cuh"
#define THREADWISE_TENSOR_OP_METHOD 1 #define THREADWISE_TENSOR_OP_METHOD 0
#if THREADWISE_TENSOR_OP_METHOD == 0 #if THREADWISE_TENSOR_OP_METHOD == 0
template <class TFloat, class SrcDesc, class DstDesc, class F> template <class TFloat, class SrcDesc, class DstDesc, class F>
__device__ void threadwise_4d_tensor_op( __device__ void threadwise_4d_tensor_op(SrcDesc,
SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f) TFloat* const __restrict__ p_src,
DstDesc,
TFloat* __restrict__ p_dst,
F f,
bool flag = false)
{ {
constexpr auto I0 = Index<0>{}; constexpr auto I0 = Index<0>{};
constexpr auto I1 = Index<1>{}; constexpr auto I1 = Index<1>{};
...@@ -26,6 +30,11 @@ __device__ void threadwise_4d_tensor_op( ...@@ -26,6 +30,11 @@ __device__ void threadwise_4d_tensor_op(
} }
#endif #endif
#if 1
if(flag && threadIdx.x != 0)
return;
#endif
for(unsigned did0 = 0; did0 < src_desc.GetLength(I0); ++did0) for(unsigned did0 = 0; did0 < src_desc.GetLength(I0); ++did0)
{ {
for(unsigned did1 = 0; did1 < src_desc.GetLength(I1); ++did1) for(unsigned did1 = 0; did1 < src_desc.GetLength(I1); ++did1)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment