Commit 99d05ba7 authored by Chao Liu's avatar Chao Liu
Browse files

refactor

parent 29496c95
...@@ -248,7 +248,7 @@ int main() ...@@ -248,7 +248,7 @@ int main()
int num_thread = std::thread::hardware_concurrency(); int num_thread = std::thread::hardware_concurrency();
#if 1 #if 0
in.GenerateTensorValue(GeneratorTensor<float>{}, num_thread); in.GenerateTensorValue(GeneratorTensor<float>{}, num_thread);
wei.GenerateTensorValue(GeneratorTensor<float>{}, num_thread); wei.GenerateTensorValue(GeneratorTensor<float>{}, num_thread);
out_host.GenerateTensorValue(GeneratorConstant<float>{0}, num_thread); out_host.GenerateTensorValue(GeneratorConstant<float>{0}, num_thread);
...@@ -258,7 +258,7 @@ int main() ...@@ -258,7 +258,7 @@ int main()
device_convolution(in_desc, in, wei_desc, wei, out_desc, out_device); device_convolution(in_desc, in, wei_desc, wei, out_desc, out_device);
#if 1 #if 0
host_convolution(in, wei, out_host); host_convolution(in, wei, out_host);
float error = 0; float error = 0;
......
...@@ -13,7 +13,7 @@ template <class TFloat, ...@@ -13,7 +13,7 @@ template <class TFloat,
unsigned NWorkLen3, unsigned NWorkLen3,
class F, class F,
unsigned BlockSize> unsigned BlockSize>
__device__ void blockwise_4d_tensor_op( __device__ void blockwise_4d_tensor_op_binary(
SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f) SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f)
{ {
constexpr auto I0 = Index<0>{}; constexpr auto I0 = Index<0>{};
...@@ -31,8 +31,8 @@ __device__ void blockwise_4d_tensor_op( ...@@ -31,8 +31,8 @@ __device__ void blockwise_4d_tensor_op(
#if 0 #if 0
if(threadIdx.x == 0) if(threadIdx.x == 0)
{ {
print_ConstantTensorDescriptor(src_desc, "blockwise_4d_tensor_op: src_desc: "); print_ConstantTensorDescriptor(src_desc, "blockwise_4d_tensor_op_binary: src_desc: ");
print_ConstantTensorDescriptor(dst_desc, "blockwise_4d_tensor_op: dst_desc: "); print_ConstantTensorDescriptor(dst_desc, "blockwise_4d_tensor_op_binary: dst_desc: ");
} }
#endif #endif
...@@ -73,7 +73,7 @@ template <class TFloat, ...@@ -73,7 +73,7 @@ template <class TFloat,
unsigned NWorkLen3, unsigned NWorkLen3,
class F, class F,
unsigned BlockSize> unsigned BlockSize>
__device__ void blockwise_4d_tensor_op( __device__ void blockwise_4d_tensor_op_binary(
SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f) SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f)
{ {
constexpr auto I0 = Index<0>{}; constexpr auto I0 = Index<0>{};
...@@ -91,16 +91,11 @@ __device__ void blockwise_4d_tensor_op( ...@@ -91,16 +91,11 @@ __device__ void blockwise_4d_tensor_op(
#if 0 #if 0
if(threadIdx.x == 0) if(threadIdx.x == 0)
{ {
print_ConstantTensorDescriptor(src_desc, "blockwise_4d_tensor_op: src_desc: "); print_ConstantTensorDescriptor(src_desc, "blockwise_4d_tensor_op_binary: src_desc: ");
print_ConstantTensorDescriptor(dst_desc, "blockwise_4d_tensor_op: dst_desc: "); print_ConstantTensorDescriptor(dst_desc, "blockwise_4d_tensor_op_binary: dst_desc: ");
} }
#endif #endif
#if 0
if(threadIdx.x != 0)
return;
#endif
constexpr unsigned NLoop = desc.GetElementSize() / BlockSize; constexpr unsigned NLoop = desc.GetElementSize() / BlockSize;
for(unsigned iloop = 0; iloop < NLoop; ++iloop) for(unsigned iloop = 0; iloop < NLoop; ++iloop)
...@@ -158,6 +153,87 @@ __device__ void blockwise_4d_tensor_op( ...@@ -158,6 +153,87 @@ __device__ void blockwise_4d_tensor_op(
} }
} }
} }
template <class TFloat,
class DstDesc,
unsigned NWorkLen0,
unsigned NWorkLen1,
unsigned NWorkLen2,
unsigned NWorkLen3,
class F,
unsigned BlockSize>
__device__ void blockwise_4d_tensor_op_unary(DstDesc, TFloat* __restrict__ p_dst, F f)
{
constexpr auto I0 = Index<0>{};
constexpr auto I1 = Index<1>{};
constexpr auto I2 = Index<2>{};
constexpr auto I3 = Index<3>{};
constexpr auto dst_desc = DstDesc{};
constexpr auto desc = make_ConstantTensorDescriptor(dst_desc.GetLengths());
#if 0
if(threadIdx.x == 0)
{
print_ConstantTensorDescriptor(dst_desc, "blockwise_4d_tensor_op_unary: dst_desc: ");
print_ConstantTensorDescriptor(desc, "blockwise_4d_tensor_op_unary: desc: ");
}
#endif
constexpr unsigned NLoop = desc.GetElementSize() / BlockSize;
for(unsigned iloop = 0; iloop < NLoop; ++iloop)
{
unsigned is = threadIdx.x + iloop * BlockSize;
const unsigned did0 = is / desc.GetStride(I0);
is -= did0 * desc.GetStride(I0);
const unsigned did1 = is / desc.GetStride(I1);
is -= did1 * desc.GetStride(I1);
const unsigned did2 = is / desc.GetStride(I2);
is -= did2 * desc.GetStride(I2);
const unsigned did3 = is / desc.GetStride(I3);
const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3);
f(p_dst[dindex]);
}
constexpr bool has_tail = (desc.GetElementSize() > NLoop * BlockSize);
if(has_tail)
{
unsigned is = threadIdx.x + NLoop * BlockSize;
if(is < desc.GetElementSize())
{
const unsigned did0 = is / desc.GetStride(I0);
is -= did0 * desc.GetStride(I0);
const unsigned did1 = is / desc.GetStride(I1);
is -= did1 * desc.GetStride(I1);
const unsigned did2 = is / desc.GetStride(I2);
is -= did2 * desc.GetStride(I2);
const unsigned did3 = is / desc.GetStride(I3);
const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3);
f(p_dst[dindex]);
}
}
}
#endif #endif
#if BLOCKWISE_TENSOR_OP_METHOD == 21 #if BLOCKWISE_TENSOR_OP_METHOD == 21
...@@ -170,7 +246,7 @@ template <class TFloat, ...@@ -170,7 +246,7 @@ template <class TFloat,
unsigned NWorkLen3, unsigned NWorkLen3,
class F, class F,
unsigned BlockSize> unsigned BlockSize>
__device__ void blockwise_4d_tensor_op( __device__ void blockwise_4d_tensor_op_binary(
SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f) SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f)
{ {
constexpr auto I0 = Index<0>{}; constexpr auto I0 = Index<0>{};
...@@ -234,7 +310,7 @@ template <class TFloat, ...@@ -234,7 +310,7 @@ template <class TFloat,
unsigned NWorkLen3, unsigned NWorkLen3,
class F, class F,
unsigned BlockSize> unsigned BlockSize>
__device__ void blockwise_4d_tensor_op( __device__ void blockwise_4d_tensor_op_binary(
SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f) SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f)
{ {
constexpr auto I0 = Index<0>{}; constexpr auto I0 = Index<0>{};
...@@ -318,7 +394,7 @@ template <class TFloat, ...@@ -318,7 +394,7 @@ template <class TFloat,
unsigned NWorkLen3, unsigned NWorkLen3,
class F, class F,
unsigned BlockSize> unsigned BlockSize>
__device__ void blockwise_4d_tensor_op( __device__ void blockwise_4d_tensor_op_binary(
SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f) SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f)
{ {
constexpr auto I0 = Index<0>{}; constexpr auto I0 = Index<0>{};
...@@ -404,7 +480,7 @@ template <class TFloat, ...@@ -404,7 +480,7 @@ template <class TFloat,
unsigned NWorkLen3, unsigned NWorkLen3,
class F, class F,
unsigned BlockSize> unsigned BlockSize>
__device__ void blockwise_4d_tensor_op( __device__ void blockwise_4d_tensor_op_binary(
SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f) SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f)
{ {
constexpr auto I0 = Index<0>{}; constexpr auto I0 = Index<0>{};
......
#pragma once
#include "constant_tensor_descriptor.cuh"
#include "blockwise_tensor_op.cuh"
#include "threadwise_tensor_op.cuh"
#include "threadwise_convolution.cuh"
template <class TFloat,
class InDesc,
class WeiDesc,
class OutDesc,
unsigned OutTileSizeH,
unsigned OutTileSizeW,
unsigned BlockSize>
__device__ void blockwise_convolution(InDesc,
TFloat* const __restrict__ p_in,
WeiDesc,
TFloat* const __restrict__ p_wei,
OutDesc,
TFloat* __restrict__ p_out)
{
constexpr auto I0 = Index<0>{};
constexpr auto I1 = Index<1>{};
constexpr auto I2 = Index<2>{};
constexpr auto I3 = Index<3>{};
constexpr auto in_desc = InDesc{};
constexpr auto wei_desc = WeiDesc{};
constexpr auto out_desc = OutDesc{};
constexpr unsigned S = wei_desc.GetLength(I2);
constexpr unsigned R = wei_desc.GetLength(I3);
constexpr unsigned NPerBlock = out_desc.GetLength(I0);
constexpr unsigned KPerBlock = out_desc.GetLength(I1);
constexpr unsigned YPerBlock = (out_desc.GetLength(I2) + OutTileSizeH - 1) / OutTileSizeH;
constexpr unsigned XPerBlock = (out_desc.GetLength(I3) + OutTileSizeW - 1) / OutTileSizeW;
constexpr unsigned CPerBlock = in_desc.GetLength(I1);
constexpr unsigned InTileSizeH = OutTileSizeH + S - 1;
constexpr unsigned InTileSizeW = OutTileSizeW + R - 1;
#if 0
if(threadIdx.x == 0)
{
print_ConstantTensorDescriptor(in_desc);
print_ConstantTensorDescriptor(wei_desc);
print_ConstantTensorDescriptor(out_desc);
}
#endif
constexpr auto in_thread_src_desc = make_ConstantTensorDescriptor(
Sequence<1, CPerBlock, OutTileSizeH + S - 1, OutTileSizeW + R - 1>{}, in_desc.GetStrides());
constexpr auto wei_thread_src_desc =
make_ConstantTensorDescriptor(Sequence<1, CPerBlock, S, R>{}, wei_desc.GetStrides());
constexpr auto out_thread_src_desc = make_ConstantTensorDescriptor(
Sequence<1, 1, OutTileSizeH, OutTileSizeW>{}, out_desc.GetStrides());
constexpr auto in_thread_dst_desc =
make_ConstantTensorDescriptor(in_thread_src_desc.GetLengths());
constexpr auto wei_thread_dst_desc =
make_ConstantTensorDescriptor(wei_thread_src_desc.GetLengths());
constexpr auto out_thread_dst_desc =
make_ConstantTensorDescriptor(out_thread_src_desc.GetLengths());
const unsigned thread_id = threadIdx.x;
for(unsigned thread_work_id = thread_id;
thread_work_id < NPerBlock * KPerBlock * YPerBlock * XPerBlock;
thread_work_id += BlockSize)
{
unsigned itmp = thread_work_id;
unsigned n_thread_work_id = itmp / (KPerBlock * YPerBlock * XPerBlock);
itmp -= n_thread_work_id * (KPerBlock * YPerBlock * XPerBlock);
unsigned k_thread_work_id = itmp / (YPerBlock * XPerBlock);
itmp -= k_thread_work_id * (YPerBlock * XPerBlock);
unsigned y_thread_work_id = itmp / XPerBlock;
unsigned x_thread_work_id = itmp - y_thread_work_id * XPerBlock;
unsigned n_thread_work_begin = n_thread_work_id * 1;
unsigned k_thread_work_begin = k_thread_work_id * 1;
unsigned ho_thread_work_begin = y_thread_work_id * OutTileSizeH;
unsigned wo_thread_work_begin = x_thread_work_id * OutTileSizeW;
unsigned hi_thread_work_begin = ho_thread_work_begin; // minus padding
unsigned wi_thread_work_begin = wo_thread_work_begin; // minus padding
TFloat p_in_thread[1 * CPerBlock * InTileSizeH * InTileSizeW];
TFloat p_wei_thread[1 * CPerBlock * S * R];
TFloat p_out_thread[1 * 1 * OutTileSizeH * OutTileSizeW];
auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; };
// copy input tensor into register
threadwise_4d_tensor_op<TFloat,
decltype(in_thread_src_desc),
decltype(in_thread_dst_desc),
decltype(f_copy)>(
in_thread_src_desc,
p_in + in_desc.Get1dIndex(
n_thread_work_begin, 0, hi_thread_work_begin, wi_thread_work_begin),
in_thread_dst_desc,
p_in_thread,
f_copy);
// copy weight tensor into register
threadwise_4d_tensor_op<TFloat,
decltype(wei_thread_src_desc),
decltype(wei_thread_dst_desc),
decltype(f_copy)>(
wei_thread_src_desc,
p_wei + wei_desc.Get1dIndex(k_thread_work_begin, 0, 0, 0),
wei_thread_dst_desc,
p_wei_thread,
f_copy);
// copy output tensor into register
threadwise_4d_tensor_op<TFloat,
decltype(out_thread_src_desc),
decltype(out_thread_dst_desc),
decltype(f_copy)>(out_thread_src_desc,
p_out + out_desc.Get1dIndex(n_thread_work_begin,
k_thread_work_begin,
ho_thread_work_begin,
wo_thread_work_begin),
out_thread_dst_desc,
p_out_thread,
f_copy);
// threadwise convolution
threadwise_direct_convolution<TFloat,
decltype(in_thread_dst_desc),
decltype(wei_thread_dst_desc),
decltype(out_thread_dst_desc)>(in_thread_dst_desc,
p_in_thread,
wei_thread_dst_desc,
p_wei_thread,
out_thread_dst_desc,
p_out_thread);
// accumulate output tensor into LDS
threadwise_4d_tensor_op<TFloat,
decltype(out_thread_dst_desc),
decltype(out_thread_src_desc),
decltype(f_copy)>(out_thread_dst_desc,
p_out_thread,
out_thread_src_desc,
p_out + out_desc.Get1dIndex(n_thread_work_begin,
k_thread_work_begin,
ho_thread_work_begin,
wo_thread_work_begin),
f_copy);
}
}
template <class TFloat,
class InDesc,
class WeiDesc,
class OutDesc,
unsigned OutTileSizeH,
unsigned OutTileSizeW,
unsigned NPerBlock,
unsigned KPerBlock,
unsigned CPerBlock,
unsigned YPerBlock,
unsigned XPerBlock,
unsigned NBlockCopyLen0,
unsigned NBlockCopyLen1,
unsigned NBlockCopyLen2,
unsigned NBlockCopyLen3,
unsigned BlockSize,
unsigned GridSize>
__global__ void gridwise_convolution(InDesc,
TFloat* const __restrict__ p_in,
WeiDesc,
TFloat* const __restrict__ p_wei,
OutDesc,
TFloat* __restrict__ p_out)
{
constexpr auto I0 = Index<0>{};
constexpr auto I1 = Index<1>{};
constexpr auto I2 = Index<2>{};
constexpr auto I3 = Index<3>{};
constexpr auto in_desc = InDesc{};
constexpr auto wei_desc = WeiDesc{};
constexpr auto out_desc = OutDesc{};
constexpr unsigned S = wei_desc.GetLength(I2);
constexpr unsigned R = wei_desc.GetLength(I3);
constexpr unsigned HoPerBlock = OutTileSizeH * YPerBlock;
constexpr unsigned WoPerBlock = OutTileSizeW * XPerBlock;
constexpr unsigned HiPerBlock = YPerBlock * OutTileSizeH + S - 1;
constexpr unsigned WiPerBlock = XPerBlock * OutTileSizeW + R - 1;
constexpr unsigned NBlockWork = (out_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
constexpr unsigned KBlockWork = (out_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
constexpr unsigned YBlockWork = (out_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock;
constexpr unsigned XBlockWork = (out_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock;
constexpr auto in_block_glb_desc = make_ConstantTensorDescriptor(
Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{}, in_desc.GetStrides());
constexpr auto wei_block_glb_desc = make_ConstantTensorDescriptor(
Sequence<KPerBlock, CPerBlock, S, R>{}, wei_desc.GetStrides());
constexpr auto out_block_glb_desc = make_ConstantTensorDescriptor(
Sequence<NPerBlock, KPerBlock, HoPerBlock, WoPerBlock>{}, out_desc.GetStrides());
constexpr auto in_block_lds_desc =
make_ConstantTensorDescriptor(in_block_glb_desc.GetLengths());
constexpr auto wei_block_lds_desc =
make_ConstantTensorDescriptor(wei_block_glb_desc.GetLengths());
constexpr auto out_block_lds_desc =
make_ConstantTensorDescriptor(out_block_glb_desc.GetLengths());
constexpr unsigned in_block_size = in_block_lds_desc.GetElementSize();
constexpr unsigned wei_block_size = wei_block_lds_desc.GetElementSize();
constexpr unsigned out_block_size = out_block_lds_desc.GetElementSize();
__shared__ TFloat p_in_block[in_block_size];
__shared__ TFloat p_wei_block[wei_block_size];
__shared__ TFloat p_out_block[out_block_size];
const unsigned block_id = blockIdx.x;
unsigned itmp = block_id;
unsigned n_block_work_id = itmp / (KBlockWork * YBlockWork * XBlockWork);
itmp -= n_block_work_id * (KBlockWork * YBlockWork * XBlockWork);
unsigned k_block_work_id = itmp / (YBlockWork * XBlockWork);
itmp -= k_block_work_id * (YBlockWork * XBlockWork);
unsigned y_block_work_id = itmp / XBlockWork;
unsigned x_block_work_id = itmp - y_block_work_id * XBlockWork;
unsigned n_block_work_begin = n_block_work_id * NPerBlock;
unsigned k_block_work_begin = k_block_work_id * KPerBlock;
unsigned y_block_work_begin = y_block_work_id * YPerBlock;
unsigned x_block_work_begin = x_block_work_id * XPerBlock;
unsigned ho_block_work_begin = y_block_work_begin * OutTileSizeH;
unsigned wo_block_work_begin = x_block_work_begin * OutTileSizeW;
unsigned hi_block_work_begin = ho_block_work_begin; // minus padding
unsigned wi_block_work_begin = wo_block_work_begin; // minus padding
#if 0
if(threadIdx.x == 0)
{
print_ConstantTensorDescriptor( in_desc, "gridwise_convolution: in_desc: ");
print_ConstantTensorDescriptor(wei_desc, "gridwise_convolution: wei_desc: ");
print_ConstantTensorDescriptor(out_desc, "gridwise_convolution: out_desc: ");
print_ConstantTensorDescriptor( in_block_glb_desc, "gridwise_convolution: in_block_glb_desc: ");
print_ConstantTensorDescriptor(wei_block_glb_desc, "gridwise_convolution: wei_block_glb_desc: ");
print_ConstantTensorDescriptor(out_block_glb_desc, "gridwise_convolution: out_block_glb_desc: ");
print_ConstantTensorDescriptor( in_block_lds_desc, "gridwise_convolution: in_block_lds_desc: ");
print_ConstantTensorDescriptor(wei_block_lds_desc, "gridwise_convolution: wei_block_lds_desc: ");
print_ConstantTensorDescriptor(out_block_lds_desc, "gridwise_convolution: out_block_lds_desc: ");
printf("NBlockWork %u, KBlockWork %u, YBlockWork %u, XBlockWork %u \t"
"block_id %u, n_block_work_id %u, k_block_work_id %u, y_block_work_id %u, "
"x_block_work_id %u\n",
NBlockWork,
KBlockWork,
YBlockWork,
XBlockWork,
block_id,
n_block_work_id,
k_block_work_id,
y_block_work_id,
x_block_work_id);
}
#endif
for(unsigned c_block_work_begin = 0; c_block_work_begin < in_desc.GetLength(I1);
c_block_work_begin += CPerBlock)
{
auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; };
#if 1
// copy input tensor to LDS
blockwise_4d_tensor_op<TFloat,
decltype(in_block_glb_desc),
decltype(in_block_lds_desc),
NBlockCopyLen0,
NBlockCopyLen1,
NBlockCopyLen2,
NBlockCopyLen3,
decltype(f_copy),
BlockSize>(in_block_glb_desc,
p_in + in_block_glb_desc.Get1dIndex(n_block_work_begin,
c_block_work_begin,
hi_block_work_begin,
wi_block_work_begin),
in_block_lds_desc,
p_in_block,
f_copy);
#endif
#if 1
// copy weight tensor to LDS
blockwise_4d_tensor_op<TFloat,
decltype(wei_block_glb_desc),
decltype(wei_block_lds_desc),
NBlockCopyLen0,
NBlockCopyLen1,
NBlockCopyLen2,
NBlockCopyLen3,
decltype(f_copy),
BlockSize>(
wei_block_glb_desc,
p_wei + wei_block_glb_desc.Get1dIndex(k_block_work_begin, c_block_work_begin, 0, 0),
wei_block_lds_desc,
p_wei_block,
f_copy);
#endif
// copy output tensor to LDS
blockwise_4d_tensor_op<TFloat,
decltype(out_block_glb_desc),
decltype(out_block_lds_desc),
NBlockCopyLen0,
NBlockCopyLen1,
NBlockCopyLen2,
NBlockCopyLen3,
decltype(f_copy),
BlockSize>(out_block_glb_desc,
p_out +
out_block_glb_desc.Get1dIndex(n_block_work_begin,
k_block_work_begin,
ho_block_work_begin,
wo_block_work_begin),
out_block_lds_desc,
p_out_block,
f_copy);
#if 1
__syncthreads();
#endif
// blockwise convolution
blockwise_convolution<TFloat,
decltype(in_block_lds_desc),
decltype(wei_block_lds_desc),
decltype(out_block_lds_desc),
OutTileSizeH,
OutTileSizeW,
BlockSize>(in_block_lds_desc,
p_in_block,
wei_block_lds_desc,
p_wei_block,
out_block_lds_desc,
p_out_block);
#if 1
__syncthreads();
#endif
// accum output tensor from LDS to device mem
blockwise_4d_tensor_op<TFloat,
decltype(out_block_lds_desc),
decltype(out_block_glb_desc),
NBlockCopyLen0,
NBlockCopyLen1,
NBlockCopyLen2,
NBlockCopyLen3,
decltype(f_copy),
BlockSize>(out_block_lds_desc,
p_out_block,
out_block_glb_desc,
p_out +
out_block_glb_desc.Get1dIndex(n_block_work_begin,
k_block_work_begin,
ho_block_work_begin,
wo_block_work_begin),
f_copy);
}
}
...@@ -279,34 +279,26 @@ __global__ void gridwise_convolution(InDesc, ...@@ -279,34 +279,26 @@ __global__ void gridwise_convolution(InDesc,
} }
#endif #endif
auto f_set0 = [](TFloat& v) { v = TFloat(0); };
auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; }; auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; };
auto f_accu = [](const TFloat& src, TFloat& dst) { dst += src; }; auto f_accu = [](const TFloat& src, TFloat& dst) { dst += src; };
// copy output tensor to LDS // set output tensor in LDS to 0
blockwise_4d_tensor_op<TFloat, blockwise_4d_tensor_op_unary<TFloat,
decltype(out_block_glb_desc),
decltype(out_block_lds_desc), decltype(out_block_lds_desc),
NBlockCopyLen0, NBlockCopyLen0,
NBlockCopyLen1, NBlockCopyLen1,
NBlockCopyLen2, NBlockCopyLen2,
NBlockCopyLen3, NBlockCopyLen3,
decltype(f_copy), decltype(f_set0),
BlockSize>(out_block_glb_desc, BlockSize>(out_block_lds_desc, p_out_block, f_set0);
p_out + out_block_glb_desc.Get1dIndex(n_block_work_begin,
k_block_work_begin,
ho_block_work_begin,
wo_block_work_begin),
out_block_lds_desc,
p_out_block,
f_copy);
for(unsigned c_block_work_begin = 0; c_block_work_begin < in_desc.GetLength(I1); for(unsigned c_block_work_begin = 0; c_block_work_begin < in_desc.GetLength(I1);
c_block_work_begin += CPerBlock) c_block_work_begin += CPerBlock)
{ {
#if 1
// copy input tensor to LDS // copy input tensor to LDS
blockwise_4d_tensor_op<TFloat, blockwise_4d_tensor_op_binary<TFloat,
decltype(in_block_glb_desc), decltype(in_block_glb_desc),
decltype(in_block_lds_desc), decltype(in_block_lds_desc),
NBlockCopyLen0, NBlockCopyLen0,
...@@ -314,7 +306,8 @@ __global__ void gridwise_convolution(InDesc, ...@@ -314,7 +306,8 @@ __global__ void gridwise_convolution(InDesc,
NBlockCopyLen2, NBlockCopyLen2,
NBlockCopyLen3, NBlockCopyLen3,
decltype(f_copy), decltype(f_copy),
BlockSize>(in_block_glb_desc, BlockSize>(
in_block_glb_desc,
p_in + in_block_glb_desc.Get1dIndex(n_block_work_begin, p_in + in_block_glb_desc.Get1dIndex(n_block_work_begin,
c_block_work_begin, c_block_work_begin,
hi_block_work_begin, hi_block_work_begin,
...@@ -322,11 +315,9 @@ __global__ void gridwise_convolution(InDesc, ...@@ -322,11 +315,9 @@ __global__ void gridwise_convolution(InDesc,
in_block_lds_desc, in_block_lds_desc,
p_in_block, p_in_block,
f_copy); f_copy);
#endif
#if 1
// copy weight tensor to LDS // copy weight tensor to LDS
blockwise_4d_tensor_op<TFloat, blockwise_4d_tensor_op_binary<TFloat,
decltype(wei_block_glb_desc), decltype(wei_block_glb_desc),
decltype(wei_block_lds_desc), decltype(wei_block_lds_desc),
NBlockCopyLen0, NBlockCopyLen0,
...@@ -340,7 +331,6 @@ __global__ void gridwise_convolution(InDesc, ...@@ -340,7 +331,6 @@ __global__ void gridwise_convolution(InDesc,
wei_block_lds_desc, wei_block_lds_desc,
p_wei_block, p_wei_block,
f_copy); f_copy);
#endif
#if 1 #if 1
__syncthreads(); __syncthreads();
...@@ -366,7 +356,7 @@ __global__ void gridwise_convolution(InDesc, ...@@ -366,7 +356,7 @@ __global__ void gridwise_convolution(InDesc,
} }
// copy output tensor from LDS to device mem // copy output tensor from LDS to device mem
blockwise_4d_tensor_op<TFloat, blockwise_4d_tensor_op_binary<TFloat,
decltype(out_block_lds_desc), decltype(out_block_lds_desc),
decltype(out_block_glb_desc), decltype(out_block_glb_desc),
NBlockCopyLen0, NBlockCopyLen0,
...@@ -374,12 +364,12 @@ __global__ void gridwise_convolution(InDesc, ...@@ -374,12 +364,12 @@ __global__ void gridwise_convolution(InDesc,
NBlockCopyLen2, NBlockCopyLen2,
NBlockCopyLen3, NBlockCopyLen3,
decltype(f_copy), decltype(f_copy),
BlockSize>(out_block_lds_desc, BlockSize>(
out_block_lds_desc,
p_out_block, p_out_block,
out_block_glb_desc, out_block_glb_desc,
p_out + out_block_glb_desc.Get1dIndex(n_block_work_begin, p_out +
k_block_work_begin, out_block_glb_desc.Get1dIndex(
ho_block_work_begin, n_block_work_begin, k_block_work_begin, ho_block_work_begin, wo_block_work_begin),
wo_block_work_begin),
f_copy); f_copy);
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment