"...git@developer.sourcefind.cn:OpenDAS/mmdetection3d.git" did not exist on "96cab60d84724d710f86c7bcc83bd8f182d0f99f"
Commit ac1f62be authored by Chao Liu's avatar Chao Liu
Browse files

refactor

parent 3dbd4725
......@@ -3,7 +3,8 @@
#include "threadwise_tensor_op.cuh"
#include "threadwise_direct_convolution.cuh"
template <class TFloat,
template <unsigned BlockSize,
class TFloat,
class InBlockDesc,
class WeiBlockDesc,
class OutBlockDesc,
......@@ -11,8 +12,7 @@ template <class TFloat,
unsigned OutTileSizeW,
unsigned NPerThread,
unsigned KPerThread,
unsigned CPerThread,
unsigned BlockSize>
unsigned CPerThread>
__device__ void blockwise_direct_convolution(InBlockDesc,
TFloat* const __restrict__ p_in_block,
WeiBlockDesc,
......
#pragma once
#include "constant_tensor_descriptor.cuh"
template <class TFloat, class DstDesc, class F, unsigned BlockSize>
template <unsigned BlockSize, class TFloat, class DstDesc, class F>
__device__ void
blockwise_4d_tensor_pointwise_operation_unary(DstDesc, TFloat* __restrict__ p_dst, F f)
{
......@@ -78,13 +78,13 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, TFloat* __restrict__ p_ds
// TODO: in order to optimize mem access for different mem type,
// need to write specialized version
template <class TFloat,
template <unsigned BlockSize,
class TFloat,
class SrcDesc,
class DstDesc,
class RefDesc,
class Reorder,
class F,
unsigned BlockSize>
class F>
__device__ void
blockwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
TFloat* const __restrict__ p_src,
......@@ -170,21 +170,20 @@ blockwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
}
}
template <class TFloat, class DstDesc, unsigned BlockSize>
template <unsigned BlockSize, class TFloat, class DstDesc>
__device__ void blockwise_4d_tensor_set_zero(DstDesc, TFloat* __restrict__ p_dst)
{
auto f_set_zero = [](TFloat& v) { v = TFloat(0); };
blockwise_4d_tensor_pointwise_operation_unary<TFloat, DstDesc, decltype(f_set_zero), BlockSize>(
DstDesc{}, p_dst, f_set_zero);
blockwise_4d_tensor_pointwise_operation_unary<BlockSize>(DstDesc{}, p_dst, f_set_zero);
}
template <class TFloat,
template <unsigned BlockSize,
class TFloat,
class SrcDesc,
class DstDesc,
class RefDesc,
class Reorder,
unsigned BlockSize>
class Reorder>
__device__ void blockwise_4d_tensor_copy_reorder(SrcDesc,
TFloat* const __restrict__ p_src,
DstDesc,
......@@ -194,34 +193,16 @@ __device__ void blockwise_4d_tensor_copy_reorder(SrcDesc,
{
auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; };
blockwise_4d_tensor_pointwise_operation_binary_reorder<TFloat,
SrcDesc,
DstDesc,
RefDesc,
Reorder,
decltype(f_copy),
BlockSize>(
blockwise_4d_tensor_pointwise_operation_binary_reorder<BlockSize>(
SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, Reorder{}, f_copy);
}
template <class TFloat, class SrcDesc, class DstDesc, class RefDesc, unsigned BlockSize>
template <unsigned BlockSize, class TFloat, class SrcDesc, class DstDesc, class RefDesc>
__device__ void blockwise_4d_tensor_copy(
SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, RefDesc)
{
constexpr auto reorder = Sequence<0, 1, 2, 3>{};
blockwise_4d_tensor_copy_reorder<TFloat,
SrcDesc,
DstDesc,
RefDesc,
decltype(reorder),
BlockSize>(
blockwise_4d_tensor_copy_reorder<BlockSize>(
SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, reorder);
}
template <class TFloat, class ImDesc, class WDesc, class ColDesc, unsigned BlockSize>
__device__ void blockwise_4d_tensor_im2col(
ImDesc, const __restrict__ TFloat* p_im, WDesc, ColDesc, __restrict__ TFloat* p_col)
{
// do nothing
}
......@@ -24,12 +24,55 @@ struct Sequence
}
template <unsigned I>
__host__ __device__ constexpr auto GetNumber(Number<I>) const
__host__ __device__ constexpr auto GetConstant(Number<I>) const
{
constexpr unsigned N = Get(I);
return Number<N>{};
}
template <unsigned I0, unsigned I1>
__host__ __device__ constexpr auto Reorder(Number<I0>, Number<I1>) const
{
constexpr unsigned IR0 = Get(Number<I0>{});
constexpr unsigned IR1 = Get(Number<I1>{});
return Sequence<IR0, IR1>{};
}
template <unsigned I0, unsigned I1, unsigned I2>
__host__ __device__ constexpr auto Reorder(Number<I0>, Number<I1>, Number<I2>) const
{
constexpr unsigned IR0 = Get(Number<I0>{});
constexpr unsigned IR1 = Get(Number<I1>{});
constexpr unsigned IR2 = Get(Number<I2>{});
return Sequence<IR0, IR1, IR2>{};
}
template <unsigned I0, unsigned I1, unsigned I2, unsigned I3>
__host__ __device__ constexpr auto Reorder(Number<I0>, Number<I1>, Number<I2>, Number<I3>) const
{
constexpr unsigned IR0 = Get(Number<I0>{});
constexpr unsigned IR1 = Get(Number<I1>{});
constexpr unsigned IR2 = Get(Number<I2>{});
constexpr unsigned IR3 = Get(Number<I3>{});
return Sequence<IR0, IR1, IR2, IR3>{};
}
template <unsigned I0, unsigned I1, unsigned I2, unsigned I3, unsigned I4>
__host__ __device__ constexpr auto
Reorder(Number<I0>, Number<I1>, Number<I2>, Number<I3>, Number<I4>) const
{
constexpr unsigned IR0 = Get(Number<I0>{});
constexpr unsigned IR1 = Get(Number<I1>{});
constexpr unsigned IR2 = Get(Number<I2>{});
constexpr unsigned IR3 = Get(Number<I3>{});
constexpr unsigned IR4 = Get(Number<I4>{});
return Sequence<IR0, IR1, IR2, IR3, IR4>{};
}
};
template <class Lengths, class Strides>
......@@ -99,6 +142,15 @@ struct ConstantTensorDescriptor
static_assert(nDim == 4, "nDim is not 4");
return n * GetStride(I0) + c * GetStride(I1) + h * GetStride(I2) + w * GetStride(I3);
}
template <class... Is>
__host__ __device__ constexpr auto Reorder(Is... is) const
{
constexpr auto lengths = Lengths{}.Reorder(is...);
constexpr auto strides = Strides{}.Reorder(is...);
return ConstantTensorDescriptor<decltype(lengths), decltype(strides)>{};
}
};
// this is ugly, only for 4d
......@@ -121,28 +173,6 @@ __host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Stride
return ConstantTensorDescriptor<Lengths, Strides>{};
}
// this is ugly, only for 4d
template <class Desc, class Reorder>
__host__ __device__ constexpr auto get_reordered_4d_tensor_descriptor(Desc, Reorder)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto IT0 = Reorder{}.GetNumber(I0);
constexpr auto IT1 = Reorder{}.GetNumber(I1);
constexpr auto IT2 = Reorder{}.GetNumber(I2);
constexpr auto IT3 = Reorder{}.GetNumber(I3);
constexpr unsigned L0 = Desc{}.GetLength(IT0);
constexpr unsigned L1 = Desc{}.GetLength(IT1);
constexpr unsigned L2 = Desc{}.GetLength(IT2);
constexpr unsigned L3 = Desc{}.GetLength(IT3);
return make_ConstantTensorDescriptor(Sequence<L0, L1, L2, L3>{});
}
// this is ugly, only for 4d
template <class InDesc, class WeiDesc>
__host__ __device__ constexpr auto get_convolution_output_4d_tensor_descriptor(InDesc, WeiDesc)
......
......@@ -122,18 +122,13 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
#endif
// set output tensor in LDS to 0
blockwise_4d_tensor_set_zero<TFloat, decltype(out_block_desc), BlockSize>(out_block_desc,
p_out_block);
blockwise_4d_tensor_set_zero<BlockSize>(out_block_desc, p_out_block);
for(unsigned c_block_work_begin = 0; c_block_work_begin < in_global_desc.GetLength(I1);
c_block_work_begin += CPerBlock)
{
// copy input tensor to LDS
blockwise_4d_tensor_copy<TFloat,
decltype(in_block_global_desc),
decltype(in_block_desc),
decltype(in_block_desc),
BlockSize>(in_block_global_desc,
blockwise_4d_tensor_copy<BlockSize>(in_block_global_desc,
p_in_global +
in_global_desc.Get1dIndex(n_block_work_begin,
c_block_work_begin,
......@@ -144,11 +139,7 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
in_block_desc);
// copy weight tensor to LDS
blockwise_4d_tensor_copy<TFloat,
decltype(wei_block_global_desc),
decltype(wei_block_desc),
decltype(wei_block_desc),
BlockSize>(
blockwise_4d_tensor_copy<BlockSize>(
wei_block_global_desc,
p_wei_global + wei_global_desc.Get1dIndex(k_block_work_begin, c_block_work_begin, 0, 0),
wei_block_desc,
......@@ -158,7 +149,8 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
__syncthreads();
// blockwise convolution
blockwise_direct_convolution<TFloat,
blockwise_direct_convolution<BlockSize,
TFloat,
decltype(in_block_desc),
decltype(wei_block_desc),
decltype(out_block_desc),
......@@ -166,19 +158,14 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
OutTileSizeW,
NPerThread,
KPerThread,
CPerThread,
BlockSize>(
CPerThread>(
in_block_desc, p_in_block, wei_block_desc, p_wei_block, out_block_desc, p_out_block);
__syncthreads();
}
// copy output tensor from LDS to device mem
blockwise_4d_tensor_copy<TFloat,
decltype(out_block_desc),
decltype(out_block_global_desc),
decltype(out_block_desc),
BlockSize>(
blockwise_4d_tensor_copy<BlockSize>(
out_block_desc,
p_out_block,
out_block_global_desc,
......@@ -186,4 +173,4 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
out_global_desc.Get1dIndex(
n_block_work_begin, k_block_work_begin, ho_block_work_begin, wo_block_work_begin),
out_block_desc);
}
\ No newline at end of file
}
......@@ -151,11 +151,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
c_block_data_begin += CPerBlock, __syncthreads())
{
// copy input tensor to LDS
blockwise_4d_tensor_copy<TFloat,
decltype(in_global_desc),
decltype(in_block_desc),
decltype(in_block_desc),
BlockSize>(in_global_desc,
blockwise_4d_tensor_copy<BlockSize>(in_global_desc,
p_in_global +
in_global_desc.Get1dIndex(n_block_data_begin,
c_block_data_begin,
......@@ -166,11 +162,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
in_block_desc);
// copy weight tensor to LDS
blockwise_4d_tensor_copy<TFloat,
decltype(wei_global_desc),
decltype(wei_block_desc),
decltype(wei_block_desc),
BlockSize>(
blockwise_4d_tensor_copy<BlockSize>(
wei_global_desc,
p_wei_global + wei_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0),
wei_block_desc,
......@@ -182,7 +174,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread)
{
// threadwise convolution
#if 0
#if 1
threadwise_direct_convolution_2(
in_thread_block_desc,
p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment