Commit 0b8e67ef authored by Chao Liu's avatar Chao Liu
Browse files

refactor

parent ac1f62be
......@@ -4,7 +4,7 @@
#include "threadwise_direct_convolution.cuh"
template <unsigned BlockSize,
class TFloat,
class Float,
class InBlockDesc,
class WeiBlockDesc,
class OutBlockDesc,
......@@ -14,11 +14,11 @@ template <unsigned BlockSize,
unsigned KPerThread,
unsigned CPerThread>
__device__ void blockwise_direct_convolution(InBlockDesc,
TFloat* const __restrict__ p_in_block,
Float* const __restrict__ p_in_block,
WeiBlockDesc,
TFloat* const __restrict__ p_wei_block,
Float* const __restrict__ p_wei_block,
OutBlockDesc,
TFloat* __restrict__ p_out_block)
Float* __restrict__ p_out_block)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
......@@ -92,7 +92,7 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
unsigned hi_thread_data_begin = ho_thread_data_begin; // minus padding
unsigned wi_thread_data_begin = wo_thread_data_begin; // minus padding
TFloat p_out_thread[out_thread_desc.GetElementSpace()];
Float p_out_thread[out_thread_desc.GetElementSpace()];
threadwise_4d_tensor_copy(out_block_desc,
p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin,
......
#pragma once
#include "constant_tensor_descriptor.cuh"
template <unsigned BlockSize, class TFloat, class DstDesc, class F>
template <unsigned BlockSize, class Float, class DstDesc, class F>
__device__ void
blockwise_4d_tensor_pointwise_operation_unary(DstDesc, TFloat* __restrict__ p_dst, F f)
blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst, F f)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
......@@ -79,7 +79,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, TFloat* __restrict__ p_ds
// TODO: in order to optimize mem access for different mem type,
// need to write specialized version
template <unsigned BlockSize,
class TFloat,
class Float,
class SrcDesc,
class DstDesc,
class RefDesc,
......@@ -87,9 +87,9 @@ template <unsigned BlockSize,
class F>
__device__ void
blockwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
TFloat* const __restrict__ p_src,
Float* const __restrict__ p_src,
DstDesc,
TFloat* __restrict__ p_dst,
Float* __restrict__ p_dst,
RefDesc,
Reorder,
F f)
......@@ -170,36 +170,32 @@ blockwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
}
}
template <unsigned BlockSize, class TFloat, class DstDesc>
__device__ void blockwise_4d_tensor_set_zero(DstDesc, TFloat* __restrict__ p_dst)
template <unsigned BlockSize, class Float, class DstDesc>
__device__ void blockwise_4d_tensor_set_zero(DstDesc, Float* __restrict__ p_dst)
{
auto f_set_zero = [](TFloat& v) { v = TFloat(0); };
auto f_set_zero = [](Float& v) { v = Float(0); };
blockwise_4d_tensor_pointwise_operation_unary<BlockSize>(DstDesc{}, p_dst, f_set_zero);
}
template <unsigned BlockSize,
class TFloat,
class Float,
class SrcDesc,
class DstDesc,
class RefDesc,
class Reorder>
__device__ void blockwise_4d_tensor_copy_reorder(SrcDesc,
TFloat* const __restrict__ p_src,
DstDesc,
TFloat* __restrict__ p_dst,
RefDesc,
Reorder)
__device__ void blockwise_4d_tensor_copy_reorder(
SrcDesc, Float* const __restrict__ p_src, DstDesc, Float* __restrict__ p_dst, RefDesc, Reorder)
{
auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; };
auto f_copy = [](const Float& src, Float& dst) { dst = src; };
blockwise_4d_tensor_pointwise_operation_binary_reorder<BlockSize>(
SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, Reorder{}, f_copy);
}
template <unsigned BlockSize, class TFloat, class SrcDesc, class DstDesc, class RefDesc>
template <unsigned BlockSize, class Float, class SrcDesc, class DstDesc, class RefDesc>
__device__ void blockwise_4d_tensor_copy(
SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, RefDesc)
SrcDesc, Float* const __restrict__ p_src, DstDesc, Float* __restrict__ p_dst, RefDesc)
{
constexpr auto reorder = Sequence<0, 1, 2, 3>{};
......
......@@ -3,7 +3,7 @@
#include "blockwise_tensor_op.cuh"
#include "blockwise_direct_convolution.cuh"
template <class TFloat,
template <class Float,
class InGlobalDesc,
class WeiGlobalDesc,
class OutGlobalDesc,
......@@ -20,11 +20,11 @@ template <class TFloat,
unsigned BlockSize,
unsigned GridSize>
__global__ void gridwise_direct_convolution_1(InGlobalDesc,
TFloat* const __restrict__ p_in_global,
Float* const __restrict__ p_in_global,
WeiGlobalDesc,
TFloat* const __restrict__ p_wei_global,
Float* const __restrict__ p_wei_global,
OutGlobalDesc,
TFloat* __restrict__ p_out_global)
Float* __restrict__ p_out_global)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
......@@ -68,9 +68,9 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
constexpr unsigned wei_block_size = wei_block_desc.GetElementSpace();
constexpr unsigned out_block_size = out_block_desc.GetElementSpace();
__shared__ TFloat p_in_block[in_block_size];
__shared__ TFloat p_wei_block[wei_block_size];
__shared__ TFloat p_out_block[out_block_size];
__shared__ Float p_in_block[in_block_size];
__shared__ Float p_wei_block[wei_block_size];
__shared__ Float p_out_block[out_block_size];
const unsigned block_id = blockIdx.x;
......@@ -150,7 +150,7 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
// blockwise convolution
blockwise_direct_convolution<BlockSize,
TFloat,
Float,
decltype(in_block_desc),
decltype(wei_block_desc),
decltype(out_block_desc),
......
......@@ -5,7 +5,7 @@
#include "threadwise_tensor_op.cuh"
#include "threadwise_direct_convolution.cuh"
template <class TFloat,
template <class Float,
class InGlobalDesc,
class WeiGlobalDesc,
class OutGlobalDesc,
......@@ -22,11 +22,11 @@ template <class TFloat,
unsigned BlockSize,
unsigned GridSize>
__global__ void gridwise_direct_convolution_2(InGlobalDesc,
TFloat* const __restrict__ p_in_global,
Float* const __restrict__ p_in_global,
WeiGlobalDesc,
TFloat* const __restrict__ p_wei_global,
Float* const __restrict__ p_wei_global,
OutGlobalDesc,
TFloat* __restrict__ p_out_global)
Float* __restrict__ p_out_global)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
......@@ -56,8 +56,8 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
constexpr unsigned in_block_size = in_block_desc.GetElementSpace();
constexpr unsigned wei_block_size = wei_block_desc.GetElementSpace();
__shared__ TFloat p_in_block[in_block_size];
__shared__ TFloat p_wei_block[wei_block_size];
__shared__ Float p_in_block[in_block_size];
__shared__ Float p_wei_block[wei_block_size];
// threadwise tensors
constexpr unsigned InTileSizeH = OutTileSizeH + S - 1;
......@@ -73,7 +73,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
get_convolution_output_4d_tensor_descriptor(in_thread_block_desc, wei_thread_block_desc);
// register
TFloat p_out_thread[out_thread_desc.GetElementSpace()];
Float p_out_thread[out_thread_desc.GetElementSpace()];
// divide block work
constexpr unsigned NBlockWork = (out_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
......
......@@ -3,7 +3,7 @@
#include "blockwise_winograd_transform.cuh"
#include "threadwise_winograd_transform.cuh"
template <class TFloat,
template <class Float,
class InGlobalDesc,
class WeiGlobalDesc,
class OutGlobalDesc,
......@@ -20,11 +20,11 @@ template <class TFloat,
unsigned BlockSize,
unsigned GridSize>
__global__ void gridwise_winograd_convolution(InGlobalDesc,
TFloat* const __restrict__ p_in_global,
Float* const __restrict__ p_in_global,
WeiGlobalDesc,
TFloat* const __restrict__ p_wei_global,
Float* const __restrict__ p_wei_global,
OutGlobalDesc,
TFloat* __restrict__ p_out_global)
Float* __restrict__ p_out_global)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
......@@ -102,8 +102,8 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc,
constexpr auto wei_transform_block_desc =
make_ConstantTensorDescriptor(Sequence<KPerBlock, CPerBlock, InTileSizeH, InTileSizeW>{});
__shared__ TFloat p_in_transform_block[in_transform_block_desc.GetElementSpace()];
__shared__ TFloat p_wei_transform_block[wei_transform_block_desc.GetElementSpace()];
__shared__ Float p_in_transform_block[in_transform_block_desc.GetElementSpace()];
__shared__ Float p_wei_transform_block[wei_transform_block_desc.GetElementSpace()];
// thread data
constexpr auto in_transform_thread_block_desc =
......@@ -123,8 +123,8 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc,
constexpr auto out_thread_global_desc =
make_ConstantTensorDescriptor(out_thread_desc.GetLengths(), out_global_desc.GetStrides());
TFloat p_out_transform_thread[out_transform_thread_desc.GetElementSpace()];
TFloat p_out_thread[out_thread_desc.GetElementSpace()];
Float p_out_transform_thread[out_transform_thread_desc.GetElementSpace()];
Float p_out_thread[out_thread_desc.GetElementSpace()];
#if 0
if(blockIdx.x == 0 && threadIdx.x == 0)
......@@ -146,7 +146,7 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc,
{
#if 0
// blockwise transform input
blockwise_winograd_transform_input<TFloat,
blockwise_winograd_transform_input<Float,
InTileSizeH,
InTileSizeW,
S,
......@@ -166,7 +166,7 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc,
#endif
// blockwise transform weights
blockwise_winograd_transform_weight<TFloat,
blockwise_winograd_transform_weight<Float,
InTileSizeH,
InTileSizeW,
S,
......@@ -183,7 +183,7 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc,
{
// threadwise point multiplication
threadwise_winograd_calculate_transformed_output<
TFloat,
Float,
decltype(in_transform_thread_block_desc),
decltype(wei_transform_thread_block_desc),
decltype(out_transform_thread_desc),
......@@ -207,7 +207,7 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc,
};
// transform back
threadwise_winograd_reverse_transform_output<TFloat,
threadwise_winograd_reverse_transform_output<Float,
decltype(out_transform_thread_desc),
decltype(out_thread_desc),
InTileSizeH,
......
......@@ -2,13 +2,13 @@
#include "constant_tensor_descriptor.cuh"
// optimized for scenario if p_in, p_wei, p_out are in register
template <class TFloat, class InDesc, class WeiDesc, class OutDesc>
template <class Float, class InDesc, class WeiDesc, class OutDesc>
__device__ void threadwise_direct_convolution_1(InDesc,
TFloat* const __restrict__ p_in,
Float* const __restrict__ p_in,
WeiDesc,
TFloat* const __restrict__ p_wei,
Float* const __restrict__ p_wei,
OutDesc,
TFloat* __restrict__ p_out)
Float* __restrict__ p_out)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
......@@ -81,13 +81,13 @@ __device__ void threadwise_direct_convolution_1(InDesc,
// Optimized for scenario if p_in and p_wei are in LDS, p_out are in register
// Copy in and wei into register before doing convolution
template <class TFloat, class InDesc, class WeiDesc, class OutDesc>
template <class Float, class InDesc, class WeiDesc, class OutDesc>
__device__ void threadwise_direct_convolution_2(InDesc,
TFloat* const __restrict__ p_in,
Float* const __restrict__ p_in,
WeiDesc,
TFloat* const __restrict__ p_wei,
Float* const __restrict__ p_wei,
OutDesc,
TFloat* __restrict__ p_out)
Float* __restrict__ p_out)
{
constexpr auto in_desc = InDesc{};
constexpr auto wei_desc = WeiDesc{};
......@@ -97,8 +97,8 @@ __device__ void threadwise_direct_convolution_2(InDesc,
constexpr auto wei_reg_desc = make_ConstantTensorDescriptor(wei_desc.GetLengths());
// register
TFloat p_in_reg[in_reg_desc.GetElementSpace()];
TFloat p_wei_reg[wei_reg_desc.GetElementSpace()];
Float p_in_reg[in_reg_desc.GetElementSpace()];
Float p_wei_reg[wei_reg_desc.GetElementSpace()];
// copy input tensor into register
threadwise_4d_tensor_copy(in_desc, p_in, in_reg_desc, p_in_reg, in_reg_desc);
......@@ -114,13 +114,13 @@ __device__ void threadwise_direct_convolution_2(InDesc,
// optimized for scenario where p_in and p_wei are in LDS, p_out is in register
// break down a non-1x1 convolution into a sequence of 1x1 convolutions,
// load 1x1 weight into register, and do 1x1 convolution in register.
template <class TFloat, class InDesc, class WeiDesc, class OutDesc>
template <class Float, class InDesc, class WeiDesc, class OutDesc>
__device__ void threadwise_direct_convolution_3(InDesc,
TFloat* const __restrict__ p_in,
Float* const __restrict__ p_in,
WeiDesc,
TFloat* const __restrict__ p_wei,
Float* const __restrict__ p_wei,
OutDesc,
TFloat* __restrict__ p_out)
Float* __restrict__ p_out)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
......@@ -139,8 +139,8 @@ __device__ void threadwise_direct_convolution_3(InDesc,
constexpr auto wei_reg_desc = make_ConstantTensorDescriptor(
Sequence<wei_desc.GetLength(I0), wei_desc.GetLength(I1), 1, 1>{});
TFloat p_in_reg[in_reg_desc.GetElementSpace()];
TFloat p_wei_reg[wei_reg_desc.GetElementSpace()];
Float p_in_reg[in_reg_desc.GetElementSpace()];
Float p_wei_reg[wei_reg_desc.GetElementSpace()];
constexpr unsigned in_w_new_read = 1;
......
#pragma once
#include "constant_tensor_descriptor.cuh"
template <class TFloat, class Desc, class F>
__device__ void threadwise_4d_tensor_pointwise_operation_unary(Desc, TFloat* __restrict__ p, F f)
template <class Float, class Desc, class F>
__device__ void threadwise_4d_tensor_pointwise_operation_unary(Desc, Float* __restrict__ p, F f)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
......@@ -37,12 +37,12 @@ __device__ void threadwise_4d_tensor_pointwise_operation_unary(Desc, TFloat* __r
// TODO: in order to optimize mem access for different mem type,
// need to write specialized version
template <class TFloat, class SrcDesc, class DstDesc, class RefDesc, class Reorder, class F>
template <class Float, class SrcDesc, class DstDesc, class RefDesc, class Reorder, class F>
__device__ void
threadwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
TFloat* const __restrict__ p_src,
Float* const __restrict__ p_src,
DstDesc,
TFloat* __restrict__ p_dst,
Float* __restrict__ p_dst,
RefDesc,
Reorder,
F f)
......@@ -83,26 +83,22 @@ threadwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
}
}
template <class TFloat, class Desc>
__device__ void threadwise_4d_tensor_set_zero(Desc, TFloat* __restrict__ p)
template <class Float, class Desc>
__device__ void threadwise_4d_tensor_set_zero(Desc, Float* __restrict__ p)
{
auto f_set_zero = [](TFloat& v) { v = TFloat(0); };
auto f_set_zero = [](Float& v) { v = Float(0); };
threadwise_4d_tensor_pointwise_operation_unary<TFloat, Desc, decltype(f_set_zero)>(
threadwise_4d_tensor_pointwise_operation_unary<Float, Desc, decltype(f_set_zero)>(
Desc{}, p, f_set_zero);
}
template <class TFloat, class SrcDesc, class DstDesc, class RefDesc, class Reorder>
__device__ void threadwise_4d_tensor_copy_reorder(SrcDesc,
TFloat* const __restrict__ p_src,
DstDesc,
TFloat* __restrict__ p_dst,
RefDesc,
Reorder)
template <class Float, class SrcDesc, class DstDesc, class RefDesc, class Reorder>
__device__ void threadwise_4d_tensor_copy_reorder(
SrcDesc, Float* const __restrict__ p_src, DstDesc, Float* __restrict__ p_dst, RefDesc, Reorder)
{
auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; };
auto f_copy = [](const Float& src, Float& dst) { dst = src; };
threadwise_4d_tensor_pointwise_operation_binary_reorder<TFloat,
threadwise_4d_tensor_pointwise_operation_binary_reorder<Float,
SrcDesc,
DstDesc,
RefDesc,
......@@ -111,18 +107,18 @@ __device__ void threadwise_4d_tensor_copy_reorder(SrcDesc,
SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, Reorder{}, f_copy);
}
template <class TFloat, class SrcDesc, class DstDesc, class RefDesc>
template <class Float, class SrcDesc, class DstDesc, class RefDesc>
__device__ void threadwise_4d_tensor_copy(
SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, RefDesc)
SrcDesc, Float* const __restrict__ p_src, DstDesc, Float* __restrict__ p_dst, RefDesc)
{
auto reorder = Sequence<0, 1, 2, 3>{};
threadwise_4d_tensor_copy_reorder<TFloat, SrcDesc, DstDesc, RefDesc, decltype(reorder)>(
threadwise_4d_tensor_copy_reorder<Float, SrcDesc, DstDesc, RefDesc, decltype(reorder)>(
SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, reorder);
}
template <class TFloat, class Desc, class IDim, class NShift>
__device__ void threadwise_4d_tensor_shift_down(Desc, TFloat* __restrict__ p, IDim, NShift)
template <class Float, class Desc, class IDim, class NShift>
__device__ void threadwise_4d_tensor_shift_down(Desc, Float* __restrict__ p, IDim, NShift)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment