Commit 0b8e67ef authored by Chao Liu's avatar Chao Liu
Browse files

refactor

parent ac1f62be
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
#include "threadwise_direct_convolution.cuh" #include "threadwise_direct_convolution.cuh"
template <unsigned BlockSize, template <unsigned BlockSize,
class TFloat, class Float,
class InBlockDesc, class InBlockDesc,
class WeiBlockDesc, class WeiBlockDesc,
class OutBlockDesc, class OutBlockDesc,
...@@ -14,11 +14,11 @@ template <unsigned BlockSize, ...@@ -14,11 +14,11 @@ template <unsigned BlockSize,
unsigned KPerThread, unsigned KPerThread,
unsigned CPerThread> unsigned CPerThread>
__device__ void blockwise_direct_convolution(InBlockDesc, __device__ void blockwise_direct_convolution(InBlockDesc,
TFloat* const __restrict__ p_in_block, Float* const __restrict__ p_in_block,
WeiBlockDesc, WeiBlockDesc,
TFloat* const __restrict__ p_wei_block, Float* const __restrict__ p_wei_block,
OutBlockDesc, OutBlockDesc,
TFloat* __restrict__ p_out_block) Float* __restrict__ p_out_block)
{ {
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
...@@ -92,7 +92,7 @@ __device__ void blockwise_direct_convolution(InBlockDesc, ...@@ -92,7 +92,7 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
unsigned hi_thread_data_begin = ho_thread_data_begin; // minus padding unsigned hi_thread_data_begin = ho_thread_data_begin; // minus padding
unsigned wi_thread_data_begin = wo_thread_data_begin; // minus padding unsigned wi_thread_data_begin = wo_thread_data_begin; // minus padding
TFloat p_out_thread[out_thread_desc.GetElementSpace()]; Float p_out_thread[out_thread_desc.GetElementSpace()];
threadwise_4d_tensor_copy(out_block_desc, threadwise_4d_tensor_copy(out_block_desc,
p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin, p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin,
......
#pragma once #pragma once
#include "constant_tensor_descriptor.cuh" #include "constant_tensor_descriptor.cuh"
template <unsigned BlockSize, class TFloat, class DstDesc, class F> template <unsigned BlockSize, class Float, class DstDesc, class F>
__device__ void __device__ void
blockwise_4d_tensor_pointwise_operation_unary(DstDesc, TFloat* __restrict__ p_dst, F f) blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst, F f)
{ {
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
...@@ -79,7 +79,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, TFloat* __restrict__ p_ds ...@@ -79,7 +79,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, TFloat* __restrict__ p_ds
// TODO: in order to optimize mem access for different mem type, // TODO: in order to optimize mem access for different mem type,
// need to write specialized version // need to write specialized version
template <unsigned BlockSize, template <unsigned BlockSize,
class TFloat, class Float,
class SrcDesc, class SrcDesc,
class DstDesc, class DstDesc,
class RefDesc, class RefDesc,
...@@ -87,9 +87,9 @@ template <unsigned BlockSize, ...@@ -87,9 +87,9 @@ template <unsigned BlockSize,
class F> class F>
__device__ void __device__ void
blockwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc, blockwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
TFloat* const __restrict__ p_src, Float* const __restrict__ p_src,
DstDesc, DstDesc,
TFloat* __restrict__ p_dst, Float* __restrict__ p_dst,
RefDesc, RefDesc,
Reorder, Reorder,
F f) F f)
...@@ -170,36 +170,32 @@ blockwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc, ...@@ -170,36 +170,32 @@ blockwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
} }
} }
template <unsigned BlockSize, class TFloat, class DstDesc> template <unsigned BlockSize, class Float, class DstDesc>
__device__ void blockwise_4d_tensor_set_zero(DstDesc, TFloat* __restrict__ p_dst) __device__ void blockwise_4d_tensor_set_zero(DstDesc, Float* __restrict__ p_dst)
{ {
auto f_set_zero = [](TFloat& v) { v = TFloat(0); }; auto f_set_zero = [](Float& v) { v = Float(0); };
blockwise_4d_tensor_pointwise_operation_unary<BlockSize>(DstDesc{}, p_dst, f_set_zero); blockwise_4d_tensor_pointwise_operation_unary<BlockSize>(DstDesc{}, p_dst, f_set_zero);
} }
template <unsigned BlockSize, template <unsigned BlockSize,
class TFloat, class Float,
class SrcDesc, class SrcDesc,
class DstDesc, class DstDesc,
class RefDesc, class RefDesc,
class Reorder> class Reorder>
__device__ void blockwise_4d_tensor_copy_reorder(SrcDesc, __device__ void blockwise_4d_tensor_copy_reorder(
TFloat* const __restrict__ p_src, SrcDesc, Float* const __restrict__ p_src, DstDesc, Float* __restrict__ p_dst, RefDesc, Reorder)
DstDesc,
TFloat* __restrict__ p_dst,
RefDesc,
Reorder)
{ {
auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; }; auto f_copy = [](const Float& src, Float& dst) { dst = src; };
blockwise_4d_tensor_pointwise_operation_binary_reorder<BlockSize>( blockwise_4d_tensor_pointwise_operation_binary_reorder<BlockSize>(
SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, Reorder{}, f_copy); SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, Reorder{}, f_copy);
} }
template <unsigned BlockSize, class TFloat, class SrcDesc, class DstDesc, class RefDesc> template <unsigned BlockSize, class Float, class SrcDesc, class DstDesc, class RefDesc>
__device__ void blockwise_4d_tensor_copy( __device__ void blockwise_4d_tensor_copy(
SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, RefDesc) SrcDesc, Float* const __restrict__ p_src, DstDesc, Float* __restrict__ p_dst, RefDesc)
{ {
constexpr auto reorder = Sequence<0, 1, 2, 3>{}; constexpr auto reorder = Sequence<0, 1, 2, 3>{};
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#include "blockwise_tensor_op.cuh" #include "blockwise_tensor_op.cuh"
#include "blockwise_direct_convolution.cuh" #include "blockwise_direct_convolution.cuh"
template <class TFloat, template <class Float,
class InGlobalDesc, class InGlobalDesc,
class WeiGlobalDesc, class WeiGlobalDesc,
class OutGlobalDesc, class OutGlobalDesc,
...@@ -20,11 +20,11 @@ template <class TFloat, ...@@ -20,11 +20,11 @@ template <class TFloat,
unsigned BlockSize, unsigned BlockSize,
unsigned GridSize> unsigned GridSize>
__global__ void gridwise_direct_convolution_1(InGlobalDesc, __global__ void gridwise_direct_convolution_1(InGlobalDesc,
TFloat* const __restrict__ p_in_global, Float* const __restrict__ p_in_global,
WeiGlobalDesc, WeiGlobalDesc,
TFloat* const __restrict__ p_wei_global, Float* const __restrict__ p_wei_global,
OutGlobalDesc, OutGlobalDesc,
TFloat* __restrict__ p_out_global) Float* __restrict__ p_out_global)
{ {
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
...@@ -68,9 +68,9 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc, ...@@ -68,9 +68,9 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
constexpr unsigned wei_block_size = wei_block_desc.GetElementSpace(); constexpr unsigned wei_block_size = wei_block_desc.GetElementSpace();
constexpr unsigned out_block_size = out_block_desc.GetElementSpace(); constexpr unsigned out_block_size = out_block_desc.GetElementSpace();
__shared__ TFloat p_in_block[in_block_size]; __shared__ Float p_in_block[in_block_size];
__shared__ TFloat p_wei_block[wei_block_size]; __shared__ Float p_wei_block[wei_block_size];
__shared__ TFloat p_out_block[out_block_size]; __shared__ Float p_out_block[out_block_size];
const unsigned block_id = blockIdx.x; const unsigned block_id = blockIdx.x;
...@@ -150,7 +150,7 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc, ...@@ -150,7 +150,7 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
// blockwise convolution // blockwise convolution
blockwise_direct_convolution<BlockSize, blockwise_direct_convolution<BlockSize,
TFloat, Float,
decltype(in_block_desc), decltype(in_block_desc),
decltype(wei_block_desc), decltype(wei_block_desc),
decltype(out_block_desc), decltype(out_block_desc),
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
#include "threadwise_tensor_op.cuh" #include "threadwise_tensor_op.cuh"
#include "threadwise_direct_convolution.cuh" #include "threadwise_direct_convolution.cuh"
template <class TFloat, template <class Float,
class InGlobalDesc, class InGlobalDesc,
class WeiGlobalDesc, class WeiGlobalDesc,
class OutGlobalDesc, class OutGlobalDesc,
...@@ -22,11 +22,11 @@ template <class TFloat, ...@@ -22,11 +22,11 @@ template <class TFloat,
unsigned BlockSize, unsigned BlockSize,
unsigned GridSize> unsigned GridSize>
__global__ void gridwise_direct_convolution_2(InGlobalDesc, __global__ void gridwise_direct_convolution_2(InGlobalDesc,
TFloat* const __restrict__ p_in_global, Float* const __restrict__ p_in_global,
WeiGlobalDesc, WeiGlobalDesc,
TFloat* const __restrict__ p_wei_global, Float* const __restrict__ p_wei_global,
OutGlobalDesc, OutGlobalDesc,
TFloat* __restrict__ p_out_global) Float* __restrict__ p_out_global)
{ {
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
...@@ -56,8 +56,8 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc, ...@@ -56,8 +56,8 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
constexpr unsigned in_block_size = in_block_desc.GetElementSpace(); constexpr unsigned in_block_size = in_block_desc.GetElementSpace();
constexpr unsigned wei_block_size = wei_block_desc.GetElementSpace(); constexpr unsigned wei_block_size = wei_block_desc.GetElementSpace();
__shared__ TFloat p_in_block[in_block_size]; __shared__ Float p_in_block[in_block_size];
__shared__ TFloat p_wei_block[wei_block_size]; __shared__ Float p_wei_block[wei_block_size];
// threadwise tensors // threadwise tensors
constexpr unsigned InTileSizeH = OutTileSizeH + S - 1; constexpr unsigned InTileSizeH = OutTileSizeH + S - 1;
...@@ -73,7 +73,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc, ...@@ -73,7 +73,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
get_convolution_output_4d_tensor_descriptor(in_thread_block_desc, wei_thread_block_desc); get_convolution_output_4d_tensor_descriptor(in_thread_block_desc, wei_thread_block_desc);
// register // register
TFloat p_out_thread[out_thread_desc.GetElementSpace()]; Float p_out_thread[out_thread_desc.GetElementSpace()];
// divide block work // divide block work
constexpr unsigned NBlockWork = (out_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock; constexpr unsigned NBlockWork = (out_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#include "blockwise_winograd_transform.cuh" #include "blockwise_winograd_transform.cuh"
#include "threadwise_winograd_transform.cuh" #include "threadwise_winograd_transform.cuh"
template <class TFloat, template <class Float,
class InGlobalDesc, class InGlobalDesc,
class WeiGlobalDesc, class WeiGlobalDesc,
class OutGlobalDesc, class OutGlobalDesc,
...@@ -20,11 +20,11 @@ template <class TFloat, ...@@ -20,11 +20,11 @@ template <class TFloat,
unsigned BlockSize, unsigned BlockSize,
unsigned GridSize> unsigned GridSize>
__global__ void gridwise_winograd_convolution(InGlobalDesc, __global__ void gridwise_winograd_convolution(InGlobalDesc,
TFloat* const __restrict__ p_in_global, Float* const __restrict__ p_in_global,
WeiGlobalDesc, WeiGlobalDesc,
TFloat* const __restrict__ p_wei_global, Float* const __restrict__ p_wei_global,
OutGlobalDesc, OutGlobalDesc,
TFloat* __restrict__ p_out_global) Float* __restrict__ p_out_global)
{ {
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
...@@ -102,8 +102,8 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc, ...@@ -102,8 +102,8 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc,
constexpr auto wei_transform_block_desc = constexpr auto wei_transform_block_desc =
make_ConstantTensorDescriptor(Sequence<KPerBlock, CPerBlock, InTileSizeH, InTileSizeW>{}); make_ConstantTensorDescriptor(Sequence<KPerBlock, CPerBlock, InTileSizeH, InTileSizeW>{});
__shared__ TFloat p_in_transform_block[in_transform_block_desc.GetElementSpace()]; __shared__ Float p_in_transform_block[in_transform_block_desc.GetElementSpace()];
__shared__ TFloat p_wei_transform_block[wei_transform_block_desc.GetElementSpace()]; __shared__ Float p_wei_transform_block[wei_transform_block_desc.GetElementSpace()];
// thread data // thread data
constexpr auto in_transform_thread_block_desc = constexpr auto in_transform_thread_block_desc =
...@@ -123,8 +123,8 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc, ...@@ -123,8 +123,8 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc,
constexpr auto out_thread_global_desc = constexpr auto out_thread_global_desc =
make_ConstantTensorDescriptor(out_thread_desc.GetLengths(), out_global_desc.GetStrides()); make_ConstantTensorDescriptor(out_thread_desc.GetLengths(), out_global_desc.GetStrides());
TFloat p_out_transform_thread[out_transform_thread_desc.GetElementSpace()]; Float p_out_transform_thread[out_transform_thread_desc.GetElementSpace()];
TFloat p_out_thread[out_thread_desc.GetElementSpace()]; Float p_out_thread[out_thread_desc.GetElementSpace()];
#if 0 #if 0
if(blockIdx.x == 0 && threadIdx.x == 0) if(blockIdx.x == 0 && threadIdx.x == 0)
...@@ -146,7 +146,7 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc, ...@@ -146,7 +146,7 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc,
{ {
#if 0 #if 0
// blockwise transform input // blockwise transform input
blockwise_winograd_transform_input<TFloat, blockwise_winograd_transform_input<Float,
InTileSizeH, InTileSizeH,
InTileSizeW, InTileSizeW,
S, S,
...@@ -166,7 +166,7 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc, ...@@ -166,7 +166,7 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc,
#endif #endif
// blockwise transform weights // blockwise transform weights
blockwise_winograd_transform_weight<TFloat, blockwise_winograd_transform_weight<Float,
InTileSizeH, InTileSizeH,
InTileSizeW, InTileSizeW,
S, S,
...@@ -183,7 +183,7 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc, ...@@ -183,7 +183,7 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc,
{ {
// threadwise point multiplication // threadwise point multiplication
threadwise_winograd_calculate_transformed_output< threadwise_winograd_calculate_transformed_output<
TFloat, Float,
decltype(in_transform_thread_block_desc), decltype(in_transform_thread_block_desc),
decltype(wei_transform_thread_block_desc), decltype(wei_transform_thread_block_desc),
decltype(out_transform_thread_desc), decltype(out_transform_thread_desc),
...@@ -207,7 +207,7 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc, ...@@ -207,7 +207,7 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc,
}; };
// transform back // transform back
threadwise_winograd_reverse_transform_output<TFloat, threadwise_winograd_reverse_transform_output<Float,
decltype(out_transform_thread_desc), decltype(out_transform_thread_desc),
decltype(out_thread_desc), decltype(out_thread_desc),
InTileSizeH, InTileSizeH,
......
...@@ -2,13 +2,13 @@ ...@@ -2,13 +2,13 @@
#include "constant_tensor_descriptor.cuh" #include "constant_tensor_descriptor.cuh"
// optimized for scenario if p_in, p_wei, p_out are in register // optimized for scenario if p_in, p_wei, p_out are in register
template <class TFloat, class InDesc, class WeiDesc, class OutDesc> template <class Float, class InDesc, class WeiDesc, class OutDesc>
__device__ void threadwise_direct_convolution_1(InDesc, __device__ void threadwise_direct_convolution_1(InDesc,
TFloat* const __restrict__ p_in, Float* const __restrict__ p_in,
WeiDesc, WeiDesc,
TFloat* const __restrict__ p_wei, Float* const __restrict__ p_wei,
OutDesc, OutDesc,
TFloat* __restrict__ p_out) Float* __restrict__ p_out)
{ {
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
...@@ -81,13 +81,13 @@ __device__ void threadwise_direct_convolution_1(InDesc, ...@@ -81,13 +81,13 @@ __device__ void threadwise_direct_convolution_1(InDesc,
// Optimized for scenario if p_in and p_wei are in LDS, p_out are in register // Optimized for scenario if p_in and p_wei are in LDS, p_out are in register
// Copy in and wei into register before doing convolution // Copy in and wei into register before doing convolution
template <class TFloat, class InDesc, class WeiDesc, class OutDesc> template <class Float, class InDesc, class WeiDesc, class OutDesc>
__device__ void threadwise_direct_convolution_2(InDesc, __device__ void threadwise_direct_convolution_2(InDesc,
TFloat* const __restrict__ p_in, Float* const __restrict__ p_in,
WeiDesc, WeiDesc,
TFloat* const __restrict__ p_wei, Float* const __restrict__ p_wei,
OutDesc, OutDesc,
TFloat* __restrict__ p_out) Float* __restrict__ p_out)
{ {
constexpr auto in_desc = InDesc{}; constexpr auto in_desc = InDesc{};
constexpr auto wei_desc = WeiDesc{}; constexpr auto wei_desc = WeiDesc{};
...@@ -97,8 +97,8 @@ __device__ void threadwise_direct_convolution_2(InDesc, ...@@ -97,8 +97,8 @@ __device__ void threadwise_direct_convolution_2(InDesc,
constexpr auto wei_reg_desc = make_ConstantTensorDescriptor(wei_desc.GetLengths()); constexpr auto wei_reg_desc = make_ConstantTensorDescriptor(wei_desc.GetLengths());
// register // register
TFloat p_in_reg[in_reg_desc.GetElementSpace()]; Float p_in_reg[in_reg_desc.GetElementSpace()];
TFloat p_wei_reg[wei_reg_desc.GetElementSpace()]; Float p_wei_reg[wei_reg_desc.GetElementSpace()];
// copy input tensor into register // copy input tensor into register
threadwise_4d_tensor_copy(in_desc, p_in, in_reg_desc, p_in_reg, in_reg_desc); threadwise_4d_tensor_copy(in_desc, p_in, in_reg_desc, p_in_reg, in_reg_desc);
...@@ -114,13 +114,13 @@ __device__ void threadwise_direct_convolution_2(InDesc, ...@@ -114,13 +114,13 @@ __device__ void threadwise_direct_convolution_2(InDesc,
// optimized for scenario where p_in and p_wei are in LDS, p_out is in register // optimized for scenario where p_in and p_wei are in LDS, p_out is in register
// break down a non-1x1 convolution into a sequence of 1x1 convolutions, // break down a non-1x1 convolution into a sequence of 1x1 convolutions,
// load 1x1 weight into register, and do 1x1 convolution in register. // load 1x1 weight into register, and do 1x1 convolution in register.
template <class TFloat, class InDesc, class WeiDesc, class OutDesc> template <class Float, class InDesc, class WeiDesc, class OutDesc>
__device__ void threadwise_direct_convolution_3(InDesc, __device__ void threadwise_direct_convolution_3(InDesc,
TFloat* const __restrict__ p_in, Float* const __restrict__ p_in,
WeiDesc, WeiDesc,
TFloat* const __restrict__ p_wei, Float* const __restrict__ p_wei,
OutDesc, OutDesc,
TFloat* __restrict__ p_out) Float* __restrict__ p_out)
{ {
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
...@@ -139,8 +139,8 @@ __device__ void threadwise_direct_convolution_3(InDesc, ...@@ -139,8 +139,8 @@ __device__ void threadwise_direct_convolution_3(InDesc,
constexpr auto wei_reg_desc = make_ConstantTensorDescriptor( constexpr auto wei_reg_desc = make_ConstantTensorDescriptor(
Sequence<wei_desc.GetLength(I0), wei_desc.GetLength(I1), 1, 1>{}); Sequence<wei_desc.GetLength(I0), wei_desc.GetLength(I1), 1, 1>{});
TFloat p_in_reg[in_reg_desc.GetElementSpace()]; Float p_in_reg[in_reg_desc.GetElementSpace()];
TFloat p_wei_reg[wei_reg_desc.GetElementSpace()]; Float p_wei_reg[wei_reg_desc.GetElementSpace()];
constexpr unsigned in_w_new_read = 1; constexpr unsigned in_w_new_read = 1;
......
#pragma once #pragma once
#include "constant_tensor_descriptor.cuh" #include "constant_tensor_descriptor.cuh"
template <class TFloat, class Desc, class F> template <class Float, class Desc, class F>
__device__ void threadwise_4d_tensor_pointwise_operation_unary(Desc, TFloat* __restrict__ p, F f) __device__ void threadwise_4d_tensor_pointwise_operation_unary(Desc, Float* __restrict__ p, F f)
{ {
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
...@@ -37,12 +37,12 @@ __device__ void threadwise_4d_tensor_pointwise_operation_unary(Desc, TFloat* __r ...@@ -37,12 +37,12 @@ __device__ void threadwise_4d_tensor_pointwise_operation_unary(Desc, TFloat* __r
// TODO: in order to optimize mem access for different mem type, // TODO: in order to optimize mem access for different mem type,
// need to write specialized version // need to write specialized version
template <class TFloat, class SrcDesc, class DstDesc, class RefDesc, class Reorder, class F> template <class Float, class SrcDesc, class DstDesc, class RefDesc, class Reorder, class F>
__device__ void __device__ void
threadwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc, threadwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
TFloat* const __restrict__ p_src, Float* const __restrict__ p_src,
DstDesc, DstDesc,
TFloat* __restrict__ p_dst, Float* __restrict__ p_dst,
RefDesc, RefDesc,
Reorder, Reorder,
F f) F f)
...@@ -83,26 +83,22 @@ threadwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc, ...@@ -83,26 +83,22 @@ threadwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
} }
} }
template <class TFloat, class Desc> template <class Float, class Desc>
__device__ void threadwise_4d_tensor_set_zero(Desc, TFloat* __restrict__ p) __device__ void threadwise_4d_tensor_set_zero(Desc, Float* __restrict__ p)
{ {
auto f_set_zero = [](TFloat& v) { v = TFloat(0); }; auto f_set_zero = [](Float& v) { v = Float(0); };
threadwise_4d_tensor_pointwise_operation_unary<TFloat, Desc, decltype(f_set_zero)>( threadwise_4d_tensor_pointwise_operation_unary<Float, Desc, decltype(f_set_zero)>(
Desc{}, p, f_set_zero); Desc{}, p, f_set_zero);
} }
template <class TFloat, class SrcDesc, class DstDesc, class RefDesc, class Reorder> template <class Float, class SrcDesc, class DstDesc, class RefDesc, class Reorder>
__device__ void threadwise_4d_tensor_copy_reorder(SrcDesc, __device__ void threadwise_4d_tensor_copy_reorder(
TFloat* const __restrict__ p_src, SrcDesc, Float* const __restrict__ p_src, DstDesc, Float* __restrict__ p_dst, RefDesc, Reorder)
DstDesc,
TFloat* __restrict__ p_dst,
RefDesc,
Reorder)
{ {
auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; }; auto f_copy = [](const Float& src, Float& dst) { dst = src; };
threadwise_4d_tensor_pointwise_operation_binary_reorder<TFloat, threadwise_4d_tensor_pointwise_operation_binary_reorder<Float,
SrcDesc, SrcDesc,
DstDesc, DstDesc,
RefDesc, RefDesc,
...@@ -111,18 +107,18 @@ __device__ void threadwise_4d_tensor_copy_reorder(SrcDesc, ...@@ -111,18 +107,18 @@ __device__ void threadwise_4d_tensor_copy_reorder(SrcDesc,
SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, Reorder{}, f_copy); SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, Reorder{}, f_copy);
} }
template <class TFloat, class SrcDesc, class DstDesc, class RefDesc> template <class Float, class SrcDesc, class DstDesc, class RefDesc>
__device__ void threadwise_4d_tensor_copy( __device__ void threadwise_4d_tensor_copy(
SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, RefDesc) SrcDesc, Float* const __restrict__ p_src, DstDesc, Float* __restrict__ p_dst, RefDesc)
{ {
auto reorder = Sequence<0, 1, 2, 3>{}; auto reorder = Sequence<0, 1, 2, 3>{};
threadwise_4d_tensor_copy_reorder<TFloat, SrcDesc, DstDesc, RefDesc, decltype(reorder)>( threadwise_4d_tensor_copy_reorder<Float, SrcDesc, DstDesc, RefDesc, decltype(reorder)>(
SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, reorder); SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, reorder);
} }
template <class TFloat, class Desc, class IDim, class NShift> template <class Float, class Desc, class IDim, class NShift>
__device__ void threadwise_4d_tensor_shift_down(Desc, TFloat* __restrict__ p, IDim, NShift) __device__ void threadwise_4d_tensor_shift_down(Desc, Float* __restrict__ p, IDim, NShift)
{ {
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment