Commit a21b0d27 authored by Chao Liu's avatar Chao Liu
Browse files

refactor

parent 6790b8f3
......@@ -178,7 +178,7 @@ int main()
for(int i = 0; i < 20; ++i)
{
device_direct_convolution_2(in_desc, in, wei_desc, wei, out_desc, out_device);
device_direct_convolution_1(in_desc, in, wei_desc, wei, out_desc, out_device);
}
#if 0
......
......@@ -55,23 +55,23 @@ void device_direct_convolution_1(
cudaEventCreate(&start);
cudaEventRecord(start, 0);
gridwise_convolution<T,
InDesc,
WeiDesc,
OutDesc,
OutTileSizeH,
OutTileSizeW,
NPerBlock,
KPerBlock,
CPerBlock,
YPerBlock,
XPerBlock,
NBlockOpLen0,
NBlockOpLen1,
NBlockOpLen2,
NBlockOpLen3,
BlockSize,
GridSize>
gridwise_direct_convolution_1<T,
InDesc,
WeiDesc,
OutDesc,
OutTileSizeH,
OutTileSizeW,
NPerBlock,
KPerBlock,
CPerBlock,
YPerBlock,
XPerBlock,
NBlockOpLen0,
NBlockOpLen1,
NBlockOpLen2,
NBlockOpLen3,
BlockSize,
GridSize>
<<<grid_dim, block_dim>>>(InDesc{},
static_cast<T*>(in_device_buf.GetDeviceBuffer()),
WeiDesc{},
......
......@@ -59,26 +59,26 @@ void device_direct_convolution_2(
cudaEventCreate(&start);
cudaEventRecord(start, 0);
gridwise_convolution<T,
InDesc,
WeiDesc,
OutDesc,
OutTileSizeH,
OutTileSizeW,
NPerBlock,
KPerBlock,
CPerBlock,
YPerBlock,
XPerBlock,
NPerThread,
KPerThread,
CPerThread,
NBlockOpLen0,
NBlockOpLen1,
NBlockOpLen2,
NBlockOpLen3,
BlockSize,
GridSize>
gridwise_direct_convolution_2<T,
InDesc,
WeiDesc,
OutDesc,
OutTileSizeH,
OutTileSizeW,
NPerBlock,
KPerBlock,
CPerBlock,
YPerBlock,
XPerBlock,
NPerThread,
KPerThread,
CPerThread,
NBlockOpLen0,
NBlockOpLen1,
NBlockOpLen2,
NBlockOpLen3,
BlockSize,
GridSize>
<<<grid_dim, block_dim>>>(InDesc{},
static_cast<T*>(in_device_buf.GetDeviceBuffer()),
WeiDesc{},
......
......@@ -20,12 +20,12 @@ template <class TFloat,
unsigned NBlockOpLen3,
unsigned BlockSize,
unsigned GridSize>
__global__ void gridwise_convolution(InGlobalDesc,
TFloat* const __restrict__ p_in_global,
WeiGlobalDesc,
TFloat* const __restrict__ p_wei_global,
OutGlobalDesc,
TFloat* __restrict__ p_out_global)
__global__ void gridwise_direct_convolution_1(InGlobalDesc,
TFloat* const __restrict__ p_in_global,
WeiGlobalDesc,
TFloat* const __restrict__ p_wei_global,
OutGlobalDesc,
TFloat* __restrict__ p_out_global)
{
constexpr auto I0 = Index<0>{};
constexpr auto I1 = Index<1>{};
......
......@@ -25,12 +25,12 @@ template <class TFloat,
unsigned NBlockOpLen3,
unsigned BlockSize,
unsigned GridSize>
__global__ void gridwise_convolution(InGlobalDesc,
TFloat* const __restrict__ p_in_global,
WeiGlobalDesc,
TFloat* const __restrict__ p_wei_global,
OutGlobalDesc,
TFloat* __restrict__ p_out_global)
__global__ void gridwise_direct_convolution_2(InGlobalDesc,
TFloat* const __restrict__ p_in_global,
WeiGlobalDesc,
TFloat* const __restrict__ p_wei_global,
OutGlobalDesc,
TFloat* __restrict__ p_out_global)
{
constexpr auto I0 = Index<0>{};
constexpr auto I1 = Index<1>{};
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment