Commit 6a45afba authored by Chao Liu's avatar Chao Liu
Browse files

conv: update tensorDesc calculation

parent 1b648f2f
...@@ -111,9 +111,10 @@ void host_convolution(const Tensor<T>& in, const Tensor<T>& wei, Tensor<T>& out) ...@@ -111,9 +111,10 @@ void host_convolution(const Tensor<T>& in, const Tensor<T>& wei, Tensor<T>& out)
f_par(std::thread::hardware_concurrency()); f_par(std::thread::hardware_concurrency());
} }
template <class T, class InDesc, class WeiDesc, class OutDesc> #if 0
template <class T>
void device_convolution( void device_convolution(
InDesc, const Tensor<T>& in, WeiDesc, const Tensor<T>& wei, OutDesc, Tensor<T>& out) const Tensor<T>& in, const Tensor<T>& wei, Tensor<T>& out)
{ {
DeviceTensorDescriptor<4> in_desc_device(in.mDesc); DeviceTensorDescriptor<4> in_desc_device(in.mDesc);
DeviceTensorDescriptor<4> wei_desc_device(wei.mDesc); DeviceTensorDescriptor<4> wei_desc_device(wei.mDesc);
...@@ -144,7 +145,7 @@ void device_convolution( ...@@ -144,7 +145,7 @@ void device_convolution(
dim3 block_dim(64, 1, 1); dim3 block_dim(64, 1, 1);
dim3 grid_dim(1, 1, 1); dim3 grid_dim(1, 1, 1);
#if 0
gridwise_convolution<T, 3, 3, 4, 4, 2, 2, 1, 1, 8, 8, 1> gridwise_convolution<T, 3, 3, 4, 4, 2, 2, 1, 1, 8, 8, 1>
<<<grid_dim, block_dim>>>(in_desc_device, <<<grid_dim, block_dim>>>(in_desc_device,
static_cast<T*>(in_device_buf.GetDeviceBuffer()), static_cast<T*>(in_device_buf.GetDeviceBuffer()),
...@@ -152,19 +153,78 @@ void device_convolution( ...@@ -152,19 +153,78 @@ void device_convolution(
static_cast<T*>(wei_device_buf.GetDeviceBuffer()), static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
out_desc_device, out_desc_device,
static_cast<T*>(out_device_buf.GetDeviceBuffer())); static_cast<T*>(out_device_buf.GetDeviceBuffer()));
checkCudaErrors(cudaGetLastError());
out_device_buf.FromDevice(out.mData.data());
}
#else #else
gridwise_convolution<T, InDesc, WeiDesc, OutDesc, 4, 4, 2, 2, 1, 1, 8, 8, 1> template <class T, class InDesc, class WeiDesc, class OutDesc>
void const_device_convolution(
InDesc, const Tensor<T>& in, WeiDesc, const Tensor<T>& wei, OutDesc, Tensor<T>& out)
{
std::size_t data_sz = sizeof(T);
DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace());
DeviceMem wei_device_buf(data_sz * wei.mDesc.GetElementSpace());
DeviceMem out_device_buf(data_sz * out.mDesc.GetElementSpace());
int num_thread = std::thread::hardware_concurrency();
out.GenerateTensorValue(GeneratorConstant<float>{0}, num_thread);
in_device_buf.ToDevice(in.mData.data());
wei_device_buf.ToDevice(wei.mData.data());
out_device_buf.ToDevice(out.mData.data());
dim3 block_dim(64, 1, 1);
dim3 grid_dim(1, 1, 1);
constexpr auto I0 = Index<0>{};
constexpr auto I1 = Index<1>{};
constexpr auto I2 = Index<2>{};
constexpr auto I3 = Index<3>{};
constexpr auto in_desc = InDesc{};
constexpr auto wei_desc = WeiDesc{};
constexpr auto out_desc = OutDesc{};
constexpr unsigned NPerBlock = 1;
constexpr unsigned KPerBlock = 1;
constexpr unsigned CPerBlockLoop = 1;
constexpr unsigned OutTileSizeH = 2;
constexpr unsigned OutTileSizeW = 2;
constexpr unsigned YPerBlock = (out_desc.GetLength(I2) + OutTileSizeH - 1) / OutTileSizeH;
constexpr unsigned XPerBlock = (out_desc.GetLength(I3) + OutTileSizeW - 1) / OutTileSizeW;
constexpr unsigned NBlockCopyLen0 = 1;
constexpr unsigned NBlockCopyLen1 = 1;
constexpr unsigned NBlockCopyLen2 = 1;
constexpr unsigned NBlockCopyLen3 = 64;
gridwise_convolution<T,
InDesc,
WeiDesc,
OutDesc,
NPerBlock,
KPerBlock,
CPerBlockLoop,
OutTileSizeH,
OutTileSizeW,
YPerBlock,
XPerBlock,
NBlockCopyLen0,
NBlockCopyLen1,
NBlockCopyLen2,
NBlockCopyLen3>
<<<grid_dim, block_dim>>>(InDesc{}, <<<grid_dim, block_dim>>>(InDesc{},
static_cast<T*>(in_device_buf.GetDeviceBuffer()), static_cast<T*>(in_device_buf.GetDeviceBuffer()),
WeiDesc{}, WeiDesc{},
static_cast<T*>(wei_device_buf.GetDeviceBuffer()), static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
OutDesc{}, OutDesc{},
static_cast<T*>(out_device_buf.GetDeviceBuffer())); static_cast<T*>(out_device_buf.GetDeviceBuffer()));
#endif
checkCudaErrors(cudaGetLastError()); checkCudaErrors(cudaGetLastError());
out_device_buf.FromDevice(out.mData.data()); out_device_buf.FromDevice(out.mData.data());
} }
#endif
int main() int main()
{ {
...@@ -176,14 +236,22 @@ int main() ...@@ -176,14 +236,22 @@ int main()
constexpr unsigned K = 1; constexpr unsigned K = 1;
constexpr unsigned S = 3; constexpr unsigned S = 3;
constexpr unsigned R = 3; constexpr unsigned R = 3;
#elif 0 #elif 1
constexpr unsigned N = 1; constexpr unsigned N = 1;
constexpr unsigned C = 1; constexpr unsigned C = 1;
constexpr unsigned HI = 130; constexpr unsigned HI = 36;
constexpr unsigned WI = 130; constexpr unsigned WI = 36;
constexpr unsigned K = 1; constexpr unsigned K = 1;
constexpr unsigned S = 3; constexpr unsigned S = 3;
constexpr unsigned R = 3; constexpr unsigned R = 3;
#elif 0
constexpr unsigned N = 1;
constexpr unsigned C = 1;
constexpr unsigned HI = 130;
constexpr unsigned WI = 130;
constexpr unsigned K = 1;
constexpr unsigned S = 3;
constexpr unsigned R = 3;
#elif 0 #elif 0
constexpr unsigned N = 3; constexpr unsigned N = 3;
constexpr unsigned C = 16; constexpr unsigned C = 16;
...@@ -214,7 +282,12 @@ int main() ...@@ -214,7 +282,12 @@ int main()
wei.GenerateTensorValue(GeneratorTensor<float>{}, num_thread); wei.GenerateTensorValue(GeneratorTensor<float>{}, num_thread);
host_convolution(in, wei, out_host); host_convolution(in, wei, out_host);
device_convolution(in_desc, in, wei_desc, wei, out_desc, out_device);
#if 0
device_convolution(in, wei, out_device);
#else
const_device_convolution(in_desc, in, wei_desc, wei, out_desc, out_device);
#endif
std::cout << __func__ << ": done" << std::endl; std::cout << __func__ << ": done" << std::endl;
......
...@@ -231,17 +231,8 @@ template <class TFloat, ...@@ -231,17 +231,8 @@ template <class TFloat,
class InDesc, class InDesc,
class WeiDesc, class WeiDesc,
class OutDesc, class OutDesc,
unsigned S,
unsigned R,
unsigned InTileSizeH,
unsigned InTileSizeW,
unsigned OutTileSizeH, unsigned OutTileSizeH,
unsigned OutTileSizeW, unsigned OutTileSizeW>
unsigned NPerBlock,
unsigned KPerBlock,
unsigned YPerBlock,
unsigned XPerBlock,
unsigned CPerBlockLoop>
__device__ void blockwise_convolution(InDesc, __device__ void blockwise_convolution(InDesc,
TFloat* const __restrict__ p_in, TFloat* const __restrict__ p_in,
WeiDesc, WeiDesc,
...@@ -258,6 +249,19 @@ __device__ void blockwise_convolution(InDesc, ...@@ -258,6 +249,19 @@ __device__ void blockwise_convolution(InDesc,
constexpr auto wei_desc = WeiDesc{}; constexpr auto wei_desc = WeiDesc{};
constexpr auto out_desc = OutDesc{}; constexpr auto out_desc = OutDesc{};
constexpr unsigned S = wei_desc.GetLength(I2);
constexpr unsigned R = wei_desc.GetLength(I3);
constexpr unsigned NPerBlock = out_desc.GetLength(I0);
constexpr unsigned KPerBlock = out_desc.GetLength(I1);
constexpr unsigned YPerBlock = (out_desc.GetLength(I2) + OutTileSizeH - 1) / OutTileSizeH;
constexpr unsigned XPerBlock = (out_desc.GetLength(I3) + OutTileSizeW - 1) / OutTileSizeW;
constexpr unsigned CPerBlockLoop = in_desc.GetLength(I1);
constexpr unsigned InTileSizeH = OutTileSizeH + S - 1;
constexpr unsigned InTileSizeW = OutTileSizeW + R - 1;
#if 1 #if 1
if(threadIdx.x == 0) if(threadIdx.x == 0)
{ {
...@@ -383,15 +387,17 @@ template <class TFloat, ...@@ -383,15 +387,17 @@ template <class TFloat,
class InDesc, class InDesc,
class WeiDesc, class WeiDesc,
class OutDesc, class OutDesc,
unsigned InTileSizeH,
unsigned InTileSizeW,
unsigned OutTileSizeH,
unsigned OutTileSizeW,
unsigned NPerBlock, unsigned NPerBlock,
unsigned KPerBlock, unsigned KPerBlock,
unsigned CPerBlockLoop,
unsigned OutTileSizeH,
unsigned OutTileSizeW,
unsigned YPerBlock, unsigned YPerBlock,
unsigned XPerBlock, unsigned XPerBlock,
unsigned CPerBlockLoop> unsigned NBlockCopyLen0,
unsigned NBlockCopyLen1,
unsigned NBlockCopyLen2,
unsigned NBlockCopyLen3>
__global__ void gridwise_convolution(InDesc, __global__ void gridwise_convolution(InDesc,
TFloat* const __restrict__ p_in, TFloat* const __restrict__ p_in,
WeiDesc, WeiDesc,
...@@ -420,11 +426,10 @@ __global__ void gridwise_convolution(InDesc, ...@@ -420,11 +426,10 @@ __global__ void gridwise_convolution(InDesc,
} }
#endif #endif
constexpr unsigned NBlockWork = (in_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock; constexpr unsigned NBlockWork = (out_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
constexpr unsigned YBlockWork = (in_desc.GetLength(I2) + YPerBlock - 1) / YPerBlock; constexpr unsigned KBlockWork = (out_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
constexpr unsigned XBlockWork = (in_desc.GetLength(I3) + XPerBlock - 1) / XPerBlock; constexpr unsigned YBlockWork = (out_desc.GetLength(I2) + YPerBlock - 1) / YPerBlock;
constexpr unsigned XBlockWork = (out_desc.GetLength(I3) + XPerBlock - 1) / XPerBlock;
constexpr unsigned KBlockWork = (wei_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
const unsigned block_id = const unsigned block_id =
blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * (gridDim.y * gridDim.x); blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * (gridDim.y * gridDim.x);
...@@ -434,6 +439,7 @@ __global__ void gridwise_convolution(InDesc, ...@@ -434,6 +439,7 @@ __global__ void gridwise_convolution(InDesc,
CPerBlockLoop, CPerBlockLoop,
YPerBlock * OutTileSizeH + S - 1, YPerBlock * OutTileSizeH + S - 1,
XPerBlock * OutTileSizeW + R - 1>{}); XPerBlock * OutTileSizeW + R - 1>{});
constexpr auto wei_block_desc = constexpr auto wei_block_desc =
make_ConstantTensorDescriptor(Sequence<KPerBlock, CPerBlockLoop, S, R>{}); make_ConstantTensorDescriptor(Sequence<KPerBlock, CPerBlockLoop, S, R>{});
...@@ -474,10 +480,10 @@ __global__ void gridwise_convolution(InDesc, ...@@ -474,10 +480,10 @@ __global__ void gridwise_convolution(InDesc,
blockwise_4d_tensor_op<TFloat, blockwise_4d_tensor_op<TFloat,
decltype(in_desc), decltype(in_desc),
decltype(in_block_desc), decltype(in_block_desc),
1, NBlockCopyLen0,
1, NBlockCopyLen1,
1, NBlockCopyLen2,
64, NBlockCopyLen3,
decltype(f_copy)>(in_desc, decltype(f_copy)>(in_desc,
p_in + in_desc.Get1dIndex(n_block_work_begin, p_in + in_desc.Get1dIndex(n_block_work_begin,
c_block_work_begin, c_block_work_begin,
...@@ -491,10 +497,10 @@ __global__ void gridwise_convolution(InDesc, ...@@ -491,10 +497,10 @@ __global__ void gridwise_convolution(InDesc,
blockwise_4d_tensor_op<TFloat, blockwise_4d_tensor_op<TFloat,
decltype(wei_desc), decltype(wei_desc),
decltype(wei_block_desc), decltype(wei_block_desc),
1, NBlockCopyLen0,
1, NBlockCopyLen1,
1, NBlockCopyLen2,
64, NBlockCopyLen3,
decltype(f_copy)>( decltype(f_copy)>(
wei_desc, wei_desc,
p_wei + wei_desc.Get1dIndex(k_block_work_begin, c_block_work_begin, 0, 0), p_wei + wei_desc.Get1dIndex(k_block_work_begin, c_block_work_begin, 0, 0),
...@@ -506,10 +512,10 @@ __global__ void gridwise_convolution(InDesc, ...@@ -506,10 +512,10 @@ __global__ void gridwise_convolution(InDesc,
blockwise_4d_tensor_op<TFloat, blockwise_4d_tensor_op<TFloat,
decltype(out_desc), decltype(out_desc),
decltype(out_block_desc), decltype(out_block_desc),
1, NBlockCopyLen0,
1, NBlockCopyLen1,
1, NBlockCopyLen2,
64, NBlockCopyLen3,
decltype(f_copy)>(out_desc, decltype(f_copy)>(out_desc,
p_out + out_desc.Get1dIndex(n_block_work_begin, p_out + out_desc.Get1dIndex(n_block_work_begin,
k_block_work_begin, k_block_work_begin,
...@@ -526,17 +532,8 @@ __global__ void gridwise_convolution(InDesc, ...@@ -526,17 +532,8 @@ __global__ void gridwise_convolution(InDesc,
decltype(in_block_desc), decltype(in_block_desc),
decltype(wei_block_desc), decltype(wei_block_desc),
decltype(out_block_desc), decltype(out_block_desc),
S,
R,
InTileSizeH,
InTileSizeW,
OutTileSizeH, OutTileSizeH,
OutTileSizeW, OutTileSizeW>(
NPerBlock,
KPerBlock,
YPerBlock,
XPerBlock,
CPerBlockLoop>(
in_block_desc, p_in_block, wei_block_desc, p_wei_block, out_block_desc, p_out_block); in_block_desc, p_in_block, wei_block_desc, p_wei_block, out_block_desc, p_out_block);
__syncthreads(); __syncthreads();
...@@ -545,10 +542,10 @@ __global__ void gridwise_convolution(InDesc, ...@@ -545,10 +542,10 @@ __global__ void gridwise_convolution(InDesc,
blockwise_4d_tensor_op<TFloat, blockwise_4d_tensor_op<TFloat,
decltype(out_block_desc), decltype(out_block_desc),
decltype(out_desc), decltype(out_desc),
1, NBlockCopyLen0,
1, NBlockCopyLen1,
1, NBlockCopyLen2,
64, NBlockCopyLen3,
decltype(f_copy)>(out_block_desc, decltype(f_copy)>(out_block_desc,
p_out_block, p_out_block,
out_desc, out_desc,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment