"git@developer.sourcefind.cn:yangql/composable_kernel-1.git" did not exist on "0345963eef4f92e9c5eab608bb8557b5463a1dcb"
Commit 6a45afba authored by Chao Liu's avatar Chao Liu
Browse files

conv: update tensorDesc calculation

parent 1b648f2f
......@@ -111,9 +111,10 @@ void host_convolution(const Tensor<T>& in, const Tensor<T>& wei, Tensor<T>& out)
f_par(std::thread::hardware_concurrency());
}
template <class T, class InDesc, class WeiDesc, class OutDesc>
#if 0
template <class T>
void device_convolution(
InDesc, const Tensor<T>& in, WeiDesc, const Tensor<T>& wei, OutDesc, Tensor<T>& out)
const Tensor<T>& in, const Tensor<T>& wei, Tensor<T>& out)
{
DeviceTensorDescriptor<4> in_desc_device(in.mDesc);
DeviceTensorDescriptor<4> wei_desc_device(wei.mDesc);
......@@ -144,7 +145,7 @@ void device_convolution(
dim3 block_dim(64, 1, 1);
dim3 grid_dim(1, 1, 1);
#if 0
gridwise_convolution<T, 3, 3, 4, 4, 2, 2, 1, 1, 8, 8, 1>
<<<grid_dim, block_dim>>>(in_desc_device,
static_cast<T*>(in_device_buf.GetDeviceBuffer()),
......@@ -152,19 +153,78 @@ void device_convolution(
static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
out_desc_device,
static_cast<T*>(out_device_buf.GetDeviceBuffer()));
checkCudaErrors(cudaGetLastError());
out_device_buf.FromDevice(out.mData.data());
}
#else
gridwise_convolution<T, InDesc, WeiDesc, OutDesc, 4, 4, 2, 2, 1, 1, 8, 8, 1>
template <class T, class InDesc, class WeiDesc, class OutDesc>
void const_device_convolution(
InDesc, const Tensor<T>& in, WeiDesc, const Tensor<T>& wei, OutDesc, Tensor<T>& out)
{
std::size_t data_sz = sizeof(T);
DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace());
DeviceMem wei_device_buf(data_sz * wei.mDesc.GetElementSpace());
DeviceMem out_device_buf(data_sz * out.mDesc.GetElementSpace());
int num_thread = std::thread::hardware_concurrency();
out.GenerateTensorValue(GeneratorConstant<float>{0}, num_thread);
in_device_buf.ToDevice(in.mData.data());
wei_device_buf.ToDevice(wei.mData.data());
out_device_buf.ToDevice(out.mData.data());
dim3 block_dim(64, 1, 1);
dim3 grid_dim(1, 1, 1);
constexpr auto I0 = Index<0>{};
constexpr auto I1 = Index<1>{};
constexpr auto I2 = Index<2>{};
constexpr auto I3 = Index<3>{};
constexpr auto in_desc = InDesc{};
constexpr auto wei_desc = WeiDesc{};
constexpr auto out_desc = OutDesc{};
constexpr unsigned NPerBlock = 1;
constexpr unsigned KPerBlock = 1;
constexpr unsigned CPerBlockLoop = 1;
constexpr unsigned OutTileSizeH = 2;
constexpr unsigned OutTileSizeW = 2;
constexpr unsigned YPerBlock = (out_desc.GetLength(I2) + OutTileSizeH - 1) / OutTileSizeH;
constexpr unsigned XPerBlock = (out_desc.GetLength(I3) + OutTileSizeW - 1) / OutTileSizeW;
constexpr unsigned NBlockCopyLen0 = 1;
constexpr unsigned NBlockCopyLen1 = 1;
constexpr unsigned NBlockCopyLen2 = 1;
constexpr unsigned NBlockCopyLen3 = 64;
gridwise_convolution<T,
InDesc,
WeiDesc,
OutDesc,
NPerBlock,
KPerBlock,
CPerBlockLoop,
OutTileSizeH,
OutTileSizeW,
YPerBlock,
XPerBlock,
NBlockCopyLen0,
NBlockCopyLen1,
NBlockCopyLen2,
NBlockCopyLen3>
<<<grid_dim, block_dim>>>(InDesc{},
static_cast<T*>(in_device_buf.GetDeviceBuffer()),
WeiDesc{},
static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
OutDesc{},
static_cast<T*>(out_device_buf.GetDeviceBuffer()));
#endif
checkCudaErrors(cudaGetLastError());
out_device_buf.FromDevice(out.mData.data());
}
#endif
int main()
{
......@@ -176,14 +236,22 @@ int main()
constexpr unsigned K = 1;
constexpr unsigned S = 3;
constexpr unsigned R = 3;
#elif 0
#elif 1
constexpr unsigned N = 1;
constexpr unsigned C = 1;
constexpr unsigned HI = 130;
constexpr unsigned WI = 130;
constexpr unsigned HI = 36;
constexpr unsigned WI = 36;
constexpr unsigned K = 1;
constexpr unsigned S = 3;
constexpr unsigned R = 3;
#elif 0
constexpr unsigned N = 1;
constexpr unsigned C = 1;
constexpr unsigned HI = 130;
constexpr unsigned WI = 130;
constexpr unsigned K = 1;
constexpr unsigned S = 3;
constexpr unsigned R = 3;
#elif 0
constexpr unsigned N = 3;
constexpr unsigned C = 16;
......@@ -214,7 +282,12 @@ int main()
wei.GenerateTensorValue(GeneratorTensor<float>{}, num_thread);
host_convolution(in, wei, out_host);
device_convolution(in_desc, in, wei_desc, wei, out_desc, out_device);
#if 0
device_convolution(in, wei, out_device);
#else
const_device_convolution(in_desc, in, wei_desc, wei, out_desc, out_device);
#endif
std::cout << __func__ << ": done" << std::endl;
......
......@@ -231,17 +231,8 @@ template <class TFloat,
class InDesc,
class WeiDesc,
class OutDesc,
unsigned S,
unsigned R,
unsigned InTileSizeH,
unsigned InTileSizeW,
unsigned OutTileSizeH,
unsigned OutTileSizeW,
unsigned NPerBlock,
unsigned KPerBlock,
unsigned YPerBlock,
unsigned XPerBlock,
unsigned CPerBlockLoop>
unsigned OutTileSizeW>
__device__ void blockwise_convolution(InDesc,
TFloat* const __restrict__ p_in,
WeiDesc,
......@@ -258,6 +249,19 @@ __device__ void blockwise_convolution(InDesc,
constexpr auto wei_desc = WeiDesc{};
constexpr auto out_desc = OutDesc{};
constexpr unsigned S = wei_desc.GetLength(I2);
constexpr unsigned R = wei_desc.GetLength(I3);
constexpr unsigned NPerBlock = out_desc.GetLength(I0);
constexpr unsigned KPerBlock = out_desc.GetLength(I1);
constexpr unsigned YPerBlock = (out_desc.GetLength(I2) + OutTileSizeH - 1) / OutTileSizeH;
constexpr unsigned XPerBlock = (out_desc.GetLength(I3) + OutTileSizeW - 1) / OutTileSizeW;
constexpr unsigned CPerBlockLoop = in_desc.GetLength(I1);
constexpr unsigned InTileSizeH = OutTileSizeH + S - 1;
constexpr unsigned InTileSizeW = OutTileSizeW + R - 1;
#if 1
if(threadIdx.x == 0)
{
......@@ -383,15 +387,17 @@ template <class TFloat,
class InDesc,
class WeiDesc,
class OutDesc,
unsigned InTileSizeH,
unsigned InTileSizeW,
unsigned OutTileSizeH,
unsigned OutTileSizeW,
unsigned NPerBlock,
unsigned KPerBlock,
unsigned CPerBlockLoop,
unsigned OutTileSizeH,
unsigned OutTileSizeW,
unsigned YPerBlock,
unsigned XPerBlock,
unsigned CPerBlockLoop>
unsigned NBlockCopyLen0,
unsigned NBlockCopyLen1,
unsigned NBlockCopyLen2,
unsigned NBlockCopyLen3>
__global__ void gridwise_convolution(InDesc,
TFloat* const __restrict__ p_in,
WeiDesc,
......@@ -420,11 +426,10 @@ __global__ void gridwise_convolution(InDesc,
}
#endif
constexpr unsigned NBlockWork = (in_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
constexpr unsigned YBlockWork = (in_desc.GetLength(I2) + YPerBlock - 1) / YPerBlock;
constexpr unsigned XBlockWork = (in_desc.GetLength(I3) + XPerBlock - 1) / XPerBlock;
constexpr unsigned KBlockWork = (wei_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
constexpr unsigned NBlockWork = (out_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
constexpr unsigned KBlockWork = (out_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
constexpr unsigned YBlockWork = (out_desc.GetLength(I2) + YPerBlock - 1) / YPerBlock;
constexpr unsigned XBlockWork = (out_desc.GetLength(I3) + XPerBlock - 1) / XPerBlock;
const unsigned block_id =
blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * (gridDim.y * gridDim.x);
......@@ -434,6 +439,7 @@ __global__ void gridwise_convolution(InDesc,
CPerBlockLoop,
YPerBlock * OutTileSizeH + S - 1,
XPerBlock * OutTileSizeW + R - 1>{});
constexpr auto wei_block_desc =
make_ConstantTensorDescriptor(Sequence<KPerBlock, CPerBlockLoop, S, R>{});
......@@ -474,10 +480,10 @@ __global__ void gridwise_convolution(InDesc,
blockwise_4d_tensor_op<TFloat,
decltype(in_desc),
decltype(in_block_desc),
1,
1,
1,
64,
NBlockCopyLen0,
NBlockCopyLen1,
NBlockCopyLen2,
NBlockCopyLen3,
decltype(f_copy)>(in_desc,
p_in + in_desc.Get1dIndex(n_block_work_begin,
c_block_work_begin,
......@@ -491,10 +497,10 @@ __global__ void gridwise_convolution(InDesc,
blockwise_4d_tensor_op<TFloat,
decltype(wei_desc),
decltype(wei_block_desc),
1,
1,
1,
64,
NBlockCopyLen0,
NBlockCopyLen1,
NBlockCopyLen2,
NBlockCopyLen3,
decltype(f_copy)>(
wei_desc,
p_wei + wei_desc.Get1dIndex(k_block_work_begin, c_block_work_begin, 0, 0),
......@@ -506,10 +512,10 @@ __global__ void gridwise_convolution(InDesc,
blockwise_4d_tensor_op<TFloat,
decltype(out_desc),
decltype(out_block_desc),
1,
1,
1,
64,
NBlockCopyLen0,
NBlockCopyLen1,
NBlockCopyLen2,
NBlockCopyLen3,
decltype(f_copy)>(out_desc,
p_out + out_desc.Get1dIndex(n_block_work_begin,
k_block_work_begin,
......@@ -526,17 +532,8 @@ __global__ void gridwise_convolution(InDesc,
decltype(in_block_desc),
decltype(wei_block_desc),
decltype(out_block_desc),
S,
R,
InTileSizeH,
InTileSizeW,
OutTileSizeH,
OutTileSizeW,
NPerBlock,
KPerBlock,
YPerBlock,
XPerBlock,
CPerBlockLoop>(
OutTileSizeW>(
in_block_desc, p_in_block, wei_block_desc, p_wei_block, out_block_desc, p_out_block);
__syncthreads();
......@@ -545,10 +542,10 @@ __global__ void gridwise_convolution(InDesc,
blockwise_4d_tensor_op<TFloat,
decltype(out_block_desc),
decltype(out_desc),
1,
1,
1,
64,
NBlockCopyLen0,
NBlockCopyLen1,
NBlockCopyLen2,
NBlockCopyLen3,
decltype(f_copy)>(out_block_desc,
p_out_block,
out_desc,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment