Commit 5872b710 authored by Chao Liu's avatar Chao Liu
Browse files

refactor

parent 2b52fbd2
...@@ -8,7 +8,8 @@ ...@@ -8,7 +8,8 @@
#include "conv_common.cuh" #include "conv_common.cuh"
#include "device_direct_convolution_1.cuh" #include "device_direct_convolution_1.cuh"
#include "device_direct_convolution_2.cuh" #include "device_direct_convolution_2.cuh"
#include "device_implicit_gemm_convolution.cuh" #include "device_implicit_gemm_convolution_nchw_kcsr.cuh"
#include "device_implicit_gemm_convolution_nchw_srck.cuh"
//#include "device_winograd_convolution.cuh" //#include "device_winograd_convolution.cuh"
struct GeneratorTensor_1 struct GeneratorTensor_1
...@@ -393,18 +394,6 @@ int main() ...@@ -393,18 +394,6 @@ int main()
wei_kcsr.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); wei_kcsr.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
#endif #endif
#if 1
auto wei_srck_desc = make_ConstantTensorDescriptor(Sequence<S, R, C, K>{});
ostream_ConstantTensorDescriptor(wei_srck_desc, std::cout << "wei_srck_desc: ");
Tensor<float> wei_srck(make_TensorDescriptor(wei_srck_desc));
auto f_reorder_kcsr2srck = [&](auto k, auto c, auto s, auto r) {
wei_srck(s, r, c, k) = wei_kcsr(k, c, s, r);
};
make_ParallelTensorFunctor(f_reorder_kcsr2srck, K, C, S, R)(num_thread);
#endif
for(int i = 0; i < 40; ++i) for(int i = 0; i < 40; ++i)
{ {
#if 0 #if 0
...@@ -413,11 +402,11 @@ int main() ...@@ -413,11 +402,11 @@ int main()
device_direct_convolution_2( device_direct_convolution_2(
in_nchw_desc, in_nchw, wei_kcsr_desc, wei_kcsr, out_nkhw_desc, out_nkhw_device); in_nchw_desc, in_nchw, wei_kcsr_desc, wei_kcsr, out_nkhw_desc, out_nkhw_device);
#elif 0 #elif 0
device_implicit_gemm_convolution( device_implicit_gemm_convolution_nchw_kcsr(
in_nchw_desc, in_nchw, wei_kcsr_desc, wei_kcsr, out_nkhw_desc, out_nkhw_device); in_nchw_desc, in_nchw, wei_kcsr_desc, wei_kcsr, out_nkhw_desc, out_nkhw_device);
#elif 1 #elif 1
device_implicit_gemm_convolution( device_implicit_gemm_convolution_nchw_srck(
in_nchw_desc, in_nchw, wei_srck_desc, wei_srck, out_nkhw_desc, out_nkhw_device); in_nchw_desc, in_nchw, wei_kcsr_desc, wei_kcsr, out_nkhw_desc, out_nkhw_device);
#elif 0 #elif 0
device_winograd_convolution( device_winograd_convolution(
in_nchw_desc, in_nchw, wei_kcsr_desc, wei_kcsr, out_nkhw_desc, out_nkhw_device); in_nchw_desc, in_nchw, wei_kcsr_desc, wei_kcsr, out_nkhw_desc, out_nkhw_device);
......
#pragma once #pragma once
#include "gridwise_implicit_gemm_convolution_nchw_kcsr.cuh" #include "gridwise_implicit_gemm_convolution_nchw_kcsr.cuh"
#include "gridwise_implicit_gemm_convolution_nchw_srck.cuh"
template <class T, class InDesc, class WeiDesc, class OutDesc> template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_implicit_gemm_convolution( void device_implicit_gemm_convolution_nchw_kcsr(
InDesc, const Tensor<T>& in, WeiDesc, const Tensor<T>& wei, OutDesc, Tensor<T>& out) InDesc, const Tensor<T>& in, WeiDesc, const Tensor<T>& wei, OutDesc, Tensor<T>& out)
{ {
std::size_t data_sz = sizeof(T); std::size_t data_sz = sizeof(T);
...@@ -82,12 +81,7 @@ void device_implicit_gemm_convolution( ...@@ -82,12 +81,7 @@ void device_implicit_gemm_convolution(
cudaEventCreate(&start); cudaEventCreate(&start);
cudaEventRecord(start, 0); cudaEventRecord(start, 0);
#if 0 gridwise_implicit_gemm_convolution_nchw_kcsr<GridSize,
gridwise_implicit_gemm_convolution_nchw_kcsr
#elif 1
gridwise_implicit_gemm_convolution_nchw_srck
#endif
<GridSize,
BlockSize, BlockSize,
T, T,
InDesc, InDesc,
...@@ -101,7 +95,8 @@ void device_implicit_gemm_convolution( ...@@ -101,7 +95,8 @@ void device_implicit_gemm_convolution(
KPerThread, KPerThread,
CPerThread, CPerThread,
HoPerThread, HoPerThread,
WoPerThread><<<grid_dim, block_dim>>>(InDesc{}, WoPerThread>
<<<grid_dim, block_dim>>>(InDesc{},
static_cast<T*>(in_device_buf.GetDeviceBuffer()), static_cast<T*>(in_device_buf.GetDeviceBuffer()),
WeiDesc{}, WeiDesc{},
static_cast<T*>(wei_device_buf.GetDeviceBuffer()), static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
......
#pragma once
#include "gridwise_implicit_gemm_convolution_nchw_srck.cuh"
template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_implicit_gemm_convolution_nchw_srck(InDesc,
const Tensor<T>& in_nchw,
WeiDesc,
const Tensor<T>& wei_kcsr,
OutDesc,
Tensor<T>& out_nkhw)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto in_nchw_desc = InDesc{};
constexpr auto wei_kcsr_desc = WeiDesc{};
constexpr auto out_nkhw_desc = OutDesc{};
constexpr unsigned N = out_nkhw_desc.GetLength(I0);
constexpr unsigned Ho = out_nkhw_desc.GetLength(I2);
constexpr unsigned Wo = out_nkhw_desc.GetLength(I3);
constexpr unsigned K = wei_kcsr_desc.GetLength(I0);
constexpr unsigned C = wei_kcsr_desc.GetLength(I1);
constexpr unsigned S = wei_kcsr_desc.GetLength(I2);
constexpr unsigned R = wei_kcsr_desc.GetLength(I3);
auto wei_srck_desc = make_ConstantTensorDescriptor(Sequence<S, R, C, K>{});
ostream_ConstantTensorDescriptor(wei_srck_desc, std::cout << "wei_srck_desc: ");
Tensor<T> wei_srck(make_TensorDescriptor(wei_srck_desc));
auto f_reorder_kcsr2srck = [&](auto k, auto c, auto s, auto r) {
wei_srck(s, r, c, k) = wei_kcsr(k, c, s, r);
};
make_ParallelTensorFunctor(f_reorder_kcsr2srck, K, C, S, R)(
std::thread::hardware_concurrency());
std::size_t data_sz = sizeof(T);
DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
DeviceMem wei_srck_device_buf(data_sz * wei_srck.mDesc.GetElementSpace());
DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace());
int num_thread = std::thread::hardware_concurrency();
in_nchw_device_buf.ToDevice(in_nchw.mData.data());
wei_srck_device_buf.ToDevice(wei_srck.mData.data());
out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
#if 0
constexpr unsigned NPerBlock = 1;
constexpr unsigned KPerBlock = 1;
constexpr unsigned CPerBlock = 1;
constexpr unsigned HoPerBlock = 2;
constexpr unsigned WoPerBlock = 32;
constexpr unsigned KPerThread = 1;
constexpr unsigned CPerThread = 1;
constexpr unsigned HoPerThread = 2;
constexpr unsigned WoPerThread = 2;
constexpr unsigned BlockSize = 16;
#elif 1
constexpr unsigned NPerBlock = 2;
constexpr unsigned KPerBlock = 32;
constexpr unsigned CPerBlock = 4;
constexpr unsigned HoPerBlock = 2;
constexpr unsigned WoPerBlock = 32;
constexpr unsigned KPerThread = 4;
constexpr unsigned CPerThread = 2;
constexpr unsigned HoPerThread = 2;
constexpr unsigned WoPerThread = 2;
constexpr unsigned BlockSize = 128;
#elif 0
constexpr unsigned NPerBlock = 2;
constexpr unsigned KPerBlock = 64;
constexpr unsigned CPerBlock = 4;
constexpr unsigned HoPerBlock = 2;
constexpr unsigned WoPerBlock = 32;
constexpr unsigned KPerThread = 4;
constexpr unsigned CPerThread = 2;
constexpr unsigned HoPerThread = 2;
constexpr unsigned WoPerThread = 2;
constexpr unsigned BlockSize = 256;
#endif
constexpr unsigned GridSize =
((N + NPerBlock - 1) / NPerBlock) * ((K + KPerBlock - 1) / KPerBlock) *
((Ho + HoPerBlock - 1) / HoPerBlock) * ((Wo + WoPerBlock - 1) / WoPerBlock);
dim3 block_dim(BlockSize);
dim3 grid_dim(GridSize);
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
cudaEvent_t start, stop;
float elapsedTime;
cudaEventCreate(&start);
cudaEventRecord(start, 0);
gridwise_implicit_gemm_convolution_nchw_srck<GridSize,
BlockSize,
T,
decltype(in_nchw_desc),
decltype(wei_srck_desc),
decltype(out_nkhw_desc),
NPerBlock,
KPerBlock,
CPerBlock,
HoPerBlock,
WoPerBlock,
KPerThread,
CPerThread,
HoPerThread,
WoPerThread>
<<<grid_dim, block_dim>>>(in_nchw_desc,
static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
wei_srck_desc,
static_cast<T*>(wei_srck_device_buf.GetDeviceBuffer()),
out_nkhw_desc,
static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));
cudaEventCreate(&stop);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("Elapsed time : %f ms\n", elapsedTime);
checkCudaErrors(cudaGetLastError());
out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
}
...@@ -135,20 +135,6 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds ...@@ -135,20 +135,6 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds
const unsigned bindex = dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]); const unsigned bindex = dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]);
#if 0
printf("did %u %u %u %u, did_IR %u %u %u %u, index %u %u\n",
did[0],
did[1],
did[2],
did[3],
did[IR0],
did[IR1],
did[IR2],
did[IR3],
aindex,
bindex);
#endif
f(p_src[aindex], p_dst[bindex]); f(p_src[aindex], p_dst[bindex]);
} }
......
...@@ -157,7 +157,7 @@ __global__ void gridwise_implicit_gemm_convolution_nchw_kcsr(InGlobalDesc, ...@@ -157,7 +157,7 @@ __global__ void gridwise_implicit_gemm_convolution_nchw_kcsr(InGlobalDesc,
c_block_data_begin += CPerBlock, __syncthreads()) c_block_data_begin += CPerBlock, __syncthreads())
{ {
// input: global mem to LDS, // input: global mem to LDS,
// convert 4d-tensor in[N,C,Hi,Wi] to matrix in_matrix[C,Hi*Wi*N] // convert [N,C,Hi,Wi] to [C,Hi,Wi,N]
blockwise_4d_tensor_copy_reorder_by_get_dst_from_src<BlockSize>( blockwise_4d_tensor_copy_reorder_by_get_dst_from_src<BlockSize>(
in_nchw_global_desc, in_nchw_global_desc,
p_in_global + in_nchw_global_desc.Get1dIndex(n_block_data_begin, p_in_global + in_nchw_global_desc.Get1dIndex(n_block_data_begin,
...@@ -170,6 +170,7 @@ __global__ void gridwise_implicit_gemm_convolution_nchw_kcsr(InGlobalDesc, ...@@ -170,6 +170,7 @@ __global__ void gridwise_implicit_gemm_convolution_nchw_kcsr(InGlobalDesc,
reorder_chwn_from_nchw); reorder_chwn_from_nchw);
// weight: global mem to LDS, // weight: global mem to LDS,
// convert [K,C,S,R] to [S,R,C,K]
blockwise_4d_tensor_copy_reorder_by_get_dst_from_src<BlockSize>( blockwise_4d_tensor_copy_reorder_by_get_dst_from_src<BlockSize>(
wei_kcsr_global_desc, wei_kcsr_global_desc,
p_wei_global + p_wei_global +
...@@ -217,10 +218,4 @@ __global__ void gridwise_implicit_gemm_convolution_nchw_kcsr(InGlobalDesc, ...@@ -217,10 +218,4 @@ __global__ void gridwise_implicit_gemm_convolution_nchw_kcsr(InGlobalDesc,
wo_block_data_begin + wo_thread_data_begin), wo_block_data_begin + wo_thread_data_begin),
out_hkwn_thread_desc.GetLengths(), out_hkwn_thread_desc.GetLengths(),
reorder_nkhw_from_hkwn); reorder_nkhw_from_hkwn);
// printf("%f %f %f %f\n", p_out_thread[0], p_out_thread[1], p_out_thread[2], p_out_thread[3]);
// printf("%u %u, %u %u %u\n", get_block_1d_id(), get_thread_local_1d_id(),
// matrix_c_index.batch_begin, matrix_c_index.row_begin, matrix_c_index.col_begin); printf("%u
// %u, %u %u %u\n", get_block_1d_id(), get_thread_local_1d_id(), ho_thread_data_begin,
// k_thread_data_begin, wo_thread_data_begin);
} }
...@@ -153,7 +153,7 @@ __global__ void gridwise_implicit_gemm_convolution_nchw_srck(InGlobalDesc, ...@@ -153,7 +153,7 @@ __global__ void gridwise_implicit_gemm_convolution_nchw_srck(InGlobalDesc,
c_block_data_begin += CPerBlock, __syncthreads()) c_block_data_begin += CPerBlock, __syncthreads())
{ {
// input: global mem to LDS, // input: global mem to LDS,
// convert 4d-tensor in[N,C,Hi,Wi] to matrix in_matrix[C,Hi*Wi*N] // convert [N,C,Hi,Wi] to [C,Hi,Wi,N]
blockwise_4d_tensor_copy_reorder_by_get_dst_from_src<BlockSize>( blockwise_4d_tensor_copy_reorder_by_get_dst_from_src<BlockSize>(
in_nchw_global_desc, in_nchw_global_desc,
p_in_global + in_nchw_global_desc.Get1dIndex(n_block_data_begin, p_in_global + in_nchw_global_desc.Get1dIndex(n_block_data_begin,
...@@ -166,6 +166,7 @@ __global__ void gridwise_implicit_gemm_convolution_nchw_srck(InGlobalDesc, ...@@ -166,6 +166,7 @@ __global__ void gridwise_implicit_gemm_convolution_nchw_srck(InGlobalDesc,
reorder_chwn_from_nchw); reorder_chwn_from_nchw);
// weight: global mem to LDS, // weight: global mem to LDS,
// format is [S,R,C,K], no conversion needed
blockwise_4d_tensor_copy<BlockSize>( blockwise_4d_tensor_copy<BlockSize>(
wei_srck_global_desc, wei_srck_global_desc,
p_wei_global + p_wei_global +
...@@ -212,10 +213,4 @@ __global__ void gridwise_implicit_gemm_convolution_nchw_srck(InGlobalDesc, ...@@ -212,10 +213,4 @@ __global__ void gridwise_implicit_gemm_convolution_nchw_srck(InGlobalDesc,
wo_block_data_begin + wo_thread_data_begin), wo_block_data_begin + wo_thread_data_begin),
out_hkwn_thread_desc.GetLengths(), out_hkwn_thread_desc.GetLengths(),
reorder_nkhw_from_hkwn); reorder_nkhw_from_hkwn);
// printf("%f %f %f %f\n", p_out_thread[0], p_out_thread[1], p_out_thread[2], p_out_thread[3]);
// printf("%u %u, %u %u %u\n", get_block_1d_id(), get_thread_local_1d_id(),
// matrix_c_index.batch_begin, matrix_c_index.row_begin, matrix_c_index.col_begin); printf("%u
// %u, %u %u %u\n", get_block_1d_id(), get_thread_local_1d_id(), ho_thread_data_begin,
// k_thread_data_begin, wo_thread_data_begin);
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment