"docs/vscode:/vscode.git/clone" did not exist on "7ba9cbc63e337f207c0707d1018d0d4e1c83ca51"
Commit 1de6fd07 authored by Chao Liu's avatar Chao Liu
Browse files

fixed a bug, and refactored

parent 1410850e
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
#include "conv_common.cuh" #include "conv_common.cuh"
#include "device_direct_convolution_1.cuh" #include "device_direct_convolution_1.cuh"
#include "device_direct_convolution_2.cuh" #include "device_direct_convolution_2.cuh"
//#include "device_implicit_gemm_convolution_1_nchw_kcsr.cuh" #include "device_implicit_gemm_convolution_1_nchw_kcsr.cuh"
#include "device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh" #include "device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh"
#include "device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh" #include "device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh"
#include "device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh" #include "device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh"
...@@ -361,7 +361,7 @@ int main() ...@@ -361,7 +361,7 @@ int main()
constexpr unsigned K = 1; constexpr unsigned K = 1;
constexpr unsigned S = 3; constexpr unsigned S = 3;
constexpr unsigned R = 3; constexpr unsigned R = 3;
#elif 0 #elif 1
// 3x3, 34x34 // 3x3, 34x34
constexpr unsigned N = 64; constexpr unsigned N = 64;
constexpr unsigned C = 256; constexpr unsigned C = 256;
...@@ -370,7 +370,7 @@ int main() ...@@ -370,7 +370,7 @@ int main()
constexpr unsigned K = 64; constexpr unsigned K = 64;
constexpr unsigned S = 3; constexpr unsigned S = 3;
constexpr unsigned R = 3; constexpr unsigned R = 3;
#elif 1 #elif 0
// 3x3, 54x54 // 3x3, 54x54
constexpr unsigned N = 64; constexpr unsigned N = 64;
constexpr unsigned C = 64; constexpr unsigned C = 64;
...@@ -388,7 +388,7 @@ int main() ...@@ -388,7 +388,7 @@ int main()
constexpr unsigned K = 64; constexpr unsigned K = 64;
constexpr unsigned S = 3; constexpr unsigned S = 3;
constexpr unsigned R = 3; constexpr unsigned R = 3;
#elif 1 #elif 0
// 3x3, 58x58 // 3x3, 58x58
constexpr unsigned N = 64; constexpr unsigned N = 64;
constexpr unsigned C = 64; constexpr unsigned C = 64;
...@@ -449,7 +449,7 @@ int main() ...@@ -449,7 +449,7 @@ int main()
device_direct_convolution_2 device_direct_convolution_2
#elif 0 #elif 0
device_implicit_gemm_convolution_1_nchw_kcsr device_implicit_gemm_convolution_1_nchw_kcsr
#elif 0 #elif 1
device_implicit_gemm_convolution_1_nchw_srck_nkhw device_implicit_gemm_convolution_1_nchw_srck_nkhw
#elif 1 #elif 1
device_implicit_gemm_convolution_1_chwn_csrk_khwn device_implicit_gemm_convolution_1_chwn_csrk_khwn
......
#pragma once #pragma once
#include "gridwise_implicit_gemm_convolution_1_nchw_kcsr.cuh" #include "gridwise_implicit_gemm_convolution_1_nchw_kcsr.cuh"
#include <unistd.h>
template <class T, class InDesc, class WeiDesc, class OutDesc> template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_implicit_gemm_convolution_1_nchw_kcsr( void device_implicit_gemm_convolution_1_nchw_kcsr(InDesc,
InDesc, const Tensor<T>& in, WeiDesc, const Tensor<T>& wei, OutDesc, Tensor<T>& out) const Tensor<T>& in,
WeiDesc,
const Tensor<T>& wei,
OutDesc,
Tensor<T>& out,
unsigned nrepeat)
{ {
std::size_t data_sz = sizeof(T); std::size_t data_sz = sizeof(T);
DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace()); DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace());
...@@ -75,40 +81,45 @@ void device_implicit_gemm_convolution_1_nchw_kcsr( ...@@ -75,40 +81,45 @@ void device_implicit_gemm_convolution_1_nchw_kcsr(
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
cudaEvent_t start, stop; for(unsigned i = 0; i < nrepeat; ++i)
float elapsedTime; {
cudaEvent_t start, stop;
cudaEventCreate(&start); float elapsedTime;
cudaEventRecord(start, 0);
cudaEventCreate(&start);
gridwise_implicit_gemm_convolution_1_nchw_kcsr<GridSize, cudaEventRecord(start, 0);
BlockSize,
T, gridwise_implicit_gemm_convolution_1_nchw_kcsr<GridSize,
InDesc, BlockSize,
WeiDesc, T,
OutDesc, InDesc,
NPerBlock, WeiDesc,
KPerBlock, OutDesc,
CPerBlock, NPerBlock,
HoPerBlock, KPerBlock,
WoPerBlock, CPerBlock,
KPerThread, HoPerBlock,
CPerThread, WoPerBlock,
HoPerThread, KPerThread,
WoPerThread> CPerThread,
<<<grid_dim, block_dim>>>(InDesc{}, HoPerThread,
static_cast<T*>(in_device_buf.GetDeviceBuffer()), WoPerThread>
WeiDesc{}, <<<grid_dim, block_dim>>>(InDesc{},
static_cast<T*>(wei_device_buf.GetDeviceBuffer()), static_cast<T*>(in_device_buf.GetDeviceBuffer()),
OutDesc{}, WeiDesc{},
static_cast<T*>(out_device_buf.GetDeviceBuffer())); static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
OutDesc{},
cudaEventCreate(&stop); static_cast<T*>(out_device_buf.GetDeviceBuffer()));
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop); cudaEventCreate(&stop);
cudaEventRecord(stop, 0);
cudaEventElapsedTime(&elapsedTime, start, stop); cudaEventSynchronize(stop);
printf("Elapsed time : %f ms\n", elapsedTime);
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("Elapsed time : %f ms\n", elapsedTime);
usleep(10000);
}
checkCudaErrors(cudaGetLastError()); checkCudaErrors(cudaGetLastError());
out_device_buf.FromDevice(out.mData.data()); out_device_buf.FromDevice(out.mData.data());
......
...@@ -65,20 +65,8 @@ void device_implicit_gemm_convolution_1_nchw_srck_nkhw(InDesc, ...@@ -65,20 +65,8 @@ void device_implicit_gemm_convolution_1_nchw_srck_nkhw(InDesc,
constexpr unsigned WoPerThread = 2; constexpr unsigned WoPerThread = 2;
constexpr unsigned BlockSize = 16; constexpr unsigned BlockSize = 16;
#elif 0
constexpr unsigned NPerBlock = 1;
constexpr unsigned KPerBlock = 64;
constexpr unsigned CPerBlock = 2;
constexpr unsigned HoPerBlock = 4;
constexpr unsigned WoPerBlock = 32;
constexpr unsigned KPerThread = 8;
constexpr unsigned CPerThread = 1;
constexpr unsigned HoPerThread = 2;
constexpr unsigned WoPerThread = 4;
constexpr unsigned BlockSize = 128;
#elif 1 #elif 1
// for 3x3, 34x34
constexpr unsigned NPerBlock = 1; constexpr unsigned NPerBlock = 1;
constexpr unsigned KPerBlock = 64; constexpr unsigned KPerBlock = 64;
constexpr unsigned CPerBlock = 2; constexpr unsigned CPerBlock = 2;
...@@ -92,6 +80,7 @@ void device_implicit_gemm_convolution_1_nchw_srck_nkhw(InDesc, ...@@ -92,6 +80,7 @@ void device_implicit_gemm_convolution_1_nchw_srck_nkhw(InDesc,
constexpr unsigned BlockSize = 128; constexpr unsigned BlockSize = 128;
#elif 0 #elif 0
// for 3x3, 34x34
constexpr unsigned NPerBlock = 2; constexpr unsigned NPerBlock = 2;
constexpr unsigned KPerBlock = 64; constexpr unsigned KPerBlock = 64;
constexpr unsigned CPerBlock = 2; constexpr unsigned CPerBlock = 2;
......
...@@ -104,7 +104,7 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr(InGlobalDesc, ...@@ -104,7 +104,7 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr(InGlobalDesc,
const unsigned n_block_data_begin = n_block_work_id * NPerBlock; const unsigned n_block_data_begin = n_block_work_id * NPerBlock;
const unsigned k_block_data_begin = k_block_work_id * KPerBlock; const unsigned k_block_data_begin = k_block_work_id * KPerBlock;
const unsigned ho_block_data_begin = h_block_work_id * HoPerBlock; const unsigned ho_block_data_begin = h_block_work_id * HoPerBlock;
const unsigned wo_block_data_begin = w_block_work_id * HoPerBlock; const unsigned wo_block_data_begin = w_block_work_id * WoPerBlock;
const unsigned hi_block_data_begin = ho_block_data_begin; const unsigned hi_block_data_begin = ho_block_data_begin;
const unsigned wi_block_data_begin = wo_block_data_begin; const unsigned wi_block_data_begin = wo_block_data_begin;
......
...@@ -70,7 +70,7 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(InGlobalDesc, ...@@ -70,7 +70,7 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(InGlobalDesc,
const unsigned n_block_data_begin = n_block_work_id * NPerBlock; const unsigned n_block_data_begin = n_block_work_id * NPerBlock;
const unsigned k_block_data_begin = k_block_work_id * KPerBlock; const unsigned k_block_data_begin = k_block_work_id * KPerBlock;
const unsigned ho_block_data_begin = h_block_work_id * HoPerBlock; const unsigned ho_block_data_begin = h_block_work_id * HoPerBlock;
const unsigned wo_block_data_begin = w_block_work_id * HoPerBlock; const unsigned wo_block_data_begin = w_block_work_id * WoPerBlock;
const unsigned hi_block_data_begin = ho_block_data_begin; const unsigned hi_block_data_begin = ho_block_data_begin;
const unsigned wi_block_data_begin = wo_block_data_begin; const unsigned wi_block_data_begin = wo_block_data_begin;
...@@ -162,7 +162,7 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(InGlobalDesc, ...@@ -162,7 +162,7 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(InGlobalDesc,
for(unsigned c_block_data_begin = 0; c_block_data_begin < in_nchw_global_desc.GetLength(I1); for(unsigned c_block_data_begin = 0; c_block_data_begin < in_nchw_global_desc.GetLength(I1);
c_block_data_begin += CPerBlock, __syncthreads()) c_block_data_begin += CPerBlock, __syncthreads())
{ {
#if 0 #if 1
// input: global mem to LDS, // input: global mem to LDS,
// convert [N,C,Hi,Wi] to [C,Hi,Wi,N] // convert [N,C,Hi,Wi] to [C,Hi,Wi,N]
blockwise_4d_tensor_copy_reorder_by_get_dst_from_src<BlockSize>( blockwise_4d_tensor_copy_reorder_by_get_dst_from_src<BlockSize>(
...@@ -177,7 +177,7 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(InGlobalDesc, ...@@ -177,7 +177,7 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(InGlobalDesc,
reorder_chwn_from_nchw); reorder_chwn_from_nchw);
#endif #endif
#if 0 #if 1
// weight: global mem to LDS, // weight: global mem to LDS,
// format is [S,R,C,K], no conversion needed // format is [S,R,C,K], no conversion needed
blockwise_wei_copy.run(p_wei_global + wei_srck_global_desc.Get1dIndex( blockwise_wei_copy.run(p_wei_global + wei_srck_global_desc.Get1dIndex(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment