"git@developer.sourcefind.cn:modelzoo/resnet50_tensorflow.git" did not exist on "e5eca36a4fb045cc0715eeaf10086cc57ffcbe39"
Commit e80fbbdd authored by Chao Liu's avatar Chao Liu
Browse files

refactor build, clean up

parent 28354a0f
cmake_minimum_required(VERSION 3.9) cmake_minimum_required(VERSION 2.8.3)
project(convolution LANGUAGES CXX CUDA) project(modular_convolution)
#c++ #c++
enable_language(CXX)
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}") message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}")
add_compile_options(-std=c++14)
#boost #boost
find_package(Boost REQUIRED) find_package(Boost REQUIRED)
...@@ -28,20 +30,11 @@ if( NOT( ${CMAKE_CXX_COMPILER_ID} STREQUAL "AppleClang") ) ...@@ -28,20 +30,11 @@ if( NOT( ${CMAKE_CXX_COMPILER_ID} STREQUAL "AppleClang") )
link_libraries(${OpenMP_pthread_LIBRARY}) link_libraries(${OpenMP_pthread_LIBRARY})
endif( NOT( ${CMAKE_CXX_COMPILER_ID} STREQUAL "AppleClang") ) endif( NOT( ${CMAKE_CXX_COMPILER_ID} STREQUAL "AppleClang") )
#python
find_package(PythonLibs 3 REQUIRED)
message("PYTHON_INCLUDE_DIRS: ${PYTHON_INCLUDE_DIRS}")
message("PYTHON_LIBRARIES: ${PYTHON_LIBRARIES}")
include_directories(BEFORE ${PYTHON_INCLUDE_DIRS})
link_libraries(${PYTHON_LIBRARIES})
#cuda #cuda
enable_language(CUDA)
include_directories(BEFORE ${CUDA_COMMON_INCLUDE_DIR}) include_directories(BEFORE ${CUDA_COMMON_INCLUDE_DIR})
# #
include_directories(BEFORE src/include) include_directories(BEFORE src/include)
add_subdirectory(src) add_subdirectory(src)
add_subdirectory(driver) add_subdirectory(driver)
add_executable(conv EXCLUDE_FROM_ALL conv.cu) add_executable(conv conv.cu)
target_link_libraries(conv convolution) target_link_libraries(conv tensor device)
...@@ -2,13 +2,12 @@ ...@@ -2,13 +2,12 @@
#include <numeric> #include <numeric>
#include <initializer_list> #include <initializer_list>
#include <cstdlib> #include <cstdlib>
#include "nvToolsExt.h"
#include "tensor.hpp" #include "tensor.hpp"
#include "ConstantTensorDescriptor.cuh" #include "ConstantTensorDescriptor.cuh"
#include "conv_common.cuh" #include "conv_common.cuh"
#include "device_direct_convolution_1.cuh" #include "device_direct_convolution_1.cuh"
#include "device_direct_convolution_2.cuh" #include "device_direct_convolution_2.cuh"
#include "device_implicit_gemm_convolution_1_nchw_kcsr.cuh" #include "device_implicit_gemm_convolution_1_nchw_kcsr_nkhw.cuh"
#include "device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh" #include "device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh"
#include "device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh" #include "device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh"
#include "device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh" #include "device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh"
...@@ -590,7 +589,7 @@ int main() ...@@ -590,7 +589,7 @@ int main()
#elif 0 #elif 0
device_direct_convolution_2 device_direct_convolution_2
#elif 0 #elif 0
device_implicit_gemm_convolution_1_nchw_kcsr device_implicit_gemm_convolution_1_nchw_kcsr_nkhw
#elif 0 #elif 0
device_implicit_gemm_convolution_1_nchw_srck_nkhw device_implicit_gemm_convolution_1_nchw_srck_nkhw
#elif 0 #elif 0
...@@ -602,7 +601,7 @@ int main() ...@@ -602,7 +601,7 @@ int main()
#endif #endif
(in_nchw_desc, in_nchw, wei_kcsr_desc, wei_kcsr, out_nkhw_desc, out_nkhw_device, nrepeat); (in_nchw_desc, in_nchw, wei_kcsr_desc, wei_kcsr, out_nkhw_desc, out_nkhw_device, nrepeat);
#elif 1 #elif 0
device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(in_nchw_desc, device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(in_nchw_desc,
in_nchw, in_nchw,
wei_kcsr_desc, wei_kcsr_desc,
...@@ -614,7 +613,7 @@ int main() ...@@ -614,7 +613,7 @@ int main()
nrepeat); nrepeat);
#endif #endif
#if 0 #if 1
if(S == 3 && R == 3) if(S == 3 && R == 3)
{ {
host_winograd_3x3_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads); host_winograd_3x3_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads);
......
#pragma once #pragma once
#include "gridwise_direct_convolution_1.cuh"
#include <unistd.h> #include <unistd.h>
#include "device.hpp"
#include "gridwise_direct_convolution_1.cuh"
template <class T, class InDesc, class WeiDesc, class OutDesc> template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_direct_convolution_1(InDesc, void device_direct_convolution_1(InDesc,
...@@ -32,6 +33,7 @@ void device_direct_convolution_1(InDesc, ...@@ -32,6 +33,7 @@ void device_direct_convolution_1(InDesc,
constexpr auto out_desc = OutDesc{}; constexpr auto out_desc = OutDesc{};
#if 1 #if 1
// 3x3, 34x34
constexpr unsigned OutTileSizeH = 2; constexpr unsigned OutTileSizeH = 2;
constexpr unsigned OutTileSizeW = 2; constexpr unsigned OutTileSizeW = 2;
constexpr unsigned NPerBlock = 2; constexpr unsigned NPerBlock = 2;
...@@ -45,20 +47,6 @@ void device_direct_convolution_1(InDesc, ...@@ -45,20 +47,6 @@ void device_direct_convolution_1(InDesc,
constexpr unsigned CPerThread = 2; constexpr unsigned CPerThread = 2;
constexpr unsigned BlockSize = 128; constexpr unsigned BlockSize = 128;
#elif 1
constexpr unsigned OutTileSizeH = 2;
constexpr unsigned OutTileSizeW = 2;
constexpr unsigned NPerBlock = 2;
constexpr unsigned KPerBlock = 16;
constexpr unsigned CPerBlock = 2;
constexpr unsigned YPerBlock = 2;
constexpr unsigned XPerBlock = 27;
constexpr unsigned NPerThread = 2;
constexpr unsigned KPerThread = 4;
constexpr unsigned CPerThread = 2;
constexpr unsigned BlockSize = 216;
#endif #endif
constexpr unsigned GridSize = (out_desc.GetLength(I0) / NPerBlock) * constexpr unsigned GridSize = (out_desc.GetLength(I0) / NPerBlock) *
...@@ -73,45 +61,36 @@ void device_direct_convolution_1(InDesc, ...@@ -73,45 +61,36 @@ void device_direct_convolution_1(InDesc,
for(unsigned i = 0; i < nrepeat; ++i) for(unsigned i = 0; i < nrepeat; ++i)
{ {
cudaEvent_t start, stop; const void* f = reinterpret_cast<const void*>(gridwise_direct_convolution_1<T,
float elapsedTime; InDesc,
WeiDesc,
cudaEventCreate(&start); OutDesc,
cudaEventRecord(start, 0); OutTileSizeH,
OutTileSizeW,
gridwise_direct_convolution_1<T, NPerBlock,
InDesc, KPerBlock,
WeiDesc, CPerBlock,
OutDesc, YPerBlock,
OutTileSizeH, XPerBlock,
OutTileSizeW, NPerThread,
NPerBlock, KPerThread,
KPerBlock, CPerThread,
CPerBlock, BlockSize,
YPerBlock, GridSize>);
XPerBlock,
NPerThread, T* in_dev_ptr = static_cast<T*>(in_device_buf.GetDeviceBuffer());
KPerThread, T* wei_dev_ptr = static_cast<T*>(wei_device_buf.GetDeviceBuffer());
CPerThread, T* out_dev_ptr = static_cast<T*>(out_device_buf.GetDeviceBuffer());
BlockSize,
GridSize> void* args[] = {&in_dev_ptr, &wei_dev_ptr, &out_dev_ptr};
<<<grid_dim, block_dim>>>(InDesc{},
static_cast<T*>(in_device_buf.GetDeviceBuffer()), float time = 0;
WeiDesc{},
static_cast<T*>(wei_device_buf.GetDeviceBuffer()), launch_kernel(f, grid_dim, block_dim, args, time);
OutDesc{},
static_cast<T*>(out_device_buf.GetDeviceBuffer())); printf("Elapsed time : %f ms\n", time);
usleep(std::min(time * 1000, float(10000)));
cudaEventCreate(&stop);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("Elapsed time : %f ms\n", elapsedTime);
usleep(10000);
} }
checkCudaErrors(cudaGetLastError());
out_device_buf.FromDevice(out.mData.data()); out_device_buf.FromDevice(out.mData.data());
} }
#pragma once #pragma once
#include "gridwise_direct_convolution_2.cuh"
#include <unistd.h> #include <unistd.h>
#include "device.hpp"
#include "gridwise_direct_convolution_2.cuh"
template <class T, class InDesc, class WeiDesc, class OutDesc> template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_direct_convolution_2(InDesc, void device_direct_convolution_2(InDesc,
...@@ -32,6 +33,7 @@ void device_direct_convolution_2(InDesc, ...@@ -32,6 +33,7 @@ void device_direct_convolution_2(InDesc,
constexpr auto out_desc = OutDesc{}; constexpr auto out_desc = OutDesc{};
#if 1 #if 1
// 3x3, 34x34, 128 thread
constexpr unsigned OutTileSizeH = 2; constexpr unsigned OutTileSizeH = 2;
constexpr unsigned OutTileSizeW = 2; constexpr unsigned OutTileSizeW = 2;
constexpr unsigned NPerBlock = 2; constexpr unsigned NPerBlock = 2;
...@@ -46,20 +48,7 @@ void device_direct_convolution_2(InDesc, ...@@ -46,20 +48,7 @@ void device_direct_convolution_2(InDesc,
constexpr unsigned BlockSize = 128; constexpr unsigned BlockSize = 128;
#elif 0 #elif 0
constexpr unsigned OutTileSizeH = 2; // 3x3, 34x34, 256 thread
constexpr unsigned OutTileSizeW = 2;
constexpr unsigned NPerBlock = 2;
constexpr unsigned KPerBlock = 32;
constexpr unsigned CPerBlock = 4;
constexpr unsigned YPerBlock = 1;
constexpr unsigned XPerBlock = 27;
constexpr unsigned NPerThread = 2;
constexpr unsigned KPerThread = 4;
constexpr unsigned CPerThread = 2;
constexpr unsigned BlockSize = 216;
#elif 0
constexpr unsigned OutTileSizeH = 2; constexpr unsigned OutTileSizeH = 2;
constexpr unsigned OutTileSizeW = 2; constexpr unsigned OutTileSizeW = 2;
constexpr unsigned NPerBlock = 2; constexpr unsigned NPerBlock = 2;
...@@ -87,45 +76,36 @@ void device_direct_convolution_2(InDesc, ...@@ -87,45 +76,36 @@ void device_direct_convolution_2(InDesc,
for(unsigned i = 0; i < nrepeat; ++i) for(unsigned i = 0; i < nrepeat; ++i)
{ {
cudaEvent_t start, stop; const void* f = reinterpret_cast<const void*>(gridwise_direct_convolution_2<T,
float elapsedTime; InDesc,
WeiDesc,
cudaEventCreate(&start); OutDesc,
cudaEventRecord(start, 0); OutTileSizeH,
OutTileSizeW,
gridwise_direct_convolution_2<T, NPerBlock,
InDesc, KPerBlock,
WeiDesc, CPerBlock,
OutDesc, YPerBlock,
OutTileSizeH, XPerBlock,
OutTileSizeW, NPerThread,
NPerBlock, KPerThread,
KPerBlock, CPerThread,
CPerBlock, BlockSize,
YPerBlock, GridSize>);
XPerBlock,
NPerThread, T* in_dev_ptr = static_cast<T*>(in_device_buf.GetDeviceBuffer());
KPerThread, T* wei_dev_ptr = static_cast<T*>(wei_device_buf.GetDeviceBuffer());
CPerThread, T* out_dev_ptr = static_cast<T*>(out_device_buf.GetDeviceBuffer());
BlockSize,
GridSize> void* args[] = {&in_dev_ptr, &wei_dev_ptr, &out_dev_ptr};
<<<grid_dim, block_dim>>>(InDesc{},
static_cast<T*>(in_device_buf.GetDeviceBuffer()), float time = 0;
WeiDesc{},
static_cast<T*>(wei_device_buf.GetDeviceBuffer()), launch_kernel(f, grid_dim, block_dim, args, time);
OutDesc{},
static_cast<T*>(out_device_buf.GetDeviceBuffer())); printf("Elapsed time : %f ms\n", time);
usleep(std::min(time * 1000, float(10000)));
cudaEventCreate(&stop);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("Elapsed time : %f ms\n", elapsedTime);
usleep(10000);
} }
checkCudaErrors(cudaGetLastError());
out_device_buf.FromDevice(out.mData.data()); out_device_buf.FromDevice(out.mData.data());
} }
#pragma once #pragma once
#include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh"
#include <unistd.h> #include <unistd.h>
#include "device.hpp"
#include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh"
template <class T, class InDesc, class WeiDesc, class OutDesc> template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc, void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
...@@ -73,21 +74,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc, ...@@ -73,21 +74,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
wei_csrk_device_buf.ToDevice(wei_csrk.mData.data()); wei_csrk_device_buf.ToDevice(wei_csrk.mData.data());
out_khwn_device_buf.ToDevice(out_khwn.mData.data()); out_khwn_device_buf.ToDevice(out_khwn.mData.data());
#if 0 #if 1
constexpr unsigned NPerBlock = 1;
constexpr unsigned KPerBlock = 1;
constexpr unsigned CPerBlock = 1;
constexpr unsigned HoPerBlock = 2;
constexpr unsigned WoPerBlock = 4;
constexpr unsigned NPerThread = 1;
constexpr unsigned KPerThread = 1;
constexpr unsigned CPerThread = 1;
constexpr unsigned HoPerThread = 1;
constexpr unsigned WoPerThread = 1;
constexpr unsigned BlockSize = 8;
#elif 0
// for 3x3, 34x34 | 3x3 58x58, NKC = 64, 64, 256 // for 3x3, 34x34 | 3x3 58x58, NKC = 64, 64, 256
constexpr unsigned NPerBlock = 16; constexpr unsigned NPerBlock = 16;
constexpr unsigned KPerBlock = 64; constexpr unsigned KPerBlock = 64;
...@@ -214,50 +201,42 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc, ...@@ -214,50 +201,42 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
for(unsigned i = 0; i < nrepeat; ++i) for(unsigned i = 0; i < nrepeat; ++i)
{ {
cudaEvent_t start, stop; const void* f = reinterpret_cast<const void*>(
float elapsedTime; gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn<GridSize,
BlockSize,
cudaEventCreate(&start); T,
cudaEventRecord(start, 0); decltype(in_chwn_desc),
decltype(wei_csrk_desc),
gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn<GridSize, decltype(out_khwn_desc),
BlockSize, NPerBlock,
T, KPerBlock,
decltype(in_chwn_desc), CPerBlock,
decltype(wei_csrk_desc), HoPerBlock,
decltype(out_khwn_desc), WoPerBlock,
NPerBlock, NPerThread,
KPerBlock, KPerThread,
CPerBlock, CPerThread,
HoPerBlock, HoPerThread,
WoPerBlock, WoPerThread,
NPerThread, WeiBlockCopyThreadPerDim0,
KPerThread, WeiBlockCopyThreadPerDim1,
CPerThread, InBlockCopyDataPerRead,
HoPerThread, WeiBlockCopyDataPerRead>);
WoPerThread,
WeiBlockCopyThreadPerDim0, T* in_dev_ptr = static_cast<T*>(in_chwn_device_buf.GetDeviceBuffer());
WeiBlockCopyThreadPerDim1, T* wei_dev_ptr = static_cast<T*>(wei_csrk_device_buf.GetDeviceBuffer());
InBlockCopyDataPerRead, T* out_dev_ptr = static_cast<T*>(out_khwn_device_buf.GetDeviceBuffer());
WeiBlockCopyDataPerRead>
<<<grid_dim, block_dim>>>(in_chwn_desc, void* args[] = {&in_dev_ptr, &wei_dev_ptr, &out_dev_ptr};
static_cast<T*>(in_chwn_device_buf.GetDeviceBuffer()),
wei_csrk_desc, float time = 0;
static_cast<T*>(wei_csrk_device_buf.GetDeviceBuffer()),
out_khwn_desc, launch_kernel(f, grid_dim, block_dim, args, time);
static_cast<T*>(out_khwn_device_buf.GetDeviceBuffer()));
printf("Elapsed time : %f ms\n", time);
cudaEventCreate(&stop); usleep(std::min(time * 1000, float(10000)));
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("Elapsed time : %f ms\n", elapsedTime);
usleep(std::min(elapsedTime * 1000, float(10000)));
} }
checkCudaErrors(cudaGetLastError());
out_khwn_device_buf.FromDevice(out_khwn.mData.data()); out_khwn_device_buf.FromDevice(out_khwn.mData.data());
// reorder output // reorder output
......
#pragma once #pragma once
#include <unistd.h>
#include "device.hpp"
#include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh" #include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh"
#include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.cuh" #include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.cuh"
#include <unistd.h>
#include <algorithm>
template <class T, class InDesc, class WeiDesc, class OutDesc, class LowerPads, class UpperPads> template <class T, class InDesc, class WeiDesc, class OutDesc, class LowerPads, class UpperPads>
void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc, void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc,
...@@ -172,7 +172,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc, ...@@ -172,7 +172,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc,
constexpr unsigned WoPerThread = 1; constexpr unsigned WoPerThread = 1;
constexpr unsigned BlockSize = 128; constexpr unsigned BlockSize = 128;
#elif 0 #elif 1
// 3x3 56x56, NKC = 16,256,128, with padding // 3x3 56x56, NKC = 16,256,128, with padding
// 3x3 28x28, NKC = 16,512,256, with padding // 3x3 28x28, NKC = 16,512,256, with padding
// 3x3 20x84, NKC = 16,256,256, with padding // 3x3 20x84, NKC = 16,256,256, with padding
...@@ -222,7 +222,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc, ...@@ -222,7 +222,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc,
constexpr unsigned WoPerThread = 1; constexpr unsigned WoPerThread = 1;
constexpr unsigned BlockSize = 128; constexpr unsigned BlockSize = 128;
#elif 1 #elif 0
// for 1x1, 28x28 // for 1x1, 28x28
constexpr unsigned NPerBlock = 16; constexpr unsigned NPerBlock = 16;
constexpr unsigned KPerBlock = 128; constexpr unsigned KPerBlock = 128;
...@@ -253,16 +253,11 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc, ...@@ -253,16 +253,11 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc,
for(unsigned i = 0; i < nrepeat; ++i) for(unsigned i = 0; i < nrepeat; ++i)
{ {
cudaEvent_t start, stop; const void* f = reinterpret_cast<const void*>(
float elapsedTime; #if 0
cudaEventCreate(&start);
cudaEventRecord(start, 0);
#if 1
gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded
#elif 1 #elif 1
gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline
#endif #endif
<GridSize, <GridSize,
BlockSize, BlockSize,
...@@ -283,22 +278,22 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc, ...@@ -283,22 +278,22 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc,
HoPerThread, HoPerThread,
WoPerThread, WoPerThread,
WeiBlockCopyThreadPerDim0, WeiBlockCopyThreadPerDim0,
WeiBlockCopyThreadPerDim1> WeiBlockCopyThreadPerDim1>);
<<<grid_dim, block_dim>>>(static_cast<T*>(in_chwn_device_buf.GetDeviceBuffer()),
static_cast<T*>(wei_csrk_device_buf.GetDeviceBuffer()), T* in_dev_ptr = static_cast<T*>(in_chwn_device_buf.GetDeviceBuffer());
static_cast<T*>(out_khwn_device_buf.GetDeviceBuffer())); T* wei_dev_ptr = static_cast<T*>(wei_csrk_device_buf.GetDeviceBuffer());
T* out_dev_ptr = static_cast<T*>(out_khwn_device_buf.GetDeviceBuffer());
void* args[] = {&in_dev_ptr, &wei_dev_ptr, &out_dev_ptr};
cudaEventCreate(&stop); float time = 0;
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop); launch_kernel(f, grid_dim, block_dim, args, time);
printf("Elapsed time : %f ms\n", elapsedTime);
usleep(std::min(elapsedTime * 1000, float(10000))); printf("Elapsed time : %f ms\n", time);
usleep(std::min(time * 1000, float(10000)));
} }
checkCudaErrors(cudaGetLastError());
out_khwn_device_buf.FromDevice(out_khwn.mData.data()); out_khwn_device_buf.FromDevice(out_khwn.mData.data());
// reorder output // reorder output
......
#pragma once #pragma once
#include "gridwise_implicit_gemm_convolution_1_nchw_kcsr.cuh"
#include <unistd.h> #include <unistd.h>
#include "device.hpp"
#include "gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw.cuh"
template <class T, class InDesc, class WeiDesc, class OutDesc> template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_implicit_gemm_convolution_1_nchw_kcsr(InDesc, void device_implicit_gemm_convolution_1_nchw_kcsr_nkhw(InDesc,
const Tensor<T>& in, const Tensor<T>& in,
WeiDesc, WeiDesc,
const Tensor<T>& wei, const Tensor<T>& wei,
OutDesc, OutDesc,
Tensor<T>& out, Tensor<T>& out,
unsigned nrepeat) unsigned nrepeat)
{ {
std::size_t data_sz = sizeof(T); std::size_t data_sz = sizeof(T);
DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace()); DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace());
...@@ -31,20 +32,8 @@ void device_implicit_gemm_convolution_1_nchw_kcsr(InDesc, ...@@ -31,20 +32,8 @@ void device_implicit_gemm_convolution_1_nchw_kcsr(InDesc,
constexpr auto wei_desc = WeiDesc{}; constexpr auto wei_desc = WeiDesc{};
constexpr auto out_desc = OutDesc{}; constexpr auto out_desc = OutDesc{};
#if 0 #if 1
constexpr unsigned NPerBlock = 1; // 3x3, 34x34
constexpr unsigned KPerBlock = 1;
constexpr unsigned CPerBlock = 1;
constexpr unsigned HoPerBlock = 2;
constexpr unsigned WoPerBlock = 32;
constexpr unsigned KPerThread = 1;
constexpr unsigned CPerThread = 1;
constexpr unsigned HoPerThread = 2;
constexpr unsigned WoPerThread = 2;
constexpr unsigned BlockSize = 16;
#elif 1
constexpr unsigned NPerBlock = 1; constexpr unsigned NPerBlock = 1;
constexpr unsigned KPerBlock = 64; constexpr unsigned KPerBlock = 64;
constexpr unsigned CPerBlock = 2; constexpr unsigned CPerBlock = 2;
...@@ -56,19 +45,6 @@ void device_implicit_gemm_convolution_1_nchw_kcsr(InDesc, ...@@ -56,19 +45,6 @@ void device_implicit_gemm_convolution_1_nchw_kcsr(InDesc,
constexpr unsigned HoPerThread = 2; constexpr unsigned HoPerThread = 2;
constexpr unsigned WoPerThread = 2; constexpr unsigned WoPerThread = 2;
constexpr unsigned BlockSize = 128;
#elif 0
constexpr unsigned NPerBlock = 1;
constexpr unsigned KPerBlock = 64;
constexpr unsigned CPerBlock = 4;
constexpr unsigned HoPerBlock = 4;
constexpr unsigned WoPerBlock = 32;
constexpr unsigned KPerThread = 8;
constexpr unsigned CPerThread = 2;
constexpr unsigned HoPerThread = 2;
constexpr unsigned WoPerThread = 4;
constexpr unsigned BlockSize = 128; constexpr unsigned BlockSize = 128;
#endif #endif
...@@ -83,44 +59,36 @@ void device_implicit_gemm_convolution_1_nchw_kcsr(InDesc, ...@@ -83,44 +59,36 @@ void device_implicit_gemm_convolution_1_nchw_kcsr(InDesc,
for(unsigned i = 0; i < nrepeat; ++i) for(unsigned i = 0; i < nrepeat; ++i)
{ {
cudaEvent_t start, stop; const void* f = reinterpret_cast<const void*>(
float elapsedTime; gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw<GridSize,
BlockSize,
cudaEventCreate(&start); T,
cudaEventRecord(start, 0); InDesc,
WeiDesc,
gridwise_implicit_gemm_convolution_1_nchw_kcsr<GridSize, OutDesc,
BlockSize, NPerBlock,
T, KPerBlock,
InDesc, CPerBlock,
WeiDesc, HoPerBlock,
OutDesc, WoPerBlock,
NPerBlock, KPerThread,
KPerBlock, CPerThread,
CPerBlock, HoPerThread,
HoPerBlock, WoPerThread>);
WoPerBlock,
KPerThread, T* in_dev_ptr = static_cast<T*>(in_device_buf.GetDeviceBuffer());
CPerThread, T* wei_dev_ptr = static_cast<T*>(wei_device_buf.GetDeviceBuffer());
HoPerThread, T* out_dev_ptr = static_cast<T*>(out_device_buf.GetDeviceBuffer());
WoPerThread>
<<<grid_dim, block_dim>>>(InDesc{}, void* args[] = {&in_dev_ptr, &wei_dev_ptr, &out_dev_ptr};
static_cast<T*>(in_device_buf.GetDeviceBuffer()),
WeiDesc{}, float time = 0;
static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
OutDesc{}, launch_kernel(f, grid_dim, block_dim, args, time);
static_cast<T*>(out_device_buf.GetDeviceBuffer()));
printf("Elapsed time : %f ms\n", time);
cudaEventCreate(&stop); usleep(std::min(time * 1000, float(10000)));
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("Elapsed time : %f ms\n", elapsedTime);
usleep(10000);
} }
checkCudaErrors(cudaGetLastError());
out_device_buf.FromDevice(out.mData.data()); out_device_buf.FromDevice(out.mData.data());
} }
#pragma once #pragma once
#include "gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh"
#include <unistd.h> #include <unistd.h>
#include "device.hpp"
#include "gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh"
template <class T, class InDesc, class WeiDesc, class OutDesc> template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_implicit_gemm_convolution_1_nchw_srck_nkhw(InDesc, void device_implicit_gemm_convolution_1_nchw_srck_nkhw(InDesc,
...@@ -52,20 +53,7 @@ void device_implicit_gemm_convolution_1_nchw_srck_nkhw(InDesc, ...@@ -52,20 +53,7 @@ void device_implicit_gemm_convolution_1_nchw_srck_nkhw(InDesc,
wei_srck_device_buf.ToDevice(wei_srck.mData.data()); wei_srck_device_buf.ToDevice(wei_srck.mData.data());
out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
#if 0 #if 1
constexpr unsigned NPerBlock = 1;
constexpr unsigned KPerBlock = 1;
constexpr unsigned CPerBlock = 1;
constexpr unsigned HoPerBlock = 2;
constexpr unsigned WoPerBlock = 32;
constexpr unsigned KPerThread = 1;
constexpr unsigned CPerThread = 1;
constexpr unsigned HoPerThread = 2;
constexpr unsigned WoPerThread = 2;
constexpr unsigned BlockSize = 16;
#elif 0
// for 3x3, 34x34 // for 3x3, 34x34
constexpr unsigned NPerBlock = 1; constexpr unsigned NPerBlock = 1;
constexpr unsigned KPerBlock = 64; constexpr unsigned KPerBlock = 64;
...@@ -123,45 +111,37 @@ void device_implicit_gemm_convolution_1_nchw_srck_nkhw(InDesc, ...@@ -123,45 +111,37 @@ void device_implicit_gemm_convolution_1_nchw_srck_nkhw(InDesc,
for(unsigned i = 0; i < nrepeat; ++i) for(unsigned i = 0; i < nrepeat; ++i)
{ {
cudaEvent_t start, stop; const void* f = reinterpret_cast<const void*>(
float elapsedTime; gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw<GridSize,
BlockSize,
cudaEventCreate(&start); T,
cudaEventRecord(start, 0); decltype(in_nchw_desc),
decltype(wei_srck_desc),
gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw<GridSize, decltype(out_nkhw_desc),
BlockSize, NPerBlock,
T, KPerBlock,
decltype(in_nchw_desc), CPerBlock,
decltype(wei_srck_desc), HoPerBlock,
decltype(out_nkhw_desc), WoPerBlock,
NPerBlock, NPerThread,
KPerBlock, KPerThread,
CPerBlock, CPerThread,
HoPerBlock, HoPerThread,
WoPerBlock, WoPerThread>);
NPerThread,
KPerThread, T* in_dev_ptr = static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer());
CPerThread, T* wei_dev_ptr = static_cast<T*>(wei_srck_device_buf.GetDeviceBuffer());
HoPerThread, T* out_dev_ptr = static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer());
WoPerThread>
<<<grid_dim, block_dim>>>(in_nchw_desc, void* args[] = {&in_dev_ptr, &wei_dev_ptr, &out_dev_ptr};
static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
wei_srck_desc, float time = 0;
static_cast<T*>(wei_srck_device_buf.GetDeviceBuffer()),
out_nkhw_desc, launch_kernel(f, grid_dim, block_dim, args, time);
static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));
printf("Elapsed time : %f ms\n", time);
cudaEventCreate(&stop); usleep(std::min(time * 1000, float(10000)));
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("Elapsed time : %f ms\n", elapsedTime);
usleep(10000);
} }
checkCudaErrors(cudaGetLastError());
out_nkhw_device_buf.FromDevice(out_nkhw.mData.data()); out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
} }
#pragma once #pragma once
#include <unistd.h>
#include "device.hpp"
#include "gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh" #include "gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh"
#include "gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.cuh" #include "gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.cuh"
#include <unistd.h>
template <class T, class InDesc, class WeiDesc, class OutDesc> template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc, void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,
...@@ -69,6 +70,7 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc, ...@@ -69,6 +70,7 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,
#if 0 #if 0
// 3x3, 34x34 // 3x3, 34x34
// need to use register double buffer for GEMM
constexpr unsigned BPerBlock = 128; constexpr unsigned BPerBlock = 128;
constexpr unsigned KPerBlock = 64; constexpr unsigned KPerBlock = 64;
constexpr unsigned CPerBlock = 4; constexpr unsigned CPerBlock = 4;
...@@ -211,60 +213,53 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc, ...@@ -211,60 +213,53 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,
for(unsigned i = 0; i < nrepeat; ++i) for(unsigned i = 0; i < nrepeat; ++i)
{ {
cudaEvent_t start, stop; const void* f = reinterpret_cast<const void*>(
float elapsedTime;
cudaEventCreate(&start);
cudaEventRecord(start, 0);
#if 0 #if 0
gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw
#else #else
gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer
#endif #endif
<GridSize, <GridSize,
BlockSize, BlockSize,
T, T,
decltype(in_cnhw_desc), decltype(in_cnhw_desc),
decltype(wei_csrk_desc), decltype(wei_csrk_desc),
decltype(out_knhw_desc), decltype(out_knhw_desc),
BPerBlock, BPerBlock,
KPerBlock, KPerBlock,
CPerBlock, CPerBlock,
BPerThread, BPerThread,
KPerThread, KPerThread,
GemmThreadPerColumnPerCluster, GemmThreadPerColumnPerCluster,
GemmThreadPerRowPerCluster, GemmThreadPerRowPerCluster,
GemmMPerThreadSubC, GemmMPerThreadSubC,
GemmNPerThreadSubC, GemmNPerThreadSubC,
GemmMLevel0Cluster, GemmMLevel0Cluster,
GemmNLevel0Cluster, GemmNLevel0Cluster,
GemmMLevel1Cluster, GemmMLevel1Cluster,
GemmNLevel1Cluster, GemmNLevel1Cluster,
GemmKPerThreadLoop, GemmKPerThreadLoop,
InBlockCopyThreadPerDim0, InBlockCopyThreadPerDim0,
InBlockCopyThreadPerDim1, InBlockCopyThreadPerDim1,
WeiBlockCopyThreadPerDim0, WeiBlockCopyThreadPerDim0,
WeiBlockCopyThreadPerDim1, WeiBlockCopyThreadPerDim1,
InBlockCopyDataPerRead, InBlockCopyDataPerRead,
WeiBlockCopyDataPerRead> WeiBlockCopyDataPerRead>);
<<<grid_dim, block_dim>>>(in_cnhw_desc,
static_cast<T*>(in_cnhw_device_buf.GetDeviceBuffer()), T* in_dev_ptr = static_cast<T*>(in_cnhw_device_buf.GetDeviceBuffer());
wei_csrk_desc, T* wei_dev_ptr = static_cast<T*>(wei_csrk_device_buf.GetDeviceBuffer());
static_cast<T*>(wei_csrk_device_buf.GetDeviceBuffer()), T* out_dev_ptr = static_cast<T*>(out_knhw_device_buf.GetDeviceBuffer());
out_knhw_desc,
static_cast<T*>(out_knhw_device_buf.GetDeviceBuffer())); void* args[] = {&in_dev_ptr, &wei_dev_ptr, &out_dev_ptr};
cudaEventCreate(&stop); float time;
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop); launch_kernel(f, grid_dim, block_dim, args, time);
cudaEventElapsedTime(&elapsedTime, start, stop); printf("Elapsed time : %f ms\n", time);
printf("Elapsed time : %f ms\n", elapsedTime); usleep(std::min(time * 1000, float(10000)));
usleep(std::min(elapsedTime * 1000, float(10000)));
} }
checkCudaErrors(cudaGetLastError());
out_knhw_device_buf.FromDevice(out_knhw.mData.data()); out_knhw_device_buf.FromDevice(out_knhw.mData.data());
// convert out_knhw to out_nkhw // convert out_knhw to out_nkhw
......
#pragma once #pragma once
#include <unistd.h>
#include "device.hpp"
#include "gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh" #include "gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh"
#include "gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.cuh" #include "gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.cuh"
#include <unistd.h>
template <class T, class InDesc, class WeiDesc, class OutDesc> template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc, void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc,
...@@ -100,7 +101,7 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc, ...@@ -100,7 +101,7 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc,
constexpr unsigned InBlockCopyThreadPerDim1 = 16; constexpr unsigned InBlockCopyThreadPerDim1 = 16;
constexpr unsigned BlockSize = 128; constexpr unsigned BlockSize = 128;
#elif 1 #elif 0
// 1x1, 28x28 // 1x1, 28x28
constexpr unsigned BPerBlock = 64; constexpr unsigned BPerBlock = 64;
constexpr unsigned KPerBlock = 64; constexpr unsigned KPerBlock = 64;
...@@ -140,50 +141,43 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc, ...@@ -140,50 +141,43 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc,
for(unsigned i = 0; i < nrepeat; ++i) for(unsigned i = 0; i < nrepeat; ++i)
{ {
cudaEvent_t start, stop; const void* f = reinterpret_cast<const void*>(
float elapsedTime; #if 1
cudaEventCreate(&start); gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw
cudaEventRecord(start, 0);
#if 0
gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw
#else #else
gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline
#endif #endif
<GridSize, <GridSize,
BlockSize, BlockSize,
T, T,
decltype(in_cnhw_desc), decltype(in_cnhw_desc),
decltype(wei_srck_desc), decltype(wei_srck_desc),
decltype(out_knhw_desc), decltype(out_knhw_desc),
BPerBlock, BPerBlock,
KPerBlock, KPerBlock,
CPerBlock, CPerBlock,
BPerThread, BPerThread,
KPerThread, KPerThread,
CPerThread, CPerThread,
GemmThreadPerColumnPerCluster, GemmThreadPerColumnPerCluster,
GemmThreadPerRowPerCluster, GemmThreadPerRowPerCluster,
InBlockCopyThreadPerDim0, InBlockCopyThreadPerDim0,
InBlockCopyThreadPerDim1> InBlockCopyThreadPerDim1>);
<<<grid_dim, block_dim>>>(in_cnhw_desc,
static_cast<T*>(in_cnhw_device_buf.GetDeviceBuffer()), T* in_dev_ptr = static_cast<T*>(in_cnhw_device_buf.GetDeviceBuffer());
wei_srck_desc, T* wei_dev_ptr = static_cast<T*>(wei_srck_device_buf.GetDeviceBuffer());
static_cast<T*>(wei_srck_device_buf.GetDeviceBuffer()), T* out_dev_ptr = static_cast<T*>(out_knhw_device_buf.GetDeviceBuffer());
out_knhw_desc,
static_cast<T*>(out_knhw_device_buf.GetDeviceBuffer())); void* args[] = {&in_dev_ptr, &wei_dev_ptr, &out_dev_ptr};
cudaEventCreate(&stop); float time = 0;
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop); launch_kernel(f, grid_dim, block_dim, args, time);
cudaEventElapsedTime(&elapsedTime, start, stop); printf("Elapsed time : %f ms\n", time);
printf("Elapsed time : %f ms\n", elapsedTime); usleep(std::min(time * 1000, float(10000)));
usleep(std::min(elapsedTime * 1000, float(10000)));
} }
checkCudaErrors(cudaGetLastError());
out_knhw_device_buf.FromDevice(out_knhw.mData.data()); out_knhw_device_buf.FromDevice(out_knhw.mData.data());
// convert out_knhw to out_nkhw // convert out_knhw to out_nkhw
......
set(SOURCE set(TENSOR_SOURCE
tensor.cpp; tensor.cpp;
) )
add_library(convolution SHARED ${SOURCE}) add_library(tensor SHARED ${TENSOR_SOURCE})
set_target_properties(convolution PROPERTIES PREFIX "") set_target_properties(tensor PROPERTIES PREFIX "")
target_compile_features(tensor PUBLIC)
set_target_properties(tensor PROPERTIES POSITION_INDEPENDENT_CODE ON)
install(TARGETS tensor LIBRARY DESTINATION lib)
# boost.python
target_link_libraries(convolution boost_python3)
# cuda set(DEVICE_SOURCE
target_link_libraries(convolution nvToolsExt cudart) device.cu;
target_compile_features(convolution PUBLIC) )
set_target_properties(convolution PROPERTIES POSITION_INDEPENDENT_CODE ON)
set_target_properties(convolution PROPERTIES CUDA_SEPARABLE_COMPILATION OFF)
install(TARGETS convolution LIBRARY DESTINATION lib) add_library(device SHARED ${DEVICE_SOURCE})
set_target_properties(device PROPERTIES PREFIX "")
target_compile_features(device PUBLIC)
set_target_properties(device PROPERTIES POSITION_INDEPENDENT_CODE ON)
install(TARGETS device LIBRARY DESTINATION lib)
target_link_libraries(device nvToolsExt cudart)
#include "device.hpp"
#include "cuda_runtime.h"
#include "nvToolsExt.h"
#include "helper_cuda.h"
DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
{
checkCudaErrors(cudaMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
}
void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; }
void DeviceMem::ToDevice(const void* p)
{
checkCudaErrors(
cudaMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, cudaMemcpyHostToDevice));
}
void DeviceMem::FromDevice(void* p)
{
checkCudaErrors(cudaMemcpy(p, mpDeviceBuf, mMemSize, cudaMemcpyDeviceToHost));
}
DeviceMem::~DeviceMem() { checkCudaErrors(cudaFree(mpDeviceBuf)); }
struct KernelTimerImpl
{
KernelTimerImpl()
{
cudaEventCreate(&mStart);
cudaEventCreate(&mEnd);
}
~KernelTimerImpl()
{
cudaEventDestroy(mStart);
cudaEventDestroy(mEnd);
}
void Start() { cudaEventRecord(mStart, 0); }
void End()
{
cudaEventRecord(mEnd, 0);
cudaEventSynchronize(mEnd);
}
float GetElapsedTime() const
{
float time;
cudaEventElapsedTime(&time, mStart, mEnd);
return time;
}
cudaEvent_t mStart, mEnd;
};
KernelTimer::KernelTimer() : impl(new KernelTimerImpl()) {}
KernelTimer::~KernelTimer() {}
void KernelTimer::Start() { impl->Start(); }
void KernelTimer::End() { impl->End(); }
float KernelTimer::GetElapsedTime() const { return impl->GetElapsedTime(); }
void launch_kernel(const void* func, dim3 grid_dim, dim3 block_dim, void** args, float& time)
{
KernelTimer timer;
timer.Start();
cudaError_t error = cudaLaunchKernel(func, grid_dim, block_dim, args, 0, 0);
timer.End();
time = timer.GetElapsedTime();
checkCudaErrors(error);
}
...@@ -513,7 +513,6 @@ struct Blockwise2dTensorCopy3 ...@@ -513,7 +513,6 @@ struct Blockwise2dTensorCopy3
} }
} }
#if 1
__device__ constexpr unsigned GetRegisterClipboardSize() const __device__ constexpr unsigned GetRegisterClipboardSize() const
{ {
static_assert(is_same<Float, float>::value, "wrong! only support float!\n"); static_assert(is_same<Float, float>::value, "wrong! only support float!\n");
...@@ -703,5 +702,4 @@ struct Blockwise2dTensorCopy3 ...@@ -703,5 +702,4 @@ struct Blockwise2dTensorCopy3
} }
} }
} }
#endif
}; };
...@@ -88,7 +88,7 @@ template <unsigned BlockSize, ...@@ -88,7 +88,7 @@ template <unsigned BlockSize,
class F> class F>
__device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_dst_from_src( __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_dst_from_src(
SrcDesc, SrcDesc,
Float* const __restrict__ p_src, const Float* __restrict__ p_src,
DstDesc, DstDesc,
Float* __restrict__ p_dst, Float* __restrict__ p_dst,
SrcOpLengths, SrcOpLengths,
...@@ -187,7 +187,7 @@ template <unsigned BlockSize, ...@@ -187,7 +187,7 @@ template <unsigned BlockSize,
class DstFromSrcReorder> class DstFromSrcReorder>
__device__ void __device__ void
blockwise_4d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc, blockwise_4d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc,
Float* const __restrict__ p_src, const Float* __restrict__ p_src,
DstDesc, DstDesc,
Float* __restrict__ p_dst, Float* __restrict__ p_dst,
SrcOpLengths, SrcOpLengths,
...@@ -202,7 +202,7 @@ blockwise_4d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc, ...@@ -202,7 +202,7 @@ blockwise_4d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc,
template <unsigned BlockSize, class Float, class SrcDesc, class DstDesc, class SrcOpLengths> template <unsigned BlockSize, class Float, class SrcDesc, class DstDesc, class SrcOpLengths>
struct Blockwise4dTensorCopy1 struct Blockwise4dTensorCopy1
{ {
__device__ void Run(Float* const __restrict__ p_src, Float* __restrict__ p_dst) const __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
{ {
constexpr auto dst_from_src_reorder = Sequence<0, 1, 2, 3>{}; constexpr auto dst_from_src_reorder = Sequence<0, 1, 2, 3>{};
...@@ -219,7 +219,7 @@ template <unsigned BlockSize, ...@@ -219,7 +219,7 @@ template <unsigned BlockSize,
class GlobalLowerPads> class GlobalLowerPads>
struct BlockwiseChwnTensorCopyPadded struct BlockwiseChwnTensorCopyPadded
{ {
__device__ void Run(Float* const __restrict__ p_src, __device__ void Run(const Float* __restrict__ p_src,
unsigned c_block_data_begin, unsigned c_block_data_begin,
unsigned ho_block_data_begin, unsigned ho_block_data_begin,
unsigned wo_block_data_begin, unsigned wo_block_data_begin,
...@@ -244,7 +244,7 @@ struct BlockwiseChwnTensorCopyPadded ...@@ -244,7 +244,7 @@ struct BlockwiseChwnTensorCopyPadded
constexpr unsigned NLoop = ref_desc.GetElementSize() / BlockSize; constexpr unsigned NLoop = ref_desc.GetElementSize() / BlockSize;
Float* const p_src_tmp = const Float* p_src_tmp =
p_src + src_desc.Get1dIndex(c_block_data_begin, p_src + src_desc.Get1dIndex(c_block_data_begin,
(ho_block_data_begin + h_block_pad_low) - h_global_pad_low, (ho_block_data_begin + h_block_pad_low) - h_global_pad_low,
(wo_block_data_begin + w_block_pad_low) - w_global_pad_low, (wo_block_data_begin + w_block_pad_low) - w_global_pad_low,
...@@ -336,4 +336,4 @@ struct BlockwiseChwnTensorCopyPadded ...@@ -336,4 +336,4 @@ struct BlockwiseChwnTensorCopyPadded
} }
} }
} }
}; };
\ No newline at end of file
#pragma once
#include <memory>
struct DeviceMem
{
DeviceMem() = delete;
DeviceMem(std::size_t mem_size);
void* GetDeviceBuffer();
void ToDevice(const void* p);
void FromDevice(void* p);
~DeviceMem();
void* mpDeviceBuf;
std::size_t mMemSize;
};
struct KernelTimerImpl;
struct KernelTimer
{
KernelTimer();
~KernelTimer();
void Start();
void End();
float GetElapsedTime() const;
std::unique_ptr<KernelTimerImpl> impl;
};
void launch_kernel(const void* func, dim3 grid_dim, dim3 block_dim, void** args, float& time);
...@@ -19,12 +19,9 @@ template <class Float, ...@@ -19,12 +19,9 @@ template <class Float,
unsigned CPerThread, unsigned CPerThread,
unsigned BlockSize, unsigned BlockSize,
unsigned GridSize> unsigned GridSize>
__global__ void gridwise_direct_convolution_1(InGlobalDesc, __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_in_global,
Float* const __restrict__ p_in_global, const Float* const __restrict__ p_wei_global,
WeiGlobalDesc, Float* const __restrict__ p_out_global)
Float* const __restrict__ p_wei_global,
OutGlobalDesc,
Float* __restrict__ p_out_global)
{ {
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
......
...@@ -21,12 +21,9 @@ template <class Float, ...@@ -21,12 +21,9 @@ template <class Float,
unsigned CPerThread, unsigned CPerThread,
unsigned BlockSize, unsigned BlockSize,
unsigned GridSize> unsigned GridSize>
__global__ void gridwise_direct_convolution_2(InGlobalDesc, __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_in_global,
Float* const __restrict__ p_in_global, const Float* const __restrict__ p_wei_global,
WeiGlobalDesc, Float* const __restrict__ p_out_global)
Float* const __restrict__ p_wei_global,
OutGlobalDesc,
Float* __restrict__ p_out_global)
{ {
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
......
...@@ -28,12 +28,9 @@ template <unsigned GridSize, ...@@ -28,12 +28,9 @@ template <unsigned GridSize,
unsigned InBlockCopyDataPerRead, unsigned InBlockCopyDataPerRead,
unsigned WeiBlockCopyDataPerRead> unsigned WeiBlockCopyDataPerRead>
__global__ void __global__ void
gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(InGlobalDesc, gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restrict__ p_in_global,
Float* const __restrict__ p_in_global, const Float* const __restrict__ p_wei_global,
WeiGlobalDesc, Float* const __restrict__ p_out_global)
Float* const __restrict__ p_wei_global,
OutGlobalDesc,
Float* __restrict__ p_out_global)
{ {
// NPerThread == NPerBlock, because the format of input in LDS [C,Hi,Wi,N] // NPerThread == NPerBlock, because the format of input in LDS [C,Hi,Wi,N]
// for GEMM trans([C,K]) * [C,Wo*N], we need a thread to do all the "N" // for GEMM trans([C,K]) * [C,Wo*N], we need a thread to do all the "N"
......
...@@ -27,10 +27,10 @@ template <unsigned GridSize, ...@@ -27,10 +27,10 @@ template <unsigned GridSize,
unsigned WoPerThread, unsigned WoPerThread,
unsigned WeiBlockCopyThreadPerDim0, unsigned WeiBlockCopyThreadPerDim0,
unsigned WeiBlockCopyThreadPerDim1> unsigned WeiBlockCopyThreadPerDim1>
__global__ void __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(
gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(Float* const __restrict__ p_in_global, const Float* const __restrict__ p_in_global,
Float* const __restrict__ p_wei_global, const Float* const __restrict__ p_wei_global,
Float* __restrict__ p_out_global) Float* const __restrict__ p_out_global)
{ {
// NPerThread == NPerBlock, because the format of input in LDS [C,Hi,Wi,N] // NPerThread == NPerBlock, because the format of input in LDS [C,Hi,Wi,N]
// for GEMM trans([C,K]) * [C,Wo*N], we need a thread to do all the "N" // for GEMM trans([C,K]) * [C,Wo*N], we need a thread to do all the "N"
...@@ -143,7 +143,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(Float* const __restri ...@@ -143,7 +143,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(Float* const __restri
decltype(in_chwn_block_desc.GetLengths()), decltype(in_chwn_block_desc.GetLengths()),
LowerPads>{}; LowerPads>{};
#if 1 #if 0
// weight: format is [C,S,R,K] // weight: format is [C,S,R,K]
constexpr auto blockwise_wei_copy = constexpr auto blockwise_wei_copy =
Blockwise4dTensorCopy1<BlockSize, Blockwise4dTensorCopy1<BlockSize,
...@@ -151,7 +151,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(Float* const __restri ...@@ -151,7 +151,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(Float* const __restri
decltype(wei_csrk_global_desc), decltype(wei_csrk_global_desc),
decltype(wei_csrk_block_desc), decltype(wei_csrk_block_desc),
decltype(wei_csrk_block_desc.GetLengths())>{}; decltype(wei_csrk_block_desc.GetLengths())>{};
#elif 1 #elif 0
// weight: format is [C*S*R,K] // weight: format is [C*S*R,K]
constexpr auto blockwise_wei_copy = constexpr auto blockwise_wei_copy =
Blockwise2dTensorCopy1<BlockSize, Blockwise2dTensorCopy1<BlockSize,
...@@ -216,7 +216,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(Float* const __restri ...@@ -216,7 +216,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(Float* const __restri
// set threadwise output tensor to 0 // set threadwise output tensor to 0
threadwise_4d_tensor_set_zero(out_hkwn_thread_desc, p_out_thread); threadwise_4d_tensor_set_zero(out_hkwn_thread_desc, p_out_thread);
Float* p_wei_global_block_begin = const Float* p_wei_global_block_begin =
p_wei_global + wei_ek_global_desc.Get1dIndex(0, k_block_data_begin); p_wei_global + wei_ek_global_desc.Get1dIndex(0, k_block_data_begin);
for(unsigned c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock, for(unsigned c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment