Commit 284e7bb3 authored by Chao Liu's avatar Chao Liu
Browse files

refactored implicit gemm v1r3

parent efd419ec
...@@ -98,8 +98,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn ...@@ -98,8 +98,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
const index_t wi_block_data_begin = wo_block_data_begin; const index_t wi_block_data_begin = wo_block_data_begin;
// global tensor view // global tensor view
constexpr auto wei_c_k_global_desc = constexpr auto wei_c_k_global_desc = wei_c_y_x_k_global_desc.Extract(I0, I3);
make_ConstantTensorDescriptor(Sequence<C, K>{}, Sequence<Y * X * K, 1>{});
// LDS tensor view // LDS tensor view
// be careful of alignment // be careful of alignment
...@@ -212,44 +211,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn ...@@ -212,44 +211,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
// set threadwise output tensor to 0 // set threadwise output tensor to 0
threadwise_matrix_set_zero(c_k_wn_thread_mtx_desc, p_out_thread); threadwise_matrix_set_zero(c_k_wn_thread_mtx_desc, p_out_thread);
#if 1
const Float* p_in_global_block_offset =
p_in_global +
in_c_h_w_n_global_desc.GetOffsetFromMultiIndex(
0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin);
const Float* p_wei_global_block_offset =
p_wei_global +
wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, 0, 0, k_block_data_begin);
for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock,
p_in_global_block_offset += CPerBlock * in_c_h_w_n_global_desc.GetStride(I0),
p_wei_global_block_offset += CPerBlock * wei_c_y_x_k_global_desc.GetStride(I0))
{
for(index_t y = 0; y < Y; ++y)
{
#pragma unroll
for(index_t x = 0; x < X; ++x)
{
blockwise_in_copy.Run(
p_in_global_block_offset +
in_c_h_w_n_global_desc.GetOffsetFromMultiIndex(0, y, x, 0),
p_in_block);
blockwise_wei_copy.Run(
p_wei_global_block_offset +
wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, 0),
p_wei_block);
__syncthreads();
blockwise_batch_gemm.Run(p_wei_block, p_in_block, p_out_thread);
__syncthreads();
}
}
}
#else
for(index_t y = 0; y < Y; ++y) for(index_t y = 0; y < Y; ++y)
{ {
for(index_t x = 0; x < X; ++x) for(index_t x = 0; x < X; ++x)
...@@ -282,7 +243,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn ...@@ -282,7 +243,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
} }
} }
} }
#endif
// output: register to global mem, // output: register to global mem,
const auto c_thread_mtx_begin = const auto c_thread_mtx_begin =
......
...@@ -128,17 +128,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer ...@@ -128,17 +128,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed( constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed(
Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{}); Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{});
// blockwise copy // blockwise copy
// input: format is [C, Hi, Wi, N] // input: format is [C, Hi, Wi, N]
#if 0
const auto blockwise_in_copy =
Blockwise4dTensorCopy1<BlockSize,
Float,
decltype(in_c_h_w_n_global_desc),
decltype(in_c_h_w_n_block_desc),
decltype(in_c_h_w_n_block_desc.GetLengths()),
InBlockCopyDataPerRead_N>{};
#else
const auto blockwise_in_copy = const auto blockwise_in_copy =
Blockwise4dTensorCopy3<BlockSize, Blockwise4dTensorCopy3<BlockSize,
Float, Float,
...@@ -147,7 +138,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer ...@@ -147,7 +138,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
decltype(in_c_h_w_n_block_desc.GetLengths()), decltype(in_c_h_w_n_block_desc.GetLengths()),
InBlockCopyClusterLengths_CHWN, InBlockCopyClusterLengths_CHWN,
InBlockCopyDataPerRead_N>{}; InBlockCopyDataPerRead_N>{};
#endif
// blockwise wei copy // blockwise wei copy
// format is [CPerBlock, X * KPerBlock] // format is [CPerBlock, X * KPerBlock]
......
...@@ -478,9 +478,9 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, ...@@ -478,9 +478,9 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
#elif 0 #elif 0
GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
#elif 1
GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
#elif 0 #elif 0
GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
#elif 1
GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
#endif #endif
<GridSize, <GridSize,
......
#!/bin/bash #!/bin/bash
rm -f CMakeCache.txt
rm -f *.cmake
rm -rf CMakeFiles
MY_PROJECT_SOURCE=../../../ MY_PROJECT_SOURCE=../../../
MY_PROJECT_INSTALL=../install.dir MY_PROJECT_INSTALL=../install.dir
...@@ -18,12 +14,11 @@ cmake ...@@ -18,12 +14,11 @@ cmake
-D CMAKE_BUILD_TYPE=Release \ -D CMAKE_BUILD_TYPE=Release \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \ -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
-D DEVICE_BACKEND=NVIDIA \ -D DEVICE_BACKEND=NVIDIA \
-D CUDA_COMMON_INCLUDE_DIR="/root/workspace/NVIDIA_CUDA-10.1_Samples/common/inc" \ -D CUDA_COMMON_INCLUDE_DIR="/root/NVIDIA_CUDA-10.1_Samples/common/inc" \
-D CMAKE_CUDA_FLAGS="-ccbin clang++-6.0 -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_60,code=sm_60 -Xptxas -v -gencode=arch=compute_70,code=sm_70" \ -D CMAKE_CUDA_FLAGS="-ccbin clang++-6.0 -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_60,code=sm_60 -Xptxas -v -gencode=arch=compute_70,code=sm_70" \
${MY_PROJECT_SOURCE} ${MY_PROJECT_SOURCE}
#-D CMAKE_CUDA_COMPILER="/package/install/cuda_10.0/bin/nvcc" \
#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61" \ #-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61" \
#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61 -Xptxas -v -Xptxas -v -maxrregcount=128" \ #-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61 -Xptxas -v -Xptxas -v -maxrregcount=128" \
#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61 -Xptxas -v -gencode=arch=compute_70,code=sm_70" \ #-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61 -Xptxas -v -gencode=arch=compute_70,code=sm_70" \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment