Commit 90abf427 authored by Chao Liu's avatar Chao Liu
Browse files

refactor

parent b57d60c0
...@@ -191,7 +191,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc, ...@@ -191,7 +191,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
constexpr index_t WeiBlockCopyDataPerRead = 4; constexpr index_t WeiBlockCopyDataPerRead = 4;
constexpr index_t BlockSize = 256; constexpr index_t BlockSize = 256;
#elif 0 #elif 1
// 1x1, 14x14, Pascal, enable lds_double_buffer, disable register double buffer // 1x1, 14x14, Pascal, enable lds_double_buffer, disable register double buffer
constexpr index_t BPerBlock = 64; constexpr index_t BPerBlock = 64;
constexpr index_t KPerBlock = 128; constexpr index_t KPerBlock = 128;
...@@ -216,9 +216,10 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc, ...@@ -216,9 +216,10 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
constexpr index_t InBlockCopyDataPerRead = 4; constexpr index_t InBlockCopyDataPerRead = 4;
constexpr index_t WeiBlockCopyDataPerRead = 4; constexpr index_t WeiBlockCopyDataPerRead = 4;
constexpr index_t OutThreadCopyDataPerWrite = 4;
constexpr index_t BlockSize = 128; constexpr index_t BlockSize = 128;
#elif 1 #elif 0
// 1x1, 14x14, Vega 20, enable lds_double_buffer, disable register_double_buffer // 1x1, 14x14, Vega 20, enable lds_double_buffer, disable register_double_buffer
constexpr index_t BPerBlock = 128; constexpr index_t BPerBlock = 128;
constexpr index_t KPerBlock = 128; constexpr index_t KPerBlock = 128;
...@@ -243,7 +244,6 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc, ...@@ -243,7 +244,6 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
constexpr index_t InBlockCopyDataPerRead = 4; constexpr index_t InBlockCopyDataPerRead = 4;
constexpr index_t WeiBlockCopyDataPerRead = 4; constexpr index_t WeiBlockCopyDataPerRead = 4;
constexpr index_t OutThreadCopyDataPerWrite = 4; constexpr index_t OutThreadCopyDataPerWrite = 4;
constexpr index_t BlockSize = 256; constexpr index_t BlockSize = 256;
......
...@@ -169,20 +169,19 @@ struct GridwiseConvolutionImplicitGemm_v1_chwn_cyxk_khwn ...@@ -169,20 +169,19 @@ struct GridwiseConvolutionImplicitGemm_v1_chwn_cyxk_khwn
HoPerThread>{}; HoPerThread>{};
// LDS: be careful of alignment // LDS: be careful of alignment
constexpr index_t in_block_element_size = constexpr index_t max_align =
in_chwn_block_desc.GetElementSpace(Number<InBlockCopyDataPerRead>{}); mod_conv::max(index_t(4), InBlockCopyDataPerRead, WeiBlockCopyDataPerRead);
constexpr index_t wei_block_element_size = constexpr index_t in_block_space =
wei_cyxk_block_desc.GetElementSpace(Number<WeiBlockCopyDataPerRead>{}); in_chwn_block_desc.GetElementSpace(Number<max_align>{});
constexpr index_t max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead constexpr index_t wei_block_space =
? InBlockCopyDataPerRead wei_cyxk_block_desc.GetElementSpace(Number<max_align>{});
: WeiBlockCopyDataPerRead;
__shared__ Float __shared__ Float
p_in_block[max_align * ((in_block_element_size + max_align - 1) / max_align)]; p_in_block[in_block_space];
__shared__ Float __shared__ Float
p_wei_block[max_align * ((wei_block_element_size + max_align - 1) / max_align)]; p_wei_block[wei_block_space];
// register // register
Float p_out_thread[out_khwn_thread_desc.GetElementSpace()]; Float p_out_thread[out_khwn_thread_desc.GetElementSpace()];
......
...@@ -208,11 +208,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer ...@@ -208,11 +208,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset, blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset,
p_wei_register_clipboard); p_wei_register_clipboard);
#if 0 #if 1
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block_double); blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block_double);
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
p_wei_block_double); p_wei_block_double);
#elif 1 #else
vmcnt(0); vmcnt(0);
blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard, blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard,
p_in_block_double); p_in_block_double);
...@@ -266,11 +266,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer ...@@ -266,11 +266,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
{ {
for(index_t x = 0; x < X; ++x) for(index_t x = 0; x < X; ++x)
{ {
#if 0 #if 1
blockwise_gemm.Run blockwise_gemm.Run
#elif 0 #elif 0
blockwise_gemm.Run_RegisterDoubleBuffer blockwise_gemm.Run_RegisterDoubleBuffer
#elif 1 #elif 0
blockwise_gemm.Run_asm blockwise_gemm.Run_asm
#endif #endif
(p_wei_block_now + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), (p_wei_block_now + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
...@@ -279,12 +279,12 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer ...@@ -279,12 +279,12 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
} }
} }
#if 0 #if 1
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
p_in_block_next); p_in_block_next);
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
p_wei_block_next); p_wei_block_next);
#elif 1 #else
vmcnt(0); vmcnt(0);
blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard, blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard,
p_in_block_next); p_in_block_next);
...@@ -315,11 +315,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer ...@@ -315,11 +315,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
{ {
for(index_t x = 0; x < X; ++x) for(index_t x = 0; x < X; ++x)
{ {
#if 0 #if 1
blockwise_gemm.Run blockwise_gemm.Run
#elif 0 #elif 0
blockwise_gemm.Run_RegisterDoubleBuffer blockwise_gemm.Run_RegisterDoubleBuffer
#elif 1 #elif 0
blockwise_gemm.Run_asm blockwise_gemm.Run_asm
#endif #endif
(p_wei_block_double + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), (p_wei_block_double + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
...@@ -328,7 +328,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer ...@@ -328,7 +328,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
} }
} }
#if 0 #if 1
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
p_in_block_double + in_block_space); p_in_block_double + in_block_space);
...@@ -349,11 +349,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer ...@@ -349,11 +349,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
{ {
for(index_t x = 0; x < X; ++x) for(index_t x = 0; x < X; ++x)
{ {
#if 0 #if 1
blockwise_gemm.Run blockwise_gemm.Run
#elif 0 #elif 0
blockwise_gemm.Run_RegisterDoubleBuffer blockwise_gemm.Run_RegisterDoubleBuffer
#elif 1 #elif 0
blockwise_gemm.Run_asm blockwise_gemm.Run_asm
#endif #endif
(p_wei_block_double + wei_block_space + (p_wei_block_double + wei_block_space +
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment