Commit 0983d205 authored by Chao Liu's avatar Chao Liu
Browse files

debugging

parent bae23337
...@@ -191,7 +191,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc, ...@@ -191,7 +191,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
constexpr index_t WeiBlockCopyDataPerRead = 4; constexpr index_t WeiBlockCopyDataPerRead = 4;
constexpr index_t BlockSize = 256; constexpr index_t BlockSize = 256;
#elif 0 #elif 1
// 1x1, 14x14, Vega 20, disable lds_double_buffer, enable register double buffer // 1x1, 14x14, Vega 20, disable lds_double_buffer, enable register double buffer
constexpr index_t BPerBlock = 64; constexpr index_t BPerBlock = 64;
constexpr index_t KPerBlock = 128; constexpr index_t KPerBlock = 128;
...@@ -208,9 +208,6 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc, ...@@ -208,9 +208,6 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
constexpr index_t GemmNLevel1Cluster = 4; constexpr index_t GemmNLevel1Cluster = 4;
constexpr index_t GemmKPerThreadLoop = 1; constexpr index_t GemmKPerThreadLoop = 1;
constexpr index_t GemmThreadPerColumnPerCluster = 8;
constexpr index_t GemmThreadPerRowPerCluster = 8;
constexpr index_t InBlockCopyThreadPerDim0 = 4; constexpr index_t InBlockCopyThreadPerDim0 = 4;
constexpr index_t InBlockCopyThreadPerDim1 = 16; constexpr index_t InBlockCopyThreadPerDim1 = 16;
......
...@@ -580,7 +580,7 @@ int main(int argc, char* argv[]) ...@@ -580,7 +580,7 @@ int main(int argc, char* argv[])
constexpr index_t HPad = 0; constexpr index_t HPad = 0;
constexpr index_t WPad = 0; constexpr index_t WPad = 0;
#elif 1 #elif 0
// 1x1 filter, 14x14 image, C = 2048 // 1x1 filter, 14x14 image, C = 2048
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 2048; constexpr index_t C = 2048;
...@@ -592,7 +592,7 @@ int main(int argc, char* argv[]) ...@@ -592,7 +592,7 @@ int main(int argc, char* argv[])
constexpr index_t HPad = 0; constexpr index_t HPad = 0;
constexpr index_t WPad = 0; constexpr index_t WPad = 0;
#elif 0 #elif 1
// 1x1 filter, 14x14 image, C = 512 // 1x1 filter, 14x14 image, C = 512
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 512; constexpr index_t C = 512;
......
...@@ -19,8 +19,6 @@ template <index_t GridSize, ...@@ -19,8 +19,6 @@ template <index_t GridSize,
index_t CPerBlock, index_t CPerBlock,
index_t BPerThread, index_t BPerThread,
index_t KPerThread, index_t KPerThread,
index_t GemmThreadPerColumnPerCluster,
index_t GemmThreadPerRowPerCluster,
index_t GemmMPerThreadSubC, index_t GemmMPerThreadSubC,
index_t GemmNPerThreadSubC, index_t GemmNPerThreadSubC,
index_t GemmMLevel0Cluster, index_t GemmMLevel0Cluster,
...@@ -95,25 +93,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn ...@@ -95,25 +93,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
constexpr auto out_kb_thread_desc = constexpr auto out_kb_thread_desc =
make_ConstantTensorDescriptor(Sequence<KPerThread, BPerThread>{}); make_ConstantTensorDescriptor(Sequence<KPerThread, BPerThread>{});
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
print_ConstantTensorDescriptor(in_chwn_global_desc, "in_chwn_global_desc");
print_ConstantTensorDescriptor(wei_cyxk_global_desc, "wei_cyxk_global_desc");
print_ConstantTensorDescriptor(out_khwn_global_desc, "out_khwn_global_desc");
print_ConstantTensorDescriptor(in_cb_global_desc, "in_cb_global_desc");
print_ConstantTensorDescriptor(wei_ek_global_desc, "wei_ek_global_desc");
print_ConstantTensorDescriptor(in_cb_block_desc, "in_cb_block_desc");
print_ConstantTensorDescriptor(wei_cyxk_block_desc, "wei_cyxk_block_desc");
print_ConstantTensorDescriptor(wei_ek_block_desc, "wei_ek_block_desc");
print_ConstantTensorDescriptor(out_kb_thread_desc, "out_kb_thread_desc");
printf("KPerBlock %u\n", KPerBlock);
}
#endif
// blockwise in copy // blockwise in copy
// formmat is [CPerBlock,BPerBlock + BGhostRead] // formmat is [CPerBlock,BPerBlock + BGhostRead]
#if 0 #if 0
...@@ -202,14 +181,13 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn ...@@ -202,14 +181,13 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
constexpr index_t max_align = constexpr index_t max_align =
mod_conv::max(index_t(4), InBlockCopyDataPerRead, WeiBlockCopyDataPerRead); mod_conv::max(index_t(4), InBlockCopyDataPerRead, WeiBlockCopyDataPerRead);
constexpr index_t in_block_element_space = constexpr index_t in_block_space = in_cb_block_desc.GetElementSpace(Number<max_align>{});
in_cb_block_desc.GetElementSpace(Number<max_align>{});
constexpr index_t wei_block_element_space = constexpr index_t wei_block_space =
wei_cyxk_block_desc.GetElementSpace(Number<max_align>{}); wei_cyxk_block_desc.GetElementSpace(Number<max_align>{});
__shared__ Float p_in_block[in_block_element_space]; __shared__ Float p_in_block[in_block_space];
__shared__ Float p_wei_block[wei_block_element_space]; __shared__ Float p_wei_block[wei_block_space];
const Float* p_in_global_block_offset = const Float* p_in_global_block_offset =
p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin); p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin);
...@@ -229,7 +207,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn ...@@ -229,7 +207,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
__syncthreads()) __syncthreads())
{ {
// load data // load data
#if 0 #if 1
blockwise_in_copy.Run(p_in_global_block_offset, p_in_block); blockwise_in_copy.Run(p_in_global_block_offset, p_in_block);
blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block); blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block);
#elif 0 #elif 0
......
...@@ -67,6 +67,8 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer ...@@ -67,6 +67,8 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
constexpr index_t B = N * Hi * Wi; constexpr index_t B = N * Hi * Wi;
constexpr index_t BGhostRead = (Y - 1) * Wi + (X - 1); constexpr index_t BGhostRead = (Y - 1) * Wi + (X - 1);
static_assert(C % (2 * CPerBlock) == 0, "C cannot be evenly divided");
// divide block work by 2d: [K, B] // divide block work by 2d: [K, B]
constexpr index_t KBlockWork = (K + KPerBlock - 1) / KPerBlock; constexpr index_t KBlockWork = (K + KPerBlock - 1) / KPerBlock;
constexpr index_t BBlockWork = (B + BPerBlock - 1) / BPerBlock; constexpr index_t BBlockWork = (B + BPerBlock - 1) / BPerBlock;
...@@ -184,15 +186,14 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer ...@@ -184,15 +186,14 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
constexpr index_t max_align = constexpr index_t max_align =
mod_conv::max(index_t(4), InBlockCopyDataPerRead, WeiBlockCopyDataPerRead); mod_conv::max(index_t(4), InBlockCopyDataPerRead, WeiBlockCopyDataPerRead);
constexpr index_t in_block_element_space = constexpr index_t in_block_space = in_cb_block_desc.GetElementSpace(Number<max_align>{});
in_cb_block_desc.GetElementSpace(Number<max_align>{});
constexpr index_t wei_block_element_space = constexpr index_t wei_block_space =
wei_cyxk_block_desc.GetElementSpace(Number<max_align>{}); wei_cyxk_block_desc.GetElementSpace(Number<max_align>{});
// LDS double buffer // LDS double buffer
__shared__ Float p_in_block_double[2 * in_block_element_space]; __shared__ Float p_in_block_double[2 * in_block_space];
__shared__ Float p_wei_block_double[2 * wei_block_element_space]; __shared__ Float p_wei_block_double[2 * wei_block_space];
const Float* p_in_global_block_offset = const Float* p_in_global_block_offset =
p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin); p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin);
...@@ -202,10 +203,10 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer ...@@ -202,10 +203,10 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
// preload data into LDS // preload data into LDS
{ {
#if 0 #if 1
blockwise_in_copy.Run(p_in_global_block_offset, p_in_block_double); blockwise_in_copy.Run(p_in_global_block_offset, p_in_block_double);
blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block_double); blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block_double);
#elif 1 #elif 0
Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()]; Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()];
Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
...@@ -237,22 +238,22 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer ...@@ -237,22 +238,22 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
const bool even_loop = (iloop % 2 == 0); const bool even_loop = (iloop % 2 == 0);
Float* p_in_block_now = Float* p_in_block_now =
even_loop ? p_in_block_double : p_in_block_double + in_block_element_space; even_loop ? p_in_block_double : p_in_block_double + in_block_space;
Float* p_wei_block_now = Float* p_wei_block_now =
even_loop ? p_wei_block_double : p_wei_block_double + wei_block_element_space; even_loop ? p_wei_block_double : p_wei_block_double + wei_block_space;
Float* p_in_block_next = Float* p_in_block_next =
even_loop ? p_in_block_double + in_block_element_space : p_in_block_double; even_loop ? p_in_block_double + in_block_space : p_in_block_double;
Float* p_wei_block_next = Float* p_wei_block_next =
even_loop ? p_wei_block_double + wei_block_element_space : p_wei_block_double; even_loop ? p_wei_block_double + wei_block_space : p_wei_block_double;
p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0);
p_wei_global_block_offset += CPerBlock * wei_cyxk_global_desc.GetStride(I0);
// load next data // load next data
Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()]; Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()];
Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0);
p_wei_global_block_offset += CPerBlock * wei_cyxk_global_desc.GetStride(I0);
__syncthreads(); __syncthreads();
blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_offset, blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_offset,
...@@ -267,7 +268,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer ...@@ -267,7 +268,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
{ {
for(index_t x = 0; x < X; ++x) for(index_t x = 0; x < X; ++x)
{ {
#if 0 #if 1
blockwise_gemm.Run blockwise_gemm.Run
#elif 0 #elif 0
blockwise_gemm.Run_RegisterDoubleBuffer blockwise_gemm.Run_RegisterDoubleBuffer
...@@ -280,12 +281,12 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer ...@@ -280,12 +281,12 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
} }
} }
#if 0 #if 1
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
p_in_block_next); p_in_block_next);
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
p_wei_block_next); p_wei_block_next);
#elif 1 #elif 0
// if work with RunLoadRegisterClipboard_asm, need to wait // if work with RunLoadRegisterClipboard_asm, need to wait
vmcnt(0); vmcnt(0);
...@@ -298,7 +299,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer ...@@ -298,7 +299,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
} }
// tail // tail
if(C % 2 == 0)
{ {
// even // even
p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0); p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0);
...@@ -319,7 +319,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer ...@@ -319,7 +319,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
{ {
for(index_t x = 0; x < X; ++x) for(index_t x = 0; x < X; ++x)
{ {
#if 0 #if 1
blockwise_gemm.Run blockwise_gemm.Run
#elif 0 #elif 0
blockwise_gemm.Run_RegisterDoubleBuffer blockwise_gemm.Run_RegisterDoubleBuffer
...@@ -332,21 +332,21 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer ...@@ -332,21 +332,21 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
} }
} }
#if 0 #if 1
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
p_in_block_double + in_block_element_space); p_in_block_double + in_block_space);
blockwise_wei_copy.RunStoreRegisterClipboard( blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
p_wei_register_clipboard, p_wei_block_double + wei_block_element_space); p_wei_block_double + wei_block_space);
#else #else
// if work with RunLoadRegisterClipboard_asm, need to wait // if work with RunLoadRegisterClipboard_asm, need to wait
vmcnt(0); vmcnt(0);
blockwise_in_copy.RunStoreRegisterClipboard_asm( blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard,
p_in_register_clipboard, p_in_block_double + in_block_element_space); p_in_block_double + in_block_space);
blockwise_wei_copy.RunStoreRegisterClipboard_asm( blockwise_wei_copy.RunStoreRegisterClipboard_asm(p_wei_register_clipboard,
p_wei_register_clipboard, p_wei_block_double + wei_block_element_space); p_wei_block_double + wei_block_space);
#endif #endif
// odd // odd
...@@ -356,25 +356,20 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer ...@@ -356,25 +356,20 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
{ {
for(index_t x = 0; x < X; ++x) for(index_t x = 0; x < X; ++x)
{ {
#if 0 #if 1
blockwise_gemm.Run blockwise_gemm.Run
#elif 0 #elif 0
blockwise_gemm.Run_RegisterDoubleBuffer blockwise_gemm.Run_RegisterDoubleBuffer
#elif 1 #elif 1
blockwise_gemm.Run_asm blockwise_gemm.Run_asm
#endif #endif
(p_wei_block_double + in_block_element_space + (p_wei_block_double + in_block_space +
wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
p_in_block_double + wei_block_element_space + y * Wi + x, p_in_block_double + wei_block_space + y * Wi + x,
p_out_thread); p_out_thread);
} }
} }
} }
else
{
// not implemented
assert(false);
}
// output: register to global mem, // output: register to global mem,
const auto c_thread_mtx_begin = const auto c_thread_mtx_begin =
......
...@@ -13,22 +13,6 @@ __device__ void threadwise_matrix_copy(SrcMatrix, ...@@ -13,22 +13,6 @@ __device__ void threadwise_matrix_copy(SrcMatrix,
constexpr auto dst_mtx = DstMatrix{}; constexpr auto dst_mtx = DstMatrix{};
for(index_t i = 0; i < NRow; ++i) for(index_t i = 0; i < NRow; ++i)
{
// optimize for vector-4 load
if(NCol % 4 == 0)
{
using vector_t = typename vector_type<Float, 4>::MemoryType;
for(index_t j = 0; j < NCol / 4; ++j)
{
const index_t src_index = src_mtx.Get1dIndex(i, 4 * j);
const index_t dst_index = dst_mtx.Get1dIndex(i, 4 * j);
*reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
*reinterpret_cast<const vector_t*>(&p_src[src_index]);
}
}
else
{ {
for(index_t j = 0; j < NCol; ++j) for(index_t j = 0; j < NCol; ++j)
{ {
...@@ -38,7 +22,6 @@ __device__ void threadwise_matrix_copy(SrcMatrix, ...@@ -38,7 +22,6 @@ __device__ void threadwise_matrix_copy(SrcMatrix,
p_dst[dst_index] = p_src[src_index]; p_dst[dst_index] = p_src[src_index];
} }
} }
}
} }
template <class MatrixA, template <class MatrixA,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment