Commit 0983d205 authored by Chao Liu's avatar Chao Liu
Browse files

debugging

parent bae23337
......@@ -191,7 +191,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
constexpr index_t WeiBlockCopyDataPerRead = 4;
constexpr index_t BlockSize = 256;
#elif 0
#elif 1
// 1x1, 14x14, Vega 20, disable lds_double_buffer, enable register double buffer
constexpr index_t BPerBlock = 64;
constexpr index_t KPerBlock = 128;
......@@ -208,9 +208,6 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
constexpr index_t GemmNLevel1Cluster = 4;
constexpr index_t GemmKPerThreadLoop = 1;
constexpr index_t GemmThreadPerColumnPerCluster = 8;
constexpr index_t GemmThreadPerRowPerCluster = 8;
constexpr index_t InBlockCopyThreadPerDim0 = 4;
constexpr index_t InBlockCopyThreadPerDim1 = 16;
......
......@@ -580,7 +580,7 @@ int main(int argc, char* argv[])
constexpr index_t HPad = 0;
constexpr index_t WPad = 0;
#elif 1
#elif 0
// 1x1 filter, 14x14 image, C = 2048
constexpr index_t N = 128;
constexpr index_t C = 2048;
......@@ -592,7 +592,7 @@ int main(int argc, char* argv[])
constexpr index_t HPad = 0;
constexpr index_t WPad = 0;
#elif 0
#elif 1
// 1x1 filter, 14x14 image, C = 512
constexpr index_t N = 128;
constexpr index_t C = 512;
......
......@@ -19,8 +19,6 @@ template <index_t GridSize,
index_t CPerBlock,
index_t BPerThread,
index_t KPerThread,
index_t GemmThreadPerColumnPerCluster,
index_t GemmThreadPerRowPerCluster,
index_t GemmMPerThreadSubC,
index_t GemmNPerThreadSubC,
index_t GemmMLevel0Cluster,
......@@ -95,34 +93,15 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
constexpr auto out_kb_thread_desc =
make_ConstantTensorDescriptor(Sequence<KPerThread, BPerThread>{});
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
print_ConstantTensorDescriptor(in_chwn_global_desc, "in_chwn_global_desc");
print_ConstantTensorDescriptor(wei_cyxk_global_desc, "wei_cyxk_global_desc");
print_ConstantTensorDescriptor(out_khwn_global_desc, "out_khwn_global_desc");
print_ConstantTensorDescriptor(in_cb_global_desc, "in_cb_global_desc");
print_ConstantTensorDescriptor(wei_ek_global_desc, "wei_ek_global_desc");
print_ConstantTensorDescriptor(in_cb_block_desc, "in_cb_block_desc");
print_ConstantTensorDescriptor(wei_cyxk_block_desc, "wei_cyxk_block_desc");
print_ConstantTensorDescriptor(wei_ek_block_desc, "wei_ek_block_desc");
print_ConstantTensorDescriptor(out_kb_thread_desc, "out_kb_thread_desc");
printf("KPerBlock %u\n", KPerBlock);
}
#endif
// blockwise in copy
// formmat is [CPerBlock,BPerBlock + BGhostRead]
#if 0
const auto blockwise_in_copy =
Blockwise2dTensorCopy1<BlockSize,
Float,
decltype(in_cb_global_desc),
decltype(in_cb_block_desc),
decltype(in_cb_block_desc.GetLengths())>{};
const auto blockwise_in_copy =
Blockwise2dTensorCopy1<BlockSize,
Float,
decltype(in_cb_global_desc),
decltype(in_cb_block_desc),
decltype(in_cb_block_desc.GetLengths())>{};
#elif 0
const auto blockwise_in_copy =
Blockwise2dTensorCopy2<BlockSize,
......@@ -145,12 +124,12 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
// blockwise wei copy
// format is [CPerBlock*Y*X,KPerBlock]
#if 0
const auto blockwise_wei_copy =
Blockwise2dTensorCopy1<BlockSize,
Float,
decltype(wei_ek_global_desc),
decltype(wei_ek_block_desc),
decltype(wei_ek_block_desc.GetLengths())>{};
const auto blockwise_wei_copy =
Blockwise2dTensorCopy1<BlockSize,
Float,
decltype(wei_ek_global_desc),
decltype(wei_ek_block_desc),
decltype(wei_ek_block_desc.GetLengths())>{};
#elif 0
const auto blockwise_wei_copy =
Blockwise2dTensorCopy2<BlockSize,
......@@ -202,14 +181,13 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
constexpr index_t max_align =
mod_conv::max(index_t(4), InBlockCopyDataPerRead, WeiBlockCopyDataPerRead);
constexpr index_t in_block_element_space =
in_cb_block_desc.GetElementSpace(Number<max_align>{});
constexpr index_t in_block_space = in_cb_block_desc.GetElementSpace(Number<max_align>{});
constexpr index_t wei_block_element_space =
constexpr index_t wei_block_space =
wei_cyxk_block_desc.GetElementSpace(Number<max_align>{});
__shared__ Float p_in_block[in_block_element_space];
__shared__ Float p_wei_block[wei_block_element_space];
__shared__ Float p_in_block[in_block_space];
__shared__ Float p_wei_block[wei_block_space];
const Float* p_in_global_block_offset =
p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin);
......@@ -229,7 +207,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
__syncthreads())
{
// load data
#if 0
#if 1
blockwise_in_copy.Run(p_in_global_block_offset, p_in_block);
blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block);
#elif 0
......
......@@ -67,6 +67,8 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
constexpr index_t B = N * Hi * Wi;
constexpr index_t BGhostRead = (Y - 1) * Wi + (X - 1);
static_assert(C % (2 * CPerBlock) == 0, "C cannot be evenly divided");
// divide block work by 2d: [K, B]
constexpr index_t KBlockWork = (K + KPerBlock - 1) / KPerBlock;
constexpr index_t BBlockWork = (B + BPerBlock - 1) / BPerBlock;
......@@ -184,15 +186,14 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
constexpr index_t max_align =
mod_conv::max(index_t(4), InBlockCopyDataPerRead, WeiBlockCopyDataPerRead);
constexpr index_t in_block_element_space =
in_cb_block_desc.GetElementSpace(Number<max_align>{});
constexpr index_t in_block_space = in_cb_block_desc.GetElementSpace(Number<max_align>{});
constexpr index_t wei_block_element_space =
constexpr index_t wei_block_space =
wei_cyxk_block_desc.GetElementSpace(Number<max_align>{});
// LDS double buffer
__shared__ Float p_in_block_double[2 * in_block_element_space];
__shared__ Float p_wei_block_double[2 * wei_block_element_space];
__shared__ Float p_in_block_double[2 * in_block_space];
__shared__ Float p_wei_block_double[2 * wei_block_space];
const Float* p_in_global_block_offset =
p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin);
......@@ -202,10 +203,10 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
// preload data into LDS
{
#if 0
#if 1
blockwise_in_copy.Run(p_in_global_block_offset, p_in_block_double);
blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block_double);
#elif 1
#elif 0
Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()];
Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
......@@ -237,22 +238,22 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
const bool even_loop = (iloop % 2 == 0);
Float* p_in_block_now =
even_loop ? p_in_block_double : p_in_block_double + in_block_element_space;
even_loop ? p_in_block_double : p_in_block_double + in_block_space;
Float* p_wei_block_now =
even_loop ? p_wei_block_double : p_wei_block_double + wei_block_element_space;
even_loop ? p_wei_block_double : p_wei_block_double + wei_block_space;
Float* p_in_block_next =
even_loop ? p_in_block_double + in_block_element_space : p_in_block_double;
even_loop ? p_in_block_double + in_block_space : p_in_block_double;
Float* p_wei_block_next =
even_loop ? p_wei_block_double + wei_block_element_space : p_wei_block_double;
p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0);
p_wei_global_block_offset += CPerBlock * wei_cyxk_global_desc.GetStride(I0);
even_loop ? p_wei_block_double + wei_block_space : p_wei_block_double;
// load next data
Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()];
Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0);
p_wei_global_block_offset += CPerBlock * wei_cyxk_global_desc.GetStride(I0);
__syncthreads();
blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_offset,
......@@ -267,25 +268,25 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
{
for(index_t x = 0; x < X; ++x)
{
#if 0
#if 1
blockwise_gemm.Run
#elif 0
blockwise_gemm.Run_RegisterDoubleBuffer
#elif 1
blockwise_gemm.Run_asm
#endif
(p_wei_block_now + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
p_in_block_now + y * Wi + x,
p_out_thread);
(p_wei_block_now + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
p_in_block_now + y * Wi + x,
p_out_thread);
}
}
#if 0
#if 1
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
p_in_block_next);
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
p_wei_block_next);
#elif 1
#elif 0
// if work with RunLoadRegisterClipboard_asm, need to wait
vmcnt(0);
......@@ -298,7 +299,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
}
// tail
if(C % 2 == 0)
{
// even
p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0);
......@@ -319,34 +319,34 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
{
for(index_t x = 0; x < X; ++x)
{
#if 0
#if 1
blockwise_gemm.Run
#elif 0
blockwise_gemm.Run_RegisterDoubleBuffer
#elif 1
blockwise_gemm.Run_asm
#endif
(p_wei_block_double + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
p_in_block_double + y * Wi + x,
p_out_thread);
(p_wei_block_double + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
p_in_block_double + y * Wi + x,
p_out_thread);
}
}
#if 0
#if 1
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
p_in_block_double + in_block_element_space);
p_in_block_double + in_block_space);
blockwise_wei_copy.RunStoreRegisterClipboard(
p_wei_register_clipboard, p_wei_block_double + wei_block_element_space);
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
p_wei_block_double + wei_block_space);
#else
// if work with RunLoadRegisterClipboard_asm, need to wait
vmcnt(0);
blockwise_in_copy.RunStoreRegisterClipboard_asm(
p_in_register_clipboard, p_in_block_double + in_block_element_space);
blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard,
p_in_block_double + in_block_space);
blockwise_wei_copy.RunStoreRegisterClipboard_asm(
p_wei_register_clipboard, p_wei_block_double + wei_block_element_space);
blockwise_wei_copy.RunStoreRegisterClipboard_asm(p_wei_register_clipboard,
p_wei_block_double + wei_block_space);
#endif
// odd
......@@ -356,25 +356,20 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
{
for(index_t x = 0; x < X; ++x)
{
#if 0
#if 1
blockwise_gemm.Run
#elif 0
blockwise_gemm.Run_RegisterDoubleBuffer
#elif 1
blockwise_gemm.Run_asm
#endif
(p_wei_block_double + in_block_element_space +
wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
p_in_block_double + wei_block_element_space + y * Wi + x,
p_out_thread);
(p_wei_block_double + in_block_space +
wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
p_in_block_double + wei_block_space + y * Wi + x,
p_out_thread);
}
}
}
else
{
// not implemented
assert(false);
}
// output: register to global mem,
const auto c_thread_mtx_begin =
......
......@@ -14,29 +14,12 @@ __device__ void threadwise_matrix_copy(SrcMatrix,
for(index_t i = 0; i < NRow; ++i)
{
// optimize for vector-4 load
if(NCol % 4 == 0)
for(index_t j = 0; j < NCol; ++j)
{
using vector_t = typename vector_type<Float, 4>::MemoryType;
const index_t src_index = src_mtx.Get1dIndex(i, j);
const index_t dst_index = dst_mtx.Get1dIndex(i, j);
for(index_t j = 0; j < NCol / 4; ++j)
{
const index_t src_index = src_mtx.Get1dIndex(i, 4 * j);
const index_t dst_index = dst_mtx.Get1dIndex(i, 4 * j);
*reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
*reinterpret_cast<const vector_t*>(&p_src[src_index]);
}
}
else
{
for(index_t j = 0; j < NCol; ++j)
{
const index_t src_index = src_mtx.Get1dIndex(i, j);
const index_t dst_index = dst_mtx.Get1dIndex(i, j);
p_dst[dst_index] = p_src[src_index];
}
p_dst[dst_index] = p_src[src_index];
}
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment