Commit 5e776504 authored by Chao Liu's avatar Chao Liu
Browse files

fixed LDS alignment bug

parent 079d63a7
...@@ -391,7 +391,7 @@ int main() ...@@ -391,7 +391,7 @@ int main()
constexpr unsigned HPad = 0; constexpr unsigned HPad = 0;
constexpr unsigned WPad = 0; constexpr unsigned WPad = 0;
#elif 0 #elif 1
// 3x3, 34x34 // 3x3, 34x34
constexpr unsigned N = 64; constexpr unsigned N = 64;
constexpr unsigned C = 256; constexpr unsigned C = 256;
...@@ -484,7 +484,7 @@ int main() ...@@ -484,7 +484,7 @@ int main()
constexpr unsigned HPad = 1; constexpr unsigned HPad = 1;
constexpr unsigned WPad = 1; constexpr unsigned WPad = 1;
#elif 1 #elif 0
// 1x1 filter, 28x28 image // 1x1 filter, 28x28 image
constexpr unsigned N = 16; constexpr unsigned N = 16;
constexpr unsigned C = 256; constexpr unsigned C = 256;
...@@ -608,7 +608,7 @@ int main() ...@@ -608,7 +608,7 @@ int main()
nrepeat); nrepeat);
#endif #endif
#if 0 #if 1
if(S == 3 && R == 3) if(S == 3 && R == 3)
{ {
host_winograd_3x3_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads); host_winograd_3x3_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads);
......
...@@ -67,7 +67,7 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc, ...@@ -67,7 +67,7 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,
Tensor<T> out_knhw(make_TensorDescriptor(out_knhw_desc)); Tensor<T> out_knhw(make_TensorDescriptor(out_knhw_desc));
#if 0 #if 1
// 3x3, 34x34 // 3x3, 34x34
constexpr unsigned BPerBlock = 128; constexpr unsigned BPerBlock = 128;
constexpr unsigned KPerBlock = 64; constexpr unsigned KPerBlock = 64;
...@@ -86,11 +86,11 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc, ...@@ -86,11 +86,11 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,
constexpr unsigned WeiBlockCopyThreadPerDim0 = 4; constexpr unsigned WeiBlockCopyThreadPerDim0 = 4;
constexpr unsigned WeiBlockCopyThreadPerDim1 = 16; constexpr unsigned WeiBlockCopyThreadPerDim1 = 16;
constexpr unsigned InBlockCopyDataPerRead = 2; constexpr unsigned InBlockCopyDataPerRead = 4;
constexpr unsigned WeiBlockCopyDataPerRead = 4; constexpr unsigned WeiBlockCopyDataPerRead = 4;
constexpr unsigned BlockSize = 128; constexpr unsigned BlockSize = 128;
#elif 1 #elif 0
// 1x1, 28x28 // 1x1, 28x28
constexpr unsigned BPerBlock = 64; constexpr unsigned BPerBlock = 64;
constexpr unsigned KPerBlock = 64; constexpr unsigned KPerBlock = 64;
...@@ -112,6 +112,29 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc, ...@@ -112,6 +112,29 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,
constexpr unsigned InBlockCopyDataPerRead = 4; constexpr unsigned InBlockCopyDataPerRead = 4;
constexpr unsigned WeiBlockCopyDataPerRead = 4; constexpr unsigned WeiBlockCopyDataPerRead = 4;
constexpr unsigned BlockSize = 64;
#elif 1
// 1x1, 28x28 try
constexpr unsigned BPerBlock = 64;
constexpr unsigned KPerBlock = 64;
constexpr unsigned CPerBlock = 8;
constexpr unsigned BPerThread = 4;
constexpr unsigned KPerThread = 16;
constexpr unsigned CPerThread = 1;
constexpr unsigned GemmThreadPerColumnPerCluster = 4;
constexpr unsigned GemmThreadPerRowPerCluster = 8;
constexpr unsigned InBlockCopyThreadPerDim0 = 4;
constexpr unsigned InBlockCopyThreadPerDim1 = 16;
constexpr unsigned WeiBlockCopyThreadPerDim0 = 4;
constexpr unsigned WeiBlockCopyThreadPerDim1 = 16;
constexpr unsigned InBlockCopyDataPerRead = 4;
constexpr unsigned WeiBlockCopyDataPerRead = 4;
constexpr unsigned BlockSize = 64; constexpr unsigned BlockSize = 64;
#endif #endif
......
...@@ -33,15 +33,6 @@ __host__ __device__ constexpr auto calculate_default_strides_aligned(Sequence<L0 ...@@ -33,15 +33,6 @@ __host__ __device__ constexpr auto calculate_default_strides_aligned(Sequence<L0
return Sequence<L1 * L2 * L3_align, L2 * L3_align, L3_align, 1>{}; return Sequence<L1 * L2 * L3_align, L2 * L3_align, L3_align, 1>{};
} }
// this is ugly, only for 4d
template <unsigned S0, unsigned S1, unsigned S2, unsigned S3>
__host__ __device__ constexpr auto calculate_full_lengths(Sequence<S0, S1, S2, S3>)
{
static_assert((S0 % S1 == 0) && (S1 % S2 == 0) && (S2 % S3 == 0), "cannot be evenly divided!");
return Sequence<1, S0 / S1, S1 / S2, S2 / S3>{};
}
template <class Lengths, class Strides> template <class Lengths, class Strides>
struct ConstantTensorDescriptor struct ConstantTensorDescriptor
{ {
...@@ -71,7 +62,6 @@ struct ConstantTensorDescriptor ...@@ -71,7 +62,6 @@ struct ConstantTensorDescriptor
return Strides{}.Get(Number<I>{}); return Strides{}.Get(Number<I>{});
} }
// this is ugly, only for 4d
__host__ __device__ constexpr unsigned GetElementSize() const __host__ __device__ constexpr unsigned GetElementSize() const
{ {
static_assert(nDim >= 2 && nDim <= 4, "nDim"); static_assert(nDim >= 2 && nDim <= 4, "nDim");
...@@ -102,16 +92,20 @@ struct ConstantTensorDescriptor ...@@ -102,16 +92,20 @@ struct ConstantTensorDescriptor
} }
} }
__host__ __device__ constexpr unsigned GetElementSpace() const template <class Align = Number<1>>
__host__ __device__ constexpr unsigned GetElementSpace(Align align = Align{}) const
{ {
static_assert(nDim >= 2 && nDim <= 4, "nDim"); static_assert(nDim >= 2 && nDim <= 4, "nDim");
constexpr unsigned align_size = align.Get();
if(nDim == 2) if(nDim == 2)
{ {
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) + 1; return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) +
align_size;
} }
else if(nDim == 3) else if(nDim == 3)
{ {
...@@ -120,7 +114,7 @@ struct ConstantTensorDescriptor ...@@ -120,7 +114,7 @@ struct ConstantTensorDescriptor
constexpr auto I2 = Number<2>{}; constexpr auto I2 = Number<2>{};
return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) + return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) +
(GetLength(I2) - 1) * GetStride(I2) + 1; (GetLength(I2) - 1) * GetStride(I2) + align_size;
} }
else if(nDim == 4) else if(nDim == 4)
{ {
...@@ -130,7 +124,8 @@ struct ConstantTensorDescriptor ...@@ -130,7 +124,8 @@ struct ConstantTensorDescriptor
constexpr auto I3 = Number<3>{}; constexpr auto I3 = Number<3>{};
return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) + return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) +
(GetLength(I2) - 1) * GetStride(I2) + (GetLength(I3) - 1) * GetStride(I3) + 1; (GetLength(I2) - 1) * GetStride(I2) + (GetLength(I3) - 1) * GetStride(I3) +
align_size;
} }
} }
......
...@@ -399,11 +399,21 @@ struct Blockwise2dTensorCopy3 ...@@ -399,11 +399,21 @@ struct Blockwise2dTensorCopy3
// we allow out-of-bound read from src in D1 dimension, // we allow out-of-bound read from src in D1 dimension,
// but we need to make sure dst stride is big enough, // but we need to make sure dst stride is big enough,
// so that the out-of-bound write won't overwrite next line // so that the out-of-bound write won't contaminate next line in dst
static_assert(thread_per_d1 * DataPerRead <= DstDesc{}.GetStride(I0), static_assert(thread_per_d1 * DataPerRead <= DstDesc{}.GetStride(I0),
"wrong! out-of-bound write will overwrite next line!\n"); "wrong! out-of-bound write will contaminate next line!\n");
static_assert(thread_per_d0 >= 1, "wrong! not enough threads to cover L1 dimension\n"); static_assert(thread_per_d0 >= 1, "wrong! not enough threads to cover one line\n");
constexpr unsigned num_active_thread = thread_per_d0 * thread_per_d1;
if(BlockSize > num_active_thread)
{
if(get_thread_local_1d_id() >= num_active_thread)
{
return;
}
}
const unsigned thread_id_d0 = get_thread_local_1d_id() / thread_per_d1; const unsigned thread_id_d0 = get_thread_local_1d_id() / thread_per_d1;
const unsigned thread_id_d1 = get_thread_local_1d_id() - thread_id_d0 * thread_per_d1; const unsigned thread_id_d1 = get_thread_local_1d_id() - thread_id_d0 * thread_per_d1;
......
...@@ -75,27 +75,14 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc, ...@@ -75,27 +75,14 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc,
constexpr auto wei_ek_global_desc = make_ConstantTensorDescriptor(Sequence<C * S * R, K>{}); constexpr auto wei_ek_global_desc = make_ConstantTensorDescriptor(Sequence<C * S * R, K>{});
// tensor view of blockwise input and weight // tensor view of blockwise input and weight
#if 0
constexpr auto in_cb_block_desc =
make_ConstantTensorDescriptor(Sequence<CPerBlock, BPerBlock + BGhostRead>{});
#else
constexpr auto in_cb_block_desc = make_ConstantTensorDescriptor_aligned( constexpr auto in_cb_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<CPerBlock, BPerBlock + BGhostRead>{}, Number<InBlockCopyDataPerRead>{}); Sequence<CPerBlock, BPerBlock + BGhostRead>{}, Number<InBlockCopyDataPerRead>{});
#endif
#if 0
constexpr auto wei_ek_block_desc =
make_ConstantTensorDescriptor(Sequence<CPerBlock * S * R, KPerBlock>{});
constexpr auto wei_csrk_block_desc =
make_ConstantTensorDescriptor(Sequence<CPerBlock, S, R, KPerBlock>{});
#else
constexpr auto wei_ek_block_desc = make_ConstantTensorDescriptor_aligned( constexpr auto wei_ek_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<CPerBlock * S * R, KPerBlock>{}, Number<WeiBlockCopyDataPerRead>{}); Sequence<CPerBlock * S * R, KPerBlock>{}, Number<WeiBlockCopyDataPerRead>{});
constexpr auto wei_csrk_block_desc = make_ConstantTensorDescriptor_aligned( constexpr auto wei_csrk_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<CPerBlock, S, R, KPerBlock>{}, Number<WeiBlockCopyDataPerRead>{}); Sequence<CPerBlock, S, R, KPerBlock>{}, Number<WeiBlockCopyDataPerRead>{});
#endif
// tensor view of threadwise output in register // tensor view of threadwise output in register
constexpr auto out_kb_thread_desc = constexpr auto out_kb_thread_desc =
...@@ -203,12 +190,19 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc, ...@@ -203,12 +190,19 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc,
GemmThreadPerRowPerCluster, GemmThreadPerRowPerCluster,
true>{}; true>{};
// LDS // LDS: be careful of alignment
constexpr unsigned in_block_size = in_cb_block_desc.GetElementSpace(); constexpr unsigned in_block_size =
constexpr unsigned wei_block_size = wei_csrk_block_desc.GetElementSpace(); in_cb_block_desc.GetElementSpace(Number<InBlockCopyDataPerRead>{});
constexpr unsigned wei_block_size =
wei_csrk_block_desc.GetElementSpace(Number<WeiBlockCopyDataPerRead>{});
constexpr unsigned max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead
? InBlockCopyDataPerRead
: WeiBlockCopyDataPerRead;
__shared__ Float p_in_block[in_block_size]; __shared__ Float p_in_block[max_align * ((in_block_size + max_align - 1) / max_align)];
__shared__ Float p_wei_block[wei_block_size]; __shared__ Float p_wei_block[max_align * ((wei_block_size + max_align - 1) / max_align)];
// register // register
Float p_out_thread[out_kb_thread_desc.GetElementSpace()]; Float p_out_thread[out_kb_thread_desc.GetElementSpace()];
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment