Commit 11b848da authored by Jing Zhang's avatar Jing Zhang
Browse files

fixed stride_division_ceil

parent 06810ad4
......@@ -49,7 +49,8 @@ void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc,
constexpr index_t N1 = 2;
constexpr index_t N2 = 4;
constexpr index_t B = (N * (Ho/Strides::Get(I0)) * (Wo/Strides::Get(I1))) / (N1 * N2);
constexpr index_t B = N * mod_conv::integer_divide_ceil(Ho, Strides::Get(I0)) *
mod_conv::integer_divide_ceil(Wo, Strides::Get(I1)) / (N1 * N2);
#if 1
constexpr index_t BlockSize = 256;
......
......@@ -25,8 +25,6 @@ struct GeneratorTensor_0
}
};
struct GeneratorTensor_1
{
template <class... Is>
......@@ -122,12 +120,12 @@ template <class TIn,
class Strides,
class Dilations>
void host_direct_convolution_forw(const Tensor<TIn>& in_nchw,
const Tensor<TWei>& wei_kcyx,
Tensor<TOut>& out_nkhw,
LowerPads,
UpperPads,
Strides,
Dilations)
const Tensor<TWei>& wei_kcyx,
Tensor<TOut>& out_nkhw,
LowerPads,
UpperPads,
Strides,
Dilations)
{
index_t h_pad_low = LowerPads{}.Get(Number<0>{});
index_t w_pad_low = LowerPads{}.Get(Number<1>{});
......@@ -179,13 +177,12 @@ template <class TIn,
class Strides,
class Dilations>
void host_direct_convolution_back(Tensor<TOut>& in_nchw,
const Tensor<TWei>& wei_kcyx,
const Tensor<TIn>& out_nkhw,
LowerPads,
UpperPads,
Strides,
Dilations
)
const Tensor<TWei>& wei_kcyx,
const Tensor<TIn>& out_nkhw,
LowerPads,
UpperPads,
Strides,
Dilations)
{
index_t h_pad_low = LowerPads{}.Get(Number<0>{});
index_t w_pad_low = LowerPads{}.Get(Number<1>{});
......@@ -199,22 +196,23 @@ void host_direct_convolution_back(Tensor<TOut>& in_nchw,
index_t dilation_h = Dilations{}.Get(Number<0>{});
index_t dilation_w = Dilations{}.Get(Number<1>{});
//loop n,c,hi,wi
// loop n,c,hi,wi
auto f = [&](auto n, auto c, auto hi, auto wi) {
double v = 0;
//loop k,y,x
// loop k,y,x
for(int k = 0; k < wei_kcyx.mDesc.GetLengths()[0]; ++k)
{
for(int y = 0; y < wei_kcyx.mDesc.GetLengths()[2]; ++y)
{
int ho_ = (hi - y * dilation_h + h_pad_low);
int ho = ho_ / stride_h;
int ho = ho_ / stride_h;
for(int x = 0; x < wei_kcyx.mDesc.GetLengths()[3]; ++x)
{
int wo_ = (wi - x * dilation_w + w_pad_low);
int wo = wo_ / stride_w;
int wo = wo_ / stride_w;
if(ho >= 0 && ho < out_nkhw.mDesc.GetLengths()[2] && wo >= 0 &&
wo < out_nkhw.mDesc.GetLengths()[3] && ho_ % stride_h == 0 && wo_ % stride_w == 0)
wo < out_nkhw.mDesc.GetLengths()[3] && ho_ % stride_h == 0 &&
wo_ % stride_w == 0)
{
v += double(out_nkhw(n, k, ho, wo)) * double(wei_kcyx(k, c, y, x));
}
......@@ -501,7 +499,7 @@ int main(int argc, char* argv[])
constexpr index_t HDilation = 1;
constexpr index_t WDilation = 1;
constexpr index_t Direction = 2; //1: Forward; 2:Backward
constexpr index_t Direction = 1; // 1: Forward; 2:Backward
#if 0
constexpr index_t N = 32;
constexpr index_t C = 128;
......@@ -553,8 +551,8 @@ int main(int argc, char* argv[])
// 1x1 filter, 28x28 image
constexpr index_t N = 128;
constexpr index_t C = 128;
constexpr index_t HI = 28;
constexpr index_t WI = 28;
constexpr index_t HI = 7;
constexpr index_t WI = 7;
constexpr index_t K = 128;
constexpr index_t Y = 1;
constexpr index_t X = 1;
......@@ -716,8 +714,8 @@ int main(int argc, char* argv[])
#elif 0
in_nchw.GenerateTensorValue(GeneratorTensor_0{}, num_thread);
wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
//out_nkhw.GenerateTensorValue(GeneratorTensor_Checkboard{}, num_thread);
//out_nkhw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
// out_nkhw.GenerateTensorValue(GeneratorTensor_Checkboard{}, num_thread);
// out_nkhw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
out_nkhw.GenerateTensorValue(GeneratorTensor_4{}, num_thread);
#elif 0
in_nchw.GenerateTensorValue(GeneratorTensor_3{}, num_thread);
......@@ -764,8 +762,7 @@ int main(int argc, char* argv[])
strides,
dilations,
in_nchw_device,
nrepeat
);
nrepeat);
#elif 1
device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(in_nchw_desc,
......
......@@ -112,27 +112,28 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
// input tensor
// tensor descriptor in device memory [N0, N1, N2, Ho, Wo]
constexpr auto in_n0_n1_n2_h_w_global_desc = in_n_c_h_w_global_desc.Slice(I2,
Number<Ho/Strides::Get(I0)>{})
.Slice(I3, Number<Wo/Strides::Get(I1)>{})
.Fold(I0, Number<N1>{}, Number<N2>{})
.Extract(Sequence<0, 1, 2, 4, 5>{});
//constexpr auto in_n0_n1_n2_h_w_global_desc =
//in_n_c_h_w_global_desc.Fold(I0, Number<N1>{}, Number<N2>{})
//.Extract(Sequence<0, 1, 2, 4, 5>{});
//constexpr auto in_lengths_new = Sequence<N0, N1, N2, Ho, Wo>{};
//constexpr auto in_strides_new =
//Sequence<in_n0_n1_n2_h_w_global_desc.GetStride(I0),
//in_n0_n1_n2_h_w_global_desc.GetStride(I1),
//in_n0_n1_n2_h_w_global_desc.GetStride(I2),
//in_n0_n1_n2_h_w_global_desc.GetStride(I3),
//in_n0_n1_n2_h_w_global_desc.GetStride(I4)>{};
//constexpr auto in_n0_n1_n2_h_w_new_global_desc =
//make_ConstantTensorDescriptor(in_lengths_new, in_strides_new);
constexpr auto in_n0_n1_n2_h_w_global_desc =
in_n_c_h_w_global_desc
.Slice(I2, Number<mod_conv::integer_divide_ceil(Ho, Strides::Get(I0))>{})
.Slice(I3, Number<mod_conv::integer_divide_ceil(Wo, Strides::Get(I1))>{})
.Fold(I0, Number<N1>{}, Number<N2>{})
.Extract(Sequence<0, 1, 2, 4, 5>{});
// constexpr auto in_n0_n1_n2_h_w_global_desc =
// in_n_c_h_w_global_desc.Fold(I0, Number<N1>{}, Number<N2>{})
//.Extract(Sequence<0, 1, 2, 4, 5>{});
// constexpr auto in_lengths_new = Sequence<N0, N1, N2, Ho, Wo>{};
// constexpr auto in_strides_new =
// Sequence<in_n0_n1_n2_h_w_global_desc.GetStride(I0),
// in_n0_n1_n2_h_w_global_desc.GetStride(I1),
// in_n0_n1_n2_h_w_global_desc.GetStride(I2),
// in_n0_n1_n2_h_w_global_desc.GetStride(I3),
// in_n0_n1_n2_h_w_global_desc.GetStride(I4)>{};
// constexpr auto in_n0_n1_n2_h_w_new_global_desc =
// make_ConstantTensorDescriptor(in_lengths_new, in_strides_new);
constexpr auto in_n0_n1_n2_h_w_new_global_desc = in_n0_n1_n2_h_w_global_desc;
// batch descritpor for device memory
......@@ -140,17 +141,17 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
constexpr auto in_c_y_x_global_desc = in_n_c_h_w_global_desc.Slice(I2, Number<Y>{})
.Slice(I3, Number<X>{})
.Extract(Sequence<1, 2, 3>{});
//constexpr auto in_win_lengths_new = Sequence<in_c_y_x_global_desc.GetLength(I0),
//in_c_y_x_global_desc.GetLength(I1),
//in_c_y_x_global_desc.GetLength(I2)>{};
// constexpr auto in_win_lengths_new = Sequence<in_c_y_x_global_desc.GetLength(I0),
// in_c_y_x_global_desc.GetLength(I1),
// in_c_y_x_global_desc.GetLength(I2)>{};
//constexpr auto in_win_strides_new =
//Sequence<in_c_y_x_global_desc.GetStride(I0),
//in_c_y_x_global_desc.GetStride(I1),
//in_c_y_x_global_desc.GetStride(I2)>{};
// constexpr auto in_win_strides_new =
// Sequence<in_c_y_x_global_desc.GetStride(I0),
// in_c_y_x_global_desc.GetStride(I1),
// in_c_y_x_global_desc.GetStride(I2)>{};
//constexpr auto in_c_y_x_new_global_desc =
//make_ConstantTensorDescriptor(in_win_lengths_new, in_win_strides_new);
// constexpr auto in_c_y_x_new_global_desc =
// make_ConstantTensorDescriptor(in_win_lengths_new, in_win_strides_new);
// merged tensor descriptor in device memory [E, N1, B, N2], src of blockwise copy
constexpr auto in_e_n1_b_n2_global_merged_desc = make_ConstantMergedTensorDescriptor(
......@@ -189,16 +190,14 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
InBlockCopyDstDataPerWrite_N2>(
{0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0});
// weight tensor
// tensor descriptor in device memory, src of blockwise copy
// weight tensor
// tensor descriptor in device memory, src of blockwise copy
#if 0
constexpr auto wei_e_k_global_desc =
wei_k_c_y_x_global_desc.Unfold(I1, I3).ReorderGivenNew2Old(Sequence<1, 0>{});
#else
constexpr auto wei_e_k_global_desc =
make_ConstantMergedTensorDescriptor(wei_k_c_y_x_global_desc,
Sequence<1, 2, 3>{},
Sequence<0>{});
constexpr auto wei_e_k_global_desc = make_ConstantMergedTensorDescriptor(
wei_k_c_y_x_global_desc, Sequence<1, 2, 3>{}, Sequence<0>{});
#endif
// tensor descriptor in LDS, dst of blockwise copy
......@@ -426,9 +425,10 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetLength(I3),
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetLength(I4),
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetLength(I5),
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetLength(I6) / Strides{}.Get(I0),
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetLength(I7) / Strides{}.Get(I1)
>{};
mod_conv::integer_divide_ceil(
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetLength(I6), Strides{}.Get(I0)),
mod_conv::integer_divide_ceil(
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetLength(I7), Strides{}.Get(I1))>{};
constexpr auto out_strides_new = Sequence<
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetStride(I0),
......@@ -438,12 +438,10 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetStride(I4),
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetStride(I5),
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetStride(I6) * Strides{}.Get(I0),
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetStride(I7) * Strides{}.Get(I1)
>{};
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetStride(I7) * Strides{}.Get(I1)>{};
constexpr auto out_n0_n1_n2_k0_k1_k2_h_w_new_global_mem_desc = make_ConstantTensorDescriptor(
out_lengths_new, out_strides_new
);
constexpr auto out_n0_n1_n2_k0_k1_k2_h_w_new_global_mem_desc =
make_ConstantTensorDescriptor(out_lengths_new, out_strides_new);
// calculate origin of thread output tensor on global memory
// blockwise GEMM c matrix starting index
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment