fixed stride_division_ceil

11b848da · Jing Zhang · 06810ad4 · 11b848da · 11b848da · 11b848da
Commit 11b848da authored Jun 10, 2019 by Jing Zhang
3 changed files
--- a/driver/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
+++ b/driver/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
@@ -49,7 +49,8 @@ void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc,
    constexpr index_t N1 = 2;
    constexpr index_t N2 = 4;

-    constexpr index_t B = (N * (Ho/Strides::Get(I0)) * (Wo/Strides::Get(I1))) / (N1 * N2);
+    constexpr index_t B = N * mod_conv::integer_divide_ceil(Ho, Strides::Get(I0)) *
+                          mod_conv::integer_divide_ceil(Wo, Strides::Get(I1)) / (N1 * N2);

 #if 1
    constexpr index_t BlockSize = 256;

--- a/driver/driver.hip.cpp
+++ b/driver/driver.hip.cpp
@@ -25,8 +25,6 @@ struct GeneratorTensor_0
    }
 };

-
-
 struct GeneratorTensor_1
 {
    template <class... Is>
@@ -122,12 +120,12 @@ template <class TIn,
          class Strides,
          class Dilations>
 void host_direct_convolution_forw(const Tensor<TIn>& in_nchw,
-                             const Tensor<TWei>& wei_kcyx,
-                             Tensor<TOut>& out_nkhw,
-                             LowerPads,
-                             UpperPads,
-                             Strides,
-                             Dilations)
+                                  const Tensor<TWei>& wei_kcyx,
+                                  Tensor<TOut>& out_nkhw,
+                                  LowerPads,
+                                  UpperPads,
+                                  Strides,
+                                  Dilations)
 {
    index_t h_pad_low = LowerPads{}.Get(Number<0>{});
    index_t w_pad_low = LowerPads{}.Get(Number<1>{});
@@ -179,13 +177,12 @@ template <class TIn,
          class Strides,
          class Dilations>
 void host_direct_convolution_back(Tensor<TOut>& in_nchw,
-                             const Tensor<TWei>& wei_kcyx,
-                             const Tensor<TIn>& out_nkhw,
-                             LowerPads,
-                             UpperPads,
-                             Strides,
-                             Dilations
-                             )
+                                  const Tensor<TWei>& wei_kcyx,
+                                  const Tensor<TIn>& out_nkhw,
+                                  LowerPads,
+                                  UpperPads,
+                                  Strides,
+                                  Dilations)
 {
    index_t h_pad_low = LowerPads{}.Get(Number<0>{});
    index_t w_pad_low = LowerPads{}.Get(Number<1>{});
@@ -199,22 +196,23 @@ void host_direct_convolution_back(Tensor<TOut>& in_nchw,
    index_t dilation_h = Dilations{}.Get(Number<0>{});
    index_t dilation_w = Dilations{}.Get(Number<1>{});

-    //loop n,c,hi,wi
+    // loop n,c,hi,wi
    auto f = [&](auto n, auto c, auto hi, auto wi) {
        double v = 0;
-        //loop k,y,x
+        // loop k,y,x
        for(int k = 0; k < wei_kcyx.mDesc.GetLengths()[0]; ++k)
        {
            for(int y = 0; y < wei_kcyx.mDesc.GetLengths()[2]; ++y)
            {
                int ho_ = (hi - y * dilation_h + h_pad_low);
-                int ho = ho_ / stride_h; 
+                int ho  = ho_ / stride_h;
                for(int x = 0; x < wei_kcyx.mDesc.GetLengths()[3]; ++x)
                {
                    int wo_ = (wi - x * dilation_w + w_pad_low);
-                    int wo = wo_ / stride_w;
+                    int wo  = wo_ / stride_w;
                    if(ho >= 0 && ho < out_nkhw.mDesc.GetLengths()[2] && wo >= 0 &&
-                       wo < out_nkhw.mDesc.GetLengths()[3] && ho_ % stride_h == 0 && wo_ % stride_w == 0)
+                       wo < out_nkhw.mDesc.GetLengths()[3] && ho_ % stride_h == 0 &&
+                       wo_ % stride_w == 0)
                    {
                        v += double(out_nkhw(n, k, ho, wo)) * double(wei_kcyx(k, c, y, x));
                    }
@@ -501,7 +499,7 @@ int main(int argc, char* argv[])
    constexpr index_t HDilation = 1;
    constexpr index_t WDilation = 1;

-    constexpr index_t Direction = 2; //1: Forward; 2:Backward
+    constexpr index_t Direction = 1; // 1: Forward; 2:Backward
 #if 0
    constexpr index_t N  = 32;
    constexpr index_t C  = 128;
@@ -553,8 +551,8 @@ int main(int argc, char* argv[])
    // 1x1 filter, 28x28 image
    constexpr index_t N  = 128;
    constexpr index_t C  = 128;
-    constexpr index_t HI = 28;
-    constexpr index_t WI = 28;
+    constexpr index_t HI = 7;
+    constexpr index_t WI = 7;
    constexpr index_t K  = 128;
    constexpr index_t Y  = 1;
    constexpr index_t X  = 1;
@@ -716,8 +714,8 @@ int main(int argc, char* argv[])
 #elif 0
        in_nchw.GenerateTensorValue(GeneratorTensor_0{}, num_thread);
        wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-      //out_nkhw.GenerateTensorValue(GeneratorTensor_Checkboard{}, num_thread);
-      //out_nkhw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        // out_nkhw.GenerateTensorValue(GeneratorTensor_Checkboard{}, num_thread);
+        // out_nkhw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
        out_nkhw.GenerateTensorValue(GeneratorTensor_4{}, num_thread);
 #elif 0
        in_nchw.GenerateTensorValue(GeneratorTensor_3{}, num_thread);
@@ -764,8 +762,7 @@ int main(int argc, char* argv[])
     strides,
     dilations,
     in_nchw_device,
-     nrepeat
-     );
+     nrepeat);

 #elif 1
    device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(in_nchw_desc,

--- a/src/include/gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hip.hpp
@@ -112,27 +112,28 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw

        // input tensor
        //     tensor descriptor in device memory [N0, N1, N2, Ho, Wo]
-        constexpr auto in_n0_n1_n2_h_w_global_desc = in_n_c_h_w_global_desc.Slice(I2,
-                Number<Ho/Strides::Get(I0)>{})
-            .Slice(I3, Number<Wo/Strides::Get(I1)>{})
-            .Fold(I0, Number<N1>{}, Number<N2>{})
-            .Extract(Sequence<0, 1, 2, 4, 5>{});
-
-        //constexpr auto in_n0_n1_n2_h_w_global_desc =
-            //in_n_c_h_w_global_desc.Fold(I0, Number<N1>{}, Number<N2>{})
-                //.Extract(Sequence<0, 1, 2, 4, 5>{});
-
-        //constexpr auto in_lengths_new = Sequence<N0, N1, N2, Ho, Wo>{};
-
-        //constexpr auto in_strides_new =
-            //Sequence<in_n0_n1_n2_h_w_global_desc.GetStride(I0),
-                     //in_n0_n1_n2_h_w_global_desc.GetStride(I1),
-                     //in_n0_n1_n2_h_w_global_desc.GetStride(I2),
-                     //in_n0_n1_n2_h_w_global_desc.GetStride(I3),
-                     //in_n0_n1_n2_h_w_global_desc.GetStride(I4)>{};
-
-        //constexpr auto in_n0_n1_n2_h_w_new_global_desc =
-            //make_ConstantTensorDescriptor(in_lengths_new, in_strides_new);
+        constexpr auto in_n0_n1_n2_h_w_global_desc =
+            in_n_c_h_w_global_desc
+                .Slice(I2, Number<mod_conv::integer_divide_ceil(Ho, Strides::Get(I0))>{})
+                .Slice(I3, Number<mod_conv::integer_divide_ceil(Wo, Strides::Get(I1))>{})
+                .Fold(I0, Number<N1>{}, Number<N2>{})
+                .Extract(Sequence<0, 1, 2, 4, 5>{});
+
+        // constexpr auto in_n0_n1_n2_h_w_global_desc =
+        // in_n_c_h_w_global_desc.Fold(I0, Number<N1>{}, Number<N2>{})
+        //.Extract(Sequence<0, 1, 2, 4, 5>{});
+
+        // constexpr auto in_lengths_new = Sequence<N0, N1, N2, Ho, Wo>{};
+
+        // constexpr auto in_strides_new =
+        // Sequence<in_n0_n1_n2_h_w_global_desc.GetStride(I0),
+        // in_n0_n1_n2_h_w_global_desc.GetStride(I1),
+        // in_n0_n1_n2_h_w_global_desc.GetStride(I2),
+        // in_n0_n1_n2_h_w_global_desc.GetStride(I3),
+        // in_n0_n1_n2_h_w_global_desc.GetStride(I4)>{};
+
+        // constexpr auto in_n0_n1_n2_h_w_new_global_desc =
+        // make_ConstantTensorDescriptor(in_lengths_new, in_strides_new);
        constexpr auto in_n0_n1_n2_h_w_new_global_desc = in_n0_n1_n2_h_w_global_desc;

        //     batch descritpor for device memory
@@ -140,17 +141,17 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
        constexpr auto in_c_y_x_global_desc = in_n_c_h_w_global_desc.Slice(I2, Number<Y>{})
                                                  .Slice(I3, Number<X>{})
                                                  .Extract(Sequence<1, 2, 3>{});
-        //constexpr auto in_win_lengths_new = Sequence<in_c_y_x_global_desc.GetLength(I0),
-                                                     //in_c_y_x_global_desc.GetLength(I1),
-                                                     //in_c_y_x_global_desc.GetLength(I2)>{};
+        // constexpr auto in_win_lengths_new = Sequence<in_c_y_x_global_desc.GetLength(I0),
+        // in_c_y_x_global_desc.GetLength(I1),
+        // in_c_y_x_global_desc.GetLength(I2)>{};

-        //constexpr auto in_win_strides_new =
-            //Sequence<in_c_y_x_global_desc.GetStride(I0),
-                     //in_c_y_x_global_desc.GetStride(I1),
-                     //in_c_y_x_global_desc.GetStride(I2)>{};
+        // constexpr auto in_win_strides_new =
+        // Sequence<in_c_y_x_global_desc.GetStride(I0),
+        // in_c_y_x_global_desc.GetStride(I1),
+        // in_c_y_x_global_desc.GetStride(I2)>{};

-        //constexpr auto in_c_y_x_new_global_desc =
-            //make_ConstantTensorDescriptor(in_win_lengths_new, in_win_strides_new);
+        // constexpr auto in_c_y_x_new_global_desc =
+        // make_ConstantTensorDescriptor(in_win_lengths_new, in_win_strides_new);

        //     merged tensor descriptor in device memory [E, N1, B, N2], src of blockwise copy
        constexpr auto in_e_n1_b_n2_global_merged_desc = make_ConstantMergedTensorDescriptor(
@@ -189,16 +190,14 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
                                               InBlockCopyDstDataPerWrite_N2>(
                {0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0});

-        // weight tensor
-        //     tensor descriptor in device memory, src of blockwise copy
+// weight tensor
+//     tensor descriptor in device memory, src of blockwise copy
 #if 0
        constexpr auto wei_e_k_global_desc =
            wei_k_c_y_x_global_desc.Unfold(I1, I3).ReorderGivenNew2Old(Sequence<1, 0>{});
 #else
-        constexpr auto wei_e_k_global_desc =
-            make_ConstantMergedTensorDescriptor(wei_k_c_y_x_global_desc,
-                    Sequence<1, 2, 3>{},
-                    Sequence<0>{});
+        constexpr auto wei_e_k_global_desc = make_ConstantMergedTensorDescriptor(
+            wei_k_c_y_x_global_desc, Sequence<1, 2, 3>{}, Sequence<0>{});
 #endif

        //     tensor descriptor in LDS, dst of blockwise copy
@@ -426,9 +425,10 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
                out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetLength(I3),
                out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetLength(I4),
                out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetLength(I5),
-                out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetLength(I6) / Strides{}.Get(I0),
-                out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetLength(I7) / Strides{}.Get(I1)
-                >{};
+                mod_conv::integer_divide_ceil(
+                    out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetLength(I6), Strides{}.Get(I0)),
+                mod_conv::integer_divide_ceil(
+                    out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetLength(I7), Strides{}.Get(I1))>{};

            constexpr auto out_strides_new = Sequence<
                out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetStride(I0),
@@ -438,12 +438,10 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
                out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetStride(I4),
                out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetStride(I5),
                out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetStride(I6) * Strides{}.Get(I0),
-                out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetStride(I7) * Strides{}.Get(I1)
-                >{};
+                out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetStride(I7) * Strides{}.Get(I1)>{};

-            constexpr auto out_n0_n1_n2_k0_k1_k2_h_w_new_global_mem_desc = make_ConstantTensorDescriptor(
-                    out_lengths_new, out_strides_new
-                    );
+            constexpr auto out_n0_n1_n2_k0_k1_k2_h_w_new_global_mem_desc =
+                make_ConstantTensorDescriptor(out_lengths_new, out_strides_new);

            // calculate origin of thread output tensor on global memory
            //     blockwise GEMM c matrix starting index