Add test for large batches in DeformConv2d (#2040)

* Add test for large batches in DeformConv2d * Clean-up and (try) fix DeformConv2d * Simplifications and bugfixes * Try fix CUDA now

Add test for large batches in DeformConv2d (#2040)
* Add test for large batches in DeformConv2d * Clean-up and (try) fix DeformConv2d * Simplifications and bugfixes * Try fix CUDA now
ccd797dd · Francisco Massa · GitHub · 979bb72e · ccd797dd · ccd797dd
Unverified Commit ccd797dd authored Apr 02, 2020 by Francisco Massa Committed by GitHub Apr 02, 2020
3 changed files
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -454,7 +454,7 @@ class DeformConvTester(OpTester, unittest.TestCase):
        return out

    def get_fn_args(self, device, contiguous):
-        batch_sz = 1
+        batch_sz = 33
        n_in_channels = 6
        n_out_channels = 2
        n_weight_grps = 2

--- a/torchvision/csrc/cpu/DeformConv_cpu.cpp
+++ b/torchvision/csrc/cpu/DeformConv_cpu.cpp
@@ -713,55 +713,49 @@ static std::tuple<at::Tensor, at::Tensor> deform_conv2d_backward_input_cpu(

  auto grad_input = at::zeros_like(input);
  auto grad_offset = at::zeros_like(offset);
-  auto columns = at::zeros(
+  auto columns = at::empty(
      {n_in_channels * weight_w * weight_h, n_parallel_imgs * out_h * out_w},
      input.options());

  // Separate into blocks
-  grad_input = grad_input.view(
+  grad_input = grad_input.reshape(
      {batch_sz / n_parallel_imgs, n_parallel_imgs, n_in_channels, in_h, in_w});
-  input = input.view(
+  input = input.reshape(
      {batch_sz / n_parallel_imgs, n_parallel_imgs, n_in_channels, in_h, in_w});
-  grad_offset = grad_offset.view({batch_sz / n_parallel_imgs,
-                                  n_parallel_imgs,
-                                  n_offset_grps * 2 * weight_h * weight_w,
-                                  out_h,
-                                  out_w});
-  offset = offset.view({batch_sz / n_parallel_imgs,
-                        n_parallel_imgs,
-                        n_offset_grps * 2 * weight_h * weight_w,
-                        out_h,
-                        out_w});
-
-  grad_out = grad_out.view({batch_sz / n_parallel_imgs,
-                            n_parallel_imgs,
-                            n_out_channels,
-                            out_h,
-                            out_w});
-  grad_out.transpose_(1, 2);
-  grad_out = grad_out.view({grad_out.size(0),
-                            n_weight_grps,
-                            grad_out.size(1) / n_weight_grps,
-                            grad_out.size(2),
-                            grad_out.size(3),
-                            grad_out.size(4)});
-
-  weight = weight.view({n_weight_grps,
-                        weight.size(0) / n_weight_grps,
-                        weight.size(1),
-                        weight.size(2),
-                        weight.size(3)});
+  grad_offset = grad_offset.reshape({batch_sz / n_parallel_imgs,
+                                     n_parallel_imgs,
+                                     n_offset_grps * 2 * weight_h * weight_w,
+                                     out_h,
+                                     out_w});
+  offset = offset.reshape({batch_sz / n_parallel_imgs,
+                           n_parallel_imgs,
+                           n_offset_grps * 2 * weight_h * weight_w,
+                           out_h,
+                           out_w});
+
+  grad_out = grad_out.reshape({batch_sz / n_parallel_imgs,
+                               n_parallel_imgs,
+                               n_weight_grps,
+                               n_out_channels / n_weight_grps,
+                               out_h,
+                               out_w}).permute({0, 2, 3, 1, 4, 5});
+
+  weight = weight.reshape({n_weight_grps,
+                           weight.size(0) / n_weight_grps,
+                           weight.size(1),
+                           weight.size(2),
+                           weight.size(3)});
+
+  columns = columns.view(
+      {n_weight_grps, columns.size(0) / n_weight_grps, columns.size(1)});

  for (int elt = 0; elt < batch_sz / n_parallel_imgs; elt++) {
+    columns.zero_();
    // Separate into weight groups
-    columns = columns.view(
-        {n_weight_grps, columns.size(0) / n_weight_grps, columns.size(1)});
    for (int g = 0; g < n_weight_grps; g++) {
      columns[g] = columns[g].addmm_(
          weight[g].flatten(1).transpose(0, 1), grad_out[elt][g].flatten(1));
    }
-    columns =
-        columns.view({columns.size(0) * columns.size(1), columns.size(2)});

    compute_grad_offset(
        columns,
@@ -801,20 +795,9 @@ static std::tuple<at::Tensor, at::Tensor> deform_conv2d_backward_input_cpu(
        grad_input[elt]);
  }

-  grad_out = grad_out.view({grad_out.size(0),
-                            grad_out.size(1) * grad_out.size(2),
-                            grad_out.size(3),
-                            grad_out.size(4),
-                            grad_out.size(5)});
-  grad_out.transpose_(1, 2);
-  grad_out = grad_out.view({batch_sz, n_out_channels, out_h, out_w});
-
  grad_input = grad_input.view({batch_sz, n_in_channels, in_h, in_w});
-  input = input.view({batch_sz, n_in_channels, in_h, in_w});
  grad_offset = grad_offset.view(
      {batch_sz, n_offset_grps * 2 * weight_h * weight_w, out_h, out_w});
-  offset = offset.view(
-      {batch_sz, n_offset_grps * 2 * weight_h * weight_w, out_h, out_w});

  return std::make_tuple(grad_input, grad_offset);
 }
@@ -854,46 +837,36 @@ static at::Tensor deform_conv2d_backward_parameters_cpu(
  long out_w = grad_out.size(3);

  auto grad_weight = at::zeros_like(weight);
-  ;
-  auto columns = at::zeros(
-      {n_in_channels * weight_w * weight_h, n_parallel_imgs * out_h * out_w},
-      input.options());

-  grad_out = grad_out.view({batch_sz / n_parallel_imgs,
-                            n_parallel_imgs,
-                            n_out_channels,
-                            out_h,
-                            out_w});
-  grad_out.transpose_(1, 2);
-
-  at::Tensor grad_out_buf = at::zeros_like(grad_out);
-  grad_out_buf.copy_(grad_out);
-  grad_out_buf = grad_out_buf.view({batch_sz / n_parallel_imgs,
-                                    n_out_channels,
-                                    n_parallel_imgs * out_h,
-                                    out_w});
-  grad_out_buf = grad_out_buf.view({grad_out_buf.size(0),
-                                    n_weight_grps,
-                                    grad_out_buf.size(1) / n_weight_grps,
-                                    grad_out_buf.size(2),
-                                    grad_out_buf.size(3)});
-
-  grad_out.transpose_(1, 2);
-  grad_out = grad_out.view({batch_sz, n_out_channels, out_h, out_w});
-
-  input = input.view(
+  at::Tensor grad_out_buf = grad_out.reshape(
+      {batch_sz / n_parallel_imgs,
+       n_parallel_imgs,
+       n_weight_grps,
+       n_out_channels / n_weight_grps,
+       out_h,
+       out_w}
+  ).permute({0, 2, 3, 1, 4, 5}).contiguous();
+
+  input = input.reshape(
      {batch_sz / n_parallel_imgs, n_parallel_imgs, n_in_channels, in_h, in_w});
-  offset = offset.view({batch_sz / n_parallel_imgs,
-                        n_parallel_imgs,
-                        n_offset_grps * 2 * weight_h * weight_w,
-                        out_h,
-                        out_w});
+  offset = offset.reshape({batch_sz / n_parallel_imgs,
+                           n_parallel_imgs,
+                           n_offset_grps * 2 * weight_h * weight_w,
+                           out_h,
+                           out_w});

  grad_weight = grad_weight.view({n_weight_grps,
                                  grad_weight.size(0) / n_weight_grps,
                                  grad_weight.size(1),
                                  grad_weight.size(2),
                                  grad_weight.size(3)});
+
+  auto columns = at::empty(
+      {n_weight_grps,
+       n_in_channels * weight_w * weight_h / n_weight_grps,
+       n_parallel_imgs * out_h * out_w},
+      input.options());
+
  for (int elt = 0; elt < batch_sz / n_parallel_imgs; elt++) {
    deformable_im2col(
        input[elt],
@@ -915,8 +888,6 @@ static at::Tensor deform_conv2d_backward_parameters_cpu(
        n_offset_grps,
        columns);

-    columns = columns.view(
-        {n_weight_grps, columns.size(0) / n_weight_grps, columns.size(1)});
    for (int g = 0; g < n_weight_grps; g++) {
      grad_weight[g] =
          grad_weight[g]
@@ -925,14 +896,8 @@ static at::Tensor deform_conv2d_backward_parameters_cpu(
                  grad_out_buf[elt][g].flatten(1), columns[g].transpose(1, 0))
              .view_as(grad_weight[g]);
    }
-    columns =
-        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
  }

-  input = input.view({batch_sz, n_in_channels, in_h, in_w});
-  offset = offset.view(
-      {batch_sz, n_offset_grps * 2 * weight_h * weight_w, out_h, out_w});
-
  grad_weight = grad_weight.view({grad_weight.size(0) * grad_weight.size(1),
                                  grad_weight.size(2),
                                  grad_weight.size(3),

--- a/torchvision/csrc/cuda/DeformConv_cuda.cu
+++ b/torchvision/csrc/cuda/DeformConv_cuda.cu
@@ -744,55 +744,48 @@ static std::tuple<at::Tensor, at::Tensor> deform_conv_backward_input_cuda(

  auto grad_input = at::zeros_like(input);
  auto grad_offset = at::zeros_like(offset);
-  auto columns = at::zeros(
+  auto columns = at::empty(
      {n_in_channels * weight_w * weight_h, n_parallel_imgs * out_h * out_w},
      input.options());

  // Separate into blocks
-  grad_input = grad_input.view(
+  grad_input = grad_input.reshape(
      {batch_sz / n_parallel_imgs, n_parallel_imgs, n_in_channels, in_h, in_w});
-  input = input.view(
+  input = input.reshape(
      {batch_sz / n_parallel_imgs, n_parallel_imgs, n_in_channels, in_h, in_w});
-  grad_offset = grad_offset.view({batch_sz / n_parallel_imgs,
-                                  n_parallel_imgs,
-                                  n_offset_grps * 2 * weight_h * weight_w,
-                                  out_h,
-                                  out_w});
-  offset = offset.view({batch_sz / n_parallel_imgs,
-                        n_parallel_imgs,
-                        n_offset_grps * 2 * weight_h * weight_w,
-                        out_h,
-                        out_w});
-
-  grad_out = grad_out.view({batch_sz / n_parallel_imgs,
-                            n_parallel_imgs,
-                            n_out_channels,
-                            out_h,
-                            out_w});
-  grad_out.transpose_(1, 2);
-  grad_out = grad_out.view({grad_out.size(0),
-                            n_weight_grps,
-                            grad_out.size(1) / n_weight_grps,
-                            grad_out.size(2),
-                            grad_out.size(3),
-                            grad_out.size(4)});
-
-  weight = weight.view({n_weight_grps,
-                        weight.size(0) / n_weight_grps,
-                        weight.size(1),
-                        weight.size(2),
-                        weight.size(3)});
+  grad_offset = grad_offset.reshape({batch_sz / n_parallel_imgs,
+                                     n_parallel_imgs,
+                                     n_offset_grps * 2 * weight_h * weight_w,
+                                     out_h,
+                                     out_w});
+  offset = offset.reshape({batch_sz / n_parallel_imgs,
+                           n_parallel_imgs,
+                           n_offset_grps * 2 * weight_h * weight_w,
+                           out_h,
+                           out_w});
+
+  grad_out = grad_out.reshape({batch_sz / n_parallel_imgs,
+                               n_parallel_imgs,
+                               n_weight_grps,
+                               n_out_channels / n_weight_grps,
+                               out_h,
+                               out_w}).permute({0, 2, 3, 1, 4, 5});
+
+  weight = weight.reshape({n_weight_grps,
+                           weight.size(0) / n_weight_grps,
+                           weight.size(1),
+                           weight.size(2),
+                           weight.size(3)});
  
+  columns = columns.view(
+      {n_weight_grps, columns.size(0) / n_weight_grps, columns.size(1)});
  for (int elt = 0; elt < batch_sz / n_parallel_imgs; elt++) {
+    columns.zero_();
    // Separate into weight groups
-    columns = columns.view(
-        {n_weight_grps, columns.size(0) / n_weight_grps, columns.size(1)});
    for (int g = 0; g < n_weight_grps; g++) {
      columns[g] = columns[g].addmm_(
          weight[g].flatten(1).transpose(0, 1), grad_out[elt][g].flatten(1));
    }
-    columns =
-        columns.view({columns.size(0) * columns.size(1), columns.size(2)});

    compute_grad_offset(
        columns,
@@ -832,20 +825,10 @@ static std::tuple<at::Tensor, at::Tensor> deform_conv_backward_input_cuda(
        grad_input[elt]);
  }

-  grad_out = grad_out.view({grad_out.size(0),
-                            grad_out.size(1) * grad_out.size(2),
-                            grad_out.size(3),
-                            grad_out.size(4),
-                            grad_out.size(5)});
-  grad_out.transpose_(1, 2);
-  grad_out = grad_out.view({batch_sz, n_out_channels, out_h, out_w});

  grad_input = grad_input.view({batch_sz, n_in_channels, in_h, in_w});
-  input = input.view({batch_sz, n_in_channels, in_h, in_w});
  grad_offset = grad_offset.view(
      {batch_sz, n_offset_grps * 2 * weight_h * weight_w, out_h, out_w});
-  offset = offset.view(
-      {batch_sz, n_offset_grps * 2 * weight_h * weight_w, out_h, out_w});

  return std::make_tuple(grad_input, grad_offset);
 }
@@ -887,46 +870,36 @@ static at::Tensor deform_conv_backward_parameters_cuda(
  long out_w = grad_out.size(3);

  auto grad_weight = at::zeros_like(weight);
-  ;
-  auto columns = at::zeros(
-      {n_in_channels * weight_w * weight_h, n_parallel_imgs * out_h * out_w},
-      input.options());

-  grad_out = grad_out.view({batch_sz / n_parallel_imgs,
-                            n_parallel_imgs,
-                            n_out_channels,
-                            out_h,
-                            out_w});
-  grad_out.transpose_(1, 2);
-
-  at::Tensor grad_out_buf = at::zeros_like(grad_out);
-  grad_out_buf.copy_(grad_out);
-  grad_out_buf = grad_out_buf.view({batch_sz / n_parallel_imgs,
-                                    n_out_channels,
-                                    n_parallel_imgs * out_h,
-                                    out_w});
-  grad_out_buf = grad_out_buf.view({grad_out_buf.size(0),
-                                    n_weight_grps,
-                                    grad_out_buf.size(1) / n_weight_grps,
-                                    grad_out_buf.size(2),
-                                    grad_out_buf.size(3)});
-
-  grad_out.transpose_(1, 2);
-  grad_out = grad_out.view({batch_sz, n_out_channels, out_h, out_w});
-
-  input = input.view(
+  at::Tensor grad_out_buf = grad_out.reshape(
+      {batch_sz / n_parallel_imgs,
+       n_parallel_imgs,
+       n_weight_grps,
+       n_out_channels / n_weight_grps,
+       out_h,
+       out_w}
+  ).permute({0, 2, 3, 1, 4, 5}).contiguous();
+
+  input = input.reshape(
      {batch_sz / n_parallel_imgs, n_parallel_imgs, n_in_channels, in_h, in_w});
-  offset = offset.view({batch_sz / n_parallel_imgs,
-                        n_parallel_imgs,
-                        n_offset_grps * 2 * weight_h * weight_w,
-                        out_h,
-                        out_w});
+  offset = offset.reshape({batch_sz / n_parallel_imgs,
+                           n_parallel_imgs,
+                           n_offset_grps * 2 * weight_h * weight_w,
+                           out_h,
+                           out_w});
+
+  grad_weight = grad_weight.reshape({n_weight_grps,
+                                     grad_weight.size(0) / n_weight_grps,
+                                     grad_weight.size(1),
+                                     grad_weight.size(2),
+                                     grad_weight.size(3)});
+
+  auto columns = at::empty(
+      {n_weight_grps,
+       n_in_channels * weight_w * weight_h / n_weight_grps,
+       n_parallel_imgs * out_h * out_w},
+      input.options());

-  grad_weight = grad_weight.view({n_weight_grps,
-                                  grad_weight.size(0) / n_weight_grps,
-                                  grad_weight.size(1),
-                                  grad_weight.size(2),
-                                  grad_weight.size(3)});
  for (int elt = 0; elt < batch_sz / n_parallel_imgs; elt++) {
    deformable_im2col(
        input[elt],
@@ -948,8 +921,6 @@ static at::Tensor deform_conv_backward_parameters_cuda(
        n_offset_grps,
        columns);

-    columns = columns.view(
-        {n_weight_grps, columns.size(0) / n_weight_grps, columns.size(1)});
    for (int g = 0; g < n_weight_grps; g++) {
      grad_weight[g] =
          grad_weight[g]
@@ -958,14 +929,8 @@ static at::Tensor deform_conv_backward_parameters_cuda(
                  grad_out_buf[elt][g].flatten(1), columns[g].transpose(1, 0))
              .view_as(grad_weight[g]);
    }
-    columns =
-        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
  }

-  input = input.view({batch_sz, n_in_channels, in_h, in_w});
-  offset = offset.view(
-      {batch_sz, n_offset_grps * 2 * weight_h * weight_w, out_h, out_w});
-
  grad_weight = grad_weight.view({grad_weight.size(0) * grad_weight.size(1),
                                  grad_weight.size(2),
                                  grad_weight.size(3),