support device dispatch in parrots (#1588)

a4dc2a72 · pc · GitHub · 0bcbeadb · a4dc2a72 · a4dc2a72
Unverified Commit a4dc2a72 authored Dec 24, 2021 by pc Committed by GitHub Dec 24, 2021
6 changed files
--- a/mmcv/ops/csrc/parrots/three_nn.cpp
+++ b/mmcv/ops/csrc/parrots/three_nn.cpp
@@ -2,29 +2,17 @@
 // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate.cpp

 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"

-#ifdef MMCV_WITH_CUDA
-void ThreeNNForwardCUDAKernelLauncher(int b, int n, int m, const Tensor unknown,
-                                      const Tensor known, Tensor dist2,
-                                      Tensor idx);
-
-void three_nn_forward_cuda(int b, int n, int m, const Tensor unknown,
+void three_nn_forward_impl(int b, int n, int m, const Tensor unknown,
                           const Tensor known, Tensor dist2, Tensor idx) {
-  ThreeNNForwardCUDAKernelLauncher(b, n, m, unknown, known, dist2, idx);
-};
-#endif
+  DISPATCH_DEVICE_IMPL(three_nn_forward_impl, b, n, m, unknown, known, dist2,
+                       idx);
+}

 void three_nn_forward(Tensor unknown_tensor, Tensor known_tensor,
                      Tensor dist2_tensor, Tensor idx_tensor, int b, int n,
                      int m) {
-  if (unknown_tensor.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    three_nn_forward_cuda(b, n, m, unknown_tensor, known_tensor, dist2_tensor,
-                          idx_tensor);
-#else
-    AT_ERROR("three_nn is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("three_nn is not implemented on CPU");
-  }
+  three_nn_forward_impl(b, n, m, unknown_tensor, known_tensor, dist2_tensor,
+                        idx_tensor);
 }
--- a/mmcv/ops/csrc/parrots/tin_shift.cpp
+++ b/mmcv/ops/csrc/parrots/tin_shift.cpp
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"

-#ifdef MMCV_WITH_CUDA
-void TINShiftForwardCUDAKernelLauncher(Tensor input, Tensor shift,
-                                       Tensor output);
-
-void TINShiftBackwardCUDAKernelLauncher(Tensor grad_output, Tensor shift,
-                                        Tensor grad_input);
-
-void tin_shift_forward_cuda(Tensor input, Tensor shift, Tensor output) {
-  TINShiftForwardCUDAKernelLauncher(input, shift, output);
+void tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output) {
+  DISPATCH_DEVICE_IMPL(tin_shift_forward_impl, input, shift, output);
 }

-void tin_shift_backward_cuda(Tensor grad_output, Tensor shift,
+void tin_shift_backward_impl(Tensor grad_output, Tensor shift,
                             Tensor grad_input) {
-  TINShiftBackwardCUDAKernelLauncher(grad_output, shift, grad_input);
+  DISPATCH_DEVICE_IMPL(tin_shift_backward_impl, grad_output, shift, grad_input);
 }

-#endif
-
 void tin_shift_forward(Tensor input, Tensor shift, Tensor output) {
-  if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(shift);
-    CHECK_CUDA_INPUT(output);
-
-    tin_shift_forward_cuda(input, shift, output);
-#else
-    AT_ERROR("TINShift is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("TINShift is not implemented on CPU");
-  }
+  tin_shift_forward_impl(input, shift, output);
 }

 void tin_shift_backward(Tensor grad_output, Tensor shift, Tensor grad_input) {
-  if (grad_output.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(grad_output);
-    CHECK_CUDA_INPUT(shift);
-    CHECK_CUDA_INPUT(grad_input);
-
-    tin_shift_backward_cuda(grad_output, shift, grad_input);
-#else
-    AT_ERROR("TINShift is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("TINShift is not implemented on CPU");
-  }
+  tin_shift_backward_impl(grad_output, shift, grad_input);
 }
--- a/mmcv/ops/csrc/parrots/upfirdn2d.cpp
+++ b/mmcv/ops/csrc/parrots/upfirdn2d.cpp
-// Copyright (c) OpenMMLab. All rights reserved
-// from
+// Modified from
 // https://github.com/rosinality/stylegan2-pytorch/blob/master/op/upfirdn2d.cpp
-#include "pytorch_cpp_helper.hpp"

-#ifdef MMCV_WITH_CUDA
-torch::Tensor upfirdn2d_op(const torch::Tensor &input,
-                           const torch::Tensor &kernel, int up_x, int up_y,
-                           int down_x, int down_y, int pad_x0, int pad_x1,
-                           int pad_y0, int pad_y1);
+/*
+Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
+
+NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator
+Augmentation (ADA)
+=======================================================================
+
+1. Definitions
+
+"Licensor" means any person or entity that distributes its Work.
+
+"Software" means the original work of authorship made available under
+this License.
+
+"Work" means the Software and any additions to or derivative works of
+the Software that are made available under this License.
+
+The terms "reproduce," "reproduction," "derivative works," and
+"distribution" have the meaning as provided under U.S. copyright law;
+provided, however, that for the purposes of this License, derivative
+works shall not include works that remain separable from, or merely
+link (or bind by name) to the interfaces of, the Work.
+
+Works, including the Software, are "made available" under this License
+by including in or with the Work either (a) a copyright notice
+referencing the applicability of this License to the Work, or (b) a
+copy of this License.
+
+2. License Grants
+
+    2.1 Copyright Grant. Subject to the terms and conditions of this
+    License, each Licensor grants to you a perpetual, worldwide,
+    non-exclusive, royalty-free, copyright license to reproduce,
+    prepare derivative works of, publicly display, publicly perform,
+    sublicense and distribute its Work and any resulting derivative
+    works in any form.
+
+3. Limitations
+
+    3.1 Redistribution. You may reproduce or distribute the Work only
+    if (a) you do so under this License, (b) you include a complete
+    copy of this License with your distribution, and (c) you retain
+    without modification any copyright, patent, trademark, or
+    attribution notices that are present in the Work.
+
+    3.2 Derivative Works. You may specify that additional or different
+    terms apply to the use, reproduction, and distribution of your
+    derivative works of the Work ("Your Terms") only if (a) Your Terms
+    provide that the use limitation in Section 3.3 applies to your
+    derivative works, and (b) you identify the specific derivative
+    works that are subject to Your Terms. Notwithstanding Your Terms,
+    this License (including the redistribution requirements in Section
+    3.1) will continue to apply to the Work itself.

-#endif
+    3.3 Use Limitation. The Work and any derivative works thereof only
+    may be used or intended for use non-commercially. Notwithstanding
+    the foregoing, NVIDIA and its affiliates may use the Work and any
+    derivative works commercially. As used herein, "non-commercially"
+    means for research or evaluation purposes only.
+
+    3.4 Patent Claims. If you bring or threaten to bring a patent claim
+    against any Licensor (including any claim, cross-claim or
+    counterclaim in a lawsuit) to enforce any patents that you allege
+    are infringed by any Work, then your rights under this License from
+    such Licensor (including the grant in Section 2.1) will terminate
+    immediately.
+
+    3.5 Trademarks. This License does not grant any rights to use any
+    Licensor’s or its affiliates’ names, logos, or trademarks, except
+    as necessary to reproduce the notices described in this License.
+
+    3.6 Termination. If you violate any term of this License, then your
+    rights under this License (including the grant in Section 2.1) will
+    terminate immediately.
+
+4. Disclaimer of Warranty.
+
+THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
+NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
+THIS LICENSE.
+
+5. Limitation of Liability.
+
+EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
+THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
+SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
+INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
+OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
+(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
+LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
+COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGES.
+
+=======================================================================
+*/
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+torch::Tensor upfirdn2d_op_impl(const torch::Tensor& input,
+                                const torch::Tensor& kernel, int up_x, int up_y,
+                                int down_x, int down_y, int pad_x0, int pad_x1,
+                                int pad_y0, int pad_y1) {
+  return DISPATCH_DEVICE_IMPL(upfirdn2d_op_impl, input, kernel, up_x, up_y,
+                              down_x, down_y, pad_x0, pad_x1, pad_y0, pad_y1);
+}

-torch::Tensor upfirdn2d(const torch::Tensor &input, const torch::Tensor &kernel,
+torch::Tensor upfirdn2d(const torch::Tensor& input, const torch::Tensor& kernel,
                        int up_x, int up_y, int down_x, int down_y, int pad_x0,
                        int pad_x1, int pad_y0, int pad_y1) {
-#ifdef MMCV_WITH_CUDA
-  CHECK_CUDA(input);
-  CHECK_CUDA(kernel);
-
-  return upfirdn2d_op(input, kernel, up_x, up_y, down_x, down_y, pad_x0, pad_x1,
-                      pad_y0, pad_y1);
-#else
-  AT_ERROR("UpFirDn2d is not compiled with GPU support");
-#endif
+  return upfirdn2d_op_impl(input, kernel, up_x, up_y, down_x, down_y, pad_x0,
+                           pad_x1, pad_y0, pad_y1);
 }
--- a/mmcv/ops/csrc/parrots/voxelization.cpp
+++ b/mmcv/ops/csrc/parrots/voxelization.cpp
 // Copyright (c) OpenMMLab. All rights reserved.
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"

-#ifdef MMCV_WITH_CUDA
-int HardVoxelizeForwardCUDAKernelLauncher(
-    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
-    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
-    const std::vector<float> coors_range, const int max_points,
-    const int max_voxels, const int NDim = 3);
-
-int hard_voxelize_forward_cuda(const at::Tensor &points, at::Tensor &voxels,
+int hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels,
                               at::Tensor &coors,
                               at::Tensor &num_points_per_voxel,
                               const std::vector<float> voxel_size,
                               const std::vector<float> coors_range,
                               const int max_points, const int max_voxels,
                               const int NDim = 3) {
-  return HardVoxelizeForwardCUDAKernelLauncher(
-      points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,
-      max_points, max_voxels, NDim);
-};
-
-void DynamicVoxelizeForwardCUDAKernelLauncher(
-    const at::Tensor &points, at::Tensor &coors,
-    const std::vector<float> voxel_size, const std::vector<float> coors_range,
-    const int NDim = 3);
+  return DISPATCH_DEVICE_IMPL(hard_voxelize_forward_impl, points, voxels, coors,
+                              num_points_per_voxel, voxel_size, coors_range,
+                              max_points, max_voxels, NDim);
+}

-void dynamic_voxelize_forward_cuda(const at::Tensor &points, at::Tensor &coors,
+void dynamic_voxelize_forward_impl(const at::Tensor &points, at::Tensor &coors,
                                   const std::vector<float> voxel_size,
                                   const std::vector<float> coors_range,
                                   const int NDim = 3) {
-  DynamicVoxelizeForwardCUDAKernelLauncher(points, coors, voxel_size,
-                                           coors_range, NDim);
-};
-#endif
-
-int hard_voxelize_forward_cpu(const at::Tensor &points, at::Tensor &voxels,
-                              at::Tensor &coors,
-                              at::Tensor &num_points_per_voxel,
-                              const std::vector<float> voxel_size,
-                              const std::vector<float> coors_range,
-                              const int max_points, const int max_voxels,
-                              const int NDim = 3);
-
-void dynamic_voxelize_forward_cpu(const at::Tensor &points, at::Tensor &coors,
-                                  const std::vector<float> voxel_size,
-                                  const std::vector<float> coors_range,
-                                  const int NDim = 3);
+  DISPATCH_DEVICE_IMPL(dynamic_voxelize_forward_impl, points, coors, voxel_size,
+                       coors_range, NDim);
+}

 void hard_voxelize_forward(const at::Tensor &points,
                           const at::Tensor &voxel_size,
@@ -60,21 +35,10 @@ void hard_voxelize_forward(const at::Tensor &points,
  std::vector<float> coors_range_v(
      coors_range.data_ptr<float>(),
      coors_range.data_ptr<float>() + coors_range.numel());
-  if (points.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(points);

-    *voxel_num_data = hard_voxelize_forward_cuda(
-        points, voxels, coors, num_points_per_voxel, voxel_size_v,
-        coors_range_v, max_points, max_voxels, NDim);
-#else
-    AT_ERROR("hard_voxelize is not compiled with GPU support");
-#endif
-  } else {
-    *voxel_num_data = hard_voxelize_forward_cpu(
-        points, voxels, coors, num_points_per_voxel, voxel_size_v,
-        coors_range_v, max_points, max_voxels, NDim);
-  }
+  *voxel_num_data = hard_voxelize_forward_impl(
+      points, voxels, coors, num_points_per_voxel, voxel_size_v, coors_range_v,
+      max_points, max_voxels, NDim);
 }

 void dynamic_voxelize_forward(const at::Tensor &points,
@@ -87,17 +51,6 @@ void dynamic_voxelize_forward(const at::Tensor &points,
  std::vector<float> coors_range_v(
      coors_range.data_ptr<float>(),
      coors_range.data_ptr<float>() + coors_range.numel());
-  if (points.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(points);
-
-    dynamic_voxelize_forward_cuda(points, coors, voxel_size_v, coors_range_v,
-                                  NDim);
-#else
-    AT_ERROR("dynamic_voxelize is not compiled with GPU support");
-#endif
-  } else {
-    dynamic_voxelize_forward_cpu(points, coors, voxel_size_v, coors_range_v,
-                                 NDim);
-  }
+  dynamic_voxelize_forward_impl(points, coors, voxel_size_v, coors_range_v,
+                                NDim);
 }
--- a/mmcv/ops/csrc/parrots/voxelization_cpu.cpp
+++ b/mmcv/ops/csrc/parrots/voxelization_cpu.cpp
-// Copyright (c) OpenMMLab. All rights reserved.
-#include "pytorch_cpp_helper.hpp"
-
-template <typename T, typename T_int>
-void dynamic_voxelize_forward_cpu_kernel(
-    const torch::TensorAccessor<T, 2> points,
-    torch::TensorAccessor<T_int, 2> coors, const std::vector<float> voxel_size,
-    const std::vector<float> coors_range, const std::vector<int> grid_size,
-    const int num_points, const int num_features, const int NDim) {
-  const int ndim_minus_1 = NDim - 1;
-  bool failed = false;
-  // int coor[NDim];
-  int* coor = new int[NDim]();
-  int c;
-
-  for (int i = 0; i < num_points; ++i) {
-    failed = false;
-    for (int j = 0; j < NDim; ++j) {
-      c = floor((points[i][j] - coors_range[j]) / voxel_size[j]);
-      // necessary to rm points out of range
-      if ((c < 0 || c >= grid_size[j])) {
-        failed = true;
-        break;
-      }
-      coor[ndim_minus_1 - j] = c;
-    }
-
-    if (failed)
-      memset(&coors[i][0], -1, NDim * sizeof(T_int));
-    else
-      memcpy(&coors[i][0], &coor[0], NDim * sizeof(T_int));
-  }
-
-  delete[] coor;
-}
-
-template <typename T, typename T_int>
-void hard_voxelize_forward_cpu_kernel(
-    const torch::TensorAccessor<T, 2> points,
-    torch::TensorAccessor<T, 3> voxels, torch::TensorAccessor<T_int, 2> coors,
-    torch::TensorAccessor<T_int, 1> num_points_per_voxel,
-    torch::TensorAccessor<T_int, 3> coor_to_voxelidx, int& voxel_num,
-    const std::vector<float> voxel_size, const std::vector<float> coors_range,
-    const std::vector<int> grid_size, const int max_points,
-    const int max_voxels, const int num_points, const int num_features,
-    const int NDim) {
-  // declare a temp coors
-  at::Tensor temp_coors = at::zeros(
-      {num_points, NDim}, at::TensorOptions().dtype(at::kInt).device(at::kCPU));
-
-  // First use dynamic voxelization to get coors,
-  // then check max points/voxels constraints
-  dynamic_voxelize_forward_cpu_kernel<T, int>(
-      points, temp_coors.accessor<int, 2>(), voxel_size, coors_range, grid_size,
-      num_points, num_features, NDim);
-
-  int voxelidx, num;
-  auto coor = temp_coors.accessor<int, 2>();
-
-  for (int i = 0; i < num_points; ++i) {
-    // T_int* coor = temp_coors.data_ptr<int>() + i * NDim;
-
-    if (coor[i][0] == -1) continue;
-
-    voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];
-
-    // record voxel
-    if (voxelidx == -1) {
-      voxelidx = voxel_num;
-      if (max_voxels != -1 && voxel_num >= max_voxels) continue;
-      voxel_num += 1;
-
-      coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]] = voxelidx;
-      memcpy(&coors[voxelidx][0], &coor[i][0], NDim * sizeof(T_int));
-    }
-
-    // put points into voxel
-    num = num_points_per_voxel[voxelidx];
-    if (max_points == -1 || num < max_points) {
-      memcpy(&voxels[voxelidx][num][0], &points[i][0],
-             num_features * sizeof(T));
-      num_points_per_voxel[voxelidx] += 1;
-    }
-  }
-
-  return;
-}
-
-void dynamic_voxelize_forward_cpu(const at::Tensor& points, at::Tensor& coors,
-                                  const std::vector<float> voxel_size,
-                                  const std::vector<float> coors_range,
-                                  const int NDim = 3) {
-  // check device
-  AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
-
-  std::vector<int> grid_size(NDim);
-  const int num_points = points.size(0);
-  const int num_features = points.size(1);
-
-  for (int i = 0; i < NDim; ++i) {
-    grid_size[i] =
-        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
-  }
-
-  // coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      points.scalar_type(), "dynamic_voxelize_forward_cpu_kernel", [&] {
-        dynamic_voxelize_forward_cpu_kernel<scalar_t, int>(
-            points.accessor<scalar_t, 2>(), coors.accessor<int, 2>(),
-            voxel_size, coors_range, grid_size, num_points, num_features, NDim);
-      });
-}
-
-int hard_voxelize_forward_cpu(const at::Tensor& points, at::Tensor& voxels,
-                              at::Tensor& coors,
-                              at::Tensor& num_points_per_voxel,
-                              const std::vector<float> voxel_size,
-                              const std::vector<float> coors_range,
-                              const int max_points, const int max_voxels,
-                              const int NDim = 3) {
-  // current version tooks about 0.02s_0.03s for one frame on cpu
-  // check device
-  AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
-
-  std::vector<int> grid_size(NDim);
-  const int num_points = points.size(0);
-  const int num_features = points.size(1);
-
-  for (int i = 0; i < NDim; ++i) {
-    grid_size[i] =
-        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
-  }
-
-  // coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
-  // printf("cpu coor_to_voxelidx size: [%d, %d, %d]\n", grid_size[2],
-  // grid_size[1], grid_size[0]);
-  at::Tensor coor_to_voxelidx =
-      -at::ones({grid_size[2], grid_size[1], grid_size[0]}, coors.options());
-
-  int voxel_num = 0;
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      points.scalar_type(), "hard_voxelize_forward_cpu_kernel", [&] {
-        hard_voxelize_forward_cpu_kernel<scalar_t, int>(
-            points.accessor<scalar_t, 2>(), voxels.accessor<scalar_t, 3>(),
-            coors.accessor<int, 2>(), num_points_per_voxel.accessor<int, 1>(),
-            coor_to_voxelidx.accessor<int, 3>(), voxel_num, voxel_size,
-            coors_range, grid_size, max_points, max_voxels, num_points,
-            num_features, NDim);
-      });
-
-  return voxel_num;
-}
--- a/setup.py
+++ b/setup.py
@@ -189,13 +189,14 @@ def get_extensions():
        define_macros = []
        include_dirs = []
        op_files = glob.glob('./mmcv/ops/csrc/pytorch/cuda/*.cu') +\
+            glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp') +\
            glob.glob('./mmcv/ops/csrc/parrots/*.cpp')
        include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))
        include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/cuda'))
        cuda_args = os.getenv('MMCV_CUDA_ARGS')
        extra_compile_args = {
-            'nvcc': [cuda_args] if cuda_args else [],
-            'cxx': [],
+            'nvcc': [cuda_args, '-std=c++14'] if cuda_args else ['-std=c++14'],
+            'cxx': ['-std=c++14'],
        }
        if torch.cuda.is_available() or os.getenv('FORCE_CUDA', '0') == '1':
            define_macros += [('MMCV_WITH_CUDA', None)]