Merge branch 'master' into v1.0.0.dev0

4c5c4299 · Tai-Wang · GitHub · b420cd1d · 86cc487c · 4c5c4299
Unverified Commit 4c5c4299 authored Feb 23, 2022 by Tai-Wang Committed by GitHub Feb 23, 2022
6 changed files
--- a/demo/inference_demo.ipynb
+++ b/demo/inference_demo.ipynb
@@ -3,74 +3,74 @@
  {
   "cell_type": "code",
   "execution_count": 7,
+   "source": [
+    "from mmdet3d.apis import init_model, inference_detector, show_result_meshlab"
+   ],
+   "outputs": [],
   "metadata": {
    "pycharm": {
     "is_executing": false
    }
-   },
+   }
-   "outputs": [],
-   "source": [
-    "from mmdet3d.apis import init_detector, inference_detector, show_result_meshlab"
-   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
-   "metadata": {
-    "pycharm": {
-     "is_executing": false
-    }
-   },
-   "outputs": [],
   "source": [
    "config_file = '../configs/second/hv_second_secfpn_6x8_80e_kitti-3d-car.py'\n",
    "# download the checkpoint from model zoo and put it in `checkpoints/`\n",
    "checkpoint_file = '../work_dirs/second/epoch_40.pth'"
-   ]
+   ],
+   "outputs": [],
+   "metadata": {
+    "pycharm": {
+     "is_executing": false
+    }
+   }
  },
  {
   "cell_type": "code",
   "execution_count": 9,
+   "source": [
+    "# build the model from a config file and a checkpoint file\n",
+    "model = init_model(config_file, checkpoint_file, device='cuda:0')"
+   ],
+   "outputs": [],
   "metadata": {
    "pycharm": {
     "is_executing": false
    }
-   },
+   }
-   "outputs": [],
-   "source": [
-    "# build the model from a config file and a checkpoint file\n",
-    "model = init_detector(config_file, checkpoint_file, device='cuda:0')"
-   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
-   "metadata": {
-    "pycharm": {
-     "is_executing": false
-    }
-   },
-   "outputs": [],
   "source": [
    "# test a single sample\n",
    "pcd = 'kitti_000008.bin'\n",
    "result, data = inference_detector(model, pcd)"
-   ]
+   ],
-  },
+   "outputs": [],
-  {
-   "cell_type": "code",
-   "execution_count": 11,
   "metadata": {
    "pycharm": {
     "is_executing": false
    }
-   },
+   }
-   "outputs": [],
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
   "source": [
    "# show the results\n",
    "out_dir = './'\n",
    "show_result_meshlab(data, result, out_dir)"
-   ]
+   ],
+   "outputs": [],
+   "metadata": {
+    "pycharm": {
+     "is_executing": false
+    }
+   }
  }
 ],
 "metadata": {

--- a/mmdet3d/ops/gather_points/gather_points.py
+++ b/mmdet3d/ops/gather_points/gather_points.py
@@ -28,7 +28,7 @@ class GatherPoints(Function):
        B, npoint = indices.size()
        _, C, N = features.size()
-        output = torch.cuda.FloatTensor(B, C, npoint)
+        output = features.new_zeros((B, C, npoint))
        gather_points_ext.gather_points_wrapper(B, C, N, npoint, features,
                                                indices, output)
@@ -42,7 +42,7 @@ class GatherPoints(Function):
        idx, C, N = ctx.for_backwards
        B, npoint = idx.size()
-        grad_features = torch.cuda.FloatTensor(B, C, N).zero_()
+        grad_features = grad_out.new_zeros((B, C, N))
        grad_out_data = grad_out.data.contiguous()
        gather_points_ext.gather_points_grad_wrapper(B, C, N, npoint,
                                                     grad_out_data, idx,

--- a/mmdet3d/ops/gather_points/src/gather_points.cpp
+++ b/mmdet3d/ops/gather_points/src/gather_points.cpp
 #include <ATen/cuda/CUDAContext.h>
+#include <ATen/TensorUtils.h>
 #include <THC/THC.h>
 #include <torch/extension.h>
 #include <torch/serialize/tensor.h>
 #include <vector>
 extern THCState *state;
 int gather_points_wrapper(int b, int c, int n, int npoints,
-                          at::Tensor points_tensor, at::Tensor idx_tensor,
+                          at::Tensor& points_tensor, at::Tensor& idx_tensor,
-                          at::Tensor out_tensor);
+                          at::Tensor& out_tensor);
 void gather_points_kernel_launcher(int b, int c, int n, int npoints,
-                                   const float *points, const int *idx,
+                                   const at::Tensor& points_tensor,
-                                   float *out, cudaStream_t stream);
+                                   const at::Tensor& idx_tensor,
+                                   at::Tensor& out_tensor);
 int gather_points_grad_wrapper(int b, int c, int n, int npoints,
-                               at::Tensor grad_out_tensor,
+                               at::Tensor& grad_out_tensor,
-                               at::Tensor idx_tensor,
+                               at::Tensor& idx_tensor,
-                               at::Tensor grad_points_tensor);
+                               at::Tensor& grad_points_tensor);
 void gather_points_grad_kernel_launcher(int b, int c, int n, int npoints,
-                                        const float *grad_out, const int *idx,
+                                        const at::Tensor& grad_out_tensor,
-                                        float *grad_points,
+                                        const at::Tensor& idx_tensor,
-                                        cudaStream_t stream);
+                                        at::Tensor& grad_points_tensor);
 int gather_points_wrapper(int b, int c, int n, int npoints,
-                          at::Tensor points_tensor, at::Tensor idx_tensor,
+                          at::Tensor& points_tensor, at::Tensor& idx_tensor,
-                          at::Tensor out_tensor) {
+                          at::Tensor& out_tensor)
-  const float *points = points_tensor.data_ptr<float>();
+{
-  const int *idx = idx_tensor.data_ptr<int>();
+  gather_points_kernel_launcher(b, c, n, npoints, points_tensor, idx_tensor, out_tensor);
-  float *out = out_tensor.data_ptr<float>();
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-  gather_points_kernel_launcher(b, c, n, npoints, points, idx, out, stream);
  return 1;
 }
 int gather_points_grad_wrapper(int b, int c, int n, int npoints,
-                               at::Tensor grad_out_tensor,
+                               at::Tensor& grad_out_tensor,
-                               at::Tensor idx_tensor,
+                               at::Tensor& idx_tensor,
-                               at::Tensor grad_points_tensor) {
+                               at::Tensor& grad_points_tensor)
-  const float *grad_out = grad_out_tensor.data_ptr<float>();
+{
-  const int *idx = idx_tensor.data_ptr<int>();
+  gather_points_grad_kernel_launcher(b, c, n, npoints, grad_out_tensor, idx_tensor,
-  float *grad_points = grad_points_tensor.data_ptr<float>();
+                                     grad_points_tensor);
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-  gather_points_grad_kernel_launcher(b, c, n, npoints, grad_out, idx,
-                                     grad_points, stream);
  return 1;
 }
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
  m.def("gather_points_wrapper", &gather_points_wrapper,
        "gather_points_wrapper");
  m.def("gather_points_grad_wrapper", &gather_points_grad_wrapper,

--- a/mmdet3d/ops/gather_points/src/gather_points_cuda.cu
+++ b/mmdet3d/ops/gather_points/src/gather_points_cuda.cu
 #include <stdio.h>
 #include <stdlib.h>
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/types.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
 #define TOTAL_THREADS 1024
 #define THREADS_PER_BLOCK 256
 #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+template <typename scalar_t>
 __global__ void gather_points_kernel(int b, int c, int n, int m,
-                                     const float *__restrict__ points,
+                                     const scalar_t *__restrict__ points,
                                     const int *__restrict__ idx,
-                                     float *__restrict__ out) {
+                                     scalar_t *__restrict__ out) {
  // points: (B, C, N)
  // idx: (B, M)
  // output:
@@ -26,8 +33,10 @@ __global__ void gather_points_kernel(int b, int c, int n, int m,
 }
 void gather_points_kernel_launcher(int b, int c, int n, int npoints,
-                                   const float *points, const int *idx,
+                                   const at::Tensor& points_tensor,
-                                   float *out, cudaStream_t stream) {
+                                   const at::Tensor& idx_tensor,
+                                   at::Tensor& out_tensor)
+{
  // points: (B, C, N)
  // idx: (B, npoints)
  // output:
@@ -35,23 +44,33 @@ void gather_points_kernel_launcher(int b, int c, int n, int npoints,
  cudaError_t err;
  dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c,
-              b);  // blockIdx.x(col), blockIdx.y(row)
+              b); // blockIdx.x(col), blockIdx.y(row)
  dim3 threads(THREADS_PER_BLOCK);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-  gather_points_kernel<<<blocks, threads, 0, stream>>>(b, c, n, npoints, points,
-                                                       idx, out);
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      out_tensor.scalar_type(), "gather_points_kernel",
+      [&]
+       {
+         const scalar_t *points = points_tensor.data_ptr<scalar_t>();
+         const int *idx = idx_tensor.data_ptr<int>();
+         scalar_t *out = out_tensor.data_ptr<scalar_t>();
+         gather_points_kernel<<<blocks, threads, 0, stream>>>(b, c, n, npoints, points,
+                                                              idx, out);
+       });
  err = cudaGetLastError();
-  if (cudaSuccess != err) {
+  if (cudaSuccess != err)
+  {
    fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
    exit(-1);
  }
 }
+template <typename scalar_t>
 __global__ void gather_points_grad_kernel(int b, int c, int n, int m,
-                                          const float *__restrict__ grad_out,
+                                          const scalar_t *__restrict__ grad_out,
                                          const int *__restrict__ idx,
-                                          float *__restrict__ grad_points) {
+                                          scalar_t *__restrict__ grad_points) {
  // grad_out: (B, C, M)
  // idx: (B, M)
  // output:
@@ -70,9 +89,10 @@ __global__ void gather_points_grad_kernel(int b, int c, int n, int m,
 }
 void gather_points_grad_kernel_launcher(int b, int c, int n, int npoints,
-                                        const float *grad_out, const int *idx,
+                                        const at::Tensor& grad_out_tensor,
-                                        float *grad_points,
+                                        const at::Tensor& idx_tensor,
-                                        cudaStream_t stream) {
+                                        at::Tensor& grad_points_tensor)
+{
  // grad_out: (B, C, npoints)
  // idx: (B, npoints)
  // output:
@@ -80,14 +100,24 @@ void gather_points_grad_kernel_launcher(int b, int c, int n, int npoints,
  cudaError_t err;
  dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c,
-              b);  // blockIdx.x(col), blockIdx.y(row)
+              b); // blockIdx.x(col), blockIdx.y(row)
  dim3 threads(THREADS_PER_BLOCK);
-  gather_points_grad_kernel<<<blocks, threads, 0, stream>>>(
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-      b, c, n, npoints, grad_out, idx, grad_points);
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_points_tensor.scalar_type(), "gather_points_grad_kernel",
+      [&]
+       {
+         const scalar_t *grad_out = grad_out_tensor.data_ptr<scalar_t>();
+         const int *idx = idx_tensor.data_ptr<int>();
+         scalar_t *grad_points = grad_points_tensor.data_ptr<scalar_t>();
+         gather_points_grad_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+             b, c, n, npoints, grad_out, idx, grad_points);
+       });
  err = cudaGetLastError();
-  if (cudaSuccess != err) {
+  if (cudaSuccess != err)
+  {
    fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
    exit(-1);
  }

--- a/mmdet3d/ops/group_points/group_points.py
+++ b/mmdet3d/ops/group_points/group_points.py
@@ -2,6 +2,7 @@
 from typing import Tuple
 import torch
+from mmcv.runner import force_fp32
 from torch import nn as nn
 from torch.autograd import Function
@@ -62,7 +63,9 @@ class QueryAndGroup(nn.Module):
        if self.max_radius is None:
            assert not self.normalize_xyz, \
                'can not normalize grouped xyz when max_radius is None'
+        self.fp16_enabled = False
+    @force_fp32()
    def forward(self, points_xyz, center_xyz, features=None):
        """forward.
@@ -143,7 +146,9 @@ class GroupAll(nn.Module):
    def __init__(self, use_xyz: bool = True):
        super().__init__()
        self.use_xyz = use_xyz
+        self.fp16_enabled = False
+    @force_fp32()
    def forward(self,
                xyz: torch.Tensor,
                new_xyz: torch.Tensor,

--- a/tests/test_models/test_common_modules/test_pointnet_ops.py
+++ b/tests/test_models/test_common_modules/test_pointnet_ops.py
@@ -2,9 +2,16 @@
 import pytest
 import torch
-from mmdet3d.ops import (ball_query, furthest_point_sample,
+from mmdet3d.ops import (
-                         furthest_point_sample_with_dist, gather_points,
+    ball_query,
-                         grouping_operation, knn, three_interpolate, three_nn)
+    furthest_point_sample,
+    furthest_point_sample_with_dist,
+    gather_points,
+    grouping_operation,
+    knn,
+    three_interpolate,
+    three_nn,
+)
 def test_fps():
@@ -236,6 +243,8 @@ def test_gather_points():
          [-0.7172, 0.0462, -0.6227, -0.7172, -0.7172, -0.7172]]]).cuda()
    assert torch.allclose(output, expected_output)
+    output_half = gather_points(features.half(), idx)
+    assert torch.allclose(output_half, expected_output.half())
 def test_three_interpolate():