fix ci

e0b3223b · q.yao · Zaida Zhou · d1690cee · e0b3223b · e0b3223b
Commit e0b3223b authored Oct 11, 2022 by q.yao Committed by Zaida Zhou Oct 22, 2022
9 changed files
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
+name: build
+on:
+  push:
+    paths-ignore:
+      - 'README.md'
+      - 'README_zh-CN.md'
+      - 'docs/**'
+      - 'examples/**'
+      - '.dev_scripts/**'
+      - 'docker/**'
+  pull_request:
+    paths-ignore:
+      - 'README.md'
+      - 'README_zh-CN.md'
+      - 'docs/**'
+      - 'examples/**'
+      - '.dev_scripts/**'
+      - 'docker/**'
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+env:
+  MMCV_WITH_OPS: 1
+jobs:
+  build_without_torch:
+    runs-on: ubuntu-18.04
+    strategy:
+      matrix:
+        python-version: [3.7]
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install system dependencies
+        run: sudo apt-get update && sudo apt-get install -y ffmpeg libturbojpeg
+      - name: Build and install
+        run: rm -rf .eggs && pip install -e .
+      - name: Validate the installation
+        run: python -c "import mmcv"
+      - name: Run unittests and generate coverage report
+        run: |
+          pip install -r requirements/test.txt
+          pytest tests/ \
+              --ignore=tests/test_runner \
+              --ignore=tests/test_device/test_ipu \
+              --ignore=tests/test_optimizer.py \
+              --ignore=tests/test_cnn \
+              --ignore=tests/test_parallel.py \
+              --ignore=tests/test_ops \
+              --ignore=tests/test_load_model_zoo.py \
+              --ignore=tests/test_utils/test_logging.py \
+              --ignore=tests/test_image/test_io.py \
+              --ignore=tests/test_utils/test_registry.py \
+              --ignore=tests/test_utils/test_parrots_jit.py \
+              --ignore=tests/test_utils/test_trace.py \
+              --ignore=tests/test_utils/test_hub.py \
+              --ignore=tests/test_device \
+              --ignore=tests/test_utils/test_torch_ops.py
+  build_without_ops:
+    runs-on: ubuntu-18.04
+    env:
+      MMCV_WITH_OPS: 0
+    strategy:
+      matrix:
+        python-version: [3.7]
+        torch: [1.7.0, 1.8.0, 1.9.0]
+        include:
+          - torch: 1.7.0
+            torchvision: 0.8.1
+          - torch: 1.8.0
+            torchvision: 0.9.0
+          - torch: 1.9.0
+            torchvision: 0.10.0
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install system dependencies
+        run: sudo apt-get update && sudo apt-get install -y ffmpeg libturbojpeg
+      - name: Install PyTorch
+        run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html
+      - name: Build and install
+        run: rm -rf .eggs && pip install -e .
+      - name: Validate the installation
+        run: python -c "import mmcv"
+      - name: Run unittests
+        run: |
+          pip install -r requirements/test.txt
+          pytest tests/ --ignore=tests/test_ops
+  build_cpu:
+    runs-on: ubuntu-18.04
+    strategy:
+      matrix:
+        python-version: [3.7]
+        torch: [1.5.1, 1.6.0, 1.7.0, 1.8.0, 1.9.0]
+        include:
+          - torch: 1.5.1
+            torchvision: 0.6.1
+          - torch: 1.6.0
+            torchvision: 0.7.0
+          - torch: 1.7.0
+            torchvision: 0.8.1
+          - torch: 1.8.0
+            torchvision: 0.9.0
+          - torch: 1.9.0
+            torchvision: 0.10.0
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install system dependencies
+        run: sudo apt-get update && sudo apt-get install -y ffmpeg libturbojpeg
+      - name: Install PyTorch
+        run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html
+      # pstuil is an optional package to detect the number of CPU for compiling mmcv
+      - name: Install psutil
+        run: pip install psutil
+      - name: Create sdist and untar
+        run: |
+          MMCV_WITH_OPS=1 python setup.py sdist
+          tar zxvf dist/mmcv-full* -C /tmp
+          rm -r mmcv
+      - name: Build and install from sdist
+        run: |
+          pushd /tmp/mmcv-full*
+          pip install -e .
+          popd
+      - name: Validate the installation
+        run: python -c "import mmcv"
+      - name: Run unittests and generate coverage report
+        run: |
+          pip install -r requirements/test.txt
+          coverage run --branch --source=mmcv -m pytest tests/
+          coverage xml
+          coverage report -m
+  build_cu101:
+    runs-on: ubuntu-18.04
+    container:
+      image: pytorch/pytorch:1.6.0-cuda10.1-cudnn7-devel
+    env:
+      FORCE_CUDA: 1
+      MMCV_CUDA_ARGS: -gencode=arch=compute_61,code=sm_61
+    strategy:
+      matrix:
+        python-version: [3.7]
+        torch: [1.5.1+cu101, 1.6.0+cu101, 1.7.0+cu101, 1.8.0+cu101]
+        include:
+          - torch: 1.5.1+cu101
+            torchvision: 0.6.1+cu101
+          - torch: 1.6.0+cu101
+            torchvision: 0.7.0+cu101
+          - torch: 1.7.0+cu101
+            torchvision: 0.8.1+cu101
+          - torch: 1.8.0+cu101
+            torchvision: 0.9.0+cu101
+          - python-version: 3.6
+            torch: 1.8.0+cu101
+            torchvision: 0.9.0+cu101
+          - python-version: 3.8
+            torch: 1.8.0+cu101
+            torchvision: 0.9.0+cu101
+          - python-version: 3.9
+            torch: 1.8.0+cu101
+            torchvision: 0.9.0+cu101
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Fetch GPG keys
+        run: |
+          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
+          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
+      - name: Install python-dev
+        run: apt-get update && apt-get install -y python${{matrix.python-version}}-dev
+        if: ${{matrix.python-version != '3.9'}}
+      - name: Install Pillow
+        run: python -m pip install Pillow==6.2.2
+        if: ${{matrix.torchvision == '0.4.2'}}
+      # When we use a third-party container, we need to add python -m to call
+      # the user-installed pip when we use the pip command, otherwise it will
+      # call the system pip
+      - name: Install PyTorch
+        run: python -m pip install torch==${{matrix.torch}} torchvision==${{matrix.torchvision}} -f https://download.pytorch.org/whl/torch_stable.html
+      - name: Install system dependencies
+        run: apt-get update && apt-get install -y ffmpeg libturbojpeg ninja-build
+      - name: Install dependencies for compiling onnx when python=3.9
+        run: python -m pip install protobuf && apt-get -y install libprotobuf-dev protobuf-compiler cmake
+        if: ${{matrix.python-version == '3.9'}}
+      # pstuil is an optional package to detect the number of CPU for compiling mmcv
+      - name: Install psutil
+        run: python -m pip install psutil
+      - name: Build and install
+        run: rm -rf .eggs && python -m pip install -e .
+      - name: Validate the installation
+        run: python -c "import mmcv"
+      - name: Run unittests and generate coverage report
+        run: |
+          python -m pip install -r requirements/test.txt
+          coverage run --branch --source=mmcv -m pytest tests/
+          coverage xml
+          coverage report -m
+      # Only upload coverage report for python3.7 && pytorch1.6
+      - name: Upload coverage to Codecov
+        if: ${{matrix.torch == '1.6.0+cu101' && matrix.python-version == '3.7'}}
+        uses: codecov/codecov-action@v1.0.14
+        with:
+          file: ./coverage.xml
+          flags: unittests
+          env_vars: OS,PYTHON
+          name: codecov-umbrella
+          fail_ci_if_error: false
+  build_cu102:
+    runs-on: ubuntu-18.04
+    container:
+      image: pytorch/pytorch:1.9.0-cuda10.2-cudnn7-devel
+    env:
+      FORCE_CUDA: 1
+      MMCV_CUDA_ARGS: -gencode=arch=compute_61,code=sm_61
+    strategy:
+      matrix:
+        python-version: [3.7]
+        torch: [1.9.0+cu102, 1.10.0+cu102]
+        include:
+          - torch: 1.9.0+cu102
+            torchvision: 0.10.0+cu102
+          - torch: 1.10.0+cu102
+            torchvision: 0.11.0+cu102
+          - python-version: '3.10'
+            torch: 1.11.0+cu102
+            torchvision: 0.12.0+cu102
+          - python-version: '3.10'
+            torch: 1.12.0+cu102
+            torchvision: 0.13.0+cu102
+          - python-version: 3.6
+            torch: 1.9.0+cu102
+            torchvision: 0.10.0+cu102
+          - python-version: 3.8
+            torch: 1.9.0+cu102
+            torchvision: 0.10.0+cu102
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Fetch GPG keys
+        run: |
+          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
+          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
+      - name: Add PPA
+        run: |
+          apt-get update && apt-get install -y software-properties-common
+          add-apt-repository -y ppa:deadsnakes/ppa
+      - name: Install python-dev
+        run: apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y python${{matrix.python-version}}-dev
+      - name: python -m Install PyTorch
+        run: python -m pip install torch==${{matrix.torch}} torchvision==${{matrix.torchvision}} -f https://download.pytorch.org/whl/torch_stable.html
+      - name: Install system dependencies
+        run: apt-get update && apt-get install -y ffmpeg libturbojpeg ninja-build
+      # pstuil is an optional package to detect the number of CPU for compiling mmcv
+      - name: Install psutil
+        run: python -m pip install psutil
+      - name: Build and install
+        run: rm -rf .eggs && python -m pip install -e .
+      - name: Validate the installation
+        run: python -c "import mmcv"
+      - name: Run unittests and generate coverage report
+        run: |
+          python -m pip install -r requirements/test.txt
+          coverage run --branch --source=mmcv -m pytest tests/
+          coverage xml
+        if: ${{matrix.python-version != '3.10'}}
+      # special treatment for python3.10 because onnx and onnxruntime don't provide python3.10 pre-built packages
+      - name: Run unittests and generate coverage report for python3.10
+        run: |
+          python -m pip install -r requirements/test.txt
+          coverage run --branch --source=mmcv -m pytest tests/ --ignore=tests/test_ops/test_onnx.py --ignore=tests/test_ops/test_tensorrt.py --ignore=tests/test_ops/test_tensorrt_preprocess.py
+          coverage xml
+        if: ${{matrix.python-version == '3.10'}}
+  build_windows_without_ops:
+    runs-on: windows-latest
+    env:
+      MMCV_WITH_OPS: 0
+    strategy:
+      matrix:
+        torch: [1.7.1, 1.8.0, 1.9.0]
+        include:
+          - torch: 1.7.1
+            torchvision: 0.8.2
+          - torch: 1.8.0
+            torchvision: 0.9.0
+          - torch: 1.9.0
+            torchvision: 0.10.0
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python 3.7
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.7
+      - name: Install PyTorch
+        run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu --no-cache-dir -f https://download.pytorch.org/whl/torch_stable.html
+      - name: Build and install
+        run: pip install -e .
+      - name: Validate the installation
+        run: python -c "import mmcv"
+      - name: Run unittests
+        run: |
+          pip install -r requirements/test.txt
+          pytest tests/ --ignore=tests/test_ops --ignore tests/test_utils/test_progressbar.py --ignore tests/test_utils/test_timer.py --ignore tests/test_image/test_io.py
+  build_windows:
+    runs-on: windows-latest
+    strategy:
+      matrix:
+        torch: [1.7.1, 1.8.0, 1.9.0]
+        include:
+          - torch: 1.7.1
+            torchvision: 0.8.2
+          - torch: 1.8.0
+            torchvision: 0.9.0
+          - torch: 1.9.0
+            torchvision: 0.10.0
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python 3.7
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.7
+      - name: Install PyTorch
+        run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu --no-cache-dir -f https://download.pytorch.org/whl/torch_stable.html
+      - name: Build and install
+        run: pip install -e .
+      - name: Validate the installation
+        run: python -c "import mmcv"
+      - name: Run unittests
+        run: |
+          pip install -r requirements/test.txt
+          pytest tests/ --ignore tests/test_utils/test_progressbar.py --ignore tests/test_utils/test_timer.py --ignore tests/test_image/test_io.py
+  build_macos:
+    runs-on: macos-latest
+    strategy:
+      matrix:
+        torch: [1.5.1, 1.6.0, 1.7.0, 1.8.0, 1.9.0]
+        include:
+          - torch: 1.5.1
+            torchvision: 0.6.1
+          - torch: 1.6.0
+            torchvision: 0.7.0
+          - torch: 1.7.0
+            torchvision: 0.8.1
+          - torch: 1.8.0
+            torchvision: 0.9.0
+          - torch: 1.9.0
+            torchvision: 0.10.0
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python 3.7
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.7
+      - name: Install system dependencies
+        run: brew install ffmpeg jpeg-turbo
+      - name: Install utils
+        run: pip install psutil
+      - name: Install Pillow
+        run: pip install Pillow==6.2.2
+        if: ${{matrix.torchvision == '0.4.2'}}
+      - name: Install PyTorch
+        run: pip install torch==${{matrix.torch}} torchvision==${{matrix.torchvision}} --no-cache-dir
+      - name: Build and install
+        run: |
+          rm -rf .eggs
+          CC=clang CXX=clang++ CFLAGS='-stdlib=libc++' pip install -e .
+      - name: Validate the installation
+        run: python -c "import mmcv"
+      - name: Run unittests
+        run: |
+          pip install -r requirements/test.txt
+          # The timing on macos VMs is not precise, so we skip the progressbar tests
+          pytest tests/ --ignore tests/test_utils/test_progressbar.py --ignore tests/test_utils/test_timer.py
--- a/mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh
@@ -27,9 +27,9 @@ __device__ inline bool devIoU(float const *const a, float const *const b,
  return interS > threshold * (Sa + Sb - interS);
 }
-__global__ void nms_cuda(const int n_boxes, const float iou_threshold,
+__global__ static void nms_cuda(const int n_boxes, const float iou_threshold,
-                         const int offset, const float *dev_boxes,
+                                const int offset, const float *dev_boxes,
-                         unsigned long long *dev_mask) {
+                                unsigned long long *dev_mask) {
  int blocks = (n_boxes + threadsPerBlock - 1) / threadsPerBlock;
  CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {
    const int tid = threadIdx.x;
@@ -73,9 +73,9 @@ __global__ void nms_cuda(const int n_boxes, const float iou_threshold,
  }
 }
-__global__ void gather_keep_from_mask(bool *keep,
+__global__ static void gather_keep_from_mask(bool *keep,
-                                      const unsigned long long *dev_mask,
+                                             const unsigned long long *dev_mask,
-                                      const int n_boxes) {
+                                             const int n_boxes) {
  const int col_blocks = (n_boxes + threadsPerBlock - 1) / threadsPerBlock;
  const int tid = threadIdx.x;

--- a/mmcv/ops/csrc/parrots/cudabind.cpp
+++ b/mmcv/ops/csrc/parrots/cudabind.cpp
@@ -570,14 +570,12 @@ void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
                                                   const Tensor boxes_b,
                                                   Tensor ans_overlap);
-void IoU3DNMS3DForwardCUDAKernelLauncher(const Tensor boxes,
+void IoU3DNMS3DForwardCUDAKernelLauncher(const Tensor boxes, Tensor& keep,
-                                         unsigned long long* mask,
+                                         Tensor& keep_num,
-                                         int boxes_num,
                                         float nms_overlap_thresh);
-void IoU3DNMS3DNormalForwardCUDAKernelLauncher(const Tensor boxes,
+void IoU3DNMS3DNormalForwardCUDAKernelLauncher(const Tensor boxes, Tensor& keep,
-                                               unsigned long long* mask,
+                                               Tensor& keep_num,
-                                               int boxes_num,
                                               float nms_overlap_thresh);
 void iou3d_boxes_overlap_bev_forward_cuda(const int num_a, const Tensor boxes_a,
@@ -587,16 +585,16 @@ void iou3d_boxes_overlap_bev_forward_cuda(const int num_a, const Tensor boxes_a,
                                                ans_overlap);
 };
-void iou3d_nms3d_forward_cuda(const Tensor boxes, unsigned long long* mask,
+void iou3d_nms3d_forward_cuda(const Tensor boxes, Tensor& keep,
-                              int boxes_num, float nms_overlap_thresh) {
+                              Tensor& keep_num, float nms_overlap_thresh) {
-  IoU3DNMS3DForwardCUDAKernelLauncher(boxes, mask, boxes_num,
+  IoU3DNMS3DForwardCUDAKernelLauncher(boxes, keep, keep_num,
                                      nms_overlap_thresh);
 };
-void iou3d_nms3d_normal_forward_cuda(const Tensor boxes,
+void iou3d_nms3d_normal_forward_cuda(const Tensor boxes, Tensor& keep,
-                                     unsigned long long* mask, int boxes_num,
+                                     Tensor& keep_num,
                                     float nms_overlap_thresh) {
-  IoU3DNMS3DNormalForwardCUDAKernelLauncher(boxes, mask, boxes_num,
+  IoU3DNMS3DNormalForwardCUDAKernelLauncher(boxes, keep, keep_num,
                                            nms_overlap_thresh);
 };
@@ -604,11 +602,11 @@ void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
                                          const int num_b, const Tensor boxes_b,
                                          Tensor ans_overlap);
-void iou3d_nms3d_forward_impl(const Tensor boxes, unsigned long long* mask,
+void iou3d_nms3d_forward_impl(const Tensor boxes, Tensor& keep,
-                              int boxes_num, float nms_overlap_thresh);
+                              Tensor& keep_num, float nms_overlap_thresh);
-void iou3d_nms3d_normal_forward_impl(const Tensor boxes,
+void iou3d_nms3d_normal_forward_impl(const Tensor boxes, Tensor& keep,
-                                     unsigned long long* mask, int boxes_num,
+                                     Tensor& keep_num,
                                     float nms_overlap_thresh);
 REGISTER_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, CUDA,

--- a/mmcv/ops/csrc/parrots/iou3d.cpp
+++ b/mmcv/ops/csrc/parrots/iou3d.cpp
@@ -19,16 +19,16 @@ void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
                       num_b, boxes_b, ans_overlap);
 }
-void iou3d_nms3d_forward_impl(const Tensor boxes, unsigned long long *mask,
+void iou3d_nms3d_forward_impl(const Tensor boxes, Tensor &keep,
-                              int boxes_num, float nms_overlap_thresh) {
+                              Tensor &keep_num, float nms_overlap_thresh) {
-  DISPATCH_DEVICE_IMPL(iou3d_nms3d_forward_impl, boxes, mask, boxes_num,
+  DISPATCH_DEVICE_IMPL(iou3d_nms3d_forward_impl, boxes, keep, keep_num,
                       nms_overlap_thresh);
 }
-void iou3d_nms3d_normal_forward_impl(const Tensor boxes,
+void iou3d_nms3d_normal_forward_impl(const Tensor boxes, Tensor &keep,
-                                     unsigned long long *mask, int boxes_num,
+                                     Tensor &keep_num,
                                     float nms_overlap_thresh) {
-  DISPATCH_DEVICE_IMPL(iou3d_nms3d_normal_forward_impl, boxes, mask, boxes_num,
+  DISPATCH_DEVICE_IMPL(iou3d_nms3d_normal_forward_impl, boxes, keep, keep_num,
                       nms_overlap_thresh);
 }
@@ -51,41 +51,7 @@ void iou3d_nms3d_forward(Tensor boxes, Tensor keep, Tensor keep_num,
  CHECK_CONTIGUOUS(boxes);
  CHECK_CONTIGUOUS(keep);
-  int boxes_num = boxes.size(0);
+  iou3d_nms3d_forward_impl(boxes, keep, keep_num, nms_overlap_thresh);
-  int64_t *keep_data = keep.data_ptr<int64_t>();
-  int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
-  const int col_blocks =
-      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
-  Tensor mask =
-      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
-  unsigned long long *mask_data =
-      (unsigned long long *)mask.data_ptr<int64_t>();
-  iou3d_nms3d_forward_impl(boxes, mask_data, boxes_num, nms_overlap_thresh);
-  at::Tensor mask_cpu = mask.to(at::kCPU);
-  unsigned long long *mask_host =
-      (unsigned long long *)mask_cpu.data_ptr<int64_t>();
-  std::vector<unsigned long long> remv_cpu(col_blocks);
-  memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
-  int num_to_keep = 0;
-  for (int i = 0; i < boxes_num; i++) {
-    int nblock = i / THREADS_PER_BLOCK_NMS;
-    int inblock = i % THREADS_PER_BLOCK_NMS;
-    if (!(remv_cpu[nblock] & (1ULL << inblock))) {
-      keep_data[num_to_keep++] = i;
-      unsigned long long *p = &mask_host[0] + i * col_blocks;
-      for (int j = nblock; j < col_blocks; j++) {
-        remv_cpu[j] |= p[j];
-      }
-    }
-    *keep_num_data = num_to_keep;
-  }
 }
 void iou3d_nms3d_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
@@ -96,40 +62,5 @@ void iou3d_nms3d_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
  CHECK_CONTIGUOUS(boxes);
  CHECK_CONTIGUOUS(keep);
-  int boxes_num = boxes.size(0);
+  iou3d_nms3d_normal_forward_impl(boxes, keep, keep_num, nms_overlap_thresh);
-  int64_t *keep_data = keep.data_ptr<int64_t>();
-  int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
-  const int col_blocks =
-      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
-  Tensor mask =
-      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
-  unsigned long long *mask_data =
-      (unsigned long long *)mask.data_ptr<int64_t>();
-  iou3d_nms3d_normal_forward_impl(boxes, mask_data, boxes_num,
-                                  nms_overlap_thresh);
-  at::Tensor mask_cpu = mask.to(at::kCPU);
-  unsigned long long *mask_host =
-      (unsigned long long *)mask_cpu.data_ptr<int64_t>();
-  std::vector<unsigned long long> remv_cpu(col_blocks);
-  memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
-  int num_to_keep = 0;
-  for (int i = 0; i < boxes_num; i++) {
-    int nblock = i / THREADS_PER_BLOCK_NMS;
-    int inblock = i % THREADS_PER_BLOCK_NMS;
-    if (!(remv_cpu[nblock] & (1ULL << inblock))) {
-      keep_data[num_to_keep++] = i;
-      unsigned long long *p = &mask_host[0] + i * col_blocks;
-      for (int j = nblock; j < col_blocks; j++) {
-        remv_cpu[j] |= p[j];
-      }
-    }
-  }
-  *keep_num_data = num_to_keep;
 }
--- a/mmcv/ops/csrc/pytorch/cuda/cudabind.cpp
+++ b/mmcv/ops/csrc/pytorch/cuda/cudabind.cpp
@@ -577,14 +577,12 @@ void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
                                                   const Tensor boxes_b,
                                                   Tensor ans_overlap);
-void IoU3DNMS3DForwardCUDAKernelLauncher(const Tensor boxes,
+void IoU3DNMS3DForwardCUDAKernelLauncher(const Tensor boxes, Tensor& keep,
-                                         unsigned long long* mask,
+                                         Tensor& keep_num,
-                                         int boxes_num,
                                         float nms_overlap_thresh);
-void IoU3DNMS3DNormalForwardCUDAKernelLauncher(const Tensor boxes,
+void IoU3DNMS3DNormalForwardCUDAKernelLauncher(const Tensor boxes, Tensor& keep,
-                                               unsigned long long* mask,
+                                               Tensor& keep_num,
-                                               int boxes_num,
                                               float nms_overlap_thresh);
 void iou3d_boxes_overlap_bev_forward_cuda(const int num_a, const Tensor boxes_a,
@@ -594,16 +592,16 @@ void iou3d_boxes_overlap_bev_forward_cuda(const int num_a, const Tensor boxes_a,
                                                ans_overlap);
 };
-void iou3d_nms3d_forward_cuda(const Tensor boxes, unsigned long long* mask,
+void iou3d_nms3d_forward_cuda(const Tensor boxes, Tensor& keep,
-                              int boxes_num, float nms_overlap_thresh) {
+                              Tensor& keep_num, float nms_overlap_thresh) {
-  IoU3DNMS3DForwardCUDAKernelLauncher(boxes, mask, boxes_num,
+  IoU3DNMS3DForwardCUDAKernelLauncher(boxes, keep, keep_num,
                                      nms_overlap_thresh);
 };
-void iou3d_nms3d_normal_forward_cuda(const Tensor boxes,
+void iou3d_nms3d_normal_forward_cuda(const Tensor boxes, Tensor& keep,
-                                     unsigned long long* mask, int boxes_num,
+                                     Tensor& keep_num,
                                     float nms_overlap_thresh) {
-  IoU3DNMS3DNormalForwardCUDAKernelLauncher(boxes, mask, boxes_num,
+  IoU3DNMS3DNormalForwardCUDAKernelLauncher(boxes, keep, keep_num,
                                            nms_overlap_thresh);
 };
@@ -611,11 +609,11 @@ void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
                                          const int num_b, const Tensor boxes_b,
                                          Tensor ans_overlap);
-void iou3d_nms3d_forward_impl(const Tensor boxes, unsigned long long* mask,
+void iou3d_nms3d_forward_impl(const Tensor boxes, Tensor& keep,
-                              int boxes_num, float nms_overlap_thresh);
+                              Tensor& keep_num, float nms_overlap_thresh);
-void iou3d_nms3d_normal_forward_impl(const Tensor boxes,
+void iou3d_nms3d_normal_forward_impl(const Tensor boxes, Tensor& keep,
-                                     unsigned long long* mask, int boxes_num,
+                                     Tensor& keep_num,
                                     float nms_overlap_thresh);
 REGISTER_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, CUDA,

--- a/mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.cu
@@ -10,6 +10,7 @@ All Rights Reserved 2019-2020.
 #include <stdio.h>
 #include "iou3d_cuda_kernel.cuh"
+#include "nms_cuda_kernel.cuh"
 #include "pytorch_cuda_helper.hpp"
 void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
@@ -32,36 +33,72 @@ void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
  AT_CUDA_CHECK(cudaGetLastError());
 }
-void IoU3DNMS3DForwardCUDAKernelLauncher(const Tensor boxes,
+void IoU3DNMS3DForwardCUDAKernelLauncher(const Tensor boxes, Tensor& keep,
-                                         unsigned long long *mask,
+                                         Tensor& keep_num,
-                                         int boxes_num,
                                         float nms_overlap_thresh) {
+  using namespace at::indexing;
  at::cuda::CUDAGuard device_guard(boxes.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  int boxes_num = boxes.size(0);
+  const int col_blocks =
+      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
+  Tensor mask =
+      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
  dim3 blocks(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS),
              GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS));
  dim3 threads(THREADS_PER_BLOCK_NMS);
  iou3d_nms3d_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
-      boxes_num, nms_overlap_thresh, boxes.data_ptr<float>(), mask);
+      boxes_num, nms_overlap_thresh, boxes.data_ptr<float>(),
+      (unsigned long long*)mask.data_ptr<int64_t>());
+  at::Tensor keep_t = at::zeros(
+      {boxes_num}, boxes.options().dtype(at::kBool).device(at::kCUDA));
+  gather_keep_from_mask<<<1, min(col_blocks, THREADS_PER_BLOCK),
+                          col_blocks * sizeof(unsigned long long), stream>>>(
+      keep_t.data_ptr<bool>(), (unsigned long long*)mask.data_ptr<int64_t>(),
+      boxes_num);
+  auto keep_data = keep_t.nonzero().index({Slice(), 0});
+  keep_num.fill_(at::Scalar(keep_data.size(0)));
+  keep.index_put_({Slice(0, keep_data.size(0))}, keep_data);
  AT_CUDA_CHECK(cudaGetLastError());
 }
-void IoU3DNMS3DNormalForwardCUDAKernelLauncher(const Tensor boxes,
+void IoU3DNMS3DNormalForwardCUDAKernelLauncher(const Tensor boxes, Tensor& keep,
-                                               unsigned long long *mask,
+                                               Tensor& keep_num,
-                                               int boxes_num,
                                               float nms_overlap_thresh) {
+  using namespace at::indexing;
  at::cuda::CUDAGuard device_guard(boxes.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  int boxes_num = boxes.size(0);
+  const int col_blocks =
+      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
+  Tensor mask =
+      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
  dim3 blocks(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS),
              GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS));
  dim3 threads(THREADS_PER_BLOCK_NMS);
  iou3d_nms3d_normal_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
-      boxes_num, nms_overlap_thresh, boxes.data_ptr<float>(), mask);
+      boxes_num, nms_overlap_thresh, boxes.data_ptr<float>(),
+      (unsigned long long*)mask.data_ptr<int64_t>());
+  at::Tensor keep_t = at::zeros(
+      {boxes_num}, boxes.options().dtype(at::kBool).device(at::kCUDA));
+  gather_keep_from_mask<<<1, min(col_blocks, THREADS_PER_BLOCK),
+                          col_blocks * sizeof(unsigned long long), stream>>>(
+      keep_t.data_ptr<bool>(), (unsigned long long*)mask.data_ptr<int64_t>(),
+      boxes_num);
+  auto keep_data = keep_t.nonzero().index({Slice(), 0});
+  keep_num.fill_(at::Scalar(keep_data.size(0)));
+  keep.index_put_({Slice(0, keep_data.size(0))}, keep_data);
  AT_CUDA_CHECK(cudaGetLastError());
 }
--- a/mmcv/ops/csrc/pytorch/iou3d.cpp
+++ b/mmcv/ops/csrc/pytorch/iou3d.cpp
@@ -19,16 +19,16 @@ void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
                       num_b, boxes_b, ans_overlap);
 }
-void iou3d_nms3d_forward_impl(const Tensor boxes, unsigned long long *mask,
+void iou3d_nms3d_forward_impl(const Tensor boxes, Tensor &keep,
-                              int boxes_num, float nms_overlap_thresh) {
+                              Tensor &keep_num, float nms_overlap_thresh) {
-  DISPATCH_DEVICE_IMPL(iou3d_nms3d_forward_impl, boxes, mask, boxes_num,
+  DISPATCH_DEVICE_IMPL(iou3d_nms3d_forward_impl, boxes, keep, keep_num,
                       nms_overlap_thresh);
 }
-void iou3d_nms3d_normal_forward_impl(const Tensor boxes,
+void iou3d_nms3d_normal_forward_impl(const Tensor boxes, Tensor &keep,
-                                     unsigned long long *mask, int boxes_num,
+                                     Tensor &keep_num,
                                     float nms_overlap_thresh) {
-  DISPATCH_DEVICE_IMPL(iou3d_nms3d_normal_forward_impl, boxes, mask, boxes_num,
+  DISPATCH_DEVICE_IMPL(iou3d_nms3d_normal_forward_impl, boxes, keep, keep_num,
                       nms_overlap_thresh);
 }
@@ -51,41 +51,7 @@ void iou3d_nms3d_forward(Tensor boxes, Tensor keep, Tensor keep_num,
  CHECK_CONTIGUOUS(boxes);
  CHECK_CONTIGUOUS(keep);
-  int boxes_num = boxes.size(0);
+  iou3d_nms3d_forward_impl(boxes, keep, keep_num, nms_overlap_thresh);
-  int64_t *keep_data = keep.data_ptr<int64_t>();
-  int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
-  const int col_blocks =
-      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
-  Tensor mask =
-      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
-  unsigned long long *mask_data =
-      (unsigned long long *)mask.data_ptr<int64_t>();
-  iou3d_nms3d_forward_impl(boxes, mask_data, boxes_num, nms_overlap_thresh);
-  at::Tensor mask_cpu = mask.to(at::kCPU);
-  unsigned long long *mask_host =
-      (unsigned long long *)mask_cpu.data_ptr<int64_t>();
-  std::vector<unsigned long long> remv_cpu(col_blocks);
-  memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
-  int num_to_keep = 0;
-  for (int i = 0; i < boxes_num; i++) {
-    int nblock = i / THREADS_PER_BLOCK_NMS;
-    int inblock = i % THREADS_PER_BLOCK_NMS;
-    if (!(remv_cpu[nblock] & (1ULL << inblock))) {
-      keep_data[num_to_keep++] = i;
-      unsigned long long *p = &mask_host[0] + i * col_blocks;
-      for (int j = nblock; j < col_blocks; j++) {
-        remv_cpu[j] |= p[j];
-      }
-    }
-    *keep_num_data = num_to_keep;
-  }
 }
 void iou3d_nms3d_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
@@ -96,40 +62,5 @@ void iou3d_nms3d_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
  CHECK_CONTIGUOUS(boxes);
  CHECK_CONTIGUOUS(keep);
-  int boxes_num = boxes.size(0);
+  iou3d_nms3d_normal_forward_impl(boxes, keep, keep_num, nms_overlap_thresh);
-  int64_t *keep_data = keep.data_ptr<int64_t>();
-  int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
-  const int col_blocks =
-      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
-  Tensor mask =
-      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
-  unsigned long long *mask_data =
-      (unsigned long long *)mask.data_ptr<int64_t>();
-  iou3d_nms3d_normal_forward_impl(boxes, mask_data, boxes_num,
-                                  nms_overlap_thresh);
-  at::Tensor mask_cpu = mask.to(at::kCPU);
-  unsigned long long *mask_host =
-      (unsigned long long *)mask_cpu.data_ptr<int64_t>();
-  std::vector<unsigned long long> remv_cpu(col_blocks);
-  memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
-  int num_to_keep = 0;
-  for (int i = 0; i < boxes_num; i++) {
-    int nblock = i / THREADS_PER_BLOCK_NMS;
-    int inblock = i % THREADS_PER_BLOCK_NMS;
-    if (!(remv_cpu[nblock] & (1ULL << inblock))) {
-      keep_data[num_to_keep++] = i;
-      unsigned long long *p = &mask_host[0] + i * col_blocks;
-      for (int j = nblock; j < col_blocks; j++) {
-        remv_cpu[j] |= p[j];
-      }
-    }
-  }
-  *keep_num_data = num_to_keep;
 }
--- a/mmcv/ops/iou3d.py
+++ b/mmcv/ops/iou3d.py
@@ -82,11 +82,11 @@ def nms3d(boxes: Tensor, scores: Tensor, iou_threshold: float) -> Tensor:
    order = scores.sort(0, descending=True)[1]
    boxes = boxes[order].contiguous()
-    keep = torch.zeros(boxes.size(0), dtype=torch.long)
+    keep = boxes.new_zeros(boxes.size(0), dtype=torch.long)
-    num_out = torch.zeros(size=(), dtype=torch.long)
+    num_out = boxes.new_zeros(size=(), dtype=torch.long)
    ext_module.iou3d_nms3d_forward(
        boxes, keep, num_out, nms_overlap_thresh=iou_threshold)
-    keep = order[keep[:num_out].cuda(boxes.device)].contiguous()
+    keep = order[keep[:num_out].to(boxes.device)].contiguous()
    return keep
@@ -109,11 +109,11 @@ def nms3d_normal(boxes: Tensor, scores: Tensor,
    order = scores.sort(0, descending=True)[1]
    boxes = boxes[order].contiguous()
-    keep = torch.zeros(boxes.size(0), dtype=torch.long)
+    keep = boxes.new_zeros(boxes.size(0), dtype=torch.long)
-    num_out = torch.zeros(size=(), dtype=torch.long)
+    num_out = boxes.new_zeros(size=(), dtype=torch.long)
    ext_module.iou3d_nms3d_normal_forward(
        boxes, keep, num_out, nms_overlap_thresh=iou_threshold)
-    return order[keep[:num_out].cuda(boxes.device)].contiguous()
+    return order[keep[:num_out].to(boxes.device)].contiguous()
 def _xyxyr2xywhr(boxes: Tensor) -> Tensor:

--- a/tests/test_ops/test_iou3d.py
+++ b/tests/test_ops/test_iou3d.py
@@ -4,11 +4,16 @@ import pytest
 import torch
 from mmcv.ops import boxes_iou3d, boxes_overlap_bev, nms3d, nms3d_normal
+from mmcv.utils import IS_CUDA_AVAILABLE
-@pytest.mark.skipif(
+@pytest.mark.parametrize('device', [
-    not torch.cuda.is_available(), reason='requires CUDA support')
+    pytest.param(
-def test_boxes_overlap_bev():
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support'))
+])
+def test_boxes_overlap_bev(device):
    np_boxes1 = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0],
                            [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 0.0],
                            [3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 0.0]],
@@ -22,8 +27,8 @@ def test_boxes_overlap_bev():
          (3 + 2 * 2**0.5)], [1.0, 1.0, 1.0], [0.0, 0.0, 0.0]],
        dtype=np.float32)
-    boxes1 = torch.from_numpy(np_boxes1).cuda()
+    boxes1 = torch.from_numpy(np_boxes1).to(device)
-    boxes2 = torch.from_numpy(np_boxes2).cuda()
+    boxes2 = torch.from_numpy(np_boxes2).to(device)
    # test for 3 boxes
    overlaps = boxes_overlap_bev(boxes1, boxes2)
@@ -37,9 +42,13 @@ def test_boxes_overlap_bev():
        overlaps.cpu().numpy(), np_expect_overlaps.repeat(555, 1), atol=1e-4)
-@pytest.mark.skipif(
+@pytest.mark.parametrize('device', [
-    not torch.cuda.is_available(), reason='requires CUDA support')
+    pytest.param(
-def test_boxes_iou3d():
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support'))
+])
+def test_boxes_iou3d(device):
    np_boxes1 = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0],
                            [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 0.0],
                            [3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 0.0]],
@@ -53,16 +62,20 @@ def test_boxes_iou3d():
         [0.0, 0.0, 0.0]],
        dtype=np.float32)
-    boxes1 = torch.from_numpy(np_boxes1).cuda()
+    boxes1 = torch.from_numpy(np_boxes1).to(device)
-    boxes2 = torch.from_numpy(np_boxes2).cuda()
+    boxes2 = torch.from_numpy(np_boxes2).to(device)
    ious = boxes_iou3d(boxes1, boxes2)
    assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
-@pytest.mark.skipif(
+@pytest.mark.parametrize('device', [
-    not torch.cuda.is_available(), reason='requires CUDA support')
+    pytest.param(
-def test_nms3d():
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support'))
+])
+def test_nms3d(device):
    # test for 5 boxes
    np_boxes = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0],
                           [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 0.0],
@@ -74,7 +87,7 @@ def test_nms3d():
    np_inds = np.array([1, 0, 3])
    boxes = torch.from_numpy(np_boxes)
    scores = torch.from_numpy(np_scores)
-    inds = nms3d(boxes.cuda(), scores.cuda(), iou_threshold=0.3)
+    inds = nms3d(boxes.to(device), scores.to(device), iou_threshold=0.3)
    assert np.allclose(inds.cpu().numpy(), np_inds)
@@ -84,14 +97,18 @@ def test_nms3d():
    np_scores = np.random.rand(555).astype(np.float32)
    boxes = torch.from_numpy(np_boxes)
    scores = torch.from_numpy(np_scores)
-    inds = nms3d(boxes.cuda(), scores.cuda(), iou_threshold=0.3)
+    inds = nms3d(boxes.to(device), scores.to(device), iou_threshold=0.3)
    assert len(inds.cpu().numpy()) == 176
-@pytest.mark.skipif(
+@pytest.mark.parametrize('device', [
-    not torch.cuda.is_available(), reason='requires CUDA support')
+    pytest.param(
-def test_nms3d_normal():
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support'))
+])
+def test_nms3d_normal(device):
    # test for 5 boxes
    np_boxes = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0],
                           [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 0.0],
@@ -103,7 +120,7 @@ def test_nms3d_normal():
    np_inds = np.array([1, 0, 3])
    boxes = torch.from_numpy(np_boxes)
    scores = torch.from_numpy(np_scores)
-    inds = nms3d_normal(boxes.cuda(), scores.cuda(), iou_threshold=0.3)
+    inds = nms3d_normal(boxes.to(device), scores.to(device), iou_threshold=0.3)
    assert np.allclose(inds.cpu().numpy(), np_inds)
@@ -113,6 +130,6 @@ def test_nms3d_normal():
    np_scores = np.random.rand(555).astype(np.float32)
    boxes = torch.from_numpy(np_boxes)
    scores = torch.from_numpy(np_scores)
-    inds = nms3d_normal(boxes.cuda(), scores.cuda(), iou_threshold=0.3)
+    inds = nms3d_normal(boxes.to(device), scores.to(device), iou_threshold=0.3)
    assert len(inds.cpu().numpy()) == 148