[Enhance] Replace mmdet3d ops with mmcv ops (#1240)

* import some ops from mmcv instead of mmdet3d * use mmcv ops in primitive_head.py * use mmcv ops in PAConv * remove ops in mmdet3d & fix some bugs * remove spconv & fix some bugs * fix voxelization unittest * remove spconv in ops/__init__.py * refine ops/__init__.py * recover sparse_block in ops/__init__ * fix parta2_bbox_head unittest * remove remaining ops * recover ops/__init__.py for bc breaking * add source of ops from mmcv * recover the unittest for voxelization * remove unittest

[Enhance] Replace mmdet3d ops with mmcv ops (#1240)
* import some ops from mmcv instead of mmdet3d * use mmcv ops in primitive_head.py * use mmcv ops in PAConv * remove ops in mmdet3d & fix some bugs * remove spconv & fix some bugs * fix voxelization unittest * remove spconv in ops/__init__.py * refine ops/__init__.py * recover sparse_block in ops/__init__ * fix parta2_bbox_head unittest * remove remaining ops * recover ops/__init__.py for bc breaking * add source of ops from mmcv * recover the unittest for voxelization * remove unittest
2f88c124 · Wenhao Wu · GitHub · 41d77dad · 41d77dad · 41d77dad
Unverified Commit 2f88c124 authored Mar 23, 2022 by Wenhao Wu Committed by GitHub Mar 23, 2022
20 changed files
--- a/mmdet3d/ops/spconv/src/indice_cuda.cu
+++ b/mmdet3d/ops/spconv/src/indice_cuda.cu
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <ATen/ATen.h>
-#include <spconv/indice.cu.h>
-#include <spconv/indice.h>
-#include <spconv/mp_helper.h>
-#include <tensorview/helper_launch.h>
-#include <tensorview/tensorview.h>
-#include <utility/timer.h>
-#include <chrono>
-#include <limits>
-#include <type_traits>
-namespace spconv {
-namespace functor {
-template <typename Index, typename IndexGrid, unsigned NDim>
-struct CreateConvIndicePairFunctorP1<tv::GPU, Index, IndexGrid, NDim> {
-  Index operator()(const tv::GPU &d, tv::TensorView<const Index> indicesIn,
-                   tv::TensorView<Index> indicesOut,
-                   tv::TensorView<IndexGrid> gridsOut,
-                   tv::TensorView<Index> indicePairs,
-                   tv::TensorView<Index> indiceNum,
-                   tv::TensorView<Index> indicePairUnique,
-                   const tv::SimpleVector<Index, NDim> kernelSize,
-                   const tv::SimpleVector<Index, NDim> stride,
-                   const tv::SimpleVector<Index, NDim> padding,
-                   const tv::SimpleVector<Index, NDim> dilation,
-                   const tv::SimpleVector<Index, NDim> outSpatialShape,
-                   bool transpose) {
-    Index batchSize = gridsOut.dim(0);
-    auto numActIn = indicesIn.dim(0);
-    if (numActIn == 0) return 0;
-    // auto timer = spconv::CudaContextTimer<>();
-    if (transpose)
-      prepareDeConvIndicePairsKernel<Index, IndexGrid, NDim, 4096>
-          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
-             d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
-                              indiceNum, indicePairUnique, kernelSize, stride,
-                              padding, dilation, outSpatialShape);
-    else
-      prepareIndicePairsKernel<Index, IndexGrid, NDim, 4096>
-          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
-             d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
-                              indiceNum, indicePairUnique, kernelSize, stride,
-                              padding, dilation, outSpatialShape);
-    TV_CHECK_CUDA_ERR();
-    // std::cout << "p1 gene time " << timer.report() / 1000.0 << std::endl;
-    return 1;
-  }
-};
-template <typename Index, typename IndexGrid, unsigned NDim>
-struct CreateConvIndicePairFunctorP2<tv::GPU, Index, IndexGrid, NDim> {
-  Index operator()(const tv::GPU &d, tv::TensorView<const Index> indicesIn,
-                   tv::TensorView<Index> indicesOut,
-                   tv::TensorView<IndexGrid> gridsOut,
-                   tv::TensorView<Index> indicePairs,
-                   tv::TensorView<Index> indiceNum,
-                   tv::TensorView<Index> indicePairUnique,
-                   const tv::SimpleVector<Index, NDim> outSpatialShape,
-                   bool transpose, bool resetGrid) {
-    Index batchSize = gridsOut.dim(0);
-    auto kernelVolume = indicePairs.dim(0);
-    auto numActIn = indicesIn.dim(0);
-    if (numActIn == 0) return 0;
-    Index numAct = indicePairUnique.dim(0) - 1;
-    assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
-        <<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
-           d.getStream()>>>(indicesOut, gridsOut, numAct, indicePairs,
-                            indicePairUnique, outSpatialShape, batchSize);
-    TV_CHECK_CUDA_ERR();
-    assignIndicePairsKernel<Index, IndexGrid, NDim>
-        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
-           d.getStream()>>>(indicesOut, gridsOut, numActIn, indicePairs,
-                            indicePairUnique, outSpatialShape);
-    TV_CHECK_CUDA_ERR();
-    if (resetGrid) {
-      resetGridKernel<Index, IndexGrid, NDim>
-          <<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
-             d.getStream()>>>(indicePairUnique.data(), gridsOut, numAct);
-      TV_CHECK_CUDA_ERR();
-    }
-    return numAct;
-  }
-};
-template <typename Index, typename IndexGrid, unsigned NDim>
-struct CreateSubMIndicePairFunctor<tv::GPU, Index, IndexGrid, NDim> {
-  Index operator()(const tv::GPU &d, tv::TensorView<const Index> indicesIn,
-                   tv::TensorView<IndexGrid> gridsOut,
-                   tv::TensorView<Index> indicePairs,
-                   tv::TensorView<Index> indiceNum,
-                   const tv::SimpleVector<Index, NDim> kernelSize,
-                   const tv::SimpleVector<Index, NDim> stride,
-                   const tv::SimpleVector<Index, NDim> padding,
-                   const tv::SimpleVector<Index, NDim> dilation,
-                   const tv::SimpleVector<Index, NDim> outSpatialShape,
-                   bool transpose, bool resetGrid) {
-    auto numActIn = indicesIn.dim(0);
-    if (numActIn == 0) return 0;
-    // auto timer = spconv::CudaContextTimer<>();
-    prepareSubMGridKernel<Index, IndexGrid, NDim>
-        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
-           d.getStream()>>>(indicesIn, gridsOut, outSpatialShape);
-    TV_CHECK_CUDA_ERR();
-    getSubMIndicePairsKernel<Index, IndexGrid, NDim, 4096>
-        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
-           d.getStream()>>>(indicesIn, gridsOut, indicePairs, indiceNum,
-                            kernelSize, stride, padding, dilation,
-                            outSpatialShape);
-    TV_CHECK_CUDA_ERR();
-    // std::cout << "subm gene time " << timer.report() / 1000.0 << std::endl;
-    if (resetGrid) {
-      resetGridSubMKernel<Index, IndexGrid, NDim>
-          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
-             d.getStream()>>>(indicesIn.data(), gridsOut, outSpatialShape,
-                              numActIn);
-      TV_CHECK_CUDA_ERR();
-    }
-    return numActIn;
-  }
-};
-}  // namespace functor
-#define DECLARE_GPU_SPECS_INDEX_NDIM(Index, NDIM)                             \
-  template struct functor::CreateConvIndicePairFunctor<tv::GPU, Index, int,   \
-                                                       NDIM>;                 \
-  template struct functor::CreateConvIndicePairFunctorP1<tv::GPU, Index, int, \
-                                                         NDIM>;               \
-  template struct functor::CreateConvIndicePairFunctorP2<tv::GPU, Index, int, \
-                                                         NDIM>;               \
-  template struct functor::CreateSubMIndicePairFunctor<tv::GPU, Index, int,   \
-                                                       NDIM>;
-#define DECLARE_GPU_INDEX(Index)          \
-  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 1); \
-  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 2); \
-  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 3); \
-  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 4);
-DECLARE_GPU_INDEX(int);
-#undef DECLARE_GPU_INDEX
-#undef DECLARE_GPU_SPECS_INDEX_NDIM
-}  // namespace spconv
--- a/mmdet3d/ops/spconv/src/maxpool.cc
+++ b/mmdet3d/ops/spconv/src/maxpool.cc
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <spconv/maxpool.h>
-#include <torch/script.h>
-namespace spconv {
-namespace functor {
-template <typename T, typename Index>
-struct SparseMaxPoolForwardFunctor<tv::CPU, T, Index> {
-  void operator()(const tv::CPU &d, tv::TensorView<T> outFeatures,
-                  tv::TensorView<const T> inFeatures,
-                  tv::TensorView<const Index> indices, int size) {
-    int stride = outFeatures.dim(1);
-    auto outFeaturesData = outFeatures.data();
-    auto inFeaturesData = inFeatures.data();
-    auto indicesIn = indices.subview(0).data();
-    auto indicesOut = indices.subview(1).data();
-    Index idxi, idxo;
-    for (int row = 0; row < size; row++) {
-      idxi = indicesIn[row] * stride;
-      idxo = indicesOut[row] * stride;
-      for (int plane = 0; plane < stride; ++plane)
-        if (outFeaturesData[idxo + plane] < inFeaturesData[idxi + plane])
-          outFeaturesData[idxo + plane] = inFeaturesData[idxi + plane];
-    }
-  }
-};
-template <typename T, typename Index>
-struct SparseMaxPoolBackwardFunctor<tv::CPU, T, Index> {
-  void operator()(const tv::CPU &d, tv::TensorView<const T> outFeatures,
-                  tv::TensorView<const T> inFeatures,
-                  tv::TensorView<const T> dout, tv::TensorView<T> din,
-                  tv::TensorView<const Index> indices, int size) {
-    int stride = outFeatures.dim(1);
-    auto outFeaturesData = outFeatures.data();
-    auto inFeaturesData = inFeatures.data();
-    auto doutData = dout.data();
-    auto dinData = din.data();
-    auto indicesIn = indices.subview(0).data();
-    auto indicesOut = indices.subview(1).data();
-    Index idxi, idxo;
-    for (int row = 0; row < size; row++) {
-      idxi = indicesIn[row] * stride;
-      idxo = indicesOut[row] * stride;
-      for (int plane = 0; plane < stride; ++plane)
-        if (outFeaturesData[idxo + plane] == inFeaturesData[idxi + plane])
-          dinData[idxi + plane] += doutData[idxo + plane];
-    }
-  }
-};
-}  // namespace functor
-#define DECLARE_CPU_SPECS_T_INDEX(T, Index)                                \
-  template struct functor::SparseMaxPoolForwardFunctor<tv::CPU, T, Index>; \
-  template struct functor::SparseMaxPoolBackwardFunctor<tv::CPU, T, Index>;
-#define DECLARE_CPU_SPECS(T)         \
-  DECLARE_CPU_SPECS_T_INDEX(T, int); \
-  DECLARE_CPU_SPECS_T_INDEX(T, long);
-DECLARE_CPU_SPECS(float);
-DECLARE_CPU_SPECS(double);
-DECLARE_CPU_SPECS(at::Half);
-#undef DECLARE_CPU_SPECS
-#undef DECLARE_CPU_SPECS_T_INDEX
-}  // namespace spconv
--- a/mmdet3d/ops/spconv/src/maxpool_cuda.cu
+++ b/mmdet3d/ops/spconv/src/maxpool_cuda.cu
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <ATen/ATen.h>
-#include <spconv/maxpool.h>
-#include <spconv/mp_helper.h>
-#include <tensorview/helper_kernel.cu.h>
-#include <tensorview/helper_launch.h>
-#include <tensorview/tensorview.h>
-#include <chrono>
-#include <limits>
-#include <type_traits>
-namespace spconv {
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void maxPoolFwdBlockKernel(T *outFeatures, const T *inFeatures,
-                                      const Index *indicesIn,
-                                      const Index *indicesOut, int numHot,
-                                      int numPlanes) {
-  T in, out;
-  int ILPStrideY[NumILP];
-  Index idxo, idxi;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
-  outFeatures += blockIdx.y * NumTLP;
-  inFeatures += blockIdx.y * NumTLP;
-  for (int ix = blockIdx.x * blockDim.x; ix < numHot;
-       ix += blockDim.x * gridDim.x) {
-    {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-        idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-        in = inFeatures[idxi];
-        out = outFeatures[idxo];
-        if (in > out) {
-          outFeatures[idxo] = in;
-        }
-      }
-    }
-  }
-}
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void maxPoolFwdGenericBlockKernel(T *outFeatures,
-                                             const T *inFeatures,
-                                             const Index *indicesIn,
-                                             const Index *indicesOut,
-                                             int numHot, int numPlanes) {
-  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
-  int ILPStrideX[NumILP];
-  Index RI[NumILP];
-  Index RO[NumILP];
-  T in, out;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
-      RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        in = inFeatures[RI[ilp] + iy];
-        out = outFeatures[RO[ilp] + iy];
-        if (in > out) {
-          outFeatures[RO[ilp] + iy] = in;
-        }
-      }
-    }
-  }
-}
-template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
-__global__ void maxPoolFwdVecBlockKernel(T *outFeatures, const T *inFeatures,
-                                         const Index *indicesIn,
-                                         const Index *indicesOut, int numHot,
-                                         int numPlanes) {
-  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
-  int ILPStrideY[NumILP];
-  constexpr int vecloadFactor = sizeof(VecType) / sizeof(T);
-  T bufi[vecloadFactor];
-  T bufo[vecloadFactor];
-  Index idxi, idxo;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
-  outFeatures += blockIdx.y * NumTLP;
-  inFeatures += blockIdx.y * NumTLP;
-  for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;
-       ix += blockDim.x * gridDim.x * vecloadFactor) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ++ilp) {
-      idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-      idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-      reinterpret_cast<VecType *>(bufo)[0] =
-          reinterpret_cast<VecType *>(outFeatures)[idxo];
-      reinterpret_cast<VecType *>(bufi)[0] =
-          reinterpret_cast<const VecType *>(inFeatures)[idxi];
-#pragma unroll
-      for (int i = 0; i < vecloadFactor; i++) {
-        if (bufi[i] > bufo[i]) {
-          bufo[i] = bufi[i];
-        }
-      }
-      reinterpret_cast<VecType *>(outFeatures)[idxo] =
-          reinterpret_cast<VecType *>(bufo)[0];
-    }
-  }
-}
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void maxPoolFwdGenericKernel(T *outFeatures, const T *inFeatures,
-                                        const Index *indicesIn,
-                                        const Index *indicesOut, int numHot,
-                                        int numPlanes) {
-  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
-  int ILPStrideX[NumILP];
-  Index RI[NumILP];
-  Index RO[NumILP];
-  T in, out;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      if (ix + ILPStrideX[ilp] < numHot) {
-        RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
-        RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
-      }
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        if (ix + ILPStrideX[ilp] < numHot) {
-          in = inFeatures[RI[ilp] + iy];
-          out = outFeatures[RO[ilp] + iy];
-          if (in > out) {
-            outFeatures[RO[ilp] + iy] = in;
-          }
-        }
-      }
-    }
-  }
-}
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void maxPoolBwdBlockKernel(const T *outFeatures, const T *inFeatures,
-                                      const T *dout, T *din,
-                                      const Index *indicesIn,
-                                      const Index *indicesOut, int numHot,
-                                      int numPlanes) {
-  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
-  T in, out;
-  Index idxo, idxi;
-  int ILPStrideY[NumILP];
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
-  outFeatures += blockIdx.y * NumTLP;
-  inFeatures += blockIdx.y * NumTLP;
-  dout += blockIdx.y * NumTLP;
-  din += blockIdx.y * NumTLP;
-  for (int ix = blockIdx.x * blockDim.x; ix < numHot;
-       ix += blockDim.x * gridDim.x) {
-    {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-        idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-        in = inFeatures[idxi];
-        out = outFeatures[idxo];
-        if (in == out) {
-          din[idxi] += dout[idxo];
-        }
-      }
-    }
-  }
-}
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void maxPoolBwdGenericBlockKernel(const T *outFeatures,
-                                             const T *inFeatures, const T *dout,
-                                             T *din, const Index *indicesIn,
-                                             const Index *indicesOut,
-                                             int numHot, int numPlanes) {
-  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
-  int ILPStrideX[NumILP];
-  Index RI[NumILP];
-  Index RO[NumILP];
-  T in, out;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
-      RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        in = inFeatures[RI[ilp] + iy];
-        out = outFeatures[RO[ilp] + iy];
-        if (in == out) {
-          din[RI[ilp] + iy] += dout[RO[ilp] + iy];
-        }
-      }
-    }
-  }
-}
-template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
-__global__ void maxPoolBwdVecBlockKernel(const T *outFeatures,
-                                         const T *inFeatures, const T *dout,
-                                         T *din, const Index *indicesIn,
-                                         const Index *indicesOut, int numHot,
-                                         int numPlanes) {
-  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
-  int ILPStrideY[NumILP];
-  constexpr int vecloadFactor = sizeof(VecType) / sizeof(T);
-  T bufi[vecloadFactor];
-  T bufo[vecloadFactor];
-  T bufdi[vecloadFactor];
-  T bufdo[vecloadFactor];
-  Index idxi, idxo;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
-  outFeatures += blockIdx.y * NumTLP;
-  inFeatures += blockIdx.y * NumTLP;
-  for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;
-       ix += blockDim.x * gridDim.x * vecloadFactor) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ++ilp) {
-      idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-      idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-      reinterpret_cast<VecType *>(bufo)[0] =
-          reinterpret_cast<const VecType *>(outFeatures)[idxo];
-      reinterpret_cast<VecType *>(bufi)[0] =
-          reinterpret_cast<const VecType *>(inFeatures)[idxi];
-      reinterpret_cast<VecType *>(bufdo)[0] =
-          reinterpret_cast<const VecType *>(dout)[idxo];
-      reinterpret_cast<VecType *>(bufdi)[0] =
-          reinterpret_cast<VecType *>(din)[idxi];
-#pragma unroll
-      for (int i = 0; i < vecloadFactor; i++) {
-        if (bufi[i] == bufo[i]) {
-          bufdi[i] += bufdo[i];
-        }
-      }
-      reinterpret_cast<VecType *>(din)[idxi] =
-          reinterpret_cast<VecType *>(bufdi)[0];
-    }
-  }
-}
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void maxPoolBwdGenericKernel(const T *outFeatures,
-                                        const T *inFeatures, const T *dout,
-                                        T *din, const Index *indicesIn,
-                                        const Index *indicesOut, int numHot,
-                                        int numPlanes) {
-  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
-  int ILPStrideX[NumILP];
-  Index RI[NumILP];
-  Index RO[NumILP];
-  T in, out;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      if (ix + ILPStrideX[ilp] < numHot) {
-        RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
-        RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
-      }
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        if (ix + ILPStrideX[ilp] < numHot) {
-          in = inFeatures[RI[ilp] + iy];
-          out = outFeatures[RO[ilp] + iy];
-          if (in == out) {
-            din[RI[ilp] + iy] += dout[RO[ilp] + iy];
-          }
-        }
-      }
-    }
-  }
-}
-namespace functor {
-template <typename T, typename Index>
-struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
-  using vecload_type_t =
-      std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
-  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
-  void operator()(const tv::GPU &d, tv::TensorView<T> outFeatures,
-                  tv::TensorView<const T> inFeatures,
-                  tv::TensorView<const Index> indices, int size) {
-    if (size <= 0) return;
-    int numPlanes = inFeatures.dim(1);
-    bool notFound = true;
-    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
-    mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &indices,
-                                 &notFound](auto NumTLP) {
-      constexpr int NumILP = NumTLP / 4;
-      int numHotBlock = (size / NumTLP) * NumTLP;
-      if (notFound) {
-        if (numPlanes % NumTLP == 0) {
-          if (numHotBlock >= NumTLP) {
-            maxPoolFwdVecBlockKernel<T, Index, int(NumTLP), NumILP,
-                                     vecload_type_t>
-                <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
-                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
-                   d.getStream()>>>(outFeatures.data(), inFeatures.data(),
-                                    indices.subview(0).data(),
-                                    indices.subview(1).data(), numHotBlock,
-                                    numPlanes / vecloadFactor);
-            TV_CHECK_CUDA_ERR();
-          }
-          if (size > numHotBlock) {
-            maxPoolFwdGenericKernel<T, Index, int(NumTLP), NumILP>
-                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
-                   0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),
-                                       indices.subview(0).data() + numHotBlock,
-                                       indices.subview(1).data() + numHotBlock,
-                                       size - numHotBlock, numPlanes);
-            TV_CHECK_CUDA_ERR();
-          }
-          notFound = false;
-        }
-      }
-    });
-    if (notFound) {
-      constexpr int NumTLP = 64;
-      constexpr int NumILP = NumTLP / 4;
-      int numHotBlock = (size / NumTLP) * NumTLP;
-      if (numHotBlock >= NumTLP) {
-        maxPoolFwdGenericBlockKernel<T, Index, NumTLP, NumILP>
-            <<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)),
-               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
-                outFeatures.data(), inFeatures.data(),
-                indices.subview(0).data(), indices.subview(1).data(),
-                numHotBlock, numPlanes);
-        TV_CHECK_CUDA_ERR();
-      }
-      if (size > numHotBlock) {
-        maxPoolFwdGenericKernel<T, Index, NumTLP, NumILP>
-            <<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)),
-               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
-                outFeatures.data(), inFeatures.data(),
-                indices.subview(0).data() + numHotBlock,
-                indices.subview(1).data() + numHotBlock, size - numHotBlock,
-                numPlanes);
-        TV_CHECK_CUDA_ERR();
-      }
-    }
-  }
-};
-template <typename T, typename Index>
-struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
-  using vecload_type_t =
-      std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
-  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
-  void operator()(const tv::GPU &d, tv::TensorView<const T> outFeatures,
-                  tv::TensorView<const T> inFeatures,
-                  tv::TensorView<const T> dout, tv::TensorView<T> din,
-                  tv::TensorView<const Index> indices, int size) {
-    if (size <= 0) return;
-    int numPlanes = inFeatures.dim(1);
-    bool notFound = true;
-    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
-    mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &dout, &din,
-                                 &indices, &notFound](auto NumTLP) {
-      constexpr int NumILP = NumTLP / 4;
-      int numHotBlock = (size / NumTLP) * NumTLP;
-      if (notFound) {
-        if (numPlanes % NumTLP == 0) {
-          if (numHotBlock >= NumTLP) {
-            maxPoolBwdVecBlockKernel<T, Index, int(NumTLP), NumILP,
-                                     vecload_type_t>
-                <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
-                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
-                   d.getStream()>>>(outFeatures.data(), inFeatures.data(),
-                                    dout.data(), din.data(),
-                                    indices.subview(0).data(),
-                                    indices.subview(1).data(), numHotBlock,
-                                    numPlanes / vecloadFactor);
-            TV_CHECK_CUDA_ERR();
-          }
-          if (size > numHotBlock) {
-            maxPoolBwdGenericKernel<T, Index, int(NumTLP), NumILP>
-                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
-                   0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),
-                                       dout.data(), din.data(),
-                                       indices.subview(0).data() + numHotBlock,
-                                       indices.subview(1).data() + numHotBlock,
-                                       size - numHotBlock, numPlanes);
-            TV_CHECK_CUDA_ERR();
-          }
-          notFound = false;
-        }
-      }
-    });
-    if (notFound) {
-      constexpr int NumTLP = 64;
-      constexpr int NumILP = NumTLP / 4;
-      int numHotBlock = (size / NumTLP) * NumTLP;
-      if (numHotBlock >= NumTLP) {
-        maxPoolBwdGenericBlockKernel<T, Index, NumTLP, NumILP>
-            <<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)),
-               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
-                outFeatures.data(), inFeatures.data(), dout.data(), din.data(),
-                indices.subview(0).data(), indices.subview(1).data(),
-                numHotBlock, numPlanes);
-        TV_CHECK_CUDA_ERR();
-      }
-      if (size > numHotBlock) {
-        maxPoolBwdGenericKernel<T, Index, NumTLP, NumILP>
-            <<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)),
-               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
-                outFeatures.data(), inFeatures.data(), dout.data(), din.data(),
-                indices.subview(0).data() + numHotBlock,
-                indices.subview(1).data() + numHotBlock, size - numHotBlock,
-                numPlanes);
-        TV_CHECK_CUDA_ERR();
-      }
-    }
-  }
-};
-}  // namespace functor
-#define DECLARE_GPU_SPECS_T_INDEX(T, Index)                                \
-  template struct functor::SparseMaxPoolForwardFunctor<tv::GPU, T, Index>; \
-  template struct functor::SparseMaxPoolBackwardFunctor<tv::GPU, T, Index>;
-#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPECS_T_INDEX(T, int);
-DECLARE_GPU_SPECS(float);
-DECLARE_GPU_SPECS(double);
-DECLARE_GPU_SPECS(at::Half);
-#undef DECLARE_GPU_SPECS
-#undef DECLARE_GPU_SPECS_T_INDEX
-}  // namespace spconv
--- a/mmdet3d/ops/spconv/src/reordering.cc
+++ b/mmdet3d/ops/spconv/src/reordering.cc
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <spconv/reordering.h>
-#include <torch/script.h>
-namespace spconv {
-namespace functor {
-template <typename T, typename Index>
-struct SparseGatherFunctor<tv::CPU, T, Index> {
-  void operator()(const tv::CPU& d, tv::TensorView<T> buffer,
-                  tv::TensorView<const T> features,
-                  tv::TensorView<const Index> indices, int size) {
-    int numPlanes = features.dim(1);
-    for (int i = 0; i < size; ++i) {
-      std::memcpy(buffer.data() + i * numPlanes,
-                  features.data() + indices[i] * numPlanes,
-                  sizeof(T) * numPlanes);
-    }
-  }
-};
-template <typename T, typename Index>
-struct SparseScatterAddFunctor<tv::CPU, T, Index> {
-  void operator()(const tv::CPU& d, tv::TensorView<T> outFeatures,
-                  tv::TensorView<const T> buffer,
-                  tv::TensorView<const Index> indices, int size, bool stable) {
-    int numPlanes = outFeatures.dim(1);
-    const T* buf = buffer.data();
-    T* out = outFeatures.data();
-    for (int i = 0; i < size; ++i) {
-      buf = buffer.data() + i * numPlanes;
-      out = outFeatures.data() + indices[i] * numPlanes;
-      for (int j = 0; j < numPlanes; ++j) {
-        out[j] += buf[j];
-      }
-    }
-  }
-};
-}  // namespace functor
-#define DECLARE_CPU_SPECS_T_INDEX(T, Index)                        \
-  template struct functor::SparseGatherFunctor<tv::CPU, T, Index>; \
-  template struct functor::SparseScatterAddFunctor<tv::CPU, T, Index>;
-#define DECLARE_CPU_SPECS(T)         \
-  DECLARE_CPU_SPECS_T_INDEX(T, int); \
-  DECLARE_CPU_SPECS_T_INDEX(T, long);
-DECLARE_CPU_SPECS(float);
-DECLARE_CPU_SPECS(double);
-DECLARE_CPU_SPECS(at::Half);
-#undef DECLARE_CPU_SPECS
-#undef DECLARE_CPU_SPECS_T_INDEX
-}  // namespace spconv
--- a/mmdet3d/ops/spconv/src/reordering_cuda.cu
+++ b/mmdet3d/ops/spconv/src/reordering_cuda.cu
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <ATen/ATen.h>
-#include <spconv/mp_helper.h>
-#include <spconv/reordering.cu.h>
-#include <spconv/reordering.h>
-#include <tensorview/helper_kernel.cu.h>
-#include <tensorview/helper_launch.h>
-#include <tensorview/tensorview.h>
-#include <utility/timer.h>
-#include <chrono>
-#include <limits>
-#include <type_traits>
-namespace spconv {
-namespace functor {
-template <typename T, typename Index>
-struct SparseGatherFunctor<tv::GPU, T, Index> {
-  using vecload_type_t =
-      std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
-  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
-  void operator()(const tv::GPU &d, tv::TensorView<T> buffer,
-                  tv::TensorView<const T> features,
-                  tv::TensorView<const Index> indices, int size) {
-    if (size <= 0) return;
-    int numPlanes = features.dim(1);
-    bool notFound = true;
-    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
-    mp_for_each<kernel_block_t>([=, &buffer, &features, &indices,
-                                 &notFound](auto NumTLP) {
-      constexpr int NumILP = NumTLP / 4;
-      // constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
-      int nHotBlock = (size / NumTLP) * NumTLP;
-      if (notFound) {
-        if (numPlanes % NumTLP == 0) {
-          if (nHotBlock >= NumTLP) {
-            gatherVecBlockKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
-                <<<dim3(numPlanes / NumTLP, size / NumTLP),
-                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
-                   d.getStream()>>>(buffer.data(), features.data(),
-                                    indices.data(), nHotBlock,
-                                    numPlanes / vecloadFactor);
-            TV_CHECK_CUDA_ERR();
-          }
-          if (size - nHotBlock > 0) {
-            gatherVecKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
-                <<<dim3(1, numPlanes / NumTLP),
-                   dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
-                   d.getStream()>>>(buffer.data() + nHotBlock * numPlanes,
-                                    features.data(), indices.data() + nHotBlock,
-                                    size - nHotBlock,
-                                    numPlanes / vecloadFactor);
-            TV_CHECK_CUDA_ERR();
-          }
-          notFound = false;
-        }
-      }
-    });
-    if (notFound) {
-      constexpr int NumTLP = 64;
-      constexpr int NumILP = NumTLP / 4;
-      gatherGenericKernel<T, Index, NumTLP, NumILP>
-          <<<dim3(tv::launch::DivUp(size, NumTLP),
-                  tv::launch::DivUp(numPlanes, NumTLP)),
-             dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
-              buffer.data(), features.data(), indices.data(), size, numPlanes);
-      TV_CHECK_CUDA_ERR();
-    }
-  }
-};
-template <typename T, typename Index>
-struct SparseScatterAddFunctor<tv::GPU, T, Index> {
-  using vecload_type_t =
-      std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
-  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
-  void operator()(const tv::GPU &d, tv::TensorView<T> outFeatures,
-                  tv::TensorView<const T> buffer,
-                  tv::TensorView<const Index> indices, int size, bool stable) {
-    if (size <= 0) return;
-    int numPlanes = outFeatures.dim(1);
-    bool notFound = true;
-    constexpr int vecloadFactor =
-        sizeof(vecload_type_t) / sizeof(T);  // important for half.
-    mp_for_each<kernel_block_t>([=, &d, &outFeatures, &buffer, &indices,
-                                 &notFound](auto NumTLP) {
-      // constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
-      constexpr int NumILP = NumTLP / 4;
-      int nHotBlock = (size / NumTLP) * NumTLP;
-      if (notFound) {
-        if (numPlanes % NumTLP == 0) {
-          if (nHotBlock >= NumTLP) {
-            scatterAddVecBlockKernel<T, Index, int(NumTLP), NumILP,
-                                     vecload_type_t>
-                <<<dim3(numPlanes / NumTLP, size / NumTLP),
-                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
-                   d.getStream()>>>(outFeatures.data(), buffer.data(),
-                                    indices.data(), nHotBlock,
-                                    numPlanes / vecloadFactor);
-            TV_CHECK_CUDA_ERR();
-          }
-          if (size - nHotBlock > 0) {
-            scatterAddGenericKernel<T, Index, int(NumTLP), NumILP>
-                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
-                   0, d.getStream()>>>(
-                    outFeatures.data(), buffer.data() + nHotBlock * numPlanes,
-                    indices.data() + nHotBlock, size - nHotBlock, numPlanes);
-            TV_CHECK_CUDA_ERR();
-          }
-          notFound = false;
-        }
-      }
-    });
-    if (notFound) {
-      constexpr int NumTLP = 64;
-      constexpr int NumILP = NumTLP / 4;
-      scatterAddGenericKernel<T, Index, NumTLP, NumILP>
-          <<<dim3(tv::launch::DivUp(size, NumTLP),
-                  tv::launch::DivUp(numPlanes, NumTLP)),
-             dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
-              outFeatures.data(), buffer.data(), indices.data(), size,
-              numPlanes);
-      TV_CHECK_CUDA_ERR();
-    }
-  }
-};
-}  // namespace functor
-#define DECLARE_GPU_SPECS_T_INDEX(T, Index)                        \
-  template struct functor::SparseGatherFunctor<tv::GPU, T, Index>; \
-  template struct functor::SparseScatterAddFunctor<tv::GPU, T, Index>;
-#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPECS_T_INDEX(T, int);
-DECLARE_GPU_SPECS(float);
-DECLARE_GPU_SPECS(double);
-DECLARE_GPU_SPECS(at::Half);
-#undef DECLARE_GPU_SPECS
-#undef DECLARE_GPU_SPECS_T_INDEX
-}  // namespace spconv
--- a/mmdet3d/ops/spconv/structure.py
+++ b/mmdet3d/ops/spconv/structure.py
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import torch
-def scatter_nd(indices, updates, shape):
-    """pytorch edition of tensorflow scatter_nd.
-    this function don't contain except handle code. so use this carefully when
-    indice repeats, don't support repeat add which is supported in tensorflow.
-    """
-    ret = torch.zeros(*shape, dtype=updates.dtype, device=updates.device)
-    ndim = indices.shape[-1]
-    output_shape = list(indices.shape[:-1]) + shape[indices.shape[-1]:]
-    flatted_indices = indices.view(-1, ndim)
-    slices = [flatted_indices[:, i] for i in range(ndim)]
-    slices += [Ellipsis]
-    ret[slices] = updates.view(*output_shape)
-    return ret
-class SparseConvTensor(object):
-    def __init__(self,
-                 features,
-                 indices,
-                 spatial_shape,
-                 batch_size,
-                 grid=None):
-        """
-        Args:
-            grid: pre-allocated grid tensor.
-                  should be used when the volume of spatial shape
-                  is very large.
-        """
-        self.features = features
-        self.indices = indices
-        if self.indices.dtype != torch.int32:
-            self.indices.int()
-        self.spatial_shape = spatial_shape
-        self.batch_size = batch_size
-        self.indice_dict = {}
-        self.grid = grid
-    @property
-    def spatial_size(self):
-        return np.prod(self.spatial_shape)
-    def find_indice_pair(self, key):
-        if key is None:
-            return None
-        if key in self.indice_dict:
-            return self.indice_dict[key]
-        return None
-    def dense(self, channels_first=True):
-        output_shape = [self.batch_size] + list(
-            self.spatial_shape) + [self.features.shape[1]]
-        res = scatter_nd(self.indices.long(), self.features, output_shape)
-        if not channels_first:
-            return res
-        ndim = len(self.spatial_shape)
-        trans_params = list(range(0, ndim + 1))
-        trans_params.insert(1, ndim + 1)
-        return res.permute(*trans_params).contiguous()
-    @property
-    def sparity(self):
-        return (self.indices.shape[0] / np.prod(self.spatial_shape) /
-                self.batch_size)
--- a/mmdet3d/ops/spconv/test_utils.py
+++ b/mmdet3d/ops/spconv/test_utils.py
-# Copyright 2019 Yan Yan
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import unittest
-import numpy as np
-class TestCase(unittest.TestCase):
-    def _GetNdArray(self, a):
-        if not isinstance(a, np.ndarray):
-            a = np.array(a)
-        return a
-    def assertAllEqual(self, a, b):
-        """Asserts that two numpy arrays have the same values.
-        Args:
-        a: the expected numpy ndarray or anything can be converted to one.
-        b: the actual numpy ndarray or anything can be converted to one.
-        """
-        a = self._GetNdArray(a)
-        b = self._GetNdArray(b)
-        self.assertEqual(
-            a.shape, b.shape,
-            'Shape mismatch: expected %s, got %s.' % (a.shape, b.shape))
-        same = (a == b)
-        if a.dtype == np.float32 or a.dtype == np.float64:
-            same = np.logical_or(same,
-                                 np.logical_and(np.isnan(a), np.isnan(b)))
-        if not np.all(same):
-            # Prints more details than np.testing.assert_array_equal.
-            diff = np.logical_not(same)
-            if a.ndim:
-                x = a[np.where(diff)]
-                y = b[np.where(diff)]
-                print('not equal where = ', np.where(diff))
-            else:
-                # np.where is broken for scalars
-                x, y = a, b
-            print('not equal lhs = ', x)
-            print('not equal rhs = ', y)
-            np.testing.assert_array_equal(a, b)
-    def assertAllClose(self, a, b, rtol=1e-6, atol=1e-6):
-        """Asserts that two numpy arrays, or dicts of same, have near values.
-        This does not support nested dicts.
-        Args:
-        a: The expected numpy ndarray (or anything can be converted to one), or
-            dict of same. Must be a dict iff `b` is a dict.
-        b: The actual numpy ndarray (or anything can be converted to one), or
-            dict of same. Must be a dict iff `a` is a dict.
-        rtol: relative tolerance.
-        atol: absolute tolerance.
-        Raises:
-        ValueError: if only one of `a` and `b` is a dict.
-        """
-        is_a_dict = isinstance(a, dict)
-        if is_a_dict != isinstance(b, dict):
-            raise ValueError("Can't compare dict to non-dict, %s vs %s." %
-                             (a, b))
-        if is_a_dict:
-            self.assertCountEqual(
-                a.keys(),
-                b.keys(),
-                msg='mismatched keys, expected %s, got %s' %
-                (a.keys(), b.keys()))
-            for k in a:
-                self._assertArrayLikeAllClose(
-                    a[k],
-                    b[k],
-                    rtol=rtol,
-                    atol=atol,
-                    msg='%s: expected %s, got %s.' % (k, a, b))
-        else:
-            self._assertArrayLikeAllClose(a, b, rtol=rtol, atol=atol)
-    def _assertArrayLikeAllClose(self, a, b, rtol=1e-6, atol=1e-6, msg=None):
-        a = self._GetNdArray(a)
-        b = self._GetNdArray(b)
-        self.assertEqual(
-            a.shape, b.shape,
-            'Shape mismatch: expected %s, got %s.' % (a.shape, b.shape))
-        if not np.allclose(a, b, rtol=rtol, atol=atol):
-            # Prints more details than np.testing.assert_allclose.
-            #
-            # NOTE: numpy.allclose (and numpy.testing.assert_allclose)
-            # checks whether two arrays are element-wise equal within a
-            # tolerance. The relative difference (rtol * abs(b)) and the
-            # absolute difference atol are added together to compare against
-            # the absolute difference between a and b.  Here, we want to
-            # print out which elements violate such conditions.
-            cond = np.logical_or(
-                np.abs(a - b) > atol + rtol * np.abs(b),
-                np.isnan(a) != np.isnan(b))
-            if a.ndim:
-                x = a[np.where(cond)]
-                y = b[np.where(cond)]
-                print('not close where = ', np.where(cond))
-            else:
-                # np.where is broken for scalars
-                x, y = a, b
-            print('not close lhs = ', x)
-            print('not close rhs = ', y)
-            print('not close dif = ', np.abs(x - y))
-            print('not close tol = ', atol + rtol * np.abs(y))
-            print('dtype = %s, shape = %s' % (a.dtype, a.shape))
-            np.testing.assert_allclose(a, b, rtol=rtol, atol=atol, err_msg=msg)
-def params_grid(*params):
-    size = len(params)
-    length = 1
-    for p in params:
-        length *= len(p)
-    sizes = [len(p) for p in params]
-    counter = [0] * size
-    total = []
-    for i in range(length):
-        total.append([0] * size)
-    for i in range(length):
-        for j in range(size):
-            total[i][j] = params[j][counter[j]]
-        counter[size - 1] += 1
-        for c in range(size - 1, -1, -1):
-            if (counter[c] == sizes[c] and c > 0):
-                counter[c - 1] += 1
-                counter[c] = 0
-    return total
-def generate_sparse_data(shape,
-                         num_points,
-                         num_channels,
-                         integer=False,
-                         data_range=(-1, 1),
-                         with_dense=True,
-                         dtype=np.float32):
-    dense_shape = shape
-    ndim = len(dense_shape)
-    # num_points = np.random.randint(10, 100, size=[batch_size, ndim])
-    num_points = np.array(num_points)
-    # num_points = np.array([3, 2])
-    batch_size = len(num_points)
-    batch_indices = []
-    coors_total = np.stack(
-        np.meshgrid(*[np.arange(0, s) for s in shape]), axis=-1)
-    coors_total = coors_total.reshape(-1, ndim)
-    for i in range(batch_size):
-        np.random.shuffle(coors_total)
-        inds_total = coors_total[:num_points[i]]
-        inds_total = np.pad(
-            inds_total, ((0, 0), (0, 1)), mode='constant', constant_values=i)
-        batch_indices.append(inds_total)
-    if integer:
-        sparse_data = np.random.randint(
-            data_range[0],
-            data_range[1],
-            size=[num_points.sum(), num_channels]).astype(dtype)
-    else:
-        sparse_data = np.random.uniform(
-            data_range[0],
-            data_range[1],
-            size=[num_points.sum(), num_channels]).astype(dtype)
-    res = {
-        'features': sparse_data.astype(dtype),
-    }
-    if with_dense:
-        dense_data = np.zeros([batch_size, num_channels, *dense_shape],
-                              dtype=sparse_data.dtype)
-        start = 0
-        for i, inds in enumerate(batch_indices):
-            for j, ind in enumerate(inds):
-                dense_slice = (i, slice(None), *ind[:-1])
-                dense_data[dense_slice] = sparse_data[start + j]
-            start += len(inds)
-        res['features_dense'] = dense_data.astype(dtype)
-    batch_indices = np.concatenate(batch_indices, axis=0)
-    res['indices'] = batch_indices.astype(np.int32)
-    return res
--- a/mmdet3d/ops/voxel/__init__.py
+++ b/mmdet3d/ops/voxel/__init__.py
-# Copyright (c) OpenMMLab. All rights reserved.
-from .scatter_points import DynamicScatter, dynamic_scatter
-from .voxelize import Voxelization, voxelization
-__all__ = ['Voxelization', 'voxelization', 'dynamic_scatter', 'DynamicScatter']
--- a/mmdet3d/ops/voxel/scatter_points.py
+++ b/mmdet3d/ops/voxel/scatter_points.py
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from torch import nn
-from torch.autograd import Function
-from .voxel_layer import (dynamic_point_to_voxel_backward,
-                          dynamic_point_to_voxel_forward)
-class _dynamic_scatter(Function):
-    @staticmethod
-    def forward(ctx, feats, coors, reduce_type='max'):
-        """convert kitti points(N, >=3) to voxels.
-        Args:
-            feats: [N, C] float tensor. points features to be reduced
-                into voxels.
-            coors: [N, ndim] int tensor. corresponding voxel coordinates
-                (specifically multi-dim voxel index) of each points.
-            reduce_type: str. reduce op. support 'max', 'sum' and 'mean'
-        Returns:
-            tuple
-            voxel_feats: [M, C] float tensor. reduced features. input features
-                that shares the same voxel coordinates are reduced to one row
-            coordinates: [M, ndim] int tensor, voxel coordinates.
-        """
-        results = dynamic_point_to_voxel_forward(feats, coors, reduce_type)
-        (voxel_feats, voxel_coors, point2voxel_map,
-         voxel_points_count) = results
-        ctx.reduce_type = reduce_type
-        ctx.save_for_backward(feats, voxel_feats, point2voxel_map,
-                              voxel_points_count)
-        ctx.mark_non_differentiable(voxel_coors)
-        return voxel_feats, voxel_coors
-    @staticmethod
-    def backward(ctx, grad_voxel_feats, grad_voxel_coors=None):
-        (feats, voxel_feats, point2voxel_map,
-         voxel_points_count) = ctx.saved_tensors
-        grad_feats = torch.zeros_like(feats)
-        # TODO: whether to use index put or use cuda_backward
-        # To use index put, need point to voxel index
-        dynamic_point_to_voxel_backward(grad_feats,
-                                        grad_voxel_feats.contiguous(), feats,
-                                        voxel_feats, point2voxel_map,
-                                        voxel_points_count, ctx.reduce_type)
-        return grad_feats, None, None
-dynamic_scatter = _dynamic_scatter.apply
-class DynamicScatter(nn.Module):
-    def __init__(self, voxel_size, point_cloud_range, average_points: bool):
-        super(DynamicScatter, self).__init__()
-        """Scatters points into voxels, used in the voxel encoder with
-           dynamic voxelization
-        **Note**: The CPU and GPU implementation get the same output, but
-        have numerical difference after summation and division (e.g., 5e-7).
-        Args:
-            average_points (bool): whether to use avg pooling to scatter
-                points into voxel voxel_size (list): list [x, y, z] size
-                of three dimension
-            point_cloud_range (list):
-                [x_min, y_min, z_min, x_max, y_max, z_max]
-        """
-        self.voxel_size = voxel_size
-        self.point_cloud_range = point_cloud_range
-        self.average_points = average_points
-    def forward_single(self, points, coors):
-        reduce = 'mean' if self.average_points else 'max'
-        return dynamic_scatter(points.contiguous(), coors.contiguous(), reduce)
-    def forward(self, points, coors):
-        """
-        Args:
-            input: NC points
-        """
-        if coors.size(-1) == 3:
-            return self.forward_single(points, coors)
-        else:
-            batch_size = coors[-1, 0] + 1
-            voxels, voxel_coors = [], []
-            for i in range(batch_size):
-                inds = torch.where(coors[:, 0] == i)
-                voxel, voxel_coor = self.forward_single(
-                    points[inds], coors[inds][:, 1:])
-                coor_pad = nn.functional.pad(
-                    voxel_coor, (1, 0), mode='constant', value=i)
-                voxel_coors.append(coor_pad)
-                voxels.append(voxel)
-            features = torch.cat(voxels, dim=0)
-            feature_coors = torch.cat(voxel_coors, dim=0)
-            return features, feature_coors
-    def __repr__(self):
-        tmpstr = self.__class__.__name__ + '('
-        tmpstr += 'voxel_size=' + str(self.voxel_size)
-        tmpstr += ', point_cloud_range=' + str(self.point_cloud_range)
-        tmpstr += ', average_points=' + str(self.average_points)
-        tmpstr += ')'
-        return tmpstr
--- a/mmdet3d/ops/voxel/src/scatter_points_cpu.cpp
+++ b/mmdet3d/ops/voxel/src/scatter_points_cpu.cpp
-#include <ATen/TensorUtils.h>
-#include <torch/extension.h>
-// #include "voxelization.h"
-namespace {
-template <typename T_int>
-void determin_max_points_kernel(
-    torch::TensorAccessor<T_int, 2> coor,
-    torch::TensorAccessor<T_int, 1> point_to_voxelidx,
-    torch::TensorAccessor<T_int, 1> num_points_per_voxel,
-    torch::TensorAccessor<T_int, 3> coor_to_voxelidx, int& voxel_num,
-    int& max_points, const int num_points) {
-  int voxelidx, num;
-  for (int i = 0; i < num_points; ++i) {
-    if (coor[i][0] == -1) continue;
-    voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];
-    // record voxel
-    if (voxelidx == -1) {
-      voxelidx = voxel_num;
-      voxel_num += 1;
-      coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]] = voxelidx;
-    }
-    // put points into voxel
-    num = num_points_per_voxel[voxelidx];
-    point_to_voxelidx[i] = num;
-    num_points_per_voxel[voxelidx] += 1;
-    // update max points per voxel
-    max_points = std::max(max_points, num + 1);
-  }
-  return;
-}
-template <typename T, typename T_int>
-void scatter_point_to_voxel_kernel(
-    const torch::TensorAccessor<T, 2> points,
-    torch::TensorAccessor<T_int, 2> coor,
-    torch::TensorAccessor<T_int, 1> point_to_voxelidx,
-    torch::TensorAccessor<T_int, 3> coor_to_voxelidx,
-    torch::TensorAccessor<T, 3> voxels,
-    torch::TensorAccessor<T_int, 2> voxel_coors, const int num_features,
-    const int num_points, const int NDim) {
-  for (int i = 0; i < num_points; ++i) {
-    int num = point_to_voxelidx[i];
-    int voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];
-    for (int k = 0; k < num_features; ++k) {
-      voxels[voxelidx][num][k] = points[i][k];
-    }
-    for (int k = 0; k < NDim; ++k) {
-      voxel_coors[voxelidx][k] = coor[i][k];
-    }
-  }
-}
-}  // namespace
-namespace voxelization {
-std::vector<at::Tensor> dynamic_point_to_voxel_cpu(
-    const at::Tensor& points, const at::Tensor& voxel_mapping,
-    const std::vector<float> voxel_size, const std::vector<float> coors_range) {
-  // current version tooks about 0.02s_0.03s for one frame on cpu
-  // check device
-  AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
-  const int NDim = voxel_mapping.size(1);
-  const int num_points = points.size(0);
-  const int num_features = points.size(1);
-  std::vector<int> grid_size(NDim);
-  for (int i = 0; i < NDim; ++i) {
-    grid_size[i] =
-        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
-  }
-  at::Tensor num_points_per_voxel = at::zeros(
-      {
-          num_points,
-      },
-      voxel_mapping.options());
-  at::Tensor coor_to_voxelidx = -at::ones(
-      {grid_size[2], grid_size[1], grid_size[0]}, voxel_mapping.options());
-  at::Tensor point_to_voxelidx = -at::ones(
-      {
-          num_points,
-      },
-      voxel_mapping.options());
-  int voxel_num = 0;
-  int max_points = 0;
-  AT_DISPATCH_ALL_TYPES(voxel_mapping.scalar_type(), "determin_max_point", [&] {
-    determin_max_points_kernel<scalar_t>(
-        voxel_mapping.accessor<scalar_t, 2>(),
-        point_to_voxelidx.accessor<scalar_t, 1>(),
-        num_points_per_voxel.accessor<scalar_t, 1>(),
-        coor_to_voxelidx.accessor<scalar_t, 3>(), voxel_num, max_points,
-        num_points);
-  });
-  at::Tensor voxels =
-      at::zeros({voxel_num, max_points, num_features}, points.options());
-  at::Tensor voxel_coors =
-      at::zeros({voxel_num, NDim}, points.options().dtype(at::kInt));
-  AT_DISPATCH_ALL_TYPES(points.scalar_type(), "scatter_point_to_voxel", [&] {
-    scatter_point_to_voxel_kernel<scalar_t, int>(
-        points.accessor<scalar_t, 2>(), voxel_mapping.accessor<int, 2>(),
-        point_to_voxelidx.accessor<int, 1>(),
-        coor_to_voxelidx.accessor<int, 3>(), voxels.accessor<scalar_t, 3>(),
-        voxel_coors.accessor<int, 2>(), num_features, num_points, NDim);
-  });
-  at::Tensor num_points_per_voxel_out =
-      num_points_per_voxel.slice(/*dim=*/0, /*start=*/0, /*end=*/voxel_num);
-  return {voxels, voxel_coors, num_points_per_voxel_out};
-}
-}  // namespace voxelization
--- a/mmdet3d/ops/voxel/src/scatter_points_cuda.cu
+++ b/mmdet3d/ops/voxel/src/scatter_points_cuda.cu
-#include <ATen/ATen.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <torch/types.h>
-#include <ATen/cuda/CUDAApplyUtils.cuh>
-typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
-#define CHECK_CUDA(x) \
-  TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x) \
-  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_INPUT(x) \
-  CHECK_CUDA(x);       \
-  CHECK_CONTIGUOUS(x)
-namespace {
-int const threadsPerBlock = 512;
-int const maxGridDim = 50000;
-}  // namespace
-__device__ __forceinline__ static void reduceMax(float *address, float val) {
-  int *address_as_i = reinterpret_cast<int *>(address);
-  int old = *address_as_i, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_i, assumed,
-                    __float_as_int(fmaxf(val, __int_as_float(assumed))));
-  } while (assumed != old || __int_as_float(old) < val);
-}
-__device__ __forceinline__ static void reduceMax(double *address, double val) {
-  unsigned long long *address_as_ull =
-      reinterpret_cast<unsigned long long *>(address);
-  unsigned long long old = *address_as_ull, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(
-        address_as_ull, assumed,
-        __double_as_longlong(fmax(val, __longlong_as_double(assumed))));
-  } while (assumed != old || __longlong_as_double(old) < val);
-}
-// get rid of meaningless warnings when compiling host code
-#ifdef __CUDA_ARCH__
-__device__ __forceinline__ static void reduceAdd(float *address, float val) {
-#if (__CUDA_ARCH__ < 200)
-#warning \
-    "compute capability lower than 2.x. fall back to use CAS version of atomicAdd for float32"
-  int *address_as_i = reinterpret_cast<int *>(address);
-  int old = *address_as_i, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_i, assumed,
-                    __float_as_int(val + __int_as_float(assumed)));
-  } while (assumed != old);
-#else
-  atomicAdd(address, val);
-#endif
-}
-__device__ __forceinline__ static void reduceAdd(double *address, double val) {
-#if (__CUDA_ARCH__ < 600)
-#warning \
-    "compute capability lower than 6.x. fall back to use CAS version of atomicAdd for float64"
-  unsigned long long *address_as_ull =
-      reinterpret_cast<unsigned long long *>(address);
-  unsigned long long old = *address_as_ull, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_ull, assumed,
-                    __double_as_longlong(val + __longlong_as_double(assumed)));
-  } while (assumed != old);
-#else
-  atomicAdd(address, val);
-#endif
-}
-#endif
-template <typename T>
-__global__ void
-feats_reduce_kernel(const T *feats, const int32_t *coors_map,
-                    T *reduced_feats, // shall be 0 at initialization
-                    const int num_input, const int num_feats,
-                    const reduce_t reduce_type) {
-  for (int x = blockIdx.x * blockDim.x + threadIdx.x; x < num_input;
-       x += gridDim.x * blockDim.x) {
-    int32_t reduce_to = coors_map[x];
-    if (reduce_to == -1) continue;
-    const T *feats_offset = feats + x * num_feats;
-    T *reduced_feats_offset = reduced_feats + reduce_to * num_feats;
-    if (reduce_type == reduce_t::MAX) {
-      for (int i = 0; i < num_feats; i++) {
-        reduceMax(&reduced_feats_offset[i], feats_offset[i]);
-      }
-    } else {
-      for (int i = 0; i < num_feats; i++) {
-        reduceAdd(&reduced_feats_offset[i], feats_offset[i]);
-      }
-    }
-  }
-}
-template <typename T>
-__global__ void add_reduce_traceback_grad_kernel(
-    T *grad_feats, const T *grad_reduced_feats, const int32_t *coors_map,
-    const int32_t *reduce_count, const int num_input, const int num_feats,
-    const reduce_t reduce_type) {
-  for (int x = blockIdx.x * blockDim.x + threadIdx.x; x < num_input;
-       x += gridDim.x * blockDim.x) {
-    int32_t reduce_to = coors_map[x];
-    if (reduce_to == -1) {
-      continue;
-    }
-    const int input_offset = x * num_feats;
-    T *grad_feats_offset = grad_feats + input_offset;
-    const int reduced_offset = reduce_to * num_feats;
-    const T *grad_reduced_feats_offset = grad_reduced_feats + reduced_offset;
-    if (reduce_type == reduce_t::SUM) {
-      for (int i = 0; i < num_feats; i++) {
-        grad_feats_offset[i] = grad_reduced_feats_offset[i];
-      }
-    } else if (reduce_type == reduce_t::MEAN) {
-      for (int i = 0; i < num_feats; i++) {
-        grad_feats_offset[i] = grad_reduced_feats_offset[i] /
-                               static_cast<T>(reduce_count[reduce_to]);
-      }
-    }
-  }
-}
-template <typename T>
-__global__ void max_reduce_traceback_scatter_idx_kernel(
-    const T *feats, const T *reduced_feats, int32_t *reduce_from,
-    const int32_t *coors_map, const int num_input, const int num_feats) {
-  for (int x = blockIdx.x * blockDim.x + threadIdx.x; x < num_input;
-       x += gridDim.x * blockDim.x) {
-    int32_t reduce_to = coors_map[x];
-    const int input_offset = x * num_feats;
-    const T *feats_offset = feats + input_offset;
-    if (reduce_to == -1) {
-      continue;
-    }
-    const int reduced_offset = reduce_to * num_feats;
-    const T *reduced_feats_offset = reduced_feats + reduced_offset;
-    int32_t *reduce_from_offset = reduce_from + reduced_offset;
-    for (int i = 0; i < num_feats; i++) {
-      if (feats_offset[i] == reduced_feats_offset[i]) {
-        atomicMin(&reduce_from_offset[i], static_cast<int32_t>(x));
-      }
-    }
-  }
-}
-template <typename T>
-__global__ void max_reduce_scatter_grad_kernel(T *grad_feats,
-                                               const T *grad_reduced_feats,
-                                               const int32_t *reduce_from,
-                                               const int num_reduced,
-                                               const int num_feats) {
-  for (int x = blockIdx.x * blockDim.x + threadIdx.x; x < num_reduced;
-       x += gridDim.x * blockDim.x) {
-    const int reduced_offset = x * num_feats;
-    const int32_t *scatter_to_offset = reduce_from + reduced_offset;
-    const T *grad_reduced_feats_offset = grad_reduced_feats + reduced_offset;
-    for (int i = 0; i < num_feats; i++) {
-      grad_feats[scatter_to_offset[i] * num_feats + i] =
-          grad_reduced_feats_offset[i];
-    }
-  }
-}
-namespace voxelization {
-std::vector<at::Tensor> dynamic_point_to_voxel_forward_gpu(
-    const at::Tensor &feats, const at::Tensor &coors,
-    const reduce_t reduce_type) {
-  CHECK_INPUT(feats);
-  CHECK_INPUT(coors);
-  const int num_input = feats.size(0);
-  const int num_feats = feats.size(1);
-  if (num_input == 0)
-    return {feats.clone().detach(),
-            coors.clone().detach(),
-            coors.new_empty({0}, torch::kInt32),
-            coors.new_empty({0}, torch::kInt32)};
-  at::Tensor out_coors;
-  at::Tensor coors_map;
-  at::Tensor reduce_count;
-  auto coors_clean = coors.masked_fill(coors.lt(0).any(-1, true), -1);
-  std::tie(out_coors, coors_map, reduce_count) =
-      at::unique_dim(coors_clean, 0, true, true, true);
-  if (out_coors.index({0, 0}).lt(0).item<bool>()) {
-    // the first element of out_coors (-1,-1,-1) and should be removed
-    out_coors = out_coors.slice(0, 1);
-    reduce_count = reduce_count.slice(0, 1);
-    coors_map = coors_map - 1;
-  }
-  coors_map = coors_map.to(torch::kInt32);
-  reduce_count = reduce_count.to(torch::kInt32);
-  auto reduced_feats =
-      at::empty({out_coors.size(0), num_feats}, feats.options());
-  AT_DISPATCH_FLOATING_TYPES(
-      feats.scalar_type(), "feats_reduce_kernel", ([&] {
-    if (reduce_type == reduce_t::MAX)
-      reduced_feats.fill_(-std::numeric_limits<scalar_t>::infinity());
-    else
-      reduced_feats.fill_(static_cast<scalar_t>(0));
-    dim3 blocks(std::min(at::cuda::ATenCeilDiv(num_input, threadsPerBlock),
-                         maxGridDim));
-    dim3 threads(threadsPerBlock);
-    feats_reduce_kernel<<<blocks, threads>>>(
-        feats.data_ptr<scalar_t>(), coors_map.data_ptr<int32_t>(),
-        reduced_feats.data_ptr<scalar_t>(), num_input, num_feats, reduce_type);
-    if (reduce_type == reduce_t::MEAN)
-      reduced_feats /= reduce_count.unsqueeze(-1).to(reduced_feats.dtype());
-  }));
-  AT_CUDA_CHECK(cudaGetLastError());
-  return {reduced_feats, out_coors, coors_map, reduce_count};
-}
-void dynamic_point_to_voxel_backward_gpu(at::Tensor &grad_feats,
-                                         const at::Tensor &grad_reduced_feats,
-                                         const at::Tensor &feats,
-                                         const at::Tensor &reduced_feats,
-                                         const at::Tensor &coors_map,
-                                         const at::Tensor &reduce_count,
-                                         const reduce_t reduce_type) {
-  CHECK_INPUT(grad_feats);
-  CHECK_INPUT(grad_reduced_feats);
-  CHECK_INPUT(feats);
-  CHECK_INPUT(reduced_feats);
-  CHECK_INPUT(coors_map);
-  CHECK_INPUT(reduce_count);
-  const int num_input = feats.size(0);
-  const int num_reduced = reduced_feats.size(0);
-  const int num_feats = feats.size(1);
-  grad_feats.fill_(0);
-  // copy voxel grad to points
-  if (num_input == 0 || num_reduced == 0) return;
-  if (reduce_type == reduce_t::MEAN || reduce_type == reduce_t::SUM) {
-    AT_DISPATCH_FLOATING_TYPES(
-        grad_reduced_feats.scalar_type(), "add_reduce_traceback_grad_kernel",
-        ([&] {
-          dim3 blocks(std::min(
-              at::cuda::ATenCeilDiv(num_input, threadsPerBlock), maxGridDim));
-          dim3 threads(threadsPerBlock);
-          add_reduce_traceback_grad_kernel<<<blocks, threads>>>(
-              grad_feats.data_ptr<scalar_t>(),
-              grad_reduced_feats.data_ptr<scalar_t>(),
-              coors_map.data_ptr<int32_t>(), reduce_count.data_ptr<int32_t>(),
-              num_input, num_feats, reduce_type);
-        }));
-    AT_CUDA_CHECK(cudaGetLastError());
-  } else {
-    auto reduce_from = at::full({num_reduced, num_feats}, num_input,
-                                coors_map.options().dtype(torch::kInt32));
-    AT_DISPATCH_FLOATING_TYPES(
-        grad_reduced_feats.scalar_type(),
-        "max_reduce_traceback_scatter_idx_kernel", ([&] {
-          dim3 blocks(std::min(
-              at::cuda::ATenCeilDiv(num_input, threadsPerBlock), maxGridDim));
-          dim3 threads(threadsPerBlock);
-          max_reduce_traceback_scatter_idx_kernel<<<blocks, threads>>>(
-              feats.data_ptr<scalar_t>(), reduced_feats.data_ptr<scalar_t>(),
-              reduce_from.data_ptr<int32_t>(), coors_map.data_ptr<int32_t>(),
-              num_input, num_feats);
-        }));
-    AT_CUDA_CHECK(cudaGetLastError());
-    AT_DISPATCH_FLOATING_TYPES(
-        grad_reduced_feats.scalar_type(),
-        "max_reduce_traceback_scatter_idx_kernel", ([&] {
-          dim3 blocks(std::min(
-              at::cuda::ATenCeilDiv(num_reduced, threadsPerBlock), maxGridDim));
-          dim3 threads(threadsPerBlock);
-          max_reduce_scatter_grad_kernel<<<blocks, threads>>>(
-              grad_feats.data_ptr<scalar_t>(),
-              grad_reduced_feats.data_ptr<scalar_t>(),
-              reduce_from.data_ptr<int32_t>(), num_reduced, num_feats);
-        }));
-    AT_CUDA_CHECK(cudaGetLastError());
-  }
-  return;
-}
-}  // namespace voxelization
--- a/mmdet3d/ops/voxel/src/voxelization.cpp
+++ b/mmdet3d/ops/voxel/src/voxelization.cpp
-#include <torch/extension.h>
-#include "voxelization.h"
-namespace voxelization {
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("hard_voxelize", &hard_voxelize, "hard voxelize");
-  m.def("dynamic_voxelize", &dynamic_voxelize, "dynamic voxelization");
-  m.def("dynamic_point_to_voxel_forward", &dynamic_point_to_voxel_forward, "dynamic point to voxel forward");
-  m.def("dynamic_point_to_voxel_backward", &dynamic_point_to_voxel_backward, "dynamic point to voxel backward");
-}
-} // namespace voxelization
--- a/mmdet3d/ops/voxel/src/voxelization.h
+++ b/mmdet3d/ops/voxel/src/voxelization.h
-#pragma once
-#include <torch/extension.h>
-typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
-namespace voxelization {
-int hard_voxelize_cpu(const at::Tensor &points, at::Tensor &voxels,
-                      at::Tensor &coors, at::Tensor &num_points_per_voxel,
-                      const std::vector<float> voxel_size,
-                      const std::vector<float> coors_range,
-                      const int max_points, const int max_voxels,
-                      const int NDim = 3);
-void dynamic_voxelize_cpu(const at::Tensor &points, at::Tensor &coors,
-                          const std::vector<float> voxel_size,
-                          const std::vector<float> coors_range,
-                          const int NDim = 3);
-std::vector<at::Tensor> dynamic_point_to_voxel_cpu(
-    const at::Tensor &points, const at::Tensor &voxel_mapping,
-    const std::vector<float> voxel_size, const std::vector<float> coors_range);
-#ifdef WITH_CUDA
-int hard_voxelize_gpu(const at::Tensor &points, at::Tensor &voxels,
-                      at::Tensor &coors, at::Tensor &num_points_per_voxel,
-                      const std::vector<float> voxel_size,
-                      const std::vector<float> coors_range,
-                      const int max_points, const int max_voxels,
-                      const int NDim = 3);
-int nondisterministic_hard_voxelize_gpu(const at::Tensor &points, at::Tensor &voxels,
-                                        at::Tensor &coors, at::Tensor &num_points_per_voxel,
-                                        const std::vector<float> voxel_size,
-                                        const std::vector<float> coors_range,
-                                        const int max_points, const int max_voxels,
-                                        const int NDim = 3);
-void dynamic_voxelize_gpu(const at::Tensor &points, at::Tensor &coors,
-                          const std::vector<float> voxel_size,
-                          const std::vector<float> coors_range,
-                          const int NDim = 3);
-std::vector<torch::Tensor> dynamic_point_to_voxel_forward_gpu(const torch::Tensor &feats,
-                                                              const torch::Tensor &coors,
-                                                              const reduce_t reduce_type);
-void dynamic_point_to_voxel_backward_gpu(torch::Tensor &grad_feats,
-                                         const torch::Tensor &grad_reduced_feats,
-                                         const torch::Tensor &feats,
-                                         const torch::Tensor &reduced_feats,
-                                         const torch::Tensor &coors_idx,
-                                         const torch::Tensor &reduce_count,
-                                         const reduce_t reduce_type);
-#endif
-// Interface for Python
-inline int hard_voxelize(const at::Tensor &points, at::Tensor &voxels,
-                         at::Tensor &coors, at::Tensor &num_points_per_voxel,
-                         const std::vector<float> voxel_size,
-                         const std::vector<float> coors_range,
-                         const int max_points, const int max_voxels,
-                         const int NDim = 3, const bool deterministic = true) {
-  if (points.device().is_cuda()) {
-#ifdef WITH_CUDA
-    if (deterministic) {
-      return hard_voxelize_gpu(points, voxels, coors, num_points_per_voxel,
-                               voxel_size, coors_range, max_points, max_voxels,
-                               NDim);
-    }
-    return nondisterministic_hard_voxelize_gpu(points, voxels, coors, num_points_per_voxel,
-                                               voxel_size, coors_range, max_points, max_voxels,
-                                               NDim);
-#else
-    AT_ERROR("Not compiled with GPU support");
-#endif
-  }
-  return hard_voxelize_cpu(points, voxels, coors, num_points_per_voxel,
-                           voxel_size, coors_range, max_points, max_voxels,
-                           NDim);
-}
-inline void dynamic_voxelize(const at::Tensor &points, at::Tensor &coors,
-                             const std::vector<float> voxel_size,
-                             const std::vector<float> coors_range,
-                             const int NDim = 3) {
-  if (points.device().is_cuda()) {
-#ifdef WITH_CUDA
-    return dynamic_voxelize_gpu(points, coors, voxel_size, coors_range, NDim);
-#else
-    AT_ERROR("Not compiled with GPU support");
-#endif
-  }
-  return dynamic_voxelize_cpu(points, coors, voxel_size, coors_range, NDim);
-}
-inline reduce_t convert_reduce_type(const std::string &reduce_type) {
-  if (reduce_type == "max")
-    return reduce_t::MAX;
-  else if (reduce_type == "sum")
-    return reduce_t::SUM;
-  else if (reduce_type == "mean")
-    return reduce_t::MEAN;
-  else TORCH_CHECK(false, "do not support reduce type " + reduce_type)
-  return reduce_t::SUM;
-}
-inline std::vector<torch::Tensor> dynamic_point_to_voxel_forward(const torch::Tensor &feats,
-                                                                 const torch::Tensor &coors,
-                                                                 const std::string &reduce_type) {
-  if (feats.device().is_cuda()) {
-#ifdef WITH_CUDA
-    return dynamic_point_to_voxel_forward_gpu(feats, coors, convert_reduce_type(reduce_type));
-#else
-    TORCH_CHECK(false, "Not compiled with GPU support");
-#endif
-  }
-  TORCH_CHECK(false, "do not support cpu yet");
-  return std::vector<torch::Tensor>();
-}
-inline void dynamic_point_to_voxel_backward(torch::Tensor &grad_feats,
-                                            const torch::Tensor &grad_reduced_feats,
-                                            const torch::Tensor &feats,
-                                            const torch::Tensor &reduced_feats,
-                                            const torch::Tensor &coors_idx,
-                                            const torch::Tensor &reduce_count,
-                                            const std::string &reduce_type) {
-  if (grad_feats.device().is_cuda()) {
-#ifdef WITH_CUDA
-    dynamic_point_to_voxel_backward_gpu(
-        grad_feats, grad_reduced_feats, feats, reduced_feats, coors_idx, reduce_count,
-        convert_reduce_type(reduce_type));
-    return;
-#else
-    TORCH_CHECK(false, "Not compiled with GPU support");
-#endif
-  }
-  TORCH_CHECK(false, "do not support cpu yet");
-}
-}  // namespace voxelization
--- a/mmdet3d/ops/voxel/src/voxelization_cpu.cpp
+++ b/mmdet3d/ops/voxel/src/voxelization_cpu.cpp
-#include <ATen/TensorUtils.h>
-#include <torch/extension.h>
-// #include "voxelization.h"
-namespace {
-template <typename T, typename T_int>
-void dynamic_voxelize_kernel(const torch::TensorAccessor<T, 2> points,
-                             torch::TensorAccessor<T_int, 2> coors,
-                             const std::vector<float> voxel_size,
-                             const std::vector<float> coors_range,
-                             const std::vector<int> grid_size,
-                             const int num_points, const int num_features,
-                             const int NDim) {
-  const int ndim_minus_1 = NDim - 1;
-  bool failed = false;
-  // int coor[NDim];
-  int* coor = new int[NDim]();
-  int c;
-  for (int i = 0; i < num_points; ++i) {
-    failed = false;
-    for (int j = 0; j < NDim; ++j) {
-      c = floor((points[i][j] - coors_range[j]) / voxel_size[j]);
-      // necessary to rm points out of range
-      if ((c < 0 || c >= grid_size[j])) {
-        failed = true;
-        break;
-      }
-      coor[ndim_minus_1 - j] = c;
-    }
-    for (int k = 0; k < NDim; ++k) {
-      if (failed)
-        coors[i][k] = -1;
-      else
-        coors[i][k] = coor[k];
-    }
-  }
-  delete[] coor;
-  return;
-}
-template <typename T, typename T_int>
-void hard_voxelize_kernel(const torch::TensorAccessor<T, 2> points,
-                          torch::TensorAccessor<T, 3> voxels,
-                          torch::TensorAccessor<T_int, 2> coors,
-                          torch::TensorAccessor<T_int, 1> num_points_per_voxel,
-                          torch::TensorAccessor<T_int, 3> coor_to_voxelidx,
-                          int& voxel_num, const std::vector<float> voxel_size,
-                          const std::vector<float> coors_range,
-                          const std::vector<int> grid_size,
-                          const int max_points, const int max_voxels,
-                          const int num_points, const int num_features,
-                          const int NDim) {
-  // declare a temp coors
-  at::Tensor temp_coors = at::zeros(
-      {num_points, NDim}, at::TensorOptions().dtype(at::kInt).device(at::kCPU));
-  // First use dynamic voxelization to get coors,
-  // then check max points/voxels constraints
-  dynamic_voxelize_kernel<T, int>(points, temp_coors.accessor<int, 2>(),
-                                  voxel_size, coors_range, grid_size,
-                                  num_points, num_features, NDim);
-  int voxelidx, num;
-  auto coor = temp_coors.accessor<int, 2>();
-  for (int i = 0; i < num_points; ++i) {
-    // T_int* coor = temp_coors.data_ptr<int>() + i * NDim;
-    if (coor[i][0] == -1) continue;
-    voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];
-    // record voxel
-    if (voxelidx == -1) {
-      voxelidx = voxel_num;
-      if (max_voxels != -1 && voxel_num >= max_voxels) continue;
-      voxel_num += 1;
-      coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]] = voxelidx;
-      for (int k = 0; k < NDim; ++k) {
-        coors[voxelidx][k] = coor[i][k];
-      }
-    }
-    // put points into voxel
-    num = num_points_per_voxel[voxelidx];
-    if (max_points == -1 || num < max_points) {
-      for (int k = 0; k < num_features; ++k) {
-        voxels[voxelidx][num][k] = points[i][k];
-      }
-      num_points_per_voxel[voxelidx] += 1;
-    }
-  }
-  return;
-}
-}  // namespace
-namespace voxelization {
-int hard_voxelize_cpu(const at::Tensor& points, at::Tensor& voxels,
-                      at::Tensor& coors, at::Tensor& num_points_per_voxel,
-                      const std::vector<float> voxel_size,
-                      const std::vector<float> coors_range,
-                      const int max_points, const int max_voxels,
-                      const int NDim = 3) {
-  // current version tooks about 0.02s_0.03s for one frame on cpu
-  // check device
-  AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
-  std::vector<int> grid_size(NDim);
-  const int num_points = points.size(0);
-  const int num_features = points.size(1);
-  for (int i = 0; i < NDim; ++i) {
-    grid_size[i] =
-        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
-  }
-  // coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
-  // printf("cpu coor_to_voxelidx size: [%d, %d, %d]\n", grid_size[2],
-  // grid_size[1], grid_size[0]);
-  at::Tensor coor_to_voxelidx =
-      -at::ones({grid_size[2], grid_size[1], grid_size[0]}, coors.options());
-  int voxel_num = 0;
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      points.scalar_type(), "hard_voxelize_forward", [&] {
-        hard_voxelize_kernel<scalar_t, int>(
-            points.accessor<scalar_t, 2>(), voxels.accessor<scalar_t, 3>(),
-            coors.accessor<int, 2>(), num_points_per_voxel.accessor<int, 1>(),
-            coor_to_voxelidx.accessor<int, 3>(), voxel_num, voxel_size,
-            coors_range, grid_size, max_points, max_voxels, num_points,
-            num_features, NDim);
-      });
-  return voxel_num;
-}
-void dynamic_voxelize_cpu(const at::Tensor& points, at::Tensor& coors,
-                          const std::vector<float> voxel_size,
-                          const std::vector<float> coors_range,
-                          const int NDim = 3) {
-  // check device
-  AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
-  std::vector<int> grid_size(NDim);
-  const int num_points = points.size(0);
-  const int num_features = points.size(1);
-  for (int i = 0; i < NDim; ++i) {
-    grid_size[i] =
-        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
-  }
-  // coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      points.scalar_type(), "hard_voxelize_forward", [&] {
-        dynamic_voxelize_kernel<scalar_t, int>(
-            points.accessor<scalar_t, 2>(), coors.accessor<int, 2>(),
-            voxel_size, coors_range, grid_size, num_points, num_features, NDim);
-      });
-  return;
-}
-}  // namespace voxelization
--- a/mmdet3d/ops/voxel/src/voxelization_cuda.cu
+++ b/mmdet3d/ops/voxel/src/voxelization_cuda.cu
-#include <ATen/ATen.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <torch/types.h>
-#include <ATen/cuda/CUDAApplyUtils.cuh>
-#define CHECK_CUDA(x) \
-  TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x) \
-  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_INPUT(x) \
-  CHECK_CUDA(x);       \
-  CHECK_CONTIGUOUS(x)
-namespace {
-int const threadsPerBlock = sizeof(unsigned long long) * 8;
-}
-#define CUDA_1D_KERNEL_LOOP(i, n)                            \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
-       i += blockDim.x * gridDim.x)
-template <typename T, typename T_int>
-__global__ void dynamic_voxelize_kernel(
-    const T* points, T_int* coors, const float voxel_x, const float voxel_y,
-    const float voxel_z, const float coors_x_min, const float coors_y_min,
-    const float coors_z_min, const float coors_x_max, const float coors_y_max,
-    const float coors_z_max, const int grid_x, const int grid_y,
-    const int grid_z, const int num_points, const int num_features,
-    const int NDim) {
-  //   const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
-  CUDA_1D_KERNEL_LOOP(index, num_points) {
-    // To save some computation
-    auto points_offset = points + index * num_features;
-    auto coors_offset = coors + index * NDim;
-    int c_x = floor((points_offset[0] - coors_x_min) / voxel_x);
-    if (c_x < 0 || c_x >= grid_x) {
-      coors_offset[0] = -1;
-      return;
-    }
-    int c_y = floor((points_offset[1] - coors_y_min) / voxel_y);
-    if (c_y < 0 || c_y >= grid_y) {
-      coors_offset[0] = -1;
-      coors_offset[1] = -1;
-      return;
-    }
-    int c_z = floor((points_offset[2] - coors_z_min) / voxel_z);
-    if (c_z < 0 || c_z >= grid_z) {
-      coors_offset[0] = -1;
-      coors_offset[1] = -1;
-      coors_offset[2] = -1;
-    } else {
-      coors_offset[0] = c_z;
-      coors_offset[1] = c_y;
-      coors_offset[2] = c_x;
-    }
-  }
-}
-template <typename T, typename T_int>
-__global__ void assign_point_to_voxel(const int nthreads, const T* points,
-                                      T_int* point_to_voxelidx,
-                                      T_int* coor_to_voxelidx, T* voxels,
-                                      const int max_points,
-                                      const int num_features,
-                                      const int num_points, const int NDim) {
-  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
-    // const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
-    int index = thread_idx / num_features;
-    int num = point_to_voxelidx[index];
-    int voxelidx = coor_to_voxelidx[index];
-    if (num > -1 && voxelidx > -1) {
-      auto voxels_offset =
-          voxels + voxelidx * max_points * num_features + num * num_features;
-      int k = thread_idx % num_features;
-      voxels_offset[k] = points[thread_idx];
-    }
-  }
-}
-template <typename T, typename T_int>
-__global__ void assign_voxel_coors(const int nthreads, T_int* coor,
-                                   T_int* point_to_voxelidx,
-                                   T_int* coor_to_voxelidx, T_int* voxel_coors,
-                                   const int num_points, const int NDim) {
-  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
-    // const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
-    // if (index >= num_points) return;
-    int index = thread_idx / NDim;
-    int num = point_to_voxelidx[index];
-    int voxelidx = coor_to_voxelidx[index];
-    if (num == 0 && voxelidx > -1) {
-      auto coors_offset = voxel_coors + voxelidx * NDim;
-      int k = thread_idx % NDim;
-      coors_offset[k] = coor[thread_idx];
-    }
-  }
-}
-template <typename T_int>
-__global__ void point_to_voxelidx_kernel(const T_int* coor,
-                                         T_int* point_to_voxelidx,
-                                         T_int* point_to_pointidx,
-                                         const int max_points,
-                                         const int max_voxels,
-                                         const int num_points, const int NDim) {
-  CUDA_1D_KERNEL_LOOP(index, num_points) {
-    auto coor_offset = coor + index * NDim;
-    // skip invalid points
-    if ((index >= num_points) || (coor_offset[0] == -1)) return;
-    int num = 0;
-    int coor_x = coor_offset[0];
-    int coor_y = coor_offset[1];
-    int coor_z = coor_offset[2];
-    // only calculate the coors before this coor[index]
-    for (int i = 0; i < index; ++i) {
-      auto prev_coor = coor + i * NDim;
-      if (prev_coor[0] == -1) continue;
-      // Find all previous points that have the same coors
-      // if find the same coor, record it
-      if ((prev_coor[0] == coor_x) && (prev_coor[1] == coor_y) &&
-          (prev_coor[2] == coor_z)) {
-        num++;
-        if (num == 1) {
-          // point to the same coor that first show up
-          point_to_pointidx[index] = i;
-        } else if (num >= max_points) {
-          // out of boundary
-          return;
-        }
-      }
-    }
-    if (num == 0) {
-      point_to_pointidx[index] = index;
-    }
-    if (num < max_points) {
-      point_to_voxelidx[index] = num;
-    }
-  }
-}
-template <typename T_int>
-__global__ void determin_voxel_num(
-    // const T_int* coor,
-    T_int* num_points_per_voxel, T_int* point_to_voxelidx,
-    T_int* point_to_pointidx, T_int* coor_to_voxelidx, T_int* voxel_num,
-    const int max_points, const int max_voxels, const int num_points) {
-  // only calculate the coors before this coor[index]
-  for (int i = 0; i < num_points; ++i) {
-    // if (coor[i][0] == -1)
-    //    continue;
-    int point_pos_in_voxel = point_to_voxelidx[i];
-    // record voxel
-    if (point_pos_in_voxel == -1) {
-      // out of max_points or invalid point
-      continue;
-    } else if (point_pos_in_voxel == 0) {
-      // record new voxel
-      int voxelidx = voxel_num[0];
-      if (voxel_num[0] >= max_voxels) continue;
-      voxel_num[0] += 1;
-      coor_to_voxelidx[i] = voxelidx;
-      num_points_per_voxel[voxelidx] = 1;
-    } else {
-      int point_idx = point_to_pointidx[i];
-      int voxelidx = coor_to_voxelidx[point_idx];
-      if (voxelidx != -1) {
-        coor_to_voxelidx[i] = voxelidx;
-        num_points_per_voxel[voxelidx] += 1;
-      }
-    }
-  }
-}
-__global__ void nondisterministic_get_assign_pos(
-    const int nthreads, const int32_t *coors_map, int32_t *pts_id,
-    int32_t *coors_count, int32_t *reduce_count, int32_t *coors_order) {
-  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
-    int coors_idx = coors_map[thread_idx];
-    if (coors_idx > -1) {
-      int32_t coors_pts_pos = atomicAdd(&reduce_count[coors_idx], 1);
-      pts_id[thread_idx] = coors_pts_pos;
-      if (coors_pts_pos == 0) {
-        coors_order[coors_idx] = atomicAdd(coors_count, 1);
-      }
-    }
-  }
-}
-template<typename T>
-__global__ void nondisterministic_assign_point_voxel(
-    const int nthreads, const T *points, const int32_t *coors_map,
-    const int32_t *pts_id, const int32_t *coors_in,
-    const int32_t *reduce_count, const int32_t *coors_order,
-    T *voxels, int32_t *coors, int32_t *pts_count, const int max_voxels,
-    const int max_points, const int num_features, const int NDim) {
-  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
-    int coors_idx = coors_map[thread_idx];
-    int coors_pts_pos = pts_id[thread_idx];
-    if (coors_idx > -1) {
-      int coors_pos = coors_order[coors_idx];
-      if (coors_pos < max_voxels && coors_pts_pos < max_points) {
-        auto voxels_offset =
-            voxels + (coors_pos * max_points + coors_pts_pos) * num_features;
-        auto points_offset = points + thread_idx * num_features;
-        for (int k = 0; k < num_features; k++) {
-          voxels_offset[k] = points_offset[k];
-        }
-        if (coors_pts_pos == 0) {
-          pts_count[coors_pos] = min(reduce_count[coors_idx], max_points);
-          auto coors_offset = coors + coors_pos * NDim;
-          auto coors_in_offset = coors_in + coors_idx * NDim;
-          for (int k = 0; k < NDim; k++) {
-            coors_offset[k] = coors_in_offset[k];
-          }
-        }
-      }
-    }
-  }
-}
-namespace voxelization {
-int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
-                      at::Tensor& coors, at::Tensor& num_points_per_voxel,
-                      const std::vector<float> voxel_size,
-                      const std::vector<float> coors_range,
-                      const int max_points, const int max_voxels,
-                      const int NDim = 3) {
-  // current version tooks about 0.04s for one frame on cpu
-  // check device
-  CHECK_INPUT(points);
-  at::cuda::CUDAGuard device_guard(points.device());
-  const int num_points = points.size(0);
-  const int num_features = points.size(1);
-  const float voxel_x = voxel_size[0];
-  const float voxel_y = voxel_size[1];
-  const float voxel_z = voxel_size[2];
-  const float coors_x_min = coors_range[0];
-  const float coors_y_min = coors_range[1];
-  const float coors_z_min = coors_range[2];
-  const float coors_x_max = coors_range[3];
-  const float coors_y_max = coors_range[4];
-  const float coors_z_max = coors_range[5];
-  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
-  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
-  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
-  // map points to voxel coors
-  at::Tensor temp_coors =
-      at::zeros({num_points, NDim}, points.options().dtype(at::kInt));
-  dim3 grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
-  dim3 block(512);
-  // 1. link point to corresponding voxel coors
-  AT_DISPATCH_ALL_TYPES(
-      points.scalar_type(), "hard_voxelize_kernel", ([&] {
-        dynamic_voxelize_kernel<scalar_t, int>
-            <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
-                points.contiguous().data_ptr<scalar_t>(),
-                temp_coors.contiguous().data_ptr<int>(), voxel_x, voxel_y,
-                voxel_z, coors_x_min, coors_y_min, coors_z_min, coors_x_max,
-                coors_y_max, coors_z_max, grid_x, grid_y, grid_z, num_points,
-                num_features, NDim);
-      }));
-  cudaDeviceSynchronize();
-  AT_CUDA_CHECK(cudaGetLastError());
-  // 2. map point to the idx of the corresponding voxel, find duplicate coor
-  // create some temporary variables
-  auto point_to_pointidx = -at::ones(
-      {
-          num_points,
-      },
-      points.options().dtype(at::kInt));
-  auto point_to_voxelidx = -at::ones(
-      {
-          num_points,
-      },
-      points.options().dtype(at::kInt));
-  dim3 map_grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
-  dim3 map_block(512);
-  AT_DISPATCH_ALL_TYPES(
-      temp_coors.scalar_type(), "determin_duplicate", ([&] {
-        point_to_voxelidx_kernel<int>
-            <<<map_grid, map_block, 0, at::cuda::getCurrentCUDAStream()>>>(
-                temp_coors.contiguous().data_ptr<int>(),
-                point_to_voxelidx.contiguous().data_ptr<int>(),
-                point_to_pointidx.contiguous().data_ptr<int>(), max_points,
-                max_voxels, num_points, NDim);
-      }));
-  cudaDeviceSynchronize();
-  AT_CUDA_CHECK(cudaGetLastError());
-  // 3. determine voxel num and voxel's coor index
-  // make the logic in the CUDA device could accelerate about 10 times
-  auto coor_to_voxelidx = -at::ones(
-      {
-          num_points,
-      },
-      points.options().dtype(at::kInt));
-  auto voxel_num = at::zeros(
-      {
-          1,
-      },
-      points.options().dtype(at::kInt));  // must be zero from the beginning
-  AT_DISPATCH_ALL_TYPES(
-      temp_coors.scalar_type(), "determin_duplicate", ([&] {
-        determin_voxel_num<int><<<1, 1, 0, at::cuda::getCurrentCUDAStream()>>>(
-            num_points_per_voxel.contiguous().data_ptr<int>(),
-            point_to_voxelidx.contiguous().data_ptr<int>(),
-            point_to_pointidx.contiguous().data_ptr<int>(),
-            coor_to_voxelidx.contiguous().data_ptr<int>(),
-            voxel_num.contiguous().data_ptr<int>(), max_points, max_voxels,
-            num_points);
-      }));
-  cudaDeviceSynchronize();
-  AT_CUDA_CHECK(cudaGetLastError());
-  // 4. copy point features to voxels
-  // Step 4 & 5 could be parallel
-  auto pts_output_size = num_points * num_features;
-  dim3 cp_grid(std::min(at::cuda::ATenCeilDiv(pts_output_size, 512), 4096));
-  dim3 cp_block(512);
-  AT_DISPATCH_ALL_TYPES(
-      points.scalar_type(), "assign_point_to_voxel", ([&] {
-        assign_point_to_voxel<float, int>
-            <<<cp_grid, cp_block, 0, at::cuda::getCurrentCUDAStream()>>>(
-                pts_output_size, points.contiguous().data_ptr<float>(),
-                point_to_voxelidx.contiguous().data_ptr<int>(),
-                coor_to_voxelidx.contiguous().data_ptr<int>(),
-                voxels.contiguous().data_ptr<float>(), max_points, num_features,
-                num_points, NDim);
-      }));
-  //   cudaDeviceSynchronize();
-  //   AT_CUDA_CHECK(cudaGetLastError());
-  // 5. copy coors of each voxels
-  auto coors_output_size = num_points * NDim;
-  dim3 coors_cp_grid(
-      std::min(at::cuda::ATenCeilDiv(coors_output_size, 512), 4096));
-  dim3 coors_cp_block(512);
-  AT_DISPATCH_ALL_TYPES(
-      points.scalar_type(), "assign_point_to_voxel", ([&] {
-        assign_voxel_coors<float, int><<<coors_cp_grid, coors_cp_block, 0,
-                                         at::cuda::getCurrentCUDAStream()>>>(
-            coors_output_size, temp_coors.contiguous().data_ptr<int>(),
-            point_to_voxelidx.contiguous().data_ptr<int>(),
-            coor_to_voxelidx.contiguous().data_ptr<int>(),
-            coors.contiguous().data_ptr<int>(), num_points, NDim);
-      }));
-  cudaDeviceSynchronize();
-  AT_CUDA_CHECK(cudaGetLastError());
-  auto voxel_num_cpu = voxel_num.to(at::kCPU);
-  int voxel_num_int = voxel_num_cpu.data_ptr<int>()[0];
-  return voxel_num_int;
-}
-int nondisterministic_hard_voxelize_gpu(
-    const at::Tensor &points, at::Tensor &voxels,
-    at::Tensor &coors, at::Tensor &num_points_per_voxel,
-    const std::vector<float> voxel_size,
-    const std::vector<float> coors_range,
-    const int max_points, const int max_voxels,
-    const int NDim = 3) {
-  CHECK_INPUT(points);
-  at::cuda::CUDAGuard device_guard(points.device());
-  const int num_points = points.size(0);
-  const int num_features = points.size(1);
-  if (num_points == 0)
-    return 0;
-  const float voxel_x = voxel_size[0];
-  const float voxel_y = voxel_size[1];
-  const float voxel_z = voxel_size[2];
-  const float coors_x_min = coors_range[0];
-  const float coors_y_min = coors_range[1];
-  const float coors_z_min = coors_range[2];
-  const float coors_x_max = coors_range[3];
-  const float coors_y_max = coors_range[4];
-  const float coors_z_max = coors_range[5];
-  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
-  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
-  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
-  // map points to voxel coors
-  at::Tensor temp_coors =
-      at::zeros({num_points, NDim}, points.options().dtype(torch::kInt32));
-  dim3 grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
-  dim3 block(512);
-  // 1. link point to corresponding voxel coors
-  AT_DISPATCH_ALL_TYPES(
-      points.scalar_type(), "hard_voxelize_kernel", ([&] {
-    dynamic_voxelize_kernel<scalar_t, int>
-    <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
-        points.contiguous().data_ptr<scalar_t>(),
-        temp_coors.contiguous().data_ptr<int>(), voxel_x, voxel_y,
-        voxel_z, coors_x_min, coors_y_min, coors_z_min, coors_x_max,
-        coors_y_max, coors_z_max, grid_x, grid_y, grid_z, num_points,
-        num_features, NDim);
-  }));
-  at::Tensor coors_map;
-  at::Tensor coors_count;
-  at::Tensor coors_order;
-  at::Tensor reduce_count;
-  at::Tensor pts_id;
-  auto coors_clean = temp_coors.masked_fill(temp_coors.lt(0).any(-1, true), -1);
-  std::tie(temp_coors, coors_map, reduce_count) =
-      at::unique_dim(coors_clean, 0, true, true, false);
-  if (temp_coors.index({0, 0}).lt(0).item<bool>()) {
-    // the first element of temp_coors is (-1,-1,-1) and should be removed
-    temp_coors = temp_coors.slice(0, 1);
-    coors_map = coors_map - 1;
-  }
-  int num_coors = temp_coors.size(0);
-  temp_coors = temp_coors.to(torch::kInt32);
-  coors_map = coors_map.to(torch::kInt32);
-  coors_count = coors_map.new_zeros(1);
-  coors_order = coors_map.new_empty(num_coors);
-  reduce_count = coors_map.new_zeros(num_coors);
-  pts_id = coors_map.new_zeros(num_points);
-  dim3 cp_grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
-  dim3 cp_block(512);
-  AT_DISPATCH_ALL_TYPES(points.scalar_type(), "get_assign_pos", ([&] {
-    nondisterministic_get_assign_pos<<<cp_grid, cp_block, 0,
-    at::cuda::getCurrentCUDAStream()>>>(
-        num_points,
-        coors_map.contiguous().data_ptr<int32_t>(),
-        pts_id.contiguous().data_ptr<int32_t>(),
-        coors_count.contiguous().data_ptr<int32_t>(),
-        reduce_count.contiguous().data_ptr<int32_t>(),
-        coors_order.contiguous().data_ptr<int32_t>());
-  }));
-  AT_DISPATCH_ALL_TYPES(
-      points.scalar_type(), "assign_point_to_voxel", ([&] {
-    nondisterministic_assign_point_voxel<scalar_t>
-    <<<cp_grid, cp_block, 0, at::cuda::getCurrentCUDAStream()>>>(
-        num_points, points.contiguous().data_ptr<scalar_t>(),
-        coors_map.contiguous().data_ptr<int32_t>(),
-        pts_id.contiguous().data_ptr<int32_t>(),
-        temp_coors.contiguous().data_ptr<int32_t>(),
-        reduce_count.contiguous().data_ptr<int32_t>(),
-        coors_order.contiguous().data_ptr<int32_t>(),
-        voxels.contiguous().data_ptr<scalar_t>(),
-        coors.contiguous().data_ptr<int32_t>(),
-        num_points_per_voxel.contiguous().data_ptr<int32_t>(),
-        max_voxels, max_points,
-        num_features, NDim);
-  }));
-  AT_CUDA_CHECK(cudaGetLastError());
-  return max_voxels < num_coors ? max_voxels : num_coors;
-}
-void dynamic_voxelize_gpu(const at::Tensor& points, at::Tensor& coors,
-                          const std::vector<float> voxel_size,
-                          const std::vector<float> coors_range,
-                          const int NDim = 3) {
-  // current version tooks about 0.04s for one frame on cpu
-  // check device
-  CHECK_INPUT(points);
-  at::cuda::CUDAGuard device_guard(points.device());
-  const int num_points = points.size(0);
-  const int num_features = points.size(1);
-  const float voxel_x = voxel_size[0];
-  const float voxel_y = voxel_size[1];
-  const float voxel_z = voxel_size[2];
-  const float coors_x_min = coors_range[0];
-  const float coors_y_min = coors_range[1];
-  const float coors_z_min = coors_range[2];
-  const float coors_x_max = coors_range[3];
-  const float coors_y_max = coors_range[4];
-  const float coors_z_max = coors_range[5];
-  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
-  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
-  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
-  const int col_blocks = at::cuda::ATenCeilDiv(num_points, threadsPerBlock);
-  dim3 blocks(col_blocks);
-  dim3 threads(threadsPerBlock);
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  AT_DISPATCH_ALL_TYPES(points.scalar_type(), "dynamic_voxelize_kernel", [&] {
-    dynamic_voxelize_kernel<scalar_t, int><<<blocks, threads, 0, stream>>>(
-        points.contiguous().data_ptr<scalar_t>(),
-        coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,
-        coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,
-        coors_z_max, grid_x, grid_y, grid_z, num_points, num_features, NDim);
-  });
-  cudaDeviceSynchronize();
-  AT_CUDA_CHECK(cudaGetLastError());
-  return;
-}
-}  // namespace voxelization
--- a/mmdet3d/ops/voxel/voxelize.py
+++ b/mmdet3d/ops/voxel/voxelize.py
-# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
-import torch
-from torch import nn
-from torch.autograd import Function
-from torch.nn.modules.utils import _pair
-from .voxel_layer import dynamic_voxelize, hard_voxelize
-class _Voxelization(Function):
-    @staticmethod
-    def forward(ctx,
-                points,
-                voxel_size,
-                coors_range,
-                max_points=35,
-                max_voxels=20000,
-                deterministic=True):
-        """convert kitti points(N, >=3) to voxels.
-        Args:
-            points: [N, ndim] float tensor. points[:, :3] contain xyz points
-                and points[:, 3:] contain other information like reflectivity
-            voxel_size: [3] list/tuple or array, float. xyz, indicate voxel
-                size
-            coors_range: [6] list/tuple or array, float. indicate voxel
-                range. format: xyzxyz, minmax
-            max_points: int. indicate maximum points contained in a voxel. if
-                max_points=-1, it means using dynamic_voxelize
-            max_voxels: int. indicate maximum voxels this function create.
-                for second, 20000 is a good choice. Users should shuffle points
-                before call this function because max_voxels may drop points.
-            deterministic: bool. whether to invoke the non-deterministic
-                version of hard-voxelization implementations. non-deterministic
-                version is considerablly fast but is not deterministic. only
-                affects hard voxelization. default True. for more information
-                of this argument and the implementation insights, please refer
-                to the following links:
-                https://github.com/open-mmlab/mmdetection3d/issues/894
-                https://github.com/open-mmlab/mmdetection3d/pull/904
-                it is an experimental feature and we will appreciate it if
-                you could share with us the failing cases.
-        Returns:
-            voxels: [M, max_points, ndim] float tensor. only contain points
-                    and returned when max_points != -1.
-            coordinates: [M, 3] int32 tensor, always returned.
-            num_points_per_voxel: [M] int32 tensor. Only returned when
-                max_points != -1.
-        """
-        if max_points == -1 or max_voxels == -1:
-            coors = points.new_zeros(size=(points.size(0), 3), dtype=torch.int)
-            dynamic_voxelize(points, coors, voxel_size, coors_range, 3)
-            return coors
-        else:
-            voxels = points.new_zeros(
-                size=(max_voxels, max_points, points.size(1)))
-            coors = points.new_zeros(size=(max_voxels, 3), dtype=torch.int)
-            num_points_per_voxel = points.new_zeros(
-                size=(max_voxels, ), dtype=torch.int)
-            voxel_num = hard_voxelize(points, voxels, coors,
-                                      num_points_per_voxel, voxel_size,
-                                      coors_range, max_points, max_voxels, 3,
-                                      deterministic)
-            # select the valid voxels
-            voxels_out = voxels[:voxel_num]
-            coors_out = coors[:voxel_num]
-            num_points_per_voxel_out = num_points_per_voxel[:voxel_num]
-            return voxels_out, coors_out, num_points_per_voxel_out
-voxelization = _Voxelization.apply
-class Voxelization(nn.Module):
-    def __init__(self,
-                 voxel_size,
-                 point_cloud_range,
-                 max_num_points,
-                 max_voxels=20000,
-                 deterministic=True):
-        super(Voxelization, self).__init__()
-        """
-        Args:
-            voxel_size (list): list [x, y, z] size of three dimension
-            point_cloud_range (list):
-                [x_min, y_min, z_min, x_max, y_max, z_max]
-            max_num_points (int): max number of points per voxel
-            max_voxels (tuple or int): max number of voxels in
-                (training, testing) time
-            deterministic: bool. whether to invoke the non-deterministic
-                version of hard-voxelization implementations. non-deterministic
-                version is considerablly fast but is not deterministic. only
-                affects hard voxelization. default True. for more information
-                of this argument and the implementation insights, please refer
-                to the following links:
-                https://github.com/open-mmlab/mmdetection3d/issues/894
-                https://github.com/open-mmlab/mmdetection3d/pull/904
-                it is an experimental feature and we will appreciate it if
-                you could share with us the failing cases.
-        """
-        self.voxel_size = voxel_size
-        self.point_cloud_range = point_cloud_range
-        self.max_num_points = max_num_points
-        if isinstance(max_voxels, tuple):
-            self.max_voxels = max_voxels
-        else:
-            self.max_voxels = _pair(max_voxels)
-        self.deterministic = deterministic
-        point_cloud_range = torch.tensor(
-            point_cloud_range, dtype=torch.float32)
-        # [0, -40, -3, 70.4, 40, 1]
-        voxel_size = torch.tensor(voxel_size, dtype=torch.float32)
-        grid_size = (point_cloud_range[3:] -
-                     point_cloud_range[:3]) / voxel_size
-        grid_size = torch.round(grid_size).long()
-        input_feat_shape = grid_size[:2]
-        self.grid_size = grid_size
-        # the origin shape is as [x-len, y-len, z-len]
-        # [w, h, d] -> [d, h, w]
-        self.pcd_shape = [*input_feat_shape, 1][::-1]
-    def forward(self, input):
-        """
-        Args:
-            input: NC points
-        """
-        if self.training:
-            max_voxels = self.max_voxels[0]
-        else:
-            max_voxels = self.max_voxels[1]
-        return voxelization(input, self.voxel_size, self.point_cloud_range,
-                            self.max_num_points, max_voxels,
-                            self.deterministic)
-    def __repr__(self):
-        tmpstr = self.__class__.__name__ + '('
-        tmpstr += 'voxel_size=' + str(self.voxel_size)
-        tmpstr += ', point_cloud_range=' + str(self.point_cloud_range)
-        tmpstr += ', max_num_points=' + str(self.max_num_points)
-        tmpstr += ', max_voxels=' + str(self.max_voxels)
-        tmpstr += ', deterministic=' + str(self.deterministic)
-        tmpstr += ')'
-        return tmpstr
--- a/setup.py
+++ b/setup.py
@@ -224,97 +224,5 @@ if __name__ == '__main__':
            'build': parse_requirements('requirements/build.txt'),
            'optional': parse_requirements('requirements/optional.txt'),
        },
-        ext_modules=[
-            make_cuda_ext(
-                name='sparse_conv_ext',
-                module='mmdet3d.ops.spconv',
-                extra_include_path=[
-                    # PyTorch 1.5 uses ninjia, which requires absolute path
-                    # of included files, relative path will cause failure.
-                    os.path.abspath(
-                        os.path.join(*'mmdet3d.ops.spconv'.split('.'),
-                                     'include/'))
-                ],
-                sources=[
-                    'src/all.cc',
-                    'src/reordering.cc',
-                    'src/reordering_cuda.cu',
-                    'src/indice.cc',
-                    'src/indice_cuda.cu',
-                    'src/maxpool.cc',
-                    'src/maxpool_cuda.cu',
-                ],
-                extra_args=['-w', '-std=c++14']),
-            make_cuda_ext(
-                name='iou3d_cuda',
-                module='mmdet3d.ops.iou3d',
-                sources=[
-                    'src/iou3d.cpp',
-                    'src/iou3d_kernel.cu',
-                ]),
-            make_cuda_ext(
-                name='voxel_layer',
-                module='mmdet3d.ops.voxel',
-                sources=[
-                    'src/voxelization.cpp',
-                    'src/scatter_points_cpu.cpp',
-                    'src/scatter_points_cuda.cu',
-                    'src/voxelization_cpu.cpp',
-                    'src/voxelization_cuda.cu',
-                ]),
-            make_cuda_ext(
-                name='roiaware_pool3d_ext',
-                module='mmdet3d.ops.roiaware_pool3d',
-                sources=[
-                    'src/roiaware_pool3d.cpp',
-                    'src/points_in_boxes_cpu.cpp',
-                ],
-                sources_cuda=[
-                    'src/roiaware_pool3d_kernel.cu',
-                    'src/points_in_boxes_cuda.cu',
-                ]),
-            make_cuda_ext(
-                name='roipoint_pool3d_ext',
-                module='mmdet3d.ops.roipoint_pool3d',
-                sources=['src/roipoint_pool3d.cpp'],
-                sources_cuda=['src/roipoint_pool3d_kernel.cu']),
-            make_cuda_ext(
-                name='ball_query_ext',
-                module='mmdet3d.ops.ball_query',
-                sources=['src/ball_query.cpp'],
-                sources_cuda=['src/ball_query_cuda.cu']),
-            make_cuda_ext(
-                name='knn_ext',
-                module='mmdet3d.ops.knn',
-                sources=['src/knn.cpp'],
-                sources_cuda=['src/knn_cuda.cu']),
-            make_cuda_ext(
-                name='assign_score_withk_ext',
-                module='mmdet3d.ops.paconv',
-                sources=['src/assign_score_withk.cpp'],
-                sources_cuda=['src/assign_score_withk_cuda.cu']),
-            make_cuda_ext(
-                name='group_points_ext',
-                module='mmdet3d.ops.group_points',
-                sources=['src/group_points.cpp'],
-                sources_cuda=['src/group_points_cuda.cu']),
-            make_cuda_ext(
-                name='interpolate_ext',
-                module='mmdet3d.ops.interpolate',
-                sources=['src/interpolate.cpp'],
-                sources_cuda=[
-                    'src/three_interpolate_cuda.cu', 'src/three_nn_cuda.cu'
-                ]),
-            make_cuda_ext(
-                name='furthest_point_sample_ext',
-                module='mmdet3d.ops.furthest_point_sample',
-                sources=['src/furthest_point_sample.cpp'],
-                sources_cuda=['src/furthest_point_sample_cuda.cu']),
-            make_cuda_ext(
-                name='gather_points_ext',
-                module='mmdet3d.ops.gather_points',
-                sources=['src/gather_points.cpp'],
-                sources_cuda=['src/gather_points_cuda.cu'])
-        ],
        cmdclass={'build_ext': BuildExtension},
        zip_safe=False)
--- a/tests/test_models/test_common_modules/test_paconv_ops.py
+++ b/tests/test_models/test_common_modules/test_paconv_ops.py
@@ -2,190 +2,7 @@
 import pytest
 import torch
-from mmdet3d.ops import PAConv, PAConvCUDA, assign_score_withk
+from mmdet3d.ops import PAConv, PAConvCUDA
-def test_paconv_assign_scores():
-    if not torch.cuda.is_available():
-        pytest.skip()
-    scores = torch.tensor([[[[0.06947571, 0.6065746], [0.28462553, 0.8378516],
-                             [0.7595994, 0.97220325], [0.519155, 0.766185]],
-                            [[0.15348864, 0.6051019], [0.21510637, 0.31916398],
-                             [0.00236845, 0.5842595], [0.6783676, 0.5216348]]],
-                           [[[0.23089725, 0.5568468], [0.7405102, 0.06438422],
-                             [0.6887394, 0.22089851], [0.0502342, 0.79228795]],
-                            [[0.44883424, 0.15427643],
-                             [0.13817799, 0.34856772], [0.7989621, 0.33788306],
-                             [0.15699774, 0.7693662]]]]).float().cuda()
-    scores.requires_grad_()
-    points = torch.tensor([[[[0.06001121, 0.92963666, 0.5753327, 0.7251477],
-                             [0.53563064, 0.23129565, 0.92366195, 0.44261628]],
-                            [[0.5770022, 0.56625944, 0.23560429, 0.11178821],
-                             [0.7735967, 0.95678777, 0.25468266, 0.02895975]],
-                            [[0.0589869, 0.09017515, 0.5977862, 0.02797985],
-                             [0.603862, 0.35991007, 0.85761684, 0.3096559]],
-                            [[0.22359002, 0.13983732, 0.5544243, 0.68863827],
-                             [0.85646236, 0.75651926, 0.8638947, 0.83600986]],
-                            [[0.45424145, 0.27458847, 0.6456112, 0.47162914],
-                             [0.15773582, 0.47645122, 0.79964715, 0.3323908]],
-                            [[0.8351399, 0.84696376, 0.9431732, 0.29418713],
-                             [0.77168906, 0.6996871, 0.19354361, 0.03392768]],
-                            [[0.30976456, 0.7074133, 0.581795, 0.976677],
-                             [0.69656056, 0.07199162, 0.4708506, 0.29117996]],
-                            [[0.5829035, 0.30201727, 0.76556486, 0.0935446],
-                             [0.88030535, 0.16129416, 0.9242525, 0.49545723]]],
-                           [[[0.50899494, 0.06482804, 0.44939405, 0.37704808],
-                             [0.47028124, 0.11969638, 0.62823206, 0.28560323]],
-                            [[0.40690207, 0.689753, 0.51636654, 0.23040164],
-                             [0.06935787, 0.00488842, 0.22462702, 0.09182382]],
-                            [[0.26611632, 0.00184339, 0.7730655, 0.5228131],
-                             [0.87776035, 0.77895886, 0.2787183, 0.16620636]],
-                            [[0.502574, 0.04039001, 0.5368497, 0.98379374],
-                             [0.40973026, 0.3238272, 0.9733018, 0.13988364]],
-                            [[0.04586202, 0.20983845, 0.20662665, 0.22270602],
-                             [0.60387236, 0.5155574, 0.51237285, 0.6528438]],
-                            [[0.45735973, 0.86821306, 0.61054605, 0.8370336],
-                             [0.45193362, 0.3734138, 0.7825672, 0.5699416]],
-                            [[0.44591594, 0.12447512, 0.09282011, 0.7055254],
-                             [0.25223452, 0.46696228, 0.7051136, 0.892151]],
-                            [[0.49615085, 0.47321403, 0.93138885, 0.7652197],
-                             [0.38766378, 0.30332977, 0.23131835,
-                              0.02863514]]]]).float().cuda()
-    points.requires_grad_()
-    centers = torch.tensor([[[[0.83878064, 0.96658987, 0.8033424, 0.9598312],
-                              [0.45035273, 0.8768925, 0.977736, 0.54547966]],
-                             [[0.01041394, 0.597893, 0.36212963, 0.4410367],
-                              [0.94879234, 0.8372817, 0.21237361, 0.67945415]],
-                             [[0.5096087, 0.26401454, 0.60034937, 0.5417416],
-                              [0.87591463, 0.546456, 0.4096033, 0.16373193]],
-                             [[0.79547447, 0.1482386, 0.12840575, 0.45384115],
-                              [0.5640288, 0.944541, 0.5745328, 0.73229736]],
-                             [[0.93011934, 0.7406011, 0.62621707, 0.8677915],
-                              [0.91563636, 0.3595413, 0.6678378, 0.6085383]],
-                             [[0.22431666, 0.65617776, 0.7483924, 0.6263364],
-                              [0.30968404, 0.78204364, 0.14899081,
-                               0.09628749]],
-                             [[0.73675203, 0.72104895, 0.4648038, 0.6101647],
-                              [0.7817645, 0.16572917, 0.3311919, 0.43407398]],
-                             [[0.8193154, 0.09559608, 0.05978829, 0.90262103],
-                              [0.4256065, 0.8165596, 0.8206446, 0.6604721]]],
-                            [[[0.7159653, 0.18600845, 0.21433902, 0.3159626],
-                              [0.3921569, 0.33221376, 0.5061177, 0.7961841]],
-                             [[0.95338356, 0.04785997, 0.67185795, 0.6538394],
-                              [0.4729132, 0.33404195, 0.17750603, 0.8445621]],
-                             [[0.6755793, 0.16193843, 0.75943846, 0.92123103],
-                              [0.2781859, 0.03114432, 0.710638, 0.52729136]],
-                             [[0.8376105, 0.10858494, 0.13208169, 0.365772],
-                              [0.5930795, 0.27390373, 0.14036089, 0.170403]],
-                             [[0.3479789, 0.89855295, 0.04844379, 0.9871029],
-                              [0.29781651, 0.0244137, 0.9179047, 0.8081611]],
-                             [[0.12460887, 0.44991326, 0.19382608, 0.35037738],
-                              [0.2773472, 0.4362057, 0.36757517, 0.5993509]],
-                             [[0.29630446, 0.90046406, 0.5417113, 0.13510644],
-                              [0.09623539, 0.04226565, 0.32001644,
-                               0.44358212]],
-                             [[0.5274848, 0.82096446, 0.9415489, 0.7123748],
-                              [0.7537517, 0.8086482, 0.85345286,
-                               0.7472754]]]]).float().cuda()
-    centers.requires_grad_()
-    knn_idx = torch.tensor([[[6, 7, 4, 6], [2, 4, 2, 4]],
-                            [[7, 1, 3, 2], [6, 0, 2, 6]]]).long().cuda()
-    aggregate = 'sum'
-    expected_output = torch.tensor(
-        [[[[-0.08134781, 0.03877336, -0.8212776, -0.2869547],
-           [-0.23378491, -0.24112664, -0.1600166, -0.4121864]],
-          [[-0.05780616, -0.12298299, -0.0370461, -0.07889931],
-           [-0.13956165, -0.02006848, -0.10940295, -0.0293439]],
-          [[0.09284145, 0.58250105, 0.5927749, 0.16774094],
-           [0.27070042, 0.13422406, 0.2617501, 0.23416464]],
-          [[-0.06121218, -0.09561322, -0.20408826, 0.08079343],
-           [0.00944228, 0.03874819, 0.08404065, 0.04041629]]],
-         [[[-0.2110898, -0.13335688, -0.09315082, 0.08512095],
-           [0.09121774, 0.15976946, 0.23994486, 0.14350912]],
-          [[-0.36167958, -0.14891288, -0.64470863, -0.0646704],
-           [-0.28276974, -0.08847666, -0.46904767, 0.20491874]],
-          [[-0.34877953, -0.35533834, -0.25225785, -0.4638189],
-           [-0.1420663, 0.09467781, 0.17088932, 0.22580585]],
-          [[-0.3879708, -0.3991068, 0.05276498, -0.46989647],
-           [0.32522714, -0.02163534, 0.21604237, 0.4346682]]]]).float()
-    # test forward
-    output = assign_score_withk(scores, points, centers, knn_idx, aggregate)
-    assert torch.allclose(output.detach().cpu(), expected_output, atol=1e-6)
-    # test backward
-    loss = output.sum()
-    loss.backward()
-    expected_scores_grad = torch.tensor([[[[0.04288036, -0.18217683],
-                                           [-0.78873926, 0.7485497],
-                                           [-0.6866992, 0.05346543],
-                                           [0.04288036, -0.18217683]],
-                                          [[-1.1407862, 0.13533896],
-                                           [-0.06964391, -0.22948086],
-                                           [-1.1407862, 0.13533896],
-                                           [-0.06964391, -0.22948086]]],
-                                         [[[-0.3363995, -2.212181],
-                                           [-1.1589496, -2.7724311],
-                                           [-0.9387654, -1.3163853],
-                                           [-1.4385346, -1.0614843]],
-                                          [[-0.5048497, 1.4143617],
-                                           [-0.47332114, 0.6017133],
-                                           [-0.30974793, 1.1995442],
-                                           [-0.5048497, 1.4143617]]]]).float()
-    expected_points_grad = torch.tensor(
-        [[[[0., 0., 0., 0.], [0., 0., 0., 0.]],
-          [[0., 0., 0., 0.], [0., 0., 0., 0.]],
-          [[0.15585709, 0.15585709, 0.15585709, 0.15585709],
-           [1.1893613, 1.1893613, 1.1893613, 1.1893613]],
-          [[0., 0., 0., 0.], [0., 0., 0., 0.]],
-          [[1.6530733, 1.6530733, 1.6530733, 1.6530733],
-           [1.8130021, 1.8130021, 1.8130021, 1.8130021]],
-          [[0., 0., 0., 0.], [0., 0., 0., 0.]],
-          [[0.58863074, 0.58863074, 0.58863074, 0.58863074],
-           [1.3727596, 1.3727596, 1.3727596, 1.3727596]],
-          [[0.28462553, 0.28462553, 0.28462553, 0.28462553],
-           [0.8378516, 0.8378516, 0.8378516, 0.8378516]]],
-         [[[0.13817799, 0.13817799, 0.13817799, 0.13817799],
-           [0.34856772, 0.34856772, 0.34856772, 0.34856772]],
-          [[0.7405102, 0.7405102, 0.7405102, 0.7405102],
-           [0.06438422, 0.06438422, 0.06438422, 0.06438422]],
-          [[0.8491963, 0.8491963, 0.8491963, 0.8491963],
-           [1.1301711, 1.1301711, 1.1301711, 1.1301711]],
-          [[0.6887394, 0.6887394, 0.6887394, 0.6887394],
-           [0.22089851, 0.22089851, 0.22089851, 0.22089851]],
-          [[0., 0., 0., 0.], [0., 0., 0., 0.]],
-          [[0., 0., 0., 0.], [0., 0., 0., 0.]],
-          [[0.605832, 0.605832, 0.605832, 0.605832],
-           [0.92364264, 0.92364264, 0.92364264, 0.92364264]],
-          [[0.23089725, 0.23089725, 0.23089725, 0.23089725],
-           [0.5568468, 0.5568468, 0.5568468, 0.5568468]]]]).float()
-    expected_centers_grad = torch.tensor(
-        [[[[0., 0., 0., 0.], [0., 0., 0., 0.]],
-          [[0., 0., 0., 0.], [0., 0., 0., 0.]],
-          [[-1.0493311, -1.0493311, -1.0493311, -1.0493311],
-           [-2.0301602, -2.0301602, -2.0301602, -2.0301602]],
-          [[0., 0., 0., 0.], [0., 0., 0., 0.]],
-          [[0., 0., 0., 0.], [0., 0., 0., 0.]],
-          [[0., 0., 0., 0.], [0., 0., 0., 0.]],
-          [[-1.6328557, -1.6328557, -1.6328557, -1.6328557],
-           [-3.1828144, -3.1828144, -3.1828144, -3.1828144]],
-          [[0., 0., 0., 0.], [0., 0., 0., 0.]]],
-         [[[0., 0., 0., 0.], [0., 0., 0., 0.]],
-          [[0., 0., 0., 0.], [0., 0., 0., 0.]],
-          [[0., 0., 0., 0.], [0., 0., 0., 0.]],
-          [[0., 0., 0., 0.], [0., 0., 0., 0.]],
-          [[0., 0., 0., 0.], [0., 0., 0., 0.]],
-          [[0., 0., 0., 0.], [0., 0., 0., 0.]],
-          [[-1.5429721, -1.5429721, -1.5429721, -1.5429721],
-           [-1.6100934, -1.6100934, -1.6100934, -1.6100934]],
-          [[-1.7103812, -1.7103812, -1.7103812, -1.7103812],
-           [-1.6344175, -1.6344175, -1.6344175, -1.6344175]]]]).float()
-    assert torch.allclose(
-        scores.grad.detach().cpu(), expected_scores_grad, atol=1e-6)
-    assert torch.allclose(
-        points.grad.detach().cpu(), expected_points_grad, atol=1e-6)
-    assert torch.allclose(
-        centers.grad.detach().cpu(), expected_centers_grad, atol=1e-6)
 def test_paconv():

--- a/tests/test_models/test_common_modules/test_pointnet_ops.py
+++ b/tests/test_models/test_common_modules/test_pointnet_ops.py
-# Copyright (c) OpenMMLab. All rights reserved.
-import pytest
-import torch
-from mmdet3d.ops import (ball_query, furthest_point_sample,
-                         furthest_point_sample_with_dist, gather_points,
-                         grouping_operation, knn, three_interpolate, three_nn)
-def test_fps():
-    if not torch.cuda.is_available():
-        pytest.skip()
-    xyz = torch.tensor([[[-0.2748, 1.0020, -1.1674], [0.1015, 1.3952, -1.2681],
-                         [-0.8070, 2.4137,
-                          -0.5845], [-1.0001, 2.1982, -0.5859],
-                         [0.3841, 1.8983, -0.7431]],
-                        [[-1.0696, 3.0758,
-                          -0.1899], [-0.2559, 3.5521, -0.1402],
-                         [0.8164, 4.0081, -0.1839], [-1.1000, 3.0213, -0.8205],
-                         [-0.0518, 3.7251, -0.3950]]]).cuda()
-    idx = furthest_point_sample(xyz, 3)
-    expected_idx = torch.tensor([[0, 2, 4], [0, 2, 1]]).cuda()
-    assert torch.all(idx == expected_idx)
-def test_ball_query():
-    if not torch.cuda.is_available():
-        pytest.skip()
-    new_xyz = torch.tensor([[[-0.0740, 1.3147, -1.3625],
-                             [-2.2769, 2.7817, -0.2334],
-                             [-0.4003, 2.4666, -0.5116],
-                             [-0.0740, 1.3147, -1.3625],
-                             [-0.0740, 1.3147, -1.3625]],
-                            [[-2.0289, 2.4952, -0.1708],
-                             [-2.0668, 6.0278, -0.4875],
-                             [0.4066, 1.4211, -0.2947],
-                             [-2.0289, 2.4952, -0.1708],
-                             [-2.0289, 2.4952, -0.1708]]]).cuda()
-    xyz = torch.tensor([[[-0.0740, 1.3147, -1.3625], [0.5555, 1.0399, -1.3634],
-                         [-0.4003, 2.4666,
-                          -0.5116], [-0.5251, 2.4379, -0.8466],
-                         [-0.9691, 1.1418,
-                          -1.3733], [-0.2232, 0.9561, -1.3626],
-                         [-2.2769, 2.7817, -0.2334],
-                         [-0.2822, 1.3192, -1.3645], [0.1533, 1.5024, -1.0432],
-                         [0.4917, 1.1529, -1.3496]],
-                        [[-2.0289, 2.4952,
-                          -0.1708], [-0.7188, 0.9956, -0.5096],
-                         [-2.0668, 6.0278, -0.4875], [-1.9304, 3.3092, 0.6610],
-                         [0.0949, 1.4332, 0.3140], [-1.2879, 2.0008, -0.7791],
-                         [-0.7252, 0.9611, -0.6371], [0.4066, 1.4211, -0.2947],
-                         [0.3220, 1.4447, 0.3548], [-0.9744, 2.3856,
-                                                    -1.2000]]]).cuda()
-    idx = ball_query(0, 0.2, 5, xyz, new_xyz)
-    expected_idx = torch.tensor([[[0, 0, 0, 0, 0], [6, 6, 6, 6, 6],
-                                  [2, 2, 2, 2, 2], [0, 0, 0, 0, 0],
-                                  [0, 0, 0, 0, 0]],
-                                 [[0, 0, 0, 0, 0], [2, 2, 2, 2, 2],
-                                  [7, 7, 7, 7, 7], [0, 0, 0, 0, 0],
-                                  [0, 0, 0, 0, 0]]]).cuda()
-    assert torch.all(idx == expected_idx)
-    # test dilated ball query
-    idx = ball_query(0.2, 0.4, 5, xyz, new_xyz)
-    expected_idx = torch.tensor([[[0, 5, 7, 0, 0], [6, 6, 6, 6, 6],
-                                  [2, 3, 2, 2, 2], [0, 5, 7, 0, 0],
-                                  [0, 5, 7, 0, 0]],
-                                 [[0, 0, 0, 0, 0], [2, 2, 2, 2, 2],
-                                  [7, 7, 7, 7, 7], [0, 0, 0, 0, 0],
-                                  [0, 0, 0, 0, 0]]]).cuda()
-    assert torch.all(idx == expected_idx)
-def test_knn():
-    if not torch.cuda.is_available():
-        pytest.skip()
-    new_xyz = torch.tensor([[[-0.0740, 1.3147, -1.3625],
-                             [-2.2769, 2.7817, -0.2334],
-                             [-0.4003, 2.4666, -0.5116],
-                             [-0.0740, 1.3147, -1.3625],
-                             [-0.0740, 1.3147, -1.3625]],
-                            [[-2.0289, 2.4952, -0.1708],
-                             [-2.0668, 6.0278, -0.4875],
-                             [0.4066, 1.4211, -0.2947],
-                             [-2.0289, 2.4952, -0.1708],
-                             [-2.0289, 2.4952, -0.1708]]]).cuda()
-    xyz = torch.tensor([[[-0.0740, 1.3147, -1.3625], [0.5555, 1.0399, -1.3634],
-                         [-0.4003, 2.4666,
-                          -0.5116], [-0.5251, 2.4379, -0.8466],
-                         [-0.9691, 1.1418,
-                          -1.3733], [-0.2232, 0.9561, -1.3626],
-                         [-2.2769, 2.7817, -0.2334],
-                         [-0.2822, 1.3192, -1.3645], [0.1533, 1.5024, -1.0432],
-                         [0.4917, 1.1529, -1.3496]],
-                        [[-2.0289, 2.4952,
-                          -0.1708], [-0.7188, 0.9956, -0.5096],
-                         [-2.0668, 6.0278, -0.4875], [-1.9304, 3.3092, 0.6610],
-                         [0.0949, 1.4332, 0.3140], [-1.2879, 2.0008, -0.7791],
-                         [-0.7252, 0.9611, -0.6371], [0.4066, 1.4211, -0.2947],
-                         [0.3220, 1.4447, 0.3548], [-0.9744, 2.3856,
-                                                    -1.2000]]]).cuda()
-    idx = knn(5, xyz, new_xyz)
-    new_xyz_ = new_xyz.unsqueeze(2).repeat(1, 1, xyz.shape[1], 1)
-    xyz_ = xyz.unsqueeze(1).repeat(1, new_xyz.shape[1], 1, 1)
-    dist = ((new_xyz_ - xyz_) * (new_xyz_ - xyz_)).sum(-1)
-    expected_idx = dist.topk(k=5, dim=2, largest=False)[1].transpose(2, 1)
-    assert torch.all(idx == expected_idx)
-    idx = knn(5,
-              xyz.transpose(1, 2).contiguous(),
-              new_xyz.transpose(1, 2).contiguous(), True)
-    assert torch.all(idx == expected_idx)
-    idx = knn(5, xyz, xyz)
-    xyz_ = xyz.unsqueeze(2).repeat(1, 1, xyz.shape[1], 1)
-    xyz__ = xyz.unsqueeze(1).repeat(1, xyz.shape[1], 1, 1)
-    dist = ((xyz_ - xyz__) * (xyz_ - xyz__)).sum(-1)
-    expected_idx = dist.topk(k=5, dim=2, largest=False)[1].transpose(2, 1)
-    assert torch.all(idx == expected_idx)
-def test_grouping_points():
-    if not torch.cuda.is_available():
-        pytest.skip()
-    idx = torch.tensor([[[0, 0, 0], [3, 3, 3], [8, 8, 8], [0, 0, 0], [0, 0, 0],
-                         [0, 0, 0]],
-                        [[0, 0, 0], [6, 6, 6], [9, 9, 9], [0, 0, 0], [0, 0, 0],
-                         [0, 0, 0]]]).int().cuda()
-    festures = torch.tensor([[[
-        0.5798, -0.7981, -0.9280, -1.3311, 1.3687, 0.9277, -0.4164, -1.8274,
-        0.9268, 0.8414
-    ],
-                              [
-                                  5.4247, 1.5113, 2.3944, 1.4740, 5.0300,
-                                  5.1030, 1.9360, 2.1939, 2.1581, 3.4666
-                              ],
-                              [
-                                  -1.6266, -1.0281, -1.0393, -1.6931, -1.3982,
-                                  -0.5732, -1.0830, -1.7561, -1.6786, -1.6967
-                              ]],
-                             [[
-                                 -0.0380, -0.1880, -1.5724, 0.6905, -0.3190,
-                                 0.7798, -0.3693, -0.9457, -0.2942, -1.8527
-                             ],
-                              [
-                                  1.1773, 1.5009, 2.6399, 5.9242, 1.0962,
-                                  2.7346, 6.0865, 1.5555, 4.3303, 2.8229
-                              ],
-                              [
-                                  -0.6646, -0.6870, -0.1125, -0.2224, -0.3445,
-                                  -1.4049, 0.4990, -0.7037, -0.9924, 0.0386
-                              ]]]).cuda()
-    output = grouping_operation(festures, idx)
-    expected_output = torch.tensor([[[[0.5798, 0.5798, 0.5798],
-                                      [-1.3311, -1.3311, -1.3311],
-                                      [0.9268, 0.9268, 0.9268],
-                                      [0.5798, 0.5798, 0.5798],
-                                      [0.5798, 0.5798, 0.5798],
-                                      [0.5798, 0.5798, 0.5798]],
-                                     [[5.4247, 5.4247, 5.4247],
-                                      [1.4740, 1.4740, 1.4740],
-                                      [2.1581, 2.1581, 2.1581],
-                                      [5.4247, 5.4247, 5.4247],
-                                      [5.4247, 5.4247, 5.4247],
-                                      [5.4247, 5.4247, 5.4247]],
-                                     [[-1.6266, -1.6266, -1.6266],
-                                      [-1.6931, -1.6931, -1.6931],
-                                      [-1.6786, -1.6786, -1.6786],
-                                      [-1.6266, -1.6266, -1.6266],
-                                      [-1.6266, -1.6266, -1.6266],
-                                      [-1.6266, -1.6266, -1.6266]]],
-                                    [[[-0.0380, -0.0380, -0.0380],
-                                      [-0.3693, -0.3693, -0.3693],
-                                      [-1.8527, -1.8527, -1.8527],
-                                      [-0.0380, -0.0380, -0.0380],
-                                      [-0.0380, -0.0380, -0.0380],
-                                      [-0.0380, -0.0380, -0.0380]],
-                                     [[1.1773, 1.1773, 1.1773],
-                                      [6.0865, 6.0865, 6.0865],
-                                      [2.8229, 2.8229, 2.8229],
-                                      [1.1773, 1.1773, 1.1773],
-                                      [1.1773, 1.1773, 1.1773],
-                                      [1.1773, 1.1773, 1.1773]],
-                                     [[-0.6646, -0.6646, -0.6646],
-                                      [0.4990, 0.4990, 0.4990],
-                                      [0.0386, 0.0386, 0.0386],
-                                      [-0.6646, -0.6646, -0.6646],
-                                      [-0.6646, -0.6646, -0.6646],
-                                      [-0.6646, -0.6646, -0.6646]]]]).cuda()
-    assert torch.allclose(output, expected_output)
-def test_gather_points():
-    if not torch.cuda.is_available():
-        pytest.skip()
-    features = torch.tensor([[[
-        -1.6095, -0.1029, -0.8876, -1.2447, -2.4031, 0.3708, -1.1586, -1.4967,
-        -0.4800, 0.2252
-    ],
-                              [
-                                  1.9138, 3.4979, 1.6854, 1.5631, 3.6776,
-                                  3.1154, 2.1705, 2.5221, 2.0411, 3.1446
-                              ],
-                              [
-                                  -1.4173, 0.3073, -1.4339, -1.4340, -1.2770,
-                                  -0.2867, -1.4162, -1.4044, -1.4245, -1.4074
-                              ]],
-                             [[
-                                 0.2160, 0.0842, 0.3661, -0.2749, -0.4909,
-                                 -0.6066, -0.8773, -0.0745, -0.9496, 0.1434
-                             ],
-                              [
-                                  1.3644, 1.8087, 1.6855, 1.9563, 1.2746,
-                                  1.9662, 0.9566, 1.8778, 1.1437, 1.3639
-                              ],
-                              [
-                                  -0.7172, 0.1692, 0.2241, 0.0721, -0.7540,
-                                  0.0462, -0.6227, 0.3223, -0.6944, -0.5294
-                              ]]]).cuda()
-    idx = torch.tensor([[0, 1, 4, 0, 0, 0], [0, 5, 6, 0, 0, 0]]).int().cuda()
-    output = gather_points(features, idx)
-    expected_output = torch.tensor(
-        [[[-1.6095, -0.1029, -2.4031, -1.6095, -1.6095, -1.6095],
-          [1.9138, 3.4979, 3.6776, 1.9138, 1.9138, 1.9138],
-          [-1.4173, 0.3073, -1.2770, -1.4173, -1.4173, -1.4173]],
-         [[0.2160, -0.6066, -0.8773, 0.2160, 0.2160, 0.2160],
-          [1.3644, 1.9662, 0.9566, 1.3644, 1.3644, 1.3644],
-          [-0.7172, 0.0462, -0.6227, -0.7172, -0.7172, -0.7172]]]).cuda()
-    assert torch.allclose(output, expected_output)
-    output_half = gather_points(features.half(), idx)
-    assert torch.allclose(output_half, expected_output.half())
-def test_three_interpolate():
-    if not torch.cuda.is_available():
-        pytest.skip()
-    features = torch.tensor([[[2.4350, 4.7516, 4.4995, 2.4350, 2.4350, 2.4350],
-                              [3.1236, 2.6278, 3.0447, 3.1236, 3.1236, 3.1236],
-                              [2.6732, 2.8677, 2.6436, 2.6732, 2.6732, 2.6732],
-                              [0.0124, 7.0150, 7.0199, 0.0124, 0.0124, 0.0124],
-                              [0.3207, 0.0000, 0.3411, 0.3207, 0.3207,
-                               0.3207]],
-                             [[0.0000, 0.9544, 2.4532, 0.0000, 0.0000, 0.0000],
-                              [0.5346, 1.9176, 1.4715, 0.5346, 0.5346, 0.5346],
-                              [0.0000, 0.2744, 2.0842, 0.0000, 0.0000, 0.0000],
-                              [0.3414, 1.5063, 1.6209, 0.3414, 0.3414, 0.3414],
-                              [0.5814, 0.0103, 0.0000, 0.5814, 0.5814,
-                               0.5814]]]).cuda()
-    idx = torch.tensor([[[0, 1, 2], [2, 3, 4], [2, 3, 4], [0, 1, 2], [0, 1, 2],
-                         [0, 1, 3]],
-                        [[0, 2, 3], [1, 3, 4], [2, 1, 4], [0, 2, 4], [0, 2, 4],
-                         [0, 1, 2]]]).int().cuda()
-    weight = torch.tensor([[[3.3333e-01, 3.3333e-01, 3.3333e-01],
-                            [1.0000e+00, 5.8155e-08, 2.2373e-08],
-                            [1.0000e+00, 1.7737e-08, 1.7356e-08],
-                            [3.3333e-01, 3.3333e-01, 3.3333e-01],
-                            [3.3333e-01, 3.3333e-01, 3.3333e-01],
-                            [3.3333e-01, 3.3333e-01, 3.3333e-01]],
-                           [[3.3333e-01, 3.3333e-01, 3.3333e-01],
-                            [1.0000e+00, 1.3651e-08, 7.7312e-09],
-                            [1.0000e+00, 1.7148e-08, 1.4070e-08],
-                            [3.3333e-01, 3.3333e-01, 3.3333e-01],
-                            [3.3333e-01, 3.3333e-01, 3.3333e-01],
-                            [3.3333e-01, 3.3333e-01, 3.3333e-01]]]).cuda()
-    output = three_interpolate(features, idx, weight)
-    expected_output = torch.tensor([[[
-        3.8953e+00, 4.4995e+00, 4.4995e+00, 3.8953e+00, 3.8953e+00, 3.2072e+00
-    ], [
-        2.9320e+00, 3.0447e+00, 3.0447e+00, 2.9320e+00, 2.9320e+00, 2.9583e+00
-    ], [
-        2.7281e+00, 2.6436e+00, 2.6436e+00, 2.7281e+00, 2.7281e+00, 2.7380e+00
-    ], [
-        4.6824e+00, 7.0199e+00, 7.0199e+00, 4.6824e+00, 4.6824e+00, 2.3466e+00
-    ], [
-        2.2060e-01, 3.4110e-01, 3.4110e-01, 2.2060e-01, 2.2060e-01, 2.1380e-01
-    ]],
-                                    [[
-                                        8.1773e-01, 9.5440e-01, 2.4532e+00,
-                                        8.1773e-01, 8.1773e-01, 1.1359e+00
-                                    ],
-                                     [
-                                         8.4689e-01, 1.9176e+00, 1.4715e+00,
-                                         8.4689e-01, 8.4689e-01, 1.3079e+00
-                                     ],
-                                     [
-                                         6.9473e-01, 2.7440e-01, 2.0842e+00,
-                                         6.9473e-01, 6.9473e-01, 7.8619e-01
-                                     ],
-                                     [
-                                         7.6789e-01, 1.5063e+00, 1.6209e+00,
-                                         7.6789e-01, 7.6789e-01, 1.1562e+00
-                                     ],
-                                     [
-                                         3.8760e-01, 1.0300e-02, 8.3569e-09,
-                                         3.8760e-01, 3.8760e-01, 1.9723e-01
-                                     ]]]).cuda()
-    assert torch.allclose(output, expected_output, 1e-4)
-def test_three_nn():
-    if not torch.cuda.is_available():
-        pytest.skip()
-    known = torch.tensor([[[-1.8373, 3.5605,
-                            -0.7867], [0.7615, 2.9420, 0.2314],
-                           [-0.6503, 3.6637, -1.0622],
-                           [-1.8373, 3.5605, -0.7867],
-                           [-1.8373, 3.5605, -0.7867]],
-                          [[-1.3399, 1.9991, -0.3698],
-                           [-0.0799, 0.9698,
-                            -0.8457], [0.0858, 2.4721, -0.1928],
-                           [-1.3399, 1.9991, -0.3698],
-                           [-1.3399, 1.9991, -0.3698]]]).cuda()
-    unknown = torch.tensor([[[-1.8373, 3.5605, -0.7867],
-                             [0.7615, 2.9420, 0.2314],
-                             [-0.6503, 3.6637, -1.0622],
-                             [-1.5237, 2.3976, -0.8097],
-                             [-0.0722, 3.4017, -0.2880],
-                             [0.5198, 3.0661, -0.4605],
-                             [-2.0185, 3.5019, -0.3236],
-                             [0.5098, 3.1020, 0.5799],
-                             [-1.6137, 3.8443, -0.5269],
-                             [0.7341, 2.9626, -0.3189]],
-                            [[-1.3399, 1.9991, -0.3698],
-                             [-0.0799, 0.9698, -0.8457],
-                             [0.0858, 2.4721, -0.1928],
-                             [-0.9022, 1.6560, -1.3090],
-                             [0.1156, 1.6901, -0.4366],
-                             [-0.6477, 2.3576, -0.1563],
-                             [-0.8482, 1.1466, -1.2704],
-                             [-0.8753, 2.0845, -0.3460],
-                             [-0.5621, 1.4233, -1.2858],
-                             [-0.5883, 1.3114, -1.2899]]]).cuda()
-    dist, idx = three_nn(unknown, known)
-    expected_dist = torch.tensor([[[0.0000, 0.0000, 0.0000],
-                                   [0.0000, 2.0463, 2.8588],
-                                   [0.0000, 1.2229, 1.2229],
-                                   [1.2047, 1.2047, 1.2047],
-                                   [1.0011, 1.0845, 1.8411],
-                                   [0.7433, 1.4451, 2.4304],
-                                   [0.5007, 0.5007, 0.5007],
-                                   [0.4587, 2.0875, 2.7544],
-                                   [0.4450, 0.4450, 0.4450],
-                                   [0.5514, 1.7206, 2.6811]],
-                                  [[0.0000, 0.0000, 0.0000],
-                                   [0.0000, 1.6464, 1.6952],
-                                   [0.0000, 1.5125, 1.5125],
-                                   [1.0915, 1.0915, 1.0915],
-                                   [0.8197, 0.8511, 1.4894],
-                                   [0.7433, 0.8082, 0.8082],
-                                   [0.8955, 1.3340, 1.3340],
-                                   [0.4730, 0.4730, 0.4730],
-                                   [0.7949, 1.3325, 1.3325],
-                                   [0.7566, 1.3727, 1.3727]]]).cuda()
-    expected_idx = torch.tensor([[[0, 3, 4], [1, 2, 0], [2, 0, 3], [0, 3, 4],
-                                  [2, 1, 0], [1, 2, 0], [0, 3, 4], [1, 2, 0],
-                                  [0, 3, 4], [1, 2, 0]],
-                                 [[0, 3, 4], [1, 2, 0], [2, 0, 3], [0, 3, 4],
-                                  [2, 1, 0], [2, 0, 3], [1, 0, 3], [0, 3, 4],
-                                  [1, 0, 3], [1, 0, 3]]]).cuda()
-    assert torch.allclose(dist, expected_dist, 1e-4)
-    assert torch.all(idx == expected_idx)
-def test_fps_with_dist():
-    if not torch.cuda.is_available():
-        pytest.skip()
-    xyz = torch.tensor([[[-0.2748, 1.0020, -1.1674], [0.1015, 1.3952, -1.2681],
-                         [-0.8070, 2.4137,
-                          -0.5845], [-1.0001, 2.1982, -0.5859],
-                         [0.3841, 1.8983, -0.7431]],
-                        [[-1.0696, 3.0758,
-                          -0.1899], [-0.2559, 3.5521, -0.1402],
-                         [0.8164, 4.0081, -0.1839], [-1.1000, 3.0213, -0.8205],
-                         [-0.0518, 3.7251, -0.3950]]]).cuda()
-    expected_idx = torch.tensor([[0, 2, 4], [0, 2, 1]]).cuda()
-    xyz_square_dist = ((xyz.unsqueeze(dim=1) -
-                        xyz.unsqueeze(dim=2))**2).sum(-1)
-    idx = furthest_point_sample_with_dist(xyz_square_dist, 3)
-    assert torch.all(idx == expected_idx)
-    import numpy as np
-    fps_idx = np.load('tests/data/ops/fps_idx.npy')
-    features_for_fps_distance = np.load(
-        'tests/data/ops/features_for_fps_distance.npy')
-    expected_idx = torch.from_numpy(fps_idx).cuda()
-    features_for_fps_distance = torch.from_numpy(
-        features_for_fps_distance).cuda()
-    idx = furthest_point_sample_with_dist(features_for_fps_distance, 16)
-    assert torch.all(idx == expected_idx)
--- a/tests/test_models/test_common_modules/test_roiaware_pool3d.py
+++ b/tests/test_models/test_common_modules/test_roiaware_pool3d.py
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import pytest
-import torch
-from mmdet3d.ops.roiaware_pool3d import (RoIAwarePool3d, points_in_boxes_all,
-                                         points_in_boxes_cpu,
-                                         points_in_boxes_part)
-def test_RoIAwarePool3d():
-    # RoIAwarePool3d only support gpu version currently.
-    if not torch.cuda.is_available():
-        pytest.skip('test requires GPU and torch+cuda')
-    roiaware_pool3d_max = RoIAwarePool3d(
-        out_size=4, max_pts_per_voxel=128, mode='max')
-    roiaware_pool3d_avg = RoIAwarePool3d(
-        out_size=4, max_pts_per_voxel=128, mode='avg')
-    rois = torch.tensor(
-        [[1.0, 2.0, 3.0, 5.0, 4.0, 6.0, -0.3 - np.pi / 2],
-         [-10.0, 23.0, 16.0, 20.0, 10.0, 20.0, -0.5 - np.pi / 2]],
-        dtype=torch.float32).cuda(
-        )  # boxes (m, 7) with bottom center in lidar coordinate
-    pts = torch.tensor(
-        [[1, 2, 3.3], [1.2, 2.5, 3.0], [0.8, 2.1, 3.5], [1.6, 2.6, 3.6],
-         [0.8, 1.2, 3.9], [-9.2, 21.0, 18.2], [3.8, 7.9, 6.3],
-         [4.7, 3.5, -12.2], [3.8, 7.6, -2], [-10.6, -12.9, -20], [-16, -18, 9],
-         [-21.3, -52, -5], [0, 0, 0], [6, 7, 8], [-2, -3, -4]],
-        dtype=torch.float32).cuda()  # points (n, 3) in lidar coordinate
-    pts_feature = pts.clone()
-    pooled_features_max = roiaware_pool3d_max(
-        rois=rois, pts=pts, pts_feature=pts_feature)
-    assert pooled_features_max.shape == torch.Size([2, 4, 4, 4, 3])
-    assert torch.allclose(pooled_features_max.sum(),
-                          torch.tensor(51.100).cuda(), 1e-3)
-    pooled_features_avg = roiaware_pool3d_avg(
-        rois=rois, pts=pts, pts_feature=pts_feature)
-    assert pooled_features_avg.shape == torch.Size([2, 4, 4, 4, 3])
-    assert torch.allclose(pooled_features_avg.sum(),
-                          torch.tensor(49.750).cuda(), 1e-3)
-def test_points_in_boxes_part():
-    if not torch.cuda.is_available():
-        pytest.skip('test requires GPU and torch+cuda')
-    boxes = torch.tensor(
-        [[[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.3]],
-         [[-10.0, 23.0, 16.0, 10, 20, 20, 0.5]]],
-        dtype=torch.float32).cuda(
-        )  # boxes (b, t, 7) with bottom center in lidar coordinate
-    pts = torch.tensor(
-        [[[1, 2, 3.3], [1.2, 2.5, 3.0], [0.8, 2.1, 3.5], [1.6, 2.6, 3.6],
-          [0.8, 1.2, 3.9], [-9.2, 21.0, 18.2], [3.8, 7.9, 6.3],
-          [4.7, 3.5, -12.2]],
-         [[3.8, 7.6, -2], [-10.6, -12.9, -20], [-16, -18, 9], [-21.3, -52, -5],
-          [0, 0, 0], [6, 7, 8], [-2, -3, -4], [6, 4, 9]]],
-        dtype=torch.float32).cuda()  # points (b, m, 3) in lidar coordinate
-    point_indices = points_in_boxes_part(points=pts, boxes=boxes)
-    expected_point_indices = torch.tensor(
-        [[0, 0, 0, 0, 0, -1, -1, -1], [-1, -1, -1, -1, -1, -1, -1, -1]],
-        dtype=torch.int32).cuda()
-    assert point_indices.shape == torch.Size([2, 8])
-    assert (point_indices == expected_point_indices).all()
-    boxes = torch.tensor([[[0.0, 0.0, 0.0, 1.0, 20.0, 1.0, 0.523598]]],
-                         dtype=torch.float32).cuda()  # 30 degrees
-    pts = torch.tensor(
-        [[[4, 6.928, 0], [6.928, 4, 0], [4, -6.928, 0], [6.928, -4, 0],
-          [-4, 6.928, 0], [-6.928, 4, 0], [-4, -6.928, 0], [-6.928, -4, 0]]],
-        dtype=torch.float32).cuda()
-    point_indices = points_in_boxes_part(points=pts, boxes=boxes)
-    expected_point_indices = torch.tensor([[-1, -1, 0, -1, 0, -1, -1, -1]],
-                                          dtype=torch.int32).cuda()
-    assert (point_indices == expected_point_indices).all()
-    if torch.cuda.device_count() > 1:
-        pts = pts.to('cuda:1')
-        boxes = boxes.to('cuda:1')
-        expected_point_indices = expected_point_indices.to('cuda:1')
-        point_indices = points_in_boxes_part(points=pts, boxes=boxes)
-        assert point_indices.shape == torch.Size([2, 8])
-        assert (point_indices == expected_point_indices).all()
-def test_points_in_boxes_cpu():
-    boxes = torch.tensor(
-        [[[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.3],
-          [-10.0, 23.0, 16.0, 10, 20, 20, 0.5]]],
-        dtype=torch.float32
-    )  # boxes (m, 7) with bottom center in lidar coordinate
-    pts = torch.tensor(
-        [[[1, 2, 3.3], [1.2, 2.5, 3.0], [0.8, 2.1, 3.5], [1.6, 2.6, 3.6],
-          [0.8, 1.2, 3.9], [-9.2, 21.0, 18.2], [3.8, 7.9, 6.3],
-          [4.7, 3.5, -12.2], [3.8, 7.6, -2], [-10.6, -12.9, -20], [
-              -16, -18, 9
-          ], [-21.3, -52, -5], [0, 0, 0], [6, 7, 8], [-2, -3, -4]]],
-        dtype=torch.float32)  # points (n, 3) in lidar coordinate
-    point_indices = points_in_boxes_cpu(points=pts, boxes=boxes)
-    expected_point_indices = torch.tensor(
-        [[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [0, 1], [0, 0], [0, 0],
-          [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]],
-        dtype=torch.int32)
-    assert point_indices.shape == torch.Size([1, 15, 2])
-    assert (point_indices == expected_point_indices).all()
-    boxes = torch.tensor([[[0.0, 0.0, 0.0, 1.0, 20.0, 1.0, 0.523598]]],
-                         dtype=torch.float32)  # 30 degrees
-    pts = torch.tensor(
-        [[[4, 6.928, 0], [6.928, 4, 0], [4, -6.928, 0], [6.928, -4, 0],
-          [-4, 6.928, 0], [-6.928, 4, 0], [-4, -6.928, 0], [-6.928, -4, 0]]],
-        dtype=torch.float32)
-    point_indices = points_in_boxes_cpu(points=pts, boxes=boxes)
-    expected_point_indices = torch.tensor(
-        [[[0], [0], [1], [0], [1], [0], [0], [0]]], dtype=torch.int32)
-    assert (point_indices == expected_point_indices).all()
-def test_points_in_boxes_all():
-    if not torch.cuda.is_available():
-        pytest.skip('test requires GPU and torch+cuda')
-    boxes = torch.tensor(
-        [[[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.3],
-          [-10.0, 23.0, 16.0, 10, 20, 20, 0.5]]],
-        dtype=torch.float32).cuda(
-        )  # boxes (m, 7) with bottom center in lidar coordinate
-    pts = torch.tensor(
-        [[[1, 2, 3.3], [1.2, 2.5, 3.0], [0.8, 2.1, 3.5], [1.6, 2.6, 3.6],
-          [0.8, 1.2, 3.9], [-9.2, 21.0, 18.2], [3.8, 7.9, 6.3],
-          [4.7, 3.5, -12.2], [3.8, 7.6, -2], [-10.6, -12.9, -20], [
-              -16, -18, 9
-          ], [-21.3, -52, -5], [0, 0, 0], [6, 7, 8], [-2, -3, -4]]],
-        dtype=torch.float32).cuda()  # points (n, 3) in lidar coordinate
-    point_indices = points_in_boxes_all(points=pts, boxes=boxes)
-    expected_point_indices = torch.tensor(
-        [[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [0, 1], [0, 0], [0, 0],
-          [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]],
-        dtype=torch.int32).cuda()
-    assert point_indices.shape == torch.Size([1, 15, 2])
-    assert (point_indices == expected_point_indices).all()
-    if torch.cuda.device_count() > 1:
-        pts = pts.to('cuda:1')
-        boxes = boxes.to('cuda:1')
-        expected_point_indices = expected_point_indices.to('cuda:1')
-        point_indices = points_in_boxes_all(points=pts, boxes=boxes)
-        assert point_indices.shape == torch.Size([1, 15, 2])
-        assert (point_indices == expected_point_indices).all()