add voxel filtering

d6310a27 · traveller59 · 73427720 · d6310a27 · d6310a27 · d6310a27
Commit d6310a27 authored May 07, 2019 by traveller59
16 changed files
--- a/include/spconv/fused_spconv_ops.h
+++ b/include/spconv/fused_spconv_ops.h
+// Copyright 2019 Yan Yan
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// 
+//     http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef FUSED_SPARSE_CONV_OP_H_
+#define FUSED_SPARSE_CONV_OP_H_
+#include <cuda_runtime_api.h>
+#include <spconv/indice.h>
+#include <spconv/reordering.h>
+#include <torch/script.h>
+#include <torch_utils.h>
+#include <utility/timer.h>
+namespace spconv {
+// torch.jit's doc says only support int64, so we need to convert to int32.
+template <typename T>
+torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
+                       torch::Tensor indicePairs, torch::Tensor indiceNum,
+                       int64_t numActOut, int64_t _inverse, int64_t _subM) {
+  bool subM = _subM != 0;
+  bool inverse = _inverse != 0;
+  auto device = features.device().type();
+  auto ndim = filters.dim() - 2;
+  auto kernelVolume = indicePairs.size(0);
+  auto numInPlanes = features.size(1);
+  auto numOutPlanes = filters.size(ndim + 1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto indicePairMaxSizeIter = std::max_element(
+      indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
+  int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumCpu.data<int>();
+  int indicePairMaxSize = *indicePairMaxSizeIter;
+  /*if (_subM){
+    std::vector<int> indicePairNumVec(indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
+    indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
+    auto indicePairVecMaxSizeIter = std::max_element(
+        indicePairNumVec.begin(), indicePairNumVec.end());
+    indicePairMaxSize = *indicePairVecMaxSizeIter;
+  }*/
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+  // auto indicePairOptions =
+  //     torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
+  torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options).copy_(bias);
+  torch::Tensor inputBuffer = torch::zeros({indicePairMaxSize, numInPlanes}, options);
+  torch::Tensor outputBuffer =
+      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
+  filters = filters.view({-1, numInPlanes, numOutPlanes});
+  if (subM) { // the center index of subm conv don't need gather and scatter
+              // add.
+    torch::mm_out(output, features, filters[indicePairMaxOffset]);
+  }
+  double totalGatherTime = 0;
+  double totalGEMMTime = 0;
+  double totalSAddTime = 0;
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data<int>()[i];
+    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
+      continue;
+    }
+    // auto timer = spconv::CudaContextTimer<>();
+    auto outputBufferBlob =
+        torch::from_blob(outputBuffer.data<T>(), {nHot, numOutPlanes}, options);
+    auto inputBufferBlob =
+        torch::from_blob(inputBuffer.data<T>(), {nHot, numInPlanes}, options);
+    if (device == torch::kCPU) {
+      functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
+      gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
+                 tv::torch2tv<const T>(features),
+                 tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+    } else {
+      functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
+      gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
+                 tv::torch2tv<const T>(features),
+                 tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+      TV_CHECK_CUDA_ERR();
+      /* slower than SparseGatherFunctor, may due to int->long conversion
+      auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
+      auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(), {nHot},
+      indicePairOptions); 
+      torch::index_select_out(inputBufferBlob, features, 0,
+      indicePairBlob);*/
+    }
+    // totalGatherTime += timer.report() / 1000.0;
+    torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
+    // totalGEMMTime += timer.report() / 1000.0;
+    if (device == torch::kCPU) {
+      functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
+      scatterFtor(tv::CPU(), tv::torch2tv<T>(output),
+                  tv::torch2tv<const T>(outputBuffer),
+                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                  true);
+    } else {
+      functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
+      scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
+                  tv::torch2tv<const T>(outputBuffer),
+                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                  true);
+      TV_CHECK_CUDA_ERR();
+    }
+    // totalSAddTime += timer.report() / 1000.0;
+  }
+  // std::cout << "gather time " << totalGatherTime << std::endl;
+  // std::cout << "gemm time " << totalGEMMTime << std::endl;
+  // std::cout << "scatteradd time " << totalSAddTime << std::endl;
+  return output;
+}
+} // namespace spconv
+#endif
\ No newline at end of file
--- a/include/spconv/nms_functor.h
+++ b/include/spconv/nms_functor.h
+// Copyright 2019 Yan Yan
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// 
+//     http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef NMS_FUNCTOR_H_
+#define NMS_FUNCTOR_H_
+#include <tensorview/tensorview.h>
+namespace spconv
+{
+namespace functor
+{
+template <typename Device, typename T, typename Index>
+struct NonMaxSupressionFunctor
+{
+    Index operator()(const Device& d, tv::TensorView<Index> keep,
+                  tv::TensorView<const T> boxes,
+                  T threshold, T eps);
+};
+template <typename Device, typename T, typename Index>
+struct rotateNonMaxSupressionFunctor
+{
+    Index operator()(const Device& d, tv::TensorView<Index> keep,
+                  tv::TensorView<const T> boxCorners,
+                  tv::TensorView<const T> standupIoU, T threshold);
+};
+} // namespace functor
+} // namespace spconv
+#endif
\ No newline at end of file
--- a/include/spconv/nms_ops.h
+++ b/include/spconv/nms_ops.h
+// Copyright 2019 Yan Yan
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// 
+//     http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef NMS_TORCH_OP_H_
+#define NMS_TORCH_OP_H_
+#include <cuda_runtime_api.h>
+#include <spconv/indice.h>
+#include <spconv/reordering.h>
+#include <torch/script.h>
+#include <torch_utils.h>
+#include <utility/timer.h>
+#include <spconv/nms_functor.h>
+namespace spconv {
+// torch.jit's doc says only support int64, so we need to convert to int32.
+template <typename T>
+torch::Tensor
+nonMaxSuppression(torch::Tensor boxes, torch::Tensor scores, int64_t preMaxSize,
+        int64_t postMaxSize, double thresh, double eps) {
+  // auto timer = spconv::CudaContextTimer<>();
+  tv::check_torch_dtype<T>(boxes);
+  auto resOptions =
+      torch::TensorOptions().dtype(torch::kInt64).device(boxes.device());
+  if (boxes.size(0) == 0){
+      return torch::zeros({0}, resOptions);
+  }
+  torch::Tensor indices;
+  if (preMaxSize > 0){
+      auto numKeepedScores = scores.size(0);
+      preMaxSize = std::min(numKeepedScores, preMaxSize);
+      auto res = torch::topk(scores, preMaxSize);
+      indices = std::get<1>(res);
+      boxes = torch::index_select(boxes, 0, indices);
+  }else{
+      indices = std::get<1>(torch::sort(scores));
+      boxes = torch::index_select(boxes, 0, indices);
+  }
+  if (boxes.size(0) == 0)
+    return torch::zeros({0}, resOptions);
+  auto keep = torch::zeros({boxes.size(0)}, resOptions);
+  int64_t keepNum = 0;
+  if (boxes.device().type() == torch::kCPU) {
+    auto nmsFunctor = functor::NonMaxSupressionFunctor<tv::CPU, T, int64_t>();
+    keepNum = nmsFunctor(tv::CPU(), tv::torch2tv<int64_t>(keep),
+    tv::torch2tv<const T>(boxes), T(thresh), T(eps));
+  }else{
+    TV_ASSERT_RT_ERR(false, "not implemented");
+  }
+  if (postMaxSize <= 0){
+    postMaxSize = keepNum;
+  }
+  // std::cout << keep << std::endl;
+  keep = keep.slice(0, 0, std::min(keepNum, postMaxSize));
+  if (preMaxSize > 0){
+    return torch::index_select(indices, 0, keep);
+  }
+  return keep;
+}
+} // namespace spconv
+#endif
\ No newline at end of file
--- a/include/spconv/point2voxel.h
+++ b/include/spconv/point2voxel.h
@@ -87,8 +87,328 @@ int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,
  }
  for (int i = 0; i < voxel_num; ++i) {
    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
+  }
+  return voxel_num;
+}
+template <typename DType, int NDim>
+int points_to_voxel_3d_np_mean(py::array_t<DType> points, py::array_t<DType> voxels,
+                          py::array_t<DType> means,
+                          py::array_t<int> coors,
+                          py::array_t<int> num_points_per_voxel,
+                          py::array_t<int> coor_to_voxelidx,
+                          std::vector<DType> voxel_size,
+                          std::vector<DType> coors_range, int max_points,
+                          int max_voxels) {
+  auto points_rw = points.template mutable_unchecked<2>();
+  auto means_rw = means.template mutable_unchecked<2>();
+  auto voxels_rw = voxels.template mutable_unchecked<3>();
+  auto coors_rw = coors.mutable_unchecked<2>();
+  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
+  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
+  auto N = points_rw.shape(0);
+  auto num_features = points_rw.shape(1);
+  // auto ndim = points_rw.shape(1) - 1;
+  constexpr int ndim_minus_1 = NDim - 1;
+  int voxel_num = 0;
+  bool failed = false;
+  int coor[NDim];
+  int c;
+  int grid_size[NDim];
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+  int voxelidx, num;
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed)
+      continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      if (voxel_num >= max_voxels)
+        break;
+      voxel_num += 1;
+      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
+      for (int k = 0; k < NDim; ++k) {
+        coors_rw(voxelidx, k) = coor[k];
+      }
+    }
+    num = num_points_per_voxel_rw(voxelidx);
+    if (num < max_points) {
+      for (int k = 0; k < num_features; ++k) {
+        voxels_rw(voxelidx, num, k) = points_rw(i, k);
+      }
+      num_points_per_voxel_rw(voxelidx) += 1;
+      for (int k = 0; k < num_features; ++k) {
+        means_rw(voxelidx, k) += (points_rw(i, k) - means_rw(voxelidx, k)) / DType(num + 1);
+      }
+    }
+  }
+  for (int i = 0; i < voxel_num; ++i) {
+    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
+    num = num_points_per_voxel_rw(i);
+    for (int j = num; j < max_points; ++j){
+      for (int k = 0; k < num_features; ++k) {
+        voxels_rw(i, j, k) = means_rw(i, k);
+      }
+    }
+  }
+  return voxel_num;
+}
+template <typename DType, int NDim>
+int points_to_voxel_3d_np_height(py::array_t<DType> points, py::array_t<DType> voxels,
+                          py::array_t<DType> height,
+                          py::array_t<DType> maxs,
+                          py::array_t<int> coors,
+                          py::array_t<int> num_points_per_voxel,
+                          py::array_t<int> coor_to_voxelidx,
+                          std::vector<DType> voxel_size,
+                          std::vector<DType> coors_range, int max_points,
+                          int max_voxels) {
+  auto points_rw = points.template mutable_unchecked<2>();
+  auto height_rw = height.template mutable_unchecked<2>();
+  auto maxs_rw = maxs.template mutable_unchecked<2>();
+  auto voxels_rw = voxels.template mutable_unchecked<3>();
+  auto coors_rw = coors.mutable_unchecked<2>();
+  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
+  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
+  auto N = points_rw.shape(0);
+  auto num_features = points_rw.shape(1);
+  // auto ndim = points_rw.shape(1) - 1;
+  constexpr int ndim_minus_1 = NDim - 1;
+  int voxel_num = 0;
+  bool failed = false;
+  int coor[NDim];
+  int c;
+  int grid_size[NDim];
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+  int voxelidx, num;
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed)
+      continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      if (voxel_num >= max_voxels)
+        break;
+      voxel_num += 1;
+      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
+      for (int k = 0; k < NDim; ++k) {
+        coors_rw(voxelidx, k) = coor[k];
+      }
+    }
+    num = num_points_per_voxel_rw(voxelidx);
+    if (num < max_points) {
+      for (int k = 0; k < num_features; ++k) {
+        voxels_rw(voxelidx, num, k) = points_rw(i, k);
+        height_rw(voxelidx, k) = std::min(points_rw(i, k), height_rw(voxelidx, k));
+        maxs_rw(voxelidx, k) = std::max(points_rw(i, k), maxs_rw(voxelidx, k));
+      }
+      num_points_per_voxel_rw(voxelidx) += 1;
+    }
+  }
+  for (int i = 0; i < voxel_num; ++i) {
+    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
+    for (int k = 0; k < num_features; ++k) {
+      height_rw(i, k) = maxs_rw(i, k) - height_rw(i, k);
+    }
+  }
+  return voxel_num;
+}
+template <typename DType, int NDim>
+int block_filtering(py::array_t<DType> points, 
+                          py::array_t<int> mask,
+                          py::array_t<DType> height,
+                          py::array_t<DType> maxs,
+                          py::array_t<int> coor_to_voxelidx,
+                          std::vector<DType> voxel_size,
+                          std::vector<DType> coors_range,
+                          int max_voxels,
+                          DType eps) {
+  auto points_rw = points.template mutable_unchecked<2>();
+  auto height_rw = height.template mutable_unchecked<1>();
+  auto maxs_rw = maxs.template mutable_unchecked<1>();
+  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
+  auto N = points_rw.shape(0);
+  auto num_features = points_rw.shape(1);
+  // auto ndim = points_rw.shape(1) - 1;
+  constexpr int ndim_minus_1 = NDim - 1;
+  int voxel_num = 0;
+  bool failed = false;
+  int coor[NDim];
+  int c;
+  int grid_size[NDim];
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+  int voxelidx, num;
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed)
+      continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      voxel_num += 1;
+      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
+    }
+    height_rw(voxelidx) = std::min(points_rw(i, 2), height_rw(voxelidx));
+    maxs_rw(voxelidx) = std::max(points_rw(i, 2), maxs_rw(voxelidx));
+  }
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed)
+      continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if ((maxs_rw(voxelidx) - height_rw(voxelidx, 2)) < eps){
+      mask(i) = 0;
+    }
+  }
+}
+template <typename DType, int NDim>
+int points_to_voxel_3d_with_filtering(py::array_t<DType> points, py::array_t<DType> voxels,
+                          py::array_t<int> voxel_mask,
+                          py::array_t<DType> mins,
+                          py::array_t<DType> maxs,
+                          py::array_t<int> coors,
+                          py::array_t<int> num_points_per_voxel,
+                          py::array_t<int> coor_to_voxelidx,
+                          std::vector<DType> voxel_size,
+                          std::vector<DType> coors_range, 
+                          int max_points,
+                          int max_voxels,
+                          int block_factor,
+                          int block_size,
+                          DType height_threshold) {
+  auto points_rw = points.template mutable_unchecked<2>();
+  auto mins_rw = mins.template mutable_unchecked<2>();
+  auto maxs_rw = maxs.template mutable_unchecked<2>();
+  auto voxels_rw = voxels.template mutable_unchecked<3>();
+  auto voxel_mask_rw = voxel_mask.template mutable_unchecked<1>();
+  auto coors_rw = coors.mutable_unchecked<2>();
+  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
+  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
+  auto N = points_rw.shape(0);
+  auto num_features = points_rw.shape(1);
+  // auto ndim = points_rw.shape(1) - 1;
+  constexpr int ndim_minus_1 = NDim - 1;
+  int voxel_num = 0;
+  bool failed = false;
+  int coor[NDim];
+  int c;
+  int grid_size[NDim];
+  DType max_value, min_value;
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+  int block_shape_H = grid_size[1] / block_factor;
+  int block_shape_W = grid_size[0] / block_factor;
+  int voxelidx, num;
+  int block_coor[2];
+  int startx, stopx, starty, stopy;
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed)
+      continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      if (voxel_num >= max_voxels)
+        break;
+      voxel_num += 1;
+      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
+      for (int k = 0; k < NDim; ++k) {
+        coors_rw(voxelidx, k) = coor[k];
+      }
+    }
+    num = num_points_per_voxel_rw(voxelidx);
+    if (num < max_points) {
+      for (int k = 0; k < num_features; ++k) {
+        voxels_rw(voxelidx, num, k) = points_rw(i, k);
+      }
+      block_coor[0] = coor[1] / block_factor;
+      block_coor[1] = coor[2] / block_factor;
+      mins_rw(block_coor[0], block_coor[1]) = std::min(points_rw(i, 2), mins_rw(block_coor[0], block_coor[1]));
+      maxs_rw(block_coor[0], block_coor[1]) = std::max(points_rw(i, 2), maxs_rw(block_coor[0], block_coor[1]));
+      num_points_per_voxel_rw(voxelidx) += 1;
+    }
+  }
+  for (int i = 0; i < voxel_num; ++i) {
+    coor[1] = coors_rw(i, 1);
+    coor[2] = coors_rw(i, 2);
+    coor_to_voxelidx_rw(coors_rw(i, 0), coor[1], coor[2]) = -1;
+    block_coor[0] = coor[1] / block_factor;
+    block_coor[1] = coor[2] / block_factor;
+    min_value = mins_rw(block_coor[0], block_coor[1]);
+    max_value = maxs_rw(block_coor[0], block_coor[1]);
+    startx = std::max(0, block_coor[0]-block_size/2);
+    stopx = std::min(block_shape_H, block_coor[0]+block_size-block_size/2);
+    starty = std::max(0, block_coor[1]-block_size/2);
+    stopy = std::min(block_shape_W, block_coor[1]+block_size-block_size/2);
+    for (int j = startx; j < stopx; ++j){
+      for (int k = starty; k < stopy; ++k){
+        min_value = std::min(min_value, mins_rw(j, k));
+        max_value = std::max(max_value, maxs_rw(j, k));
+      }
+    }
+    voxel_mask_rw(i) = (max_value - min_value) > height_threshold;
  }
  return voxel_num;
 }
 } // namespace spconv
\ No newline at end of file
--- a/spconv/__init__.py
+++ b/spconv/__init__.py
@@ -24,6 +24,8 @@ from spconv.conv import SparseInverseConv2d, SparseInverseConv3d
 from spconv.modules import SparseModule, SparseSequential
 from spconv.pool import SparseMaxPool2d, SparseMaxPool3d
+from spconv import ops
 _LIB_FILE_NAME = "libspconv.so"
 if platform.system() == "Windows":
    _LIB_FILE_NAME = "spconv.dll"

--- a/spconv/conv.py
+++ b/spconv/conv.py
 # Copyright 2019 Yan Yan
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -63,7 +63,8 @@ class SparseConvolution(SparseModule):
                 output_padding=0,
                 transposed=False,
                 inverse=False,
-                 indice_key=None):
+                 indice_key=None,
+                 fused_bn=False):
        super(SparseConvolution, self).__init__()
        assert groups == 1
        if not isinstance(kernel_size, (list, tuple)):
@@ -94,6 +95,7 @@ class SparseConvolution(SparseModule):
        self.groups = groups
        self.subm = subm
        self.indice_key = indice_key
+        self.fused_bn = fused_bn
        self.weight = Parameter(
            torch.Tensor(*kernel_size, in_channels, out_channels))
@@ -121,10 +123,12 @@ class SparseConvolution(SparseModule):
        if not self.subm:
            if self.transposed:
                out_spatial_shape = ops.get_deconv_output_size(
-                    spatial_shape, self.kernel_size, self.stride, self.padding, self.dilation, self.output_padding)
+                    spatial_shape, self.kernel_size, self.stride, self.padding,
+                    self.dilation, self.output_padding)
            else:
                out_spatial_shape = ops.get_conv_output_size(
-                    spatial_shape, self.kernel_size, self.stride, self.padding, self.dilation)
+                    spatial_shape, self.kernel_size, self.stride, self.padding,
+                    self.dilation)
        else:
            out_spatial_shape = spatial_shape
@@ -136,8 +140,8 @@ class SparseConvolution(SparseModule):
                self.weight.view(self.in_channels, self.out_channels))
            if self.bias is not None:
                features += self.bias
-            out_tensor = spconv.SparseConvTensor(features, input.indices,
+            out_tensor = spconv.SparseConvTensor(
-                                                input.spatial_shape, input.batch_size)
+                features, input.indices, input.spatial_shape, input.batch_size)
            out_tensor.indice_dict = input.indice_dict
            out_tensor.grid = input.grid
            return out_tensor
@@ -145,32 +149,54 @@ class SparseConvolution(SparseModule):
        if self.inverse:
            assert datas is not None and self.indice_key is not None
            _, outids, indice_pairs, indice_pair_num, out_spatial_shape = datas
-            assert indice_pairs.shape[0] == np.prod(self.kernel_size), "inverse conv must have same kernel size as its couple conv"
+            assert indice_pairs.shape[0] == np.prod(
+                self.kernel_size
+            ), "inverse conv must have same kernel size as its couple conv"
        else:
            if self.indice_key is not None and datas is not None:
                outids, _, indice_pairs, indice_pair_num, _ = datas
            else:
                outids, indice_pairs, indice_pair_num = ops.get_indice_pairs(
-                    indices, batch_size, spatial_shape, self.kernel_size,
+                    indices,
-                    self.stride, self.padding, self.dilation, self.output_padding, self.subm, self.transposed, grid=input.grid)
+                    batch_size,
-                input.indice_dict[self.indice_key] = (outids, indices, indice_pairs, indice_pair_num, spatial_shape)
+                    spatial_shape,
-        if self.subm:
+                    self.kernel_size,
-            out_features = Fsp.indice_subm_conv(features, self.weight,
+                    self.stride,
-                                              indice_pairs.to(device),
+                    self.padding,
-                                              indice_pair_num,
+                    self.dilation,
-                                              outids.shape[0])
+                    self.output_padding,
+                    self.subm,
+                    self.transposed,
+                    grid=input.grid)
+                input.indice_dict[self.indice_key] = (outids, indices,
+                                                      indice_pairs,
+                                                      indice_pair_num,
+                                                      spatial_shape)
+        if self.fused_bn:
+            assert self.bias is not None
+            out_features = ops.fused_indice_conv(features, self.weight, self.bias, indice_pairs.to(device),
+                                                indice_pair_num,
+                                                outids.shape[0], self.inverse, self.subm)
        else:
-            if self.inverse:
+            if self.subm:
-                out_features = Fsp.indice_inverse_conv(features,
+                out_features = Fsp.indice_subm_conv(features, self.weight,
-                                            self.weight, indice_pairs.to(device),
+                                                    indice_pairs.to(device),
-                                            indice_pair_num, outids.shape[0])
+                                                    indice_pair_num,
+                                                    outids.shape[0])
            else:
-                out_features = Fsp.indice_conv(features,
+                if self.inverse:
-                                            self.weight, indice_pairs.to(device),
+                    out_features = Fsp.indice_inverse_conv(features, self.weight,
-                                            indice_pair_num, outids.shape[0])
+                                                        indice_pairs.to(device),
+                                                        indice_pair_num,
+                                                        outids.shape[0])
+                else:
+                    out_features = Fsp.indice_conv(features, self.weight,
+                                                indice_pairs.to(device),
+                                                indice_pair_num,
+                                                outids.shape[0])
-        if self.bias is not None:
+            if self.bias is not None:
-            out_features += self.bias
+                out_features += self.bias
        out_tensor = spconv.SparseConvTensor(out_features, outids,
                                             out_spatial_shape, batch_size)
        out_tensor.indice_dict = input.indice_dict
@@ -225,6 +251,30 @@ class SparseConv3d(SparseConvolution):
            bias,
            indice_key=indice_key)
+class SparseConv4d(SparseConvolution):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super(SparseConv4d, self).__init__(
+            4,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            indice_key=indice_key)
 class SparseConvTranspose2d(SparseConvolution):
    def __init__(self,
                 in_channels,
@@ -260,7 +310,6 @@ class SparseConvTranspose3d(SparseConvolution):
                 dilation=1,
                 groups=1,
                 bias=True,
                 indice_key=None):
        super(SparseConvTranspose3d, self).__init__(
            3,
@@ -275,6 +324,7 @@ class SparseConvTranspose3d(SparseConvolution):
            transposed=True,
            indice_key=indice_key)
 class SparseInverseConv2d(SparseConvolution):
    def __init__(self,
                 in_channels,
@@ -357,3 +407,27 @@ class SubMConv3d(SparseConvolution):
            bias,
            True,
            indice_key=indice_key)
+class SubMConv4d(SparseConvolution):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super(SubMConv4d, self).__init__(
+            4,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            True,
+            indice_key=indice_key)
--- a/spconv/functional.py
+++ b/spconv/functional.py
@@ -111,7 +111,6 @@ class SparseMaxPoolFunction(Function):
        input_bp = ops.indice_maxpool_backward(features, out, grad_output, indice_pairs, indice_pair_num)
        return input_bp, None, None, None
 indice_conv = SparseConvFunction.apply
 indice_inverse_conv = SparseInverseConvFunction.apply
 indice_subm_conv = SubMConvFunction.apply

--- a/spconv/modules.py
+++ b/spconv/modules.py
 # Copyright 2019 Yan Yan
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -17,12 +17,19 @@ from collections import OrderedDict
 import spconv
 import torch
 from torch import nn
-import time 
+import time
 def is_spconv_module(module):
-    spconv_modules = (SparseModule,)
+    spconv_modules = (SparseModule, )
    return isinstance(module, spconv_modules)
+def is_sparse_conv(module):
+    from spconv.conv import SparseConvolution
+    return isinstance(module, SparseConvolution)
 def _mean_update(vals, m_vals, t):
    outputs = []
    if not isinstance(vals, list):
@@ -104,7 +111,7 @@ class SparseSequential(SparseModule):
    def __len__(self):
        return len(self._modules)
-    @property 
+    @property
    def sparity_dict(self):
        return self._sparity_dict
@@ -117,7 +124,7 @@ class SparseSequential(SparseModule):
    def forward(self, input):
        for k, module in self._modules.items():
-            if is_spconv_module(module): # use SpConvTensor as input
+            if is_spconv_module(module):  # use SpConvTensor as input
                assert isinstance(input, spconv.SparseConvTensor)
                self._sparity_dict[k] = input.sparity
                input = module(input)
@@ -128,3 +135,50 @@ class SparseSequential(SparseModule):
                else:
                    input = module(input)
        return input
+    def fused(self):
+        """don't use this. no effect.
+        """
+        from spconv.conv import SparseConvolution
+        mods = [v for k, v in self._modules.items()]
+        fused_mods = []
+        idx = 0
+        while idx < len(mods):
+            if is_sparse_conv(mods[idx]):
+                if idx < len(mods) - 1 and isinstance(mods[idx + 1], nn.BatchNorm1d):
+                    new_module = SparseConvolution(
+                        ndim=mods[idx].ndim,
+                        in_channels=mods[idx].in_channels,
+                        out_channels=mods[idx].out_channels,
+                        kernel_size=mods[idx].kernel_size,
+                        stride=mods[idx].stride,
+                        padding=mods[idx].padding,
+                        dilation=mods[idx].dilation,
+                        groups=mods[idx].groups,
+                        bias=True,
+                        subm=mods[idx].subm,
+                        output_padding=mods[idx].output_padding,
+                        transposed=mods[idx].transposed,
+                        inverse=mods[idx].inverse,
+                        indice_key=mods[idx].indice_key,
+                        fused_bn=True,
+                    )
+                    new_module.load_state_dict(mods[idx].state_dict(), False)
+                    new_module.to(mods[idx].weight.device)
+                    conv = new_module
+                    bn = mods[idx + 1]
+                    conv.bias.data.zero_()
+                    conv.weight.data[:] = conv.weight.data * bn.weight.data / (
+                        torch.sqrt(bn.running_var) + bn.eps)
+                    conv.bias.data[:] = (
+                        conv.bias.data - bn.running_mean) * bn.weight.data / (
+                            torch.sqrt(bn.running_var) + bn.eps) + bn.bias.data
+                    fused_mods.append(conv)
+                    idx += 2
+                else:
+                    fused_mods.append(mods[idx])
+                    idx += 1
+            else:
+                fused_mods.append(mods[idx])
+                idx += 1
+        return SparseSequential(*fused_mods)
--- a/spconv/ops.py
+++ b/spconv/ops.py
@@ -83,6 +83,8 @@ def get_indice_pairs(indices,
            get_indice_pairs_func = torch.ops.spconv.get_indice_pairs_2d
        elif ndim == 3:
            get_indice_pairs_func = torch.ops.spconv.get_indice_pairs_3d
+        elif ndim == 4:
+            get_indice_pairs_func = torch.ops.spconv.get_indice_pairs_4d
        else:
            raise NotImplementedError
        return get_indice_pairs_func(indices, batch_size, out_shape, spatial_shape, ksize,
@@ -117,6 +119,21 @@ def indice_conv(features,
    else:
        raise NotImplementedError
+def fused_indice_conv(features, filters, bias,
+            indice_pairs,
+            indice_pair_num,
+            num_activate_out, inverse, subm):
+    if features.dtype == torch.half:
+        func = torch.ops.spconv.fused_indice_conv_half
+    elif filters.dtype == torch.float32:
+        func = torch.ops.spconv.fused_indice_conv_fp32
+    else:
+        raise NotImplementedError
+    return func(features, filters, bias, indice_pairs,
+                indice_pair_num, num_activate_out,
+                int(inverse), int(subm))
 def indice_conv_backward(features,
                       filters,
@@ -155,3 +172,8 @@ def indice_maxpool_backward(features, out_features, out_bp, indice_pairs, indice
            features, out_features, out_bp, indice_pairs, indice_pair_num)
    else:
        raise NotImplementedError
+def nms(boxes, scores, pre_max_size, post_max_size, thresh, eps):
+    res = torch.ops.spconv.nms(
+        boxes, scores, pre_max_size, post_max_size, thresh, eps)
+    return res
--- a/spconv/utils/__init__.py
+++ b/spconv/utils/__init__.py
 # Copyright 2019 Yan Yan
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -14,16 +14,26 @@
 import numpy as np
 from spconv import spconv_utils
-from spconv.spconv_utils import (non_max_suppression, non_max_suppression_cpu,
+from spconv.spconv_utils import (
-                                 points_to_voxel_3d_np, rbbox_iou,
+    non_max_suppression, non_max_suppression_cpu, points_to_voxel_3d_np,
-                                 rotate_non_max_suppression_cpu, rbbox_intersection)
+    rbbox_iou, points_to_voxel_3d_np_mean, points_to_voxel_3d_np_height,
+    points_to_voxel_3d_with_filtering, rotate_non_max_suppression_cpu,
+    rbbox_intersection)
 def points_to_voxel(points,
-                     voxel_size,
+                    voxel_size,
-                     coors_range,
+                    coors_range,
-                     coor_to_voxelidx,
+                    coor_to_voxelidx,
-                     max_points=35,
+                    max_points=35,
-                     max_voxels=20000):
+                    max_voxels=20000,
+                    full_mean=False,
+                    with_height=False,
+                    block_filtering=True,
+                    block_factor=1,
+                    block_size=8,
+                    height_threshold=0.2,
+                    pad_output=False):
    """convert 3d points(N, >=3) to voxels. This version calculate
    everything in one loop. now it takes only 0.8ms(~6k voxels) 
    with c++ and 3.2ghz cpu.
@@ -39,12 +49,18 @@ def points_to_voxel(points,
        max_voxels: int. indicate maximum voxels this function create.
            for voxelnet, 20000 is a good choice. you should shuffle points
            before call this function because max_voxels may drop some points.
+        full_mean: bool. if true, all empty points in voxel will be filled with mean
+            of exist points.
+        with_height: bool. don't use this.
+        block_filtering: filter voxels by height. used for lidar point cloud.
+            use some visualization tool to see filtered result.
    Returns:
        voxels: [M, max_points, ndim] float tensor. only contain points.
        coordinates: [M, 3] int32 tensor. zyx format.
        num_points_per_voxel: [M] int32 tensor.
    """
+    if full_mean:
+        assert block_filtering is False
    if not isinstance(voxel_size, np.ndarray):
        voxel_size = np.array(voxel_size, dtype=points.dtype)
    if not isinstance(coors_range, np.ndarray):
@@ -56,20 +72,76 @@ def points_to_voxel(points,
    voxels = np.zeros(
        shape=(max_voxels, max_points, points.shape[-1]), dtype=points.dtype)
    coors = np.zeros(shape=(max_voxels, 3), dtype=np.int32)
-    voxel_num = points_to_voxel_3d_np(
+    res = {
-        points, voxels, coors, num_points_per_voxel, coor_to_voxelidx,
+        "voxels": voxels,
-        voxel_size.tolist(), coors_range.tolist(), max_points, max_voxels)
+        "coordinates": coors,
-    # coors = coors[:voxel_num]
+        "num_points_per_voxel": num_points_per_voxel,
-    # voxels = voxels[:voxel_num]
+    }
-    # num_points_per_voxel = num_points_per_voxel[:voxel_num]
+    if full_mean:
-    return voxels, coors, num_points_per_voxel, voxel_num
+        means = np.zeros(
+            shape=(max_voxels, points.shape[-1]), dtype=points.dtype)
+        voxel_num = points_to_voxel_3d_np_mean(
+            points, voxels, means, coors,
+            num_points_per_voxel, coor_to_voxelidx, voxel_size.tolist(),
+            coors_range.tolist(), max_points, max_voxels)
+    else:
+        if with_height:
+            heights = np.zeros(
+                shape=(max_voxels, points.shape[-1]), dtype=points.dtype)
+            maxs = np.zeros(
+                shape=(max_voxels, points.shape[-1]), dtype=points.dtype)
+            res["heights"] = heights
+            voxel_num = points_to_voxel_3d_np_height(
+                points, voxels, heights, maxs, coors,
+                num_points_per_voxel, coor_to_voxelidx, voxel_size.tolist(),
+                coors_range.tolist(), max_points, max_voxels)
+        else:
+            if block_filtering:
+                block_shape = [*voxelmap_shape[1:]]
+                block_shape = [b // block_factor for b in block_shape]
+                mins = np.full(block_shape, 99999999, dtype=points.dtype)
+                maxs = np.full(block_shape, -99999999, dtype=points.dtype)
+                voxel_mask = np.zeros((max_voxels, ), dtype=np.int32)
+                voxel_num = points_to_voxel_3d_with_filtering(
+                    points, voxels, voxel_mask, mins, maxs,
+                    coors, num_points_per_voxel, coor_to_voxelidx,
+                    voxel_size.tolist(), coors_range.tolist(), max_points,
+                    max_voxels, block_factor, block_size, height_threshold)
+                voxel_mask = voxel_mask.astype(np.bool_)
+                coors_ = coors[voxel_mask]
+                if pad_output:
+                    res["coordinates"][:voxel_num] = coors_
+                    res["voxels"][:voxel_num] = voxels[voxel_mask]
+                    res["num_points_per_voxel"][:
+                                                voxel_num] = num_points_per_voxel[
+                                                    voxel_mask]
+                    res["coordinates"][voxel_num:] = 0
+                    res["voxels"][voxel_num:] = 0
+                    res["num_points_per_voxel"][voxel_num:] = 0
+                else:
+                    res["coordinates"] = coors_
+                    res["voxels"] = voxels[voxel_mask]
+                    res["num_points_per_voxel"] = num_points_per_voxel[
+                        voxel_mask]
+                voxel_num = coors_.shape[0]
+            else:
+                voxel_num = points_to_voxel_3d_np(points, voxels, coors,
+                                                  num_points_per_voxel,
+                                                  coor_to_voxelidx,
+                                                  voxel_size.tolist(),
+                                                  coors_range.tolist(),
+                                                  max_points, max_voxels)
+    res["voxel_num"] = voxel_num
+    return res
 class VoxelGenerator:
    def __init__(self,
                 voxel_size,
                 point_cloud_range,
                 max_num_points,
-                 max_voxels=20000):
+                 max_voxels=20000,
+                 full_mean=True):
        point_cloud_range = np.array(point_cloud_range, dtype=np.float32)
        # [0, -40, -3, 70.4, 40, 1]
        voxel_size = np.array(voxel_size, dtype=np.float32)
@@ -85,12 +157,17 @@ class VoxelGenerator:
        self._max_num_points = max_num_points
        self._max_voxels = max_voxels
        self._grid_size = grid_size
+        self._full_mean = full_mean
    def generate(self, points, max_voxels=None):
-        res = points_to_voxel(
+        res = points_to_voxel(points, self._voxel_size,
-            points, self._voxel_size, self._point_cloud_range, self._coor_to_voxelidx,
+                              self._point_cloud_range, self._coor_to_voxelidx,
-            self._max_num_points, max_voxels or self._max_voxels)
+                              self._max_num_points, max_voxels
-        voxels, coors, num_points_per_voxel, voxel_num = res
+                              or self._max_voxels, self._full_mean)
+        voxels = res["voxels"]
+        coors = res["coordinates"]
+        num_points_per_voxel = res["num_points_per_voxel"]
+        voxel_num = res["voxel_num"]
        coors = coors[:voxel_num]
        voxels = voxels[:voxel_num]
        num_points_per_voxel = num_points_per_voxel[:voxel_num]
@@ -98,11 +175,99 @@ class VoxelGenerator:
        return (voxels, coors, num_points_per_voxel)
    def generate_multi_gpu(self, points, max_voxels=None):
+        res = points_to_voxel(points, self._voxel_size,
+                              self._point_cloud_range, self._coor_to_voxelidx,
+                              self._max_num_points, max_voxels
+                              or self._max_voxels, self._full_mean)
+        voxels = res["voxels"]
+        coors = res["coordinates"]
+        num_points_per_voxel = res["num_points_per_voxel"]
+        voxel_num = res["voxel_num"]
+        return (voxels, coors, num_points_per_voxel)
+    @property
+    def voxel_size(self):
+        return self._voxel_size
+    @property
+    def max_num_points_per_voxel(self):
+        return self._max_num_points
+    @property
+    def point_cloud_range(self):
+        return self._point_cloud_range
+    @property
+    def grid_size(self):
+        return self._grid_size
+class VoxelGeneratorV2:
+    def __init__(self,
+                 voxel_size,
+                 point_cloud_range,
+                 max_num_points,
+                 max_voxels=20000,
+                 full_mean=False,
+                 with_height=False,
+                 block_filtering=False,
+                 block_factor=8,
+                 block_size=3,
+                 height_threshold=0.1):
+        assert with_height is False, "don't use this."
+        assert full_mean is False, "don't use this."
+        point_cloud_range = np.array(point_cloud_range, dtype=np.float32)
+        # [0, -40, -3, 70.4, 40, 1]
+        voxel_size = np.array(voxel_size, dtype=np.float32)
+        grid_size = (
+            point_cloud_range[3:] - point_cloud_range[:3]) / voxel_size
+        grid_size = np.round(grid_size).astype(np.int64)
+        voxelmap_shape = tuple(np.round(grid_size).astype(np.int32).tolist())
+        voxelmap_shape = voxelmap_shape[::-1]
+        assert grid_size[0] % block_factor == 0
+        assert grid_size[1] % block_factor == 0
+        self._coor_to_voxelidx = np.full(voxelmap_shape, -1, dtype=np.int32)
+        self._voxel_size = voxel_size
+        self._point_cloud_range = point_cloud_range
+        self._max_num_points = max_num_points
+        self._max_voxels = max_voxels
+        self._grid_size = grid_size
+        self._full_mean = full_mean
+        self._with_height = with_height
+        self._block_filtering = block_filtering
+        self._block_factor = block_factor
+        self._height_threshold = height_threshold
+        self._block_size = block_size
+        assert block_size > 0
+    def generate(self, points, max_voxels=None):
        res = points_to_voxel(
-            points, self._voxel_size, self._point_cloud_range, self._coor_to_voxelidx,
+            points, self._voxel_size, self._point_cloud_range,
-            self._max_num_points, max_voxels or self._max_voxels)
+            self._coor_to_voxelidx, self._max_num_points, max_voxels
+            or self._max_voxels, self._full_mean, self._with_height,
+            self._block_filtering, self._block_factor, self._block_size,
+            self._height_threshold)
+        for k, v in res.items():
+            if k != "voxel_num":
+                res[k] = v[:res["voxel_num"]]
        return res
+    def generate_multi_gpu(self, points, max_voxels=None):
+        res = points_to_voxel(
+            points,
+            self._voxel_size,
+            self._point_cloud_range,
+            self._coor_to_voxelidx,
+            self._max_num_points,
+            max_voxels or self._max_voxels,
+            self._full_mean,
+            self._with_height,
+            self._block_filtering,
+            self._block_factor,
+            self._block_size,
+            self._height_threshold,
+            pad_output=True)
+        return res
    @property
    def voxel_size(self):
@@ -112,7 +277,6 @@ class VoxelGenerator:
    def max_num_points_per_voxel(self):
        return self._max_num_points
    @property
    def point_cloud_range(self):
        return self._point_cloud_range

--- a/src/spconv/CMakeLists.txt
+++ b/src/spconv/CMakeLists.txt
 add_library(spconv SHARED all.cc indice.cc indice.cu 
-            reordering.cc reordering.cu maxpool.cc maxpool.cu)
+            reordering.cc reordering.cu maxpool.cc maxpool.cu nms.cc)
 target_include_directories(spconv PRIVATE ${ALL_INCLUDE} )
 set_property(TARGET spconv PROPERTY CUDA_STANDARD 14)

--- a/src/spconv/all.cc
+++ b/src/spconv/all.cc
@@ -15,10 +15,13 @@
 #include <cuda_runtime_api.h>
 #include <spconv/pool_ops.h>
 #include <spconv/spconv_ops.h>
+#include <spconv/fused_spconv_ops.h>
+#include <spconv/nms_ops.h>
 static auto registry =
    torch::jit::RegisterOperators("spconv::get_indice_pairs_2d", &spconv::getIndicePair<2>)
        .op("spconv::get_indice_pairs_3d", &spconv::getIndicePair<3>)
+        .op("spconv::get_indice_pairs_4d", &spconv::getIndicePair<4>)
        .op("spconv::get_indice_pairs_grid_2d", &spconv::getIndicePairPreGrid<2>)
        .op("spconv::get_indice_pairs_grid_3d", &spconv::getIndicePairPreGrid<3>)
        .op("spconv::indice_conv_fp32", &spconv::indiceConv<float>)
@@ -26,9 +29,12 @@ static auto registry =
        .op("spconv::indice_conv_half", &spconv::indiceConv<at::Half>)
        .op("spconv::indice_conv_backward_half",
            &spconv::indiceConvBackward<at::Half>)
+        .op("spconv::fused_indice_conv_fp32", &spconv::fusedIndiceConvBatchNorm<float>)
+        .op("spconv::fused_indice_conv_half", &spconv::fusedIndiceConvBatchNorm<at::Half>)
        .op("spconv::indice_maxpool_fp32", &spconv::indiceMaxPool<float>)
        .op("spconv::indice_maxpool_backward_fp32",
            &spconv::indiceMaxPoolBackward<float>)
        .op("spconv::indice_maxpool_half", &spconv::indiceMaxPool<at::Half>)
        .op("spconv::indice_maxpool_backward_half",
-            &spconv::indiceMaxPoolBackward<at::Half>);
+            &spconv::indiceMaxPoolBackward<at::Half>)
\ No newline at end of file
+        .op("spconv::nms", &spconv::nonMaxSuppression<float>);
\ No newline at end of file
--- a/src/spconv/nms.cc
+++ b/src/spconv/nms.cc
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <boost/geometry.hpp>
+#include <spconv/nms_functor.h>
+#include <torch/script.h>
+#include <vector>
+namespace spconv {
+namespace functor {
+template <typename T, typename Index>
+struct NonMaxSupressionFunctor<tv::CPU, T, Index>  {
+  Index operator()(const tv::CPU &d, tv::TensorView<Index> keep,
+                   tv::TensorView<const T> boxes, T threshold, T eps) {
+    auto ndets = boxes.dim(0);
+    auto suppressed = std::vector<Index>(ndets);
+    auto area = std::vector<T>(ndets);
+    for (int i = 0; i < ndets; ++i) {
+      area[i] =
+          (boxes(i, 2) - boxes(i, 0) + eps) * (boxes(i, 3) - boxes(i, 1) + eps);
+    }
+    int i, j;
+    T xx1, xx2, w, h, inter, ovr;
+    int keepNum = 0;
+    for (int _i = 0; _i < ndets; ++_i) {
+      i = _i;
+      if (suppressed[i] == 1)
+        continue;
+      keep[keepNum] = i;
+      keepNum += 1;
+      for (int _j = _i + 1; _j < ndets; ++_j) {
+        j = _j;
+        if (suppressed[j] == 1)
+          continue;
+        xx2 = std::min(boxes(i, 2), boxes(j, 2));
+        xx1 = std::max(boxes(i, 0), boxes(j, 0));
+        w = xx2 - xx1 + eps;
+        if (w > 0) {
+          xx2 = std::min(boxes(i, 3), boxes(j, 3));
+          xx1 = std::max(boxes(i, 1), boxes(j, 1));
+          h = xx2 - xx1 + eps;
+          if (h > 0) {
+            inter = w * h;
+            ovr = inter / (area[i] + area[j] - inter);
+            if (ovr >= threshold)
+              suppressed[j] = 1;
+          }
+        }
+      }
+    }
+    return keepNum;
+  }
+};
+template <typename T, typename Index>
+struct rotateNonMaxSupressionFunctor<tv::CPU, T, Index> {
+  Index operator()(const tv::CPU &d, tv::TensorView<Index> keep,
+                   tv::TensorView<const T> boxCorners,
+                   tv::TensorView<const T> standupIoU, T threshold) {
+    auto ndets = boxCorners.dim(0);
+    auto suppressed = std::vector<Index>(ndets);
+    int i, j;
+    namespace bg = boost::geometry;
+    typedef bg::model::point<T, 2, bg::cs::cartesian> point_t;
+    typedef bg::model::polygon<point_t> polygon_t;
+    polygon_t poly, qpoly;
+    std::vector<polygon_t> poly_inter, poly_union;
+    T inter_area, union_area, overlap;
+    int keepNum = 0;
+    for (int _i = 0; _i < ndets; ++_i) {
+      i = _i;
+      if (suppressed[i] == 1)
+        continue;
+      keep[keepNum] = i;
+      keepNum += 1;
+      for (int _j = _i + 1; _j < ndets; ++_j) {
+        j = _j;
+        if (suppressed[j] == 1)
+          continue;
+        if (standupIoU(i, j) <= 0.0)
+          continue;
+        bg::append(poly, point_t(boxCorners(i, 0, 0), boxCorners(i, 0, 1)));
+        bg::append(poly, point_t(boxCorners(i, 1, 0), boxCorners(i, 1, 1)));
+        bg::append(poly, point_t(boxCorners(i, 2, 0), boxCorners(i, 2, 1)));
+        bg::append(poly, point_t(boxCorners(i, 3, 0), boxCorners(i, 3, 1)));
+        bg::append(poly, point_t(boxCorners(i, 0, 0), boxCorners(i, 0, 1)));
+        bg::append(qpoly, point_t(boxCorners(j, 0, 0), boxCorners(j, 0, 1)));
+        bg::append(qpoly, point_t(boxCorners(j, 1, 0), boxCorners(j, 1, 1)));
+        bg::append(qpoly, point_t(boxCorners(j, 2, 0), boxCorners(j, 2, 1)));
+        bg::append(qpoly, point_t(boxCorners(j, 3, 0), boxCorners(j, 3, 1)));
+        bg::append(qpoly, point_t(boxCorners(j, 0, 0), boxCorners(j, 0, 1)));
+        bg::intersection(poly, qpoly, poly_inter);
+        if (!poly_inter.empty()) {
+          inter_area = bg::area(poly_inter.front());
+          bg::union_(poly, qpoly, poly_union);
+          if (!poly_union.empty()) { // ignore invalid box
+            union_area = bg::area(poly_union.front());
+            overlap = inter_area / union_area;
+            if (overlap >= threshold)
+              suppressed[j] = 1;
+            poly_union.clear();
+          }
+        }
+        poly.clear();
+        qpoly.clear();
+        poly_inter.clear();
+      }
+    }
+    return keepNum;
+  }
+};
+} // namespace functor
+#define DECLARE_CPU_T_INDEX(T, Index)                                          \
+  template struct functor::NonMaxSupressionFunctor<tv::CPU, T, Index>;         \
+  template struct functor::rotateNonMaxSupressionFunctor<tv::CPU, T, Index>;
+#define DECLARE_CPU_INDEX(Index)                                               \
+  DECLARE_CPU_T_INDEX(float, Index);                                           \
+  DECLARE_CPU_T_INDEX(double, Index);                                          
+DECLARE_CPU_INDEX(int);
+DECLARE_CPU_INDEX(long);
+#undef DECLARE_CPU_INDEX
+#undef DECLARE_CPU_T_INDEX
+} // namespace spconv
--- a/src/spconv/nms.cu
+++ b/src/spconv/nms.cu
+// ------------------------------------------------------------------
+// Deformable Convolutional Networks
+// Copyright (c) 2015 Microsoft
+// Licensed under The MIT License
+// Modified from MATLAB Faster R-CNN (https://github.com/shaoqingren/faster_rcnn)
+// ------------------------------------------------------------------
+#include <ATen/ATen.h>
+#include <chrono>
+#include <limits>
+#include <spconv/mp_helper.h>
+#include <spconv/reordering.h>
+#include <spconv/reordering.cu.h>
+#include <tensorview/helper_kernel.cu.h>
+#include <tensorview/helper_launch.h>
+#include <tensorview/tensorview.h>
+#include <type_traits>
+#include <utility/timer.h>
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+int const threadsPerBlock = sizeof(unsigned long long) * 8;
+template <typename DType>
+__device__ inline DType devIoU(DType const *const a, DType const *const b)
+{
+  DType left = max(a[0], b[0]), right = min(a[2], b[2]);
+  DType top = max(a[1], b[1]), bottom = min(a[3], b[3]);
+  DType width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
+  DType interS = width * height;
+  DType Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
+  DType Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
+  return interS / (Sa + Sb - interS);
+}
+template <typename DType, int BLOCK_THREADS>
+__global__ void nms_kernel(const int n_boxes, const DType nms_overlap_thresh,
+                           const DType *dev_boxes, unsigned long long *dev_mask)
+{
+  const int row_start = blockIdx.y;
+  const int col_start = blockIdx.x;
+  // if (row_start > col_start) return;
+  const int row_size =
+      min(n_boxes - row_start * BLOCK_THREADS, BLOCK_THREADS);
+  const int col_size =
+      min(n_boxes - col_start * BLOCK_THREADS, BLOCK_THREADS);
+  __shared__ DType block_boxes[BLOCK_THREADS * 5];
+  if (threadIdx.x < col_size)
+  {
+#pragma unroll
+    for (int i = 0; i < 5; ++i)
+    {
+      block_boxes[threadIdx.x * 5 + i] =
+          dev_boxes[(BLOCK_THREADS * col_start + threadIdx.x) * 5 + i];
+    }
+  }
+  __syncthreads();
+  if (threadIdx.x < row_size)
+  {
+    const int cur_box_idx = BLOCK_THREADS * row_start + threadIdx.x;
+    const DType *cur_box = dev_boxes + cur_box_idx * 5;
+    unsigned long long t = 0;
+    int start = 0;
+    if (row_start == col_start)
+    {
+      start = threadIdx.x + 1;
+    }
+    for (int i = start; i < col_size; i++)
+    {
+      if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh)
+      {
+        t |= 1ULL << i;
+      }
+    }
+    const int col_blocks = DIVUP(n_boxes, BLOCK_THREADS);
+    dev_mask[cur_box_idx * col_blocks + col_start] = t;
+  }
+}
--- a/src/utils/all.cc
+++ b/src/utils/all.cc
@@ -59,4 +59,29 @@ PYBIND11_MODULE(spconv_utils, m)
          "num_points_per_voxel"_a = 4, "coor_to_voxelidx"_a = 5,
          "voxel_size"_a = 6, "coors_range"_a = 7, "max_points"_a = 8,
          "max_voxels"_a = 9);
+    m.def("points_to_voxel_3d_np_mean", &spconv::points_to_voxel_3d_np_mean<float, 3>,
+          "matrix tensor_square", "points"_a = 1, "voxels"_a = 2, "means"_a = 3, "coors"_a = 4,
+          "num_points_per_voxel"_a = 5, "coor_to_voxelidx"_a = 6,
+          "voxel_size"_a = 7, "coors_range"_a = 8, "max_points"_a = 9,
+          "max_voxels"_a = 10);
+    m.def("points_to_voxel_3d_np_mean", &spconv::points_to_voxel_3d_np_mean<double, 3>,
+          "matrix tensor_square", "points"_a = 1, "voxels"_a = 2, "means"_a = 3, "coors"_a = 4,
+          "num_points_per_voxel"_a = 5, "coor_to_voxelidx"_a = 6,
+          "voxel_size"_a = 7, "coors_range"_a = 8, "max_points"_a = 9,
+          "max_voxels"_a = 10);
+    m.def("points_to_voxel_3d_np_height", &spconv::points_to_voxel_3d_np_height<double, 3>,
+          "matrix tensor_square", "points"_a = 1, "voxels"_a = 2, "heights"_a = 3,
+          "maxs"_a = 4, "coors"_a = 5, "num_points_per_voxel"_a = 6, "coor_to_voxelidx"_a = 7,
+          "voxel_size"_a = 8, "coors_range"_a = 9, "max_points"_a = 10,
+          "max_voxels"_a = 11);
+    m.def("points_to_voxel_3d_with_filtering", &spconv::points_to_voxel_3d_with_filtering<float, 3>,
+          "matrix tensor_square", "points"_a = 1, "voxels"_a = 2, "voxel_mask"_a = 3, "mins"_a = 4,
+          "maxs"_a = 5, "coors"_a = 6, "num_points_per_voxel"_a = 7, "coor_to_voxelidx"_a = 8,
+          "voxel_size"_a = 9, "coors_range"_a = 10, "max_points"_a = 11,
+          "max_voxels"_a = 12, "block_factor"_a = 13, "block_size"_a = 14, "height_threshold"_a = 15);
+    m.def("points_to_voxel_3d_with_filtering", &spconv::points_to_voxel_3d_with_filtering<double, 3>,
+          "matrix tensor_square", "points"_a = 1, "voxels"_a = 2, "voxel_mask"_a = 3, "mins"_a = 4,
+          "maxs"_a = 5, "coors"_a = 6, "num_points_per_voxel"_a = 7, "coor_to_voxelidx"_a = 8,
+          "voxel_size"_a = 9, "coors_range"_a = 10, "max_points"_a = 11,
+          "max_voxels"_a = 12, "block_factor"_a = 13, "block_size"_a = 14, "height_threshold"_a = 15);
 }
\ No newline at end of file
--- a/test/test_conv.py
+++ b/test/test_conv.py
@@ -20,7 +20,7 @@ import numpy as np
 import time 
 from spconv.test_utils import params_grid, generate_sparse_data, TestCase
 import unittest
-import sparseconvnet as scn 
+# import sparseconvnet as scn 
 class SparseConv3dTestTorch(nn.Module):
    def __init__(self, num_layers, ndim, shape, in_channels, out_channels, kernel_size,
@@ -612,7 +612,7 @@ def main():
 if __name__ == '__main__':
-    # main()
+    main()
-    unittest.main()
+    # unittest.main()