update

c27fee37 · dengjb · 420f8331 · c27fee37 · c27fee37 · c27fee37
Commit c27fee37 authored Jun 23, 2025 by dengjb
10 changed files
--- a/pointpillars/ops/voxel_module.py
+++ b/pointpillars/ops/voxel_module.py
+# This file is modified from https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/ops/voxel/voxelize.py
+
+import torch
+import torch.nn as nn
+from pointpillars.ops.voxel_op import hard_voxelize
+
+
+class _Voxelization(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx,
+                points,
+                voxel_size,
+                coors_range,
+                max_points=35,
+                max_voxels=20000,
+                deterministic=True):
+        """convert kitti points(N, >=3) to voxels.
+        Args:
+            points: [N, ndim] float tensor. points[:, :3] contain xyz points
+                and points[:, 3:] contain other information like reflectivity
+            voxel_size: [3] list/tuple or array, float. xyz, indicate voxel
+                size
+            coors_range: [6] list/tuple or array, float. indicate voxel
+                range. format: xyzxyz, minmax
+            max_points: int. indicate maximum points contained in a voxel. if
+                max_points=-1, it means using dynamic_voxelize
+            max_voxels: int. indicate maximum voxels this function create.
+                for second, 20000 is a good choice. Users should shuffle points
+                before call this function because max_voxels may drop points.
+            deterministic: bool. whether to invoke the non-deterministic
+                version of hard-voxelization implementations. non-deterministic
+                version is considerablly fast but is not deterministic. only
+                affects hard voxelization. default True. for more information
+                of this argument and the implementation insights, please refer
+                to the following links:
+                https://github.com/open-mmlab/mmdetection3d/issues/894
+                https://github.com/open-mmlab/mmdetection3d/pull/904
+                it is an experimental feature and we will appreciate it if
+                you could share with us the failing cases.
+        Returns:
+            voxels: [M, max_points, ndim] float tensor. only contain points
+                    and returned when max_points != -1.
+            coordinates: [M, 3] int32 tensor, always returned.
+            num_points_per_voxel: [M] int32 tensor. Only returned when
+                max_points != -1.
+        """
+        
+        voxels = points.new_zeros(
+            size=(max_voxels, max_points, points.size(1)))
+        coors = points.new_zeros(size=(max_voxels, 3), dtype=torch.int)
+        num_points_per_voxel = points.new_zeros(
+            size=(max_voxels, ), dtype=torch.int)
+        voxel_num = hard_voxelize(points, voxels, coors,
+                                    num_points_per_voxel, voxel_size,
+                                    coors_range, max_points, max_voxels, 3,
+                                    deterministic)
+        # select the valid voxels
+        voxels_out = voxels[:voxel_num]
+        coors_out = coors[:voxel_num].flip(-1) # (z, y, x) -> (x, y, z)
+        num_points_per_voxel_out = num_points_per_voxel[:voxel_num]
+        return voxels_out, coors_out, num_points_per_voxel_out
+
+
+class Voxelization(nn.Module):
+
+    def __init__(self,
+                 voxel_size,
+                 point_cloud_range,
+                 max_num_points,
+                 max_voxels,
+                 deterministic=True):
+        super(Voxelization, self).__init__()
+        """
+        Args:
+            voxel_size (list): list [x, y, z] size of three dimension
+            point_cloud_range (list):
+                [x_min, y_min, z_min, x_max, y_max, z_max]
+            max_num_points (int): max number of points per voxel
+            max_voxels (tuple): max number of voxels in
+                (training, testing) time
+            deterministic: bool. whether to invoke the non-deterministic
+                version of hard-voxelization implementations. non-deterministic
+                version is considerablly fast but is not deterministic. only
+                affects hard voxelization. default True. for more information
+                of this argument and the implementation insights, please refer
+                to the following links:
+                https://github.com/open-mmlab/mmdetection3d/issues/894
+                https://github.com/open-mmlab/mmdetection3d/pull/904
+                it is an experimental feature and we will appreciate it if
+                you could share with us the failing cases.
+        """
+        self.voxel_size = voxel_size
+        self.point_cloud_range = point_cloud_range
+        self.max_num_points = max_num_points
+        self.max_voxels = max_voxels
+        self.deterministic = deterministic
+
+        point_cloud_range = torch.tensor(
+            point_cloud_range, dtype=torch.float32)
+    
+        voxel_size = torch.tensor(voxel_size, dtype=torch.float32)
+        grid_size = (point_cloud_range[3:] -
+                     point_cloud_range[:3]) / voxel_size
+        grid_size = torch.round(grid_size).long()
+        input_feat_shape = grid_size[:2]
+        self.grid_size = grid_size
+        # the origin shape is as [x-len, y-len, z-len]
+        # [w, h, d] -> [d, h, w]
+        self.pcd_shape = [*input_feat_shape, 1][::-1]
+
+    def forward(self, input):
+        """
+        input: shape=(N, c)
+        """
+        if self.training:
+            max_voxels = self.max_voxels[0]
+        else:
+            max_voxels = self.max_voxels[1]
+
+        return _Voxelization.apply(input, self.voxel_size, self.point_cloud_range,
+                                   self.max_num_points, max_voxels,
+                                   self.deterministic)
+
+    def __repr__(self):
+        tmpstr = self.__class__.__name__ + '('
+        tmpstr += 'voxel_size=' + str(self.voxel_size)
+        tmpstr += ', point_cloud_range=' + str(self.point_cloud_range)
+        tmpstr += ', max_num_points=' + str(self.max_num_points)
+        tmpstr += ', max_voxels=' + str(self.max_voxels)
+        tmpstr += ', deterministic=' + str(self.deterministic)
+        tmpstr += ')'
+        return tmpstr
--- a/pointpillars/ops/voxelization/voxelization.cpp
+++ b/pointpillars/ops/voxelization/voxelization.cpp
+#include <torch/extension.h>
+#include "voxelization.h"
+
+namespace voxelization {
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("hard_voxelize", &hard_voxelize, "hard voxelize");
+}
+
+} // namespace voxelization
--- a/pointpillars/ops/voxelization/voxelization.h
+++ b/pointpillars/ops/voxelization/voxelization.h
+#pragma once
+#include <torch/extension.h>
+
+typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
+
+namespace voxelization {
+
+int hard_voxelize_cpu(const at::Tensor &points, at::Tensor &voxels,
+                      at::Tensor &coors, at::Tensor &num_points_per_voxel,
+                      const std::vector<float> voxel_size,
+                      const std::vector<float> coors_range,
+                      const int max_points, const int max_voxels,
+                      const int NDim = 3);
+
+#ifdef WITH_CUDA
+int hard_voxelize_gpu(const at::Tensor &points, at::Tensor &voxels,
+                      at::Tensor &coors, at::Tensor &num_points_per_voxel,
+                      const std::vector<float> voxel_size,
+                      const std::vector<float> coors_range,
+                      const int max_points, const int max_voxels,
+                      const int NDim = 3);
+
+int nondisterministic_hard_voxelize_gpu(const at::Tensor &points, at::Tensor &voxels,
+                                        at::Tensor &coors, at::Tensor &num_points_per_voxel,
+                                        const std::vector<float> voxel_size,
+                                        const std::vector<float> coors_range,
+                                        const int max_points, const int max_voxels,
+                                        const int NDim = 3);
+#endif
+
+// Interface for Python
+inline int hard_voxelize(const at::Tensor &points, at::Tensor &voxels,
+                         at::Tensor &coors, at::Tensor &num_points_per_voxel,
+                         const std::vector<float> voxel_size,
+                         const std::vector<float> coors_range,
+                         const int max_points, const int max_voxels,
+                         const int NDim = 3, const bool deterministic = true) {
+  if (points.device().is_cuda()) {
+#ifdef WITH_CUDA
+    if (deterministic) {
+      return hard_voxelize_gpu(points, voxels, coors, num_points_per_voxel,
+                               voxel_size, coors_range, max_points, max_voxels,
+                               NDim);
+    }
+    return nondisterministic_hard_voxelize_gpu(points, voxels, coors, num_points_per_voxel,
+                                               voxel_size, coors_range, max_points, max_voxels,
+                                               NDim);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  return hard_voxelize_cpu(points, voxels, coors, num_points_per_voxel,
+                           voxel_size, coors_range, max_points, max_voxels,
+                           NDim);
+}
+
+
+inline reduce_t convert_reduce_type(const std::string &reduce_type) {
+  if (reduce_type == "max")
+    return reduce_t::MAX;
+  else if (reduce_type == "sum")
+    return reduce_t::SUM;
+  else if (reduce_type == "mean")
+    return reduce_t::MEAN;
+  else TORCH_CHECK(false, "do not support reduce type " + reduce_type)
+  return reduce_t::SUM;
+}
+
+}  // namespace voxelization
--- a/pointpillars/ops/voxelization/voxelization_cpu.cpp
+++ b/pointpillars/ops/voxelization/voxelization_cpu.cpp
+#include <ATen/TensorUtils.h>
+#include <torch/extension.h>
+// #include "voxelization.h"
+
+namespace {
+
+template <typename T, typename T_int>
+void dynamic_voxelize_kernel(const torch::TensorAccessor<T, 2> points,
+                             torch::TensorAccessor<T_int, 2> coors,
+                             const std::vector<float> voxel_size,
+                             const std::vector<float> coors_range,
+                             const std::vector<int> grid_size,
+                             const int num_points, const int num_features,
+                             const int NDim) {
+  const int ndim_minus_1 = NDim - 1;
+  bool failed = false;
+  // int coor[NDim];
+  int* coor = new int[NDim]();
+  int c;
+
+  for (int i = 0; i < num_points; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points[i][j] - coors_range[j]) / voxel_size[j]);
+      // necessary to rm points out of range
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+
+    for (int k = 0; k < NDim; ++k) {
+      if (failed)
+        coors[i][k] = -1;
+      else
+        coors[i][k] = coor[k];
+    }
+  }
+
+  delete[] coor;
+  return;
+}
+
+template <typename T, typename T_int>
+void hard_voxelize_kernel(const torch::TensorAccessor<T, 2> points,
+                          torch::TensorAccessor<T, 3> voxels,
+                          torch::TensorAccessor<T_int, 2> coors,
+                          torch::TensorAccessor<T_int, 1> num_points_per_voxel,
+                          torch::TensorAccessor<T_int, 3> coor_to_voxelidx,
+                          int& voxel_num, const std::vector<float> voxel_size,
+                          const std::vector<float> coors_range,
+                          const std::vector<int> grid_size,
+                          const int max_points, const int max_voxels,
+                          const int num_points, const int num_features,
+                          const int NDim) {
+  // declare a temp coors
+  at::Tensor temp_coors = at::zeros(
+      {num_points, NDim}, at::TensorOptions().dtype(at::kInt).device(at::kCPU));
+
+  // First use dynamic voxelization to get coors,
+  // then check max points/voxels constraints
+  dynamic_voxelize_kernel<T, int>(points, temp_coors.accessor<int, 2>(),
+                                  voxel_size, coors_range, grid_size,
+                                  num_points, num_features, NDim);
+
+  int voxelidx, num;
+  auto coor = temp_coors.accessor<int, 2>();
+
+  for (int i = 0; i < num_points; ++i) {
+    // T_int* coor = temp_coors.data_ptr<int>() + i * NDim;
+
+    if (coor[i][0] == -1) continue;
+
+    voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];
+
+    // record voxel
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      if (max_voxels != -1 && voxel_num >= max_voxels) continue;
+      voxel_num += 1;
+
+      coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]] = voxelidx;
+
+      for (int k = 0; k < NDim; ++k) {
+        coors[voxelidx][k] = coor[i][k];
+      }
+    }
+
+    // put points into voxel
+    num = num_points_per_voxel[voxelidx];
+    if (max_points == -1 || num < max_points) {
+      for (int k = 0; k < num_features; ++k) {
+        voxels[voxelidx][num][k] = points[i][k];
+      }
+      num_points_per_voxel[voxelidx] += 1;
+    }
+  }
+
+  return;
+}
+
+}  // namespace
+
+namespace voxelization {
+
+int hard_voxelize_cpu(const at::Tensor& points, at::Tensor& voxels,
+                      at::Tensor& coors, at::Tensor& num_points_per_voxel,
+                      const std::vector<float> voxel_size,
+                      const std::vector<float> coors_range,
+                      const int max_points, const int max_voxels,
+                      const int NDim = 3) {
+  // current version tooks about 0.02s_0.03s for one frame on cpu
+  // check device
+  AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
+
+  std::vector<int> grid_size(NDim);
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+
+  // coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
+  // printf("cpu coor_to_voxelidx size: [%d, %d, %d]\n", grid_size[2],
+  // grid_size[1], grid_size[0]);
+  at::Tensor coor_to_voxelidx =
+      -at::ones({grid_size[2], grid_size[1], grid_size[0]}, coors.options());
+
+  int voxel_num = 0;
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "hard_voxelize_forward", [&] {
+        hard_voxelize_kernel<scalar_t, int>(
+            points.accessor<scalar_t, 2>(), voxels.accessor<scalar_t, 3>(),
+            coors.accessor<int, 2>(), num_points_per_voxel.accessor<int, 1>(),
+            coor_to_voxelidx.accessor<int, 3>(), voxel_num, voxel_size,
+            coors_range, grid_size, max_points, max_voxels, num_points,
+            num_features, NDim);
+      });
+
+  return voxel_num;
+}
+
+void dynamic_voxelize_cpu(const at::Tensor& points, at::Tensor& coors,
+                          const std::vector<float> voxel_size,
+                          const std::vector<float> coors_range,
+                          const int NDim = 3) {
+  // check device
+  AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
+
+  std::vector<int> grid_size(NDim);
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+
+  // coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "hard_voxelize_forward", [&] {
+        dynamic_voxelize_kernel<scalar_t, int>(
+            points.accessor<scalar_t, 2>(), coors.accessor<int, 2>(),
+            voxel_size, coors_range, grid_size, num_points, num_features, NDim);
+      });
+
+  return;
+}
+
+}  // namespace voxelization
--- a/pointpillars/ops/voxelization/voxelization_cuda.cu
+++ b/pointpillars/ops/voxelization/voxelization_cuda.cu
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/types.h>
+
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+
+#define CHECK_CUDA(x) \
+  TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) \
+  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
+  CHECK_CONTIGUOUS(x)
+
+namespace {
+int const threadsPerBlock = sizeof(unsigned long long) * 8;
+}
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                            \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
+       i += blockDim.x * gridDim.x)
+
+template <typename T, typename T_int>
+__global__ void dynamic_voxelize_kernel(
+    const T* points, T_int* coors, const float voxel_x, const float voxel_y,
+    const float voxel_z, const float coors_x_min, const float coors_y_min,
+    const float coors_z_min, const float coors_x_max, const float coors_y_max,
+    const float coors_z_max, const int grid_x, const int grid_y,
+    const int grid_z, const int num_points, const int num_features,
+    const int NDim) {
+  //   const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
+  CUDA_1D_KERNEL_LOOP(index, num_points) {
+    // To save some computation
+    auto points_offset = points + index * num_features;
+    auto coors_offset = coors + index * NDim;
+    int c_x = floor((points_offset[0] - coors_x_min) / voxel_x);
+    if (c_x < 0 || c_x >= grid_x) {
+      coors_offset[0] = -1;
+      return;
+    }
+
+    int c_y = floor((points_offset[1] - coors_y_min) / voxel_y);
+    if (c_y < 0 || c_y >= grid_y) {
+      coors_offset[0] = -1;
+      coors_offset[1] = -1;
+      return;
+    }
+
+    int c_z = floor((points_offset[2] - coors_z_min) / voxel_z);
+    if (c_z < 0 || c_z >= grid_z) {
+      coors_offset[0] = -1;
+      coors_offset[1] = -1;
+      coors_offset[2] = -1;
+    } else {
+      coors_offset[0] = c_z;
+      coors_offset[1] = c_y;
+      coors_offset[2] = c_x;
+    }
+  }
+}
+
+template <typename T, typename T_int>
+__global__ void assign_point_to_voxel(const int nthreads, const T* points,
+                                      T_int* point_to_voxelidx,
+                                      T_int* coor_to_voxelidx, T* voxels,
+                                      const int max_points,
+                                      const int num_features,
+                                      const int num_points, const int NDim) {
+  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
+    // const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
+    int index = thread_idx / num_features;
+
+    int num = point_to_voxelidx[index];
+    int voxelidx = coor_to_voxelidx[index];
+    if (num > -1 && voxelidx > -1) {
+      auto voxels_offset =
+          voxels + voxelidx * max_points * num_features + num * num_features;
+
+      int k = thread_idx % num_features;
+      voxels_offset[k] = points[thread_idx];
+    }
+  }
+}
+
+template <typename T, typename T_int>
+__global__ void assign_voxel_coors(const int nthreads, T_int* coor,
+                                   T_int* point_to_voxelidx,
+                                   T_int* coor_to_voxelidx, T_int* voxel_coors,
+                                   const int num_points, const int NDim) {
+  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
+    // const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
+    // if (index >= num_points) return;
+    int index = thread_idx / NDim;
+    int num = point_to_voxelidx[index];
+    int voxelidx = coor_to_voxelidx[index];
+    if (num == 0 && voxelidx > -1) {
+      auto coors_offset = voxel_coors + voxelidx * NDim;
+      int k = thread_idx % NDim;
+      coors_offset[k] = coor[thread_idx];
+    }
+  }
+}
+
+template <typename T_int>
+__global__ void point_to_voxelidx_kernel(const T_int* coor,
+                                         T_int* point_to_voxelidx,
+                                         T_int* point_to_pointidx,
+                                         const int max_points,
+                                         const int max_voxels,
+                                         const int num_points, const int NDim) {
+  CUDA_1D_KERNEL_LOOP(index, num_points) {
+    auto coor_offset = coor + index * NDim;
+    // skip invalid points
+    if ((index >= num_points) || (coor_offset[0] == -1)) return;
+
+    int num = 0;
+    int coor_x = coor_offset[0];
+    int coor_y = coor_offset[1];
+    int coor_z = coor_offset[2];
+    // only calculate the coors before this coor[index]
+    for (int i = 0; i < index; ++i) {
+      auto prev_coor = coor + i * NDim;
+      if (prev_coor[0] == -1) continue;
+
+      // Find all previous points that have the same coors
+      // if find the same coor, record it
+      if ((prev_coor[0] == coor_x) && (prev_coor[1] == coor_y) &&
+          (prev_coor[2] == coor_z)) {
+        num++;
+        if (num == 1) {
+          // point to the same coor that first show up
+          point_to_pointidx[index] = i;
+        } else if (num >= max_points) {
+          // out of boundary
+          return;
+        }
+      }
+    }
+    if (num == 0) {
+      point_to_pointidx[index] = index;
+    }
+    if (num < max_points) {
+      point_to_voxelidx[index] = num;
+    }
+  }
+}
+
+template <typename T_int>
+__global__ void determin_voxel_num(
+    // const T_int* coor,
+    T_int* num_points_per_voxel, T_int* point_to_voxelidx,
+    T_int* point_to_pointidx, T_int* coor_to_voxelidx, T_int* voxel_num,
+    const int max_points, const int max_voxels, const int num_points) {
+  // only calculate the coors before this coor[index]
+  for (int i = 0; i < num_points; ++i) {
+    // if (coor[i][0] == -1)
+    //    continue;
+    int point_pos_in_voxel = point_to_voxelidx[i];
+    // record voxel
+    if (point_pos_in_voxel == -1) {
+      // out of max_points or invalid point
+      continue;
+    } else if (point_pos_in_voxel == 0) {
+      // record new voxel
+      int voxelidx = voxel_num[0];
+      if (voxel_num[0] >= max_voxels) continue;
+      voxel_num[0] += 1;
+      coor_to_voxelidx[i] = voxelidx;
+      num_points_per_voxel[voxelidx] = 1;
+    } else {
+      int point_idx = point_to_pointidx[i];
+      int voxelidx = coor_to_voxelidx[point_idx];
+      if (voxelidx != -1) {
+        coor_to_voxelidx[i] = voxelidx;
+        num_points_per_voxel[voxelidx] += 1;
+      }
+    }
+  }
+}
+
+__global__ void nondisterministic_get_assign_pos(
+    const int nthreads, const int32_t *coors_map, int32_t *pts_id,
+    int32_t *coors_count, int32_t *reduce_count, int32_t *coors_order) {
+  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
+    int coors_idx = coors_map[thread_idx];
+    if (coors_idx > -1) {
+      int32_t coors_pts_pos = atomicAdd(&reduce_count[coors_idx], 1);
+      pts_id[thread_idx] = coors_pts_pos;
+      if (coors_pts_pos == 0) {
+        coors_order[coors_idx] = atomicAdd(coors_count, 1);
+      }
+    }
+  }
+}
+
+template<typename T>
+__global__ void nondisterministic_assign_point_voxel(
+    const int nthreads, const T *points, const int32_t *coors_map,
+    const int32_t *pts_id, const int32_t *coors_in,
+    const int32_t *reduce_count, const int32_t *coors_order,
+    T *voxels, int32_t *coors, int32_t *pts_count, const int max_voxels,
+    const int max_points, const int num_features, const int NDim) {
+  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
+    int coors_idx = coors_map[thread_idx];
+    int coors_pts_pos = pts_id[thread_idx];
+    if (coors_idx > -1) {
+      int coors_pos = coors_order[coors_idx];
+      if (coors_pos < max_voxels && coors_pts_pos < max_points) {
+        auto voxels_offset =
+            voxels + (coors_pos * max_points + coors_pts_pos) * num_features;
+        auto points_offset = points + thread_idx * num_features;
+        for (int k = 0; k < num_features; k++) {
+          voxels_offset[k] = points_offset[k];
+        }
+        if (coors_pts_pos == 0) {
+          pts_count[coors_pos] = min(reduce_count[coors_idx], max_points);
+          auto coors_offset = coors + coors_pos * NDim;
+          auto coors_in_offset = coors_in + coors_idx * NDim;
+          for (int k = 0; k < NDim; k++) {
+            coors_offset[k] = coors_in_offset[k];
+          }
+        }
+      }
+    }
+  }
+}
+
+namespace voxelization {
+
+int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
+                      at::Tensor& coors, at::Tensor& num_points_per_voxel,
+                      const std::vector<float> voxel_size,
+                      const std::vector<float> coors_range,
+                      const int max_points, const int max_voxels,
+                      const int NDim = 3) {
+  // current version tooks about 0.04s for one frame on cpu
+  // check device
+  CHECK_INPUT(points);
+
+  at::cuda::CUDAGuard device_guard(points.device());
+
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  const float voxel_x = voxel_size[0];
+  const float voxel_y = voxel_size[1];
+  const float voxel_z = voxel_size[2];
+  const float coors_x_min = coors_range[0];
+  const float coors_y_min = coors_range[1];
+  const float coors_z_min = coors_range[2];
+  const float coors_x_max = coors_range[3];
+  const float coors_y_max = coors_range[4];
+  const float coors_z_max = coors_range[5];
+
+  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
+  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
+  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
+
+  // map points to voxel coors
+  at::Tensor temp_coors =
+      at::zeros({num_points, NDim}, points.options().dtype(at::kInt));
+
+  dim3 grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
+  dim3 block(512);
+
+  // 1. link point to corresponding voxel coors
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "hard_voxelize_kernel", ([&] {
+        dynamic_voxelize_kernel<scalar_t, int>
+            <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
+                points.contiguous().data_ptr<scalar_t>(),
+                temp_coors.contiguous().data_ptr<int>(), voxel_x, voxel_y,
+                voxel_z, coors_x_min, coors_y_min, coors_z_min, coors_x_max,
+                coors_y_max, coors_z_max, grid_x, grid_y, grid_z, num_points,
+                num_features, NDim);
+      }));
+  cudaDeviceSynchronize();
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // 2. map point to the idx of the corresponding voxel, find duplicate coor
+  // create some temporary variables
+  auto point_to_pointidx = -at::ones(
+      {
+          num_points,
+      },
+      points.options().dtype(at::kInt));
+  auto point_to_voxelidx = -at::ones(
+      {
+          num_points,
+      },
+      points.options().dtype(at::kInt));
+
+  dim3 map_grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
+  dim3 map_block(512);
+  AT_DISPATCH_ALL_TYPES(
+      temp_coors.scalar_type(), "determin_duplicate", ([&] {
+        point_to_voxelidx_kernel<int>
+            <<<map_grid, map_block, 0, at::cuda::getCurrentCUDAStream()>>>(
+                temp_coors.contiguous().data_ptr<int>(),
+                point_to_voxelidx.contiguous().data_ptr<int>(),
+                point_to_pointidx.contiguous().data_ptr<int>(), max_points,
+                max_voxels, num_points, NDim);
+      }));
+  cudaDeviceSynchronize();
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // 3. determin voxel num and voxel's coor index
+  // make the logic in the CUDA device could accelerate about 10 times
+  auto coor_to_voxelidx = -at::ones(
+      {
+          num_points,
+      },
+      points.options().dtype(at::kInt));
+  auto voxel_num = at::zeros(
+      {
+          1,
+      },
+      points.options().dtype(at::kInt));  // must be zero from the begining
+
+  AT_DISPATCH_ALL_TYPES(
+      temp_coors.scalar_type(), "determin_duplicate", ([&] {
+        determin_voxel_num<int><<<1, 1, 0, at::cuda::getCurrentCUDAStream()>>>(
+            num_points_per_voxel.contiguous().data_ptr<int>(),
+            point_to_voxelidx.contiguous().data_ptr<int>(),
+            point_to_pointidx.contiguous().data_ptr<int>(),
+            coor_to_voxelidx.contiguous().data_ptr<int>(),
+            voxel_num.contiguous().data_ptr<int>(), max_points, max_voxels,
+            num_points);
+      }));
+  cudaDeviceSynchronize();
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // 4. copy point features to voxels
+  // Step 4 & 5 could be parallel
+  auto pts_output_size = num_points * num_features;
+  dim3 cp_grid(std::min(at::cuda::ATenCeilDiv(pts_output_size, 512), 4096));
+  dim3 cp_block(512);
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "assign_point_to_voxel", ([&] {
+        assign_point_to_voxel<float, int>
+            <<<cp_grid, cp_block, 0, at::cuda::getCurrentCUDAStream()>>>(
+                pts_output_size, points.contiguous().data_ptr<float>(),
+                point_to_voxelidx.contiguous().data_ptr<int>(),
+                coor_to_voxelidx.contiguous().data_ptr<int>(),
+                voxels.contiguous().data_ptr<float>(), max_points, num_features,
+                num_points, NDim);
+      }));
+  //   cudaDeviceSynchronize();
+  //   AT_CUDA_CHECK(cudaGetLastError());
+
+  // 5. copy coors of each voxels
+  auto coors_output_size = num_points * NDim;
+  dim3 coors_cp_grid(
+      std::min(at::cuda::ATenCeilDiv(coors_output_size, 512), 4096));
+  dim3 coors_cp_block(512);
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "assign_point_to_voxel", ([&] {
+        assign_voxel_coors<float, int><<<coors_cp_grid, coors_cp_block, 0,
+                                         at::cuda::getCurrentCUDAStream()>>>(
+            coors_output_size, temp_coors.contiguous().data_ptr<int>(),
+            point_to_voxelidx.contiguous().data_ptr<int>(),
+            coor_to_voxelidx.contiguous().data_ptr<int>(),
+            coors.contiguous().data_ptr<int>(), num_points, NDim);
+      }));
+  cudaDeviceSynchronize();
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  auto voxel_num_cpu = voxel_num.to(at::kCPU);
+  int voxel_num_int = voxel_num_cpu.data_ptr<int>()[0];
+
+  return voxel_num_int;
+}
+
+int nondisterministic_hard_voxelize_gpu(
+    const at::Tensor &points, at::Tensor &voxels,
+    at::Tensor &coors, at::Tensor &num_points_per_voxel,
+    const std::vector<float> voxel_size,
+    const std::vector<float> coors_range,
+    const int max_points, const int max_voxels,
+    const int NDim = 3) {
+
+  CHECK_INPUT(points);
+
+  at::cuda::CUDAGuard device_guard(points.device());
+
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  if (num_points == 0)
+    return 0;
+
+  const float voxel_x = voxel_size[0];
+  const float voxel_y = voxel_size[1];
+  const float voxel_z = voxel_size[2];
+  const float coors_x_min = coors_range[0];
+  const float coors_y_min = coors_range[1];
+  const float coors_z_min = coors_range[2];
+  const float coors_x_max = coors_range[3];
+  const float coors_y_max = coors_range[4];
+  const float coors_z_max = coors_range[5];
+
+  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
+  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
+  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
+
+  // map points to voxel coors
+  at::Tensor temp_coors =
+      at::zeros({num_points, NDim}, points.options().dtype(torch::kInt32));
+
+  dim3 grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
+  dim3 block(512);
+
+  // 1. link point to corresponding voxel coors
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "hard_voxelize_kernel", ([&] {
+    dynamic_voxelize_kernel<scalar_t, int>
+    <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
+        points.contiguous().data_ptr<scalar_t>(),
+        temp_coors.contiguous().data_ptr<int>(), voxel_x, voxel_y,
+        voxel_z, coors_x_min, coors_y_min, coors_z_min, coors_x_max,
+        coors_y_max, coors_z_max, grid_x, grid_y, grid_z, num_points,
+        num_features, NDim);
+  }));
+
+  at::Tensor coors_map;
+  at::Tensor coors_count;
+  at::Tensor coors_order;
+  at::Tensor reduce_count;
+  at::Tensor pts_id;
+
+  auto coors_clean = temp_coors.masked_fill(temp_coors.lt(0).any(-1, true), -1);
+
+  std::tie(temp_coors, coors_map, reduce_count) =
+      at::unique_dim(coors_clean, 0, true, true, false);
+
+  if (temp_coors.index({0, 0}).lt(0).item<bool>()) {
+    // the first element of temp_coors is (-1,-1,-1) and should be removed
+    temp_coors = temp_coors.slice(0, 1);
+    coors_map = coors_map - 1;
+  }
+
+  int num_coors = temp_coors.size(0);
+  temp_coors = temp_coors.to(torch::kInt32);
+  coors_map = coors_map.to(torch::kInt32);
+
+  coors_count = coors_map.new_zeros(1);
+  coors_order = coors_map.new_empty(num_coors);
+  reduce_count = coors_map.new_zeros(num_coors);
+  pts_id = coors_map.new_zeros(num_points);
+
+  dim3 cp_grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
+  dim3 cp_block(512);
+  AT_DISPATCH_ALL_TYPES(points.scalar_type(), "get_assign_pos", ([&] {
+    nondisterministic_get_assign_pos<<<cp_grid, cp_block, 0,
+    at::cuda::getCurrentCUDAStream()>>>(
+        num_points,
+        coors_map.contiguous().data_ptr<int32_t>(),
+        pts_id.contiguous().data_ptr<int32_t>(),
+        coors_count.contiguous().data_ptr<int32_t>(),
+        reduce_count.contiguous().data_ptr<int32_t>(),
+        coors_order.contiguous().data_ptr<int32_t>());
+  }));
+
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "assign_point_to_voxel", ([&] {
+    nondisterministic_assign_point_voxel<scalar_t>
+    <<<cp_grid, cp_block, 0, at::cuda::getCurrentCUDAStream()>>>(
+        num_points, points.contiguous().data_ptr<scalar_t>(),
+        coors_map.contiguous().data_ptr<int32_t>(),
+        pts_id.contiguous().data_ptr<int32_t>(),
+        temp_coors.contiguous().data_ptr<int32_t>(),
+        reduce_count.contiguous().data_ptr<int32_t>(),
+        coors_order.contiguous().data_ptr<int32_t>(),
+        voxels.contiguous().data_ptr<scalar_t>(),
+        coors.contiguous().data_ptr<int32_t>(),
+        num_points_per_voxel.contiguous().data_ptr<int32_t>(),
+        max_voxels, max_points,
+        num_features, NDim);
+  }));
+  AT_CUDA_CHECK(cudaGetLastError());
+  return max_voxels < num_coors ? max_voxels : num_coors;
+}
+
+void dynamic_voxelize_gpu(const at::Tensor& points, at::Tensor& coors,
+                          const std::vector<float> voxel_size,
+                          const std::vector<float> coors_range,
+                          const int NDim = 3) {
+  // current version tooks about 0.04s for one frame on cpu
+  // check device
+  CHECK_INPUT(points);
+
+  at::cuda::CUDAGuard device_guard(points.device());
+
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  const float voxel_x = voxel_size[0];
+  const float voxel_y = voxel_size[1];
+  const float voxel_z = voxel_size[2];
+  const float coors_x_min = coors_range[0];
+  const float coors_y_min = coors_range[1];
+  const float coors_z_min = coors_range[2];
+  const float coors_x_max = coors_range[3];
+  const float coors_y_max = coors_range[4];
+  const float coors_z_max = coors_range[5];
+
+  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
+  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
+  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
+
+  const int col_blocks = at::cuda::ATenCeilDiv(num_points, threadsPerBlock);
+  dim3 blocks(col_blocks);
+  dim3 threads(threadsPerBlock);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_ALL_TYPES(points.scalar_type(), "dynamic_voxelize_kernel", [&] {
+    dynamic_voxelize_kernel<scalar_t, int><<<blocks, threads, 0, stream>>>(
+        points.contiguous().data_ptr<scalar_t>(),
+        coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,
+        coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,
+        coors_z_max, grid_x, grid_y, grid_z, num_points, num_features, NDim);
+  });
+  cudaDeviceSynchronize();
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  return;
+}
+
+}  // namespace voxelization
--- a/pointpillars/utils/__init__.py
+++ b/pointpillars/utils/__init__.py
+from .io import read_pickle, write_pickle, read_points, write_points, read_calib, \
+    read_label, write_label
+from .process import bbox_camera2lidar, bbox3d2bevcorners, box_collision_test, \
+    remove_pts_in_bboxes, limit_period, bbox3d2corners, points_lidar2image, \
+    keep_bbox_from_image_range, keep_bbox_from_lidar_range, \
+    points_camera2lidar, setup_seed, remove_outside_points, points_in_bboxes_v2, \
+    get_points_num_in_bbox, iou2d_nearest, iou2d, iou3d, iou3d_camera, iou_bev, \
+    bbox3d2corners_camera, points_camera2image
+from .vis_o3d import vis_pc, vis_img_3d
--- a/pointpillars/utils/io.py
+++ b/pointpillars/utils/io.py
+import numpy as np
+import os
+import pickle
+
+
+def read_pickle(file_path, suffix='.pkl'):
+    assert os.path.splitext(file_path)[1] == suffix
+    with open(file_path, 'rb') as f:
+        data = pickle.load(f)
+    return data
+
+
+def write_pickle(results, file_path):
+    with open(file_path, 'wb') as f:
+        pickle.dump(results, f)
+
+
+def read_points(file_path, dim=4):
+    suffix = os.path.splitext(file_path)[1] 
+    assert suffix in ['.bin', '.ply']
+    if suffix == '.bin':
+        return np.fromfile(file_path, dtype=np.float32).reshape(-1, dim)
+    else:
+        raise NotImplementedError
+
+
+def write_points(lidar_points, file_path):
+    suffix = os.path.splitext(file_path)[1] 
+    assert suffix in ['.bin', '.ply']
+    if suffix == '.bin':
+        with open(file_path, 'w') as f:
+            lidar_points.tofile(f)
+    else:
+        raise NotImplementedError
+
+
+def read_calib(file_path, extend_matrix=True):
+    with open(file_path, 'r') as f:
+        lines = f.readlines()
+    lines = [line.strip() for line in lines]
+    P0 = np.array([item for item in lines[0].split(' ')[1:]], dtype=np.float32).reshape(3, 4)
+    P1 = np.array([item for item in lines[1].split(' ')[1:]], dtype=np.float32).reshape(3, 4)
+    P2 = np.array([item for item in lines[2].split(' ')[1:]], dtype=np.float32).reshape(3, 4)
+    P3 = np.array([item for item in lines[3].split(' ')[1:]], dtype=np.float32).reshape(3, 4)
+
+    R0_rect = np.array([item for item in lines[4].split(' ')[1:]], dtype=np.float32).reshape(3, 3)
+    Tr_velo_to_cam = np.array([item for item in lines[5].split(' ')[1:]], dtype=np.float32).reshape(3, 4)
+    Tr_imu_to_velo = np.array([item for item in lines[6].split(' ')[1:]], dtype=np.float32).reshape(3, 4)
+
+    if extend_matrix:
+        P0 = np.concatenate([P0, np.array([[0, 0, 0, 1]])], axis=0)
+        P1 = np.concatenate([P1, np.array([[0, 0, 0, 1]])], axis=0)
+        P2 = np.concatenate([P2, np.array([[0, 0, 0, 1]])], axis=0)
+        P3 = np.concatenate([P3, np.array([[0, 0, 0, 1]])], axis=0)
+
+        R0_rect_extend = np.eye(4, dtype=R0_rect.dtype)
+        R0_rect_extend[:3, :3] = R0_rect
+        R0_rect = R0_rect_extend
+
+        Tr_velo_to_cam = np.concatenate([Tr_velo_to_cam, np.array([[0, 0, 0, 1]])], axis=0)
+        Tr_imu_to_velo = np.concatenate([Tr_imu_to_velo, np.array([[0, 0, 0, 1]])], axis=0)
+
+    calib_dict=dict(
+        P0=P0,
+        P1=P1,
+        P2=P2,
+        P3=P3,
+        R0_rect=R0_rect,
+        Tr_velo_to_cam=Tr_velo_to_cam,
+        Tr_imu_to_velo=Tr_imu_to_velo
+    )
+    return calib_dict
+
+
+def read_label(file_path):
+    with open(file_path, 'r') as f:
+        lines = f.readlines()
+    lines = [line.strip().split(' ') for line in lines]
+    annotation = {}
+    annotation['name'] = np.array([line[0] for line in lines])
+    annotation['truncated'] = np.array([line[1] for line in lines], dtype=np.float32)
+    annotation['occluded'] = np.array([line[2] for line in lines], dtype=np.int32)
+    annotation['alpha'] = np.array([line[3] for line in lines], dtype=np.float32)
+    annotation['bbox'] = np.array([line[4:8] for line in lines], dtype=np.float32)
+    annotation['dimensions'] = np.array([line[8:11] for line in lines], dtype=np.float32)[:, [2, 0, 1]] # hwl -> camera coordinates (lhw)
+    annotation['location'] = np.array([line[11:14] for line in lines], dtype=np.float32)
+    annotation['rotation_y'] = np.array([line[14] for line in lines], dtype=np.float32)
+    
+    return annotation
+
+
+def write_label(result, file_path, suffix='.txt'):
+    '''
+    result: dict,
+    file_path: str
+    '''
+    assert os.path.splitext(file_path)[1] == suffix
+    name, truncated, occluded, alpha, bbox, dimensions, location, rotation_y, score = \
+        result['name'], result['truncated'], result['occluded'], result['alpha'], \
+        result['bbox'], result['dimensions'], result['location'], result['rotation_y'], \
+        result['score']
+    
+    with open(file_path, 'w') as f:
+        for i in range(len(name)):
+            bbox_str = ' '.join(map(str, bbox[i]))
+            hwl = ' '.join(map(str, dimensions[i]))
+            xyz = ' '.join(map(str, location[i]))
+            line = f'{name[i]} {truncated[i]} {occluded[i]} {alpha[i]} {bbox_str} {hwl} {xyz} {rotation_y[i]} {score[i]}\n'
+            f.writelines(line)
--- a/pointpillars/utils/process.py
+++ b/pointpillars/utils/process.py
+import copy
+import numba
+import numpy as np
+import random
+import torch
+import pdb
+from pointpillars.ops.iou3d_module import boxes_overlap_bev, boxes_iou_bev
+
+
+def setup_seed(seed=0, deterministic = True):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    if deterministic:
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+
+
+def bbox_camera2lidar(bboxes, tr_velo_to_cam, r0_rect):
+    '''
+    bboxes: shape=(N, 7)
+    tr_velo_to_cam: shape=(4, 4)
+    r0_rect: shape=(4, 4)
+    return: shape=(N, 7)
+    '''
+    x_size, y_size, z_size = bboxes[:, 3:4], bboxes[:, 4:5], bboxes[:, 5:6]
+    xyz_size = np.concatenate([z_size, x_size, y_size], axis=1)
+    extended_xyz = np.pad(bboxes[:, :3], ((0, 0), (0, 1)), 'constant', constant_values=1.0)
+    rt_mat = np.linalg.inv(r0_rect @ tr_velo_to_cam)
+    xyz = extended_xyz @ rt_mat.T
+    bboxes_lidar = np.concatenate([xyz[:, :3], xyz_size, bboxes[:, 6:]], axis=1)
+    return np.array(bboxes_lidar, dtype=np.float32)
+
+
+def bbox_lidar2camera(bboxes, tr_velo_to_cam, r0_rect):
+    '''
+    bboxes: shape=(N, 7)
+    tr_velo_to_cam: shape=(4, 4)
+    r0_rect: shape=(4, 4)
+    return: shape=(N, 7)
+    '''
+    x_size, y_size, z_size = bboxes[:, 3:4], bboxes[:, 4:5], bboxes[:, 5:6]
+    xyz_size = np.concatenate([y_size, z_size, x_size], axis=1)
+    extended_xyz = np.pad(bboxes[:, :3], ((0, 0), (0, 1)), 'constant', constant_values=1.0)
+    rt_mat = r0_rect @ tr_velo_to_cam
+    xyz = extended_xyz @ rt_mat.T
+    bboxes_camera = np.concatenate([xyz[:, :3], xyz_size, bboxes[:, 6:]], axis=1)
+    return bboxes_camera
+
+
+def points_camera2image(points, P2):
+    '''
+    points: shape=(N, 8, 3) 
+    P2: shape=(4, 4)
+    return: shape=(N, 8, 2)
+    '''
+    extended_points = np.pad(points, ((0, 0), (0, 0), (0, 1)), 'constant', constant_values=1.0) # (n, 8, 4)
+    image_points = extended_points @ P2.T # (N, 8, 4)
+    image_points = image_points[:, :, :2] / image_points[:, :, 2:3]
+    return image_points
+
+
+def points_lidar2image(points, tr_velo_to_cam, r0_rect, P2):
+    '''
+    points: shape=(N, 8, 3) 
+    tr_velo_to_cam: shape=(4, 4)
+    r0_rect: shape=(4, 4)
+    P2: shape=(4, 4)
+    return: shape=(N, 8, 2)
+    '''
+    # points = points[:, :, [1, 2, 0]]
+    extended_points = np.pad(points, ((0, 0), (0, 0), (0, 1)), 'constant', constant_values=1.0) # (N, 8, 4)
+    rt_mat = r0_rect @ tr_velo_to_cam
+    camera_points = extended_points @ rt_mat.T # (N, 8, 4)
+    # camera_points = camera_points[:, :, [1, 2, 0, 3]]
+    image_points = camera_points @ P2.T # (N, 8, 4)
+    image_points = image_points[:, :, :2] / image_points[:, :, 2:3]
+
+    return image_points
+
+
+def points_camera2lidar(points, tr_velo_to_cam, r0_rect):
+    '''
+    points: shape=(N, 8, 3) 
+    tr_velo_to_cam: shape=(4, 4)
+    r0_rect: shape=(4, 4)
+    return: shape=(N, 8, 3)
+    '''
+    extended_xyz = np.pad(points, ((0, 0), (0, 0), (0, 1)), 'constant', constant_values=1.0)
+    rt_mat = np.linalg.inv(r0_rect @ tr_velo_to_cam)
+    xyz = extended_xyz @ rt_mat.T
+    return xyz[..., :3]
+
+
+def bbox3d2bevcorners(bboxes):
+    '''
+    bboxes: shape=(n, 7)
+
+                ^ x (-0.5 * pi)
+                |
+                |                (bird's eye view)
+       (-pi)  o |
+        y <-------------- (0)
+                 \ / (ag)
+                  \ 
+                   \ 
+
+    return: shape=(n, 4, 2)
+    '''
+    centers, dims, angles = bboxes[:, :2], bboxes[:, 3:5], bboxes[:, 6]
+
+    # 1.generate bbox corner coordinates, clockwise from minimal point
+    bev_corners = np.array([[-0.5, -0.5], [-0.5, 0.5], [0.5, 0.5], [0.5, -0.5]], dtype=np.float32)
+    bev_corners = bev_corners[None, ...] * dims[:, None, :] # (1, 4, 2) * (n, 1, 2) -> (n, 4, 2)
+
+    # 2. rotate
+    rot_sin, rot_cos = np.sin(angles), np.cos(angles)
+    # in fact, -angle
+    rot_mat = np.array([[rot_cos, rot_sin], 
+                        [-rot_sin, rot_cos]]) # (2, 2, n)
+    rot_mat = np.transpose(rot_mat, (2, 1, 0)) # (N, 2, 2)
+    bev_corners = bev_corners @ rot_mat # (n, 4, 2)
+
+    # 3. translate to centers
+    bev_corners += centers[:, None, :] 
+    return bev_corners.astype(np.float32)
+
+
+def bbox3d2corners(bboxes):
+    '''
+    bboxes: shape=(n, 7)
+    return: shape=(n, 8, 3)
+           ^ z   x            6 ------ 5
+           |   /             / |     / |
+           |  /             2 -|---- 1 |   
+    y      | /              |  |     | | 
+    <------|o               | 7 -----| 4
+                            |/   o   |/    
+                            3 ------ 0 
+    x: front, y: left, z: top
+    '''
+    centers, dims, angles = bboxes[:, :3], bboxes[:, 3:6], bboxes[:, 6]
+
+    # 1.generate bbox corner coordinates, clockwise from minimal point
+    bboxes_corners = np.array([[-0.5, -0.5, 0], [-0.5, -0.5, 1.0], [-0.5, 0.5, 1.0], [-0.5, 0.5, 0.0],
+                               [0.5, -0.5, 0], [0.5, -0.5, 1.0], [0.5, 0.5, 1.0], [0.5, 0.5, 0.0]], 
+                               dtype=np.float32)
+    bboxes_corners = bboxes_corners[None, :, :] * dims[:, None, :] # (1, 8, 3) * (n, 1, 3) -> (n, 8, 3)
+
+    # 2. rotate around z axis
+    rot_sin, rot_cos = np.sin(angles), np.cos(angles)
+    # in fact, -angle
+    rot_mat = np.array([[rot_cos, rot_sin, np.zeros_like(rot_cos)],
+                        [-rot_sin, rot_cos, np.zeros_like(rot_cos)],
+                        [np.zeros_like(rot_cos), np.zeros_like(rot_cos), np.ones_like(rot_cos)]], 
+                        dtype=np.float32) # (3, 3, n)
+    rot_mat = np.transpose(rot_mat, (2, 1, 0)) # (n, 3, 3)
+    bboxes_corners = bboxes_corners @ rot_mat # (n, 8, 3)
+
+    # 3. translate to centers
+    bboxes_corners += centers[:, None, :]
+    return bboxes_corners
+
+
+def bbox3d2corners_camera(bboxes):
+    '''
+    bboxes: shape=(n, 7)
+    return: shape=(n, 8, 3)
+        z (front)            6 ------ 5
+        /                  / |     / |
+       /                  2 -|---- 1 |   
+      /                   |  |     | | 
+    |o ------> x(right)   | 7 -----| 4
+    |                     |/   o   |/    
+    |                     3 ------ 0 
+    |
+    v y(down)                   
+    '''
+    centers, dims, angles = bboxes[:, :3], bboxes[:, 3:6], bboxes[:, 6]
+
+    # 1.generate bbox corner coordinates, clockwise from minimal point
+    bboxes_corners = np.array([[0.5, 0.0, -0.5], [0.5, -1.0, -0.5], [-0.5, -1.0, -0.5], [-0.5, 0.0, -0.5],
+                               [0.5, 0.0, 0.5], [0.5, -1.0, 0.5], [-0.5, -1.0, 0.5], [-0.5, 0.0, 0.5]], 
+                               dtype=np.float32)
+    bboxes_corners = bboxes_corners[None, :, :] * dims[:, None, :] # (1, 8, 3) * (n, 1, 3) -> (n, 8, 3)
+
+    # 2. rotate around y axis
+    rot_sin, rot_cos = np.sin(angles), np.cos(angles)
+    # in fact, angle
+    rot_mat = np.array([[rot_cos, np.zeros_like(rot_cos), rot_sin],
+                        [np.zeros_like(rot_cos), np.ones_like(rot_cos), np.zeros_like(rot_cos)],
+                        [-rot_sin, np.zeros_like(rot_cos), rot_cos]], 
+                        dtype=np.float32) # (3, 3, n)
+    rot_mat = np.transpose(rot_mat, (2, 1, 0)) # (n, 3, 3)
+    bboxes_corners = bboxes_corners @ rot_mat # (n, 8, 3)
+
+    # 3. translate to centers
+    bboxes_corners += centers[:, None, :]
+    return bboxes_corners
+
+
+def group_rectangle_vertexs(bboxes_corners):
+    '''
+    bboxes_corners: shape=(n, 8, 3)
+    return: shape=(n, 6, 4, 3)
+    '''
+    rec1 = np.stack([bboxes_corners[:, 0], bboxes_corners[:, 1], bboxes_corners[:, 3], bboxes_corners[:, 2]], axis=1) # (n, 4, 3)
+    rec2 = np.stack([bboxes_corners[:, 4], bboxes_corners[:, 7], bboxes_corners[:, 6], bboxes_corners[:, 5]], axis=1) # (n, 4, 3)
+    rec3 = np.stack([bboxes_corners[:, 0], bboxes_corners[:, 4], bboxes_corners[:, 5], bboxes_corners[:, 1]], axis=1) # (n, 4, 3)
+    rec4 = np.stack([bboxes_corners[:, 2], bboxes_corners[:, 6], bboxes_corners[:, 7], bboxes_corners[:, 3]], axis=1) # (n, 4, 3)
+    rec5 = np.stack([bboxes_corners[:, 1], bboxes_corners[:, 5], bboxes_corners[:, 6], bboxes_corners[:, 2]], axis=1) # (n, 4, 3)
+    rec6 = np.stack([bboxes_corners[:, 0], bboxes_corners[:, 3], bboxes_corners[:, 7], bboxes_corners[:, 4]], axis=1) # (n, 4, 3)
+    group_rectangle_vertexs = np.stack([rec1, rec2, rec3, rec4, rec5, rec6], axis=1)
+    return group_rectangle_vertexs
+
+
+@numba.jit(nopython=True)
+def bevcorner2alignedbbox(bev_corners):
+    '''
+    bev_corners: shape=(N, 4, 2)
+    return: shape=(N, 4)
+    '''
+    # xmin, xmax = np.min(bev_corners[:, :, 0], axis=-1), np.max(bev_corners[:, :, 0], axis=-1)
+    # ymin, ymax = np.min(bev_corners[:, :, 1], axis=-1), np.max(bev_corners[:, :, 1], axis=-1)
+
+    # why we don't implement like the above ? please see
+    # https://numba.pydata.org/numba-doc/latest/reference/numpysupported.html#calculation
+    n = len(bev_corners)
+    alignedbbox = np.zeros((n, 4), dtype=np.float32)
+    for i in range(n):
+        cur_bev = bev_corners[i]
+        alignedbbox[i, 0] = np.min(cur_bev[:, 0])
+        alignedbbox[i, 2] = np.max(cur_bev[:, 0])
+        alignedbbox[i, 1] = np.min(cur_bev[:, 1])
+        alignedbbox[i, 3] = np.max(cur_bev[:, 1])
+    return alignedbbox
+
+
+# modified from https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/pipelines/data_augment_utils.py#L31
+@numba.jit(nopython=True)
+def box_collision_test(boxes, qboxes, clockwise=True):
+    """Box collision test.
+    Args:
+        boxes (np.ndarray): Corners of current boxes. # (n1, 4, 2)
+        qboxes (np.ndarray): Boxes to be avoid colliding. # (n2, 4, 2)
+        clockwise (bool, optional): Whether the corners are in
+            clockwise order. Default: True.
+    return: shape=(n1, n2)
+    """
+    N = boxes.shape[0]
+    K = qboxes.shape[0]
+    ret = np.zeros((N, K), dtype=np.bool_)
+    slices = np.array([1, 2, 3, 0])
+    lines_boxes = np.stack((boxes, boxes[:, slices, :]),
+                           axis=2)  # [N, 4, 2(line), 2(xy)]
+    lines_qboxes = np.stack((qboxes, qboxes[:, slices, :]), axis=2)
+    # vec = np.zeros((2,), dtype=boxes.dtype)
+    boxes_standup = bevcorner2alignedbbox(boxes)
+    qboxes_standup = bevcorner2alignedbbox(qboxes)
+    for i in range(N):
+        for j in range(K):
+            # calculate standup first
+            iw = (
+                min(boxes_standup[i, 2], qboxes_standup[j, 2]) -
+                max(boxes_standup[i, 0], qboxes_standup[j, 0]))
+            if iw > 0:
+                ih = (
+                    min(boxes_standup[i, 3], qboxes_standup[j, 3]) -
+                    max(boxes_standup[i, 1], qboxes_standup[j, 1]))
+                if ih > 0:
+                    for k in range(4):
+                        for box_l in range(4):
+                            A = lines_boxes[i, k, 0]
+                            B = lines_boxes[i, k, 1]
+                            C = lines_qboxes[j, box_l, 0]
+                            D = lines_qboxes[j, box_l, 1]
+                            acd = (D[1] - A[1]) * (C[0] -
+                                                   A[0]) > (C[1] - A[1]) * (
+                                                       D[0] - A[0])
+                            bcd = (D[1] - B[1]) * (C[0] -
+                                                   B[0]) > (C[1] - B[1]) * (
+                                                       D[0] - B[0])
+                            if acd != bcd:
+                                abc = (C[1] - A[1]) * (B[0] - A[0]) > (
+                                    B[1] - A[1]) * (
+                                        C[0] - A[0])
+                                abd = (D[1] - A[1]) * (B[0] - A[0]) > (
+                                    B[1] - A[1]) * (
+                                        D[0] - A[0])
+                                if abc != abd:
+                                    ret[i, j] = True  # collision.
+                                    break
+                        if ret[i, j] is True:
+                            break
+                    if ret[i, j] is False:
+                        # now check complete overlap.
+                        # box overlap qbox:
+                        box_overlap_qbox = True
+                        for box_l in range(4):  # point l in qboxes
+                            for k in range(4):  # corner k in boxes
+                                vec = boxes[i, k] - boxes[i, (k + 1) % 4]
+                                if clockwise:
+                                    vec = -vec
+                                cross = vec[1] * (
+                                    boxes[i, k, 0] - qboxes[j, box_l, 0])
+                                cross -= vec[0] * (
+                                    boxes[i, k, 1] - qboxes[j, box_l, 1])
+                                if cross >= 0:
+                                    box_overlap_qbox = False
+                                    break
+                            if box_overlap_qbox is False:
+                                break
+
+                        if box_overlap_qbox is False:
+                            qbox_overlap_box = True
+                            for box_l in range(4):  # point box_l in boxes
+                                for k in range(4):  # corner k in qboxes
+                                    vec = qboxes[j, k] - qboxes[j, (k + 1) % 4]
+                                    if clockwise:
+                                        vec = -vec
+                                    cross = vec[1] * (
+                                        qboxes[j, k, 0] - boxes[i, box_l, 0])
+                                    cross -= vec[0] * (
+                                        qboxes[j, k, 1] - boxes[i, box_l, 1])
+                                    if cross >= 0:  #
+                                        qbox_overlap_box = False
+                                        break
+                                if qbox_overlap_box is False:
+                                    break
+                            if qbox_overlap_box:
+                                ret[i, j] = True  # collision.
+                        else:
+                            ret[i, j] = True  # collision.
+    return ret
+
+
+def group_plane_equation(bbox_group_rectangle_vertexs):
+    '''
+    bbox_group_rectangle_vertexs: shape=(n, 6, 4, 3)
+    return: shape=(n, 6, 4)
+    '''
+    # 1. generate vectors for a x b
+    vectors = bbox_group_rectangle_vertexs[:, :, :2] - bbox_group_rectangle_vertexs[:, :, 1:3]
+    normal_vectors = np.cross(vectors[:, :, 0], vectors[:, :, 1]) # (n, 6, 3)
+    normal_d = np.einsum('ijk,ijk->ij', bbox_group_rectangle_vertexs[:, :, 0], normal_vectors) # (n, 6)
+    plane_equation_params = np.concatenate([normal_vectors, -normal_d[:, :, None]], axis=-1)
+    return plane_equation_params
+
+
+@numba.jit(nopython=True)
+def points_in_bboxes(points, plane_equation_params):
+    '''
+    points: shape=(N, 3)
+    plane_equation_params: shape=(n, 6, 4)
+    return: shape=(N, n), bool
+    '''
+    N, n = len(points), len(plane_equation_params)
+    m = plane_equation_params.shape[1]
+    masks = np.ones((N, n), dtype=np.bool_)
+    for i in range(N):
+        x, y, z = points[i, :3]
+        for j in range(n):
+            bbox_plane_equation_params = plane_equation_params[j]
+            for k in range(m):
+                a, b, c, d = bbox_plane_equation_params[k]
+                if a * x + b * y + c * z + d >= 0:
+                    masks[i][j] = False
+                    break
+    return masks
+
+
+def remove_pts_in_bboxes(points, bboxes, rm=True):
+    '''
+    points: shape=(N, 3)
+    bboxes: shape=(n, 7)
+    return: shape=(N, n), bool
+    '''
+    # 1. get 6 groups of rectangle vertexs
+    bboxes_corners = bbox3d2corners(bboxes) # (n, 8, 3)
+    bbox_group_rectangle_vertexs = group_rectangle_vertexs(bboxes_corners) # (n, 6, 4, 3)
+
+    # 2. calculate plane equation: ax + by + cd + d = 0
+    group_plane_equation_params = group_plane_equation(bbox_group_rectangle_vertexs)
+
+    # 3. Judge each point inside or outside the bboxes
+    # if point (x0, y0, z0) lies on the direction of normal vector(a, b, c), then ax0 + by0 + cz0 + d > 0.
+    masks = points_in_bboxes(points, group_plane_equation_params) # (N, n)
+
+    if not rm:
+        return masks
+        
+    # 4. remove point insider the bboxes
+    masks = np.any(masks, axis=-1)
+
+    return points[~masks]
+
+
+# modified from https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/core/bbox/structures/utils.py#L11
+def limit_period(val, offset=0.5, period=np.pi):
+    """
+    val: array or float
+    offset: float
+    period: float
+    return: Value in the range of [-offset * period, (1-offset) * period]
+    """
+    limited_val = val - np.floor(val / period + offset) * period
+    return limited_val
+
+
+def nearest_bev(bboxes):
+    '''
+    bboxes: (n, 7), (x, y, z, w, l, h, theta)
+    return: (n, 4), (x1, y1, x2, y2)
+    '''    
+    bboxes_bev = copy.deepcopy(bboxes[:, [0, 1, 3, 4]])
+    bboxes_angle = limit_period(bboxes[:, 6].cpu(), offset=0.5, period=np.pi).to(bboxes_bev)
+    bboxes_bev = torch.where(torch.abs(bboxes_angle[:, None]) > np.pi / 4, bboxes_bev[:, [0, 1, 3, 2]], bboxes_bev)
+    
+    bboxes_xy = bboxes_bev[:, :2]
+    bboxes_wl = bboxes_bev[:, 2:]
+    bboxes_bev_x1y1x2y2 = torch.cat([bboxes_xy - bboxes_wl / 2, bboxes_xy + bboxes_wl / 2], dim=-1)
+    return bboxes_bev_x1y1x2y2
+
+
+def iou2d(bboxes1, bboxes2, metric=0):
+    '''
+    bboxes1: (n, 4), (x1, y1, x2, y2)
+    bboxes2: (m, 4), (x1, y1, x2, y2)
+    return: (n, m)
+    '''
+    bboxes_x1 = torch.maximum(bboxes1[:, 0][:, None], bboxes2[:, 0][None, :]) # (n, m)
+    bboxes_y1 = torch.maximum(bboxes1[:, 1][:, None], bboxes2[:, 1][None, :]) # (n, m)
+    bboxes_x2 = torch.minimum(bboxes1[:, 2][:, None], bboxes2[:, 2][None, :])
+    bboxes_y2 = torch.minimum(bboxes1[:, 3][:, None], bboxes2[:, 3][None, :])
+
+    bboxes_w = torch.clamp(bboxes_x2 - bboxes_x1, min=0)
+    bboxes_h = torch.clamp(bboxes_y2 - bboxes_y1, min=0)
+
+    iou_area = bboxes_w * bboxes_h # (n, m)
+    
+    bboxes1_wh = bboxes1[:, 2:] - bboxes1[:, :2]
+    area1 = bboxes1_wh[:, 0] * bboxes1_wh[:, 1] # (n, )
+    bboxes2_wh = bboxes2[:, 2:] - bboxes2[:, :2]
+    area2 = bboxes2_wh[:, 0] * bboxes2_wh[:, 1] # (m, )
+    if metric == 0:
+        iou = iou_area / (area1[:, None] + area2[None, :] - iou_area + 1e-8)
+    elif metric == 1:
+        iou = iou_area / (area1[:, None] + 1e-8)
+    return iou
+
+
+def iou2d_nearest(bboxes1, bboxes2):
+    '''
+    bboxes1: (n, 7), (x, y, z, w, l, h, theta)
+    bboxes2: (m, 7),
+    return: (n, m)
+    '''
+    bboxes1_bev = nearest_bev(bboxes1)
+    bboxes2_bev = nearest_bev(bboxes2)
+    iou = iou2d(bboxes1_bev, bboxes2_bev)
+    return iou
+
+
+def iou3d(bboxes1, bboxes2):
+    '''
+    bboxes1: (n, 7), (x, y, z, w, l, h, theta)
+    bboxes2: (m, 7)
+    return: (n, m)
+    '''
+    # 1. height overlap
+    bboxes1_bottom, bboxes2_bottom = bboxes1[:, 2], bboxes2[:, 2] # (n, ), (m, )
+    bboxes1_top, bboxes2_top = bboxes1[:, 2] + bboxes1[:, 5], bboxes2[:, 2] + bboxes2[:, 5] # (n, ), (m, )
+    bboxes_bottom = torch.maximum(bboxes1_bottom[:, None], bboxes2_bottom[None, :]) # (n, m) 
+    bboxes_top = torch.minimum(bboxes1_top[:, None], bboxes2_top[None, :])
+    height_overlap =  torch.clamp(bboxes_top - bboxes_bottom, min=0)
+
+    # 2. bev overlap
+    bboxes1_x1y1 = bboxes1[:, :2] - bboxes1[:, 3:5] / 2
+    bboxes1_x2y2 = bboxes1[:, :2] + bboxes1[:, 3:5] / 2
+    bboxes2_x1y1 = bboxes2[:, :2] - bboxes2[:, 3:5] / 2
+    bboxes2_x2y2 = bboxes2[:, :2] + bboxes2[:, 3:5] / 2
+    bboxes1_bev = torch.cat([bboxes1_x1y1, bboxes1_x2y2, bboxes1[:, 6:]], dim=-1)
+    bboxes2_bev = torch.cat([bboxes2_x1y1, bboxes2_x2y2, bboxes2[:, 6:]], dim=-1)
+    bev_overlap = boxes_overlap_bev(bboxes1_bev, bboxes2_bev) # (n, m)
+
+    # 3. overlap and volume
+    overlap = height_overlap * bev_overlap
+    volume1 = bboxes1[:, 3] * bboxes1[:, 4] * bboxes1[:, 5]
+    volume2 = bboxes2[:, 3] * bboxes2[:, 4] * bboxes2[:, 5]
+    volume = volume1[:, None] + volume2[None, :] # (n, m)
+
+    # 4. iou
+    iou = overlap / (volume - overlap + 1e-8)
+
+    return iou
+    
+
+def iou3d_camera(bboxes1, bboxes2):
+    '''
+    bboxes1: (n, 7), (x, y, z, w, l, h, theta)
+    bboxes2: (m, 7)
+    return: (n, m)
+    '''
+    # 1. height overlap
+    bboxes1_bottom, bboxes2_bottom = bboxes1[:, 1] - bboxes1[:, 4], bboxes2[:, 1] -  bboxes2[:, 4] # (n, ), (m, )
+    bboxes1_top, bboxes2_top = bboxes1[:, 1], bboxes2[:, 1] # (n, ), (m, )
+    bboxes_bottom = torch.maximum(bboxes1_bottom[:, None], bboxes2_bottom[None, :]) # (n, m) 
+    bboxes_top = torch.minimum(bboxes1_top[:, None], bboxes2_top[None, :])
+    height_overlap =  torch.clamp(bboxes_top - bboxes_bottom, min=0)
+
+    # 2. bev overlap
+    bboxes1_x1y1 = bboxes1[:, [0, 2]] - bboxes1[:, [3, 5]] / 2
+    bboxes1_x2y2 = bboxes1[:, [0, 2]] + bboxes1[:, [3, 5]] / 2
+    bboxes2_x1y1 = bboxes2[:, [0, 2]] - bboxes2[:, [3, 5]] / 2
+    bboxes2_x2y2 = bboxes2[:, [0, 2]] + bboxes2[:, [3, 5]] / 2
+    bboxes1_bev = torch.cat([bboxes1_x1y1, bboxes1_x2y2, bboxes1[:, 6:]], dim=-1)
+    bboxes2_bev = torch.cat([bboxes2_x1y1, bboxes2_x2y2, bboxes2[:, 6:]], dim=-1)
+    bev_overlap = boxes_overlap_bev(bboxes1_bev, bboxes2_bev) # (n, m)
+
+    # 3. overlap and volume
+    overlap = height_overlap * bev_overlap
+    volume1 = bboxes1[:, 3] * bboxes1[:, 4] * bboxes1[:, 5]
+    volume2 = bboxes2[:, 3] * bboxes2[:, 4] * bboxes2[:, 5]
+    volume = volume1[:, None] + volume2[None, :] # (n, m)
+
+    # 4. iou
+    iou = overlap / (volume - overlap + 1e-8)
+
+    return iou
+
+
+def iou_bev(bboxes1, bboxes2):
+    '''
+    bboxes1: (n, 5), (x, z, w, h, theta)
+    bboxes2: (m, 5)
+    return: (n, m)
+    '''
+    bboxes1_x1y1 = bboxes1[:, :2] - bboxes1[:, 2:4] / 2
+    bboxes1_x2y2 = bboxes1[:, :2] + bboxes1[:, 2:4] / 2
+    bboxes2_x1y1 = bboxes2[:, :2] - bboxes2[:, 2:4] / 2
+    bboxes2_x2y2 = bboxes2[:, :2] + bboxes2[:, 2:4] / 2
+    bboxes1_bev = torch.cat([bboxes1_x1y1, bboxes1_x2y2, bboxes1[:, 4:]], dim=-1)
+    bboxes2_bev = torch.cat([bboxes2_x1y1, bboxes2_x2y2, bboxes2[:, 4:]], dim=-1)
+    bev_overlap = boxes_iou_bev(bboxes1_bev, bboxes2_bev) # (n, m)
+
+    return bev_overlap
+
+
+def keep_bbox_from_image_range(result, tr_velo_to_cam, r0_rect, P2, image_shape):
+    '''
+    result: dict(lidar_bboxes, labels, scores)
+    tr_velo_to_cam: shape=(4, 4)
+    r0_rect: shape=(4, 4)
+    P2: shape=(4, 4)
+    image_shape: (h, w)
+    return: dict(lidar_bboxes, labels, scores, bboxes2d, camera_bboxes)
+    '''
+    h, w = image_shape
+
+    lidar_bboxes = result['lidar_bboxes']
+    labels = result['labels']
+    scores = result['scores']
+    camera_bboxes = bbox_lidar2camera(lidar_bboxes, tr_velo_to_cam, r0_rect) # (n, 7)
+    bboxes_points = bbox3d2corners_camera(camera_bboxes) # (n, 8, 3)
+    image_points = points_camera2image(bboxes_points, P2) # (n, 8, 2)
+    image_x1y1 = np.min(image_points, axis=1) # (n, 2)
+    image_x1y1 = np.maximum(image_x1y1, 0)
+    image_x2y2 = np.max(image_points, axis=1) # (n, 2)
+    image_x2y2 = np.minimum(image_x2y2, [w, h])
+    bboxes2d = np.concatenate([image_x1y1, image_x2y2], axis=-1)
+
+    keep_flag = (image_x1y1[:, 0] < w) & (image_x1y1[:, 1] < h) & (image_x2y2[:, 0] > 0) & (image_x2y2[:, 1] > 0)
+    
+    result = {
+        'lidar_bboxes': lidar_bboxes[keep_flag],
+        'labels': labels[keep_flag],
+        'scores': scores[keep_flag],
+        'bboxes2d': bboxes2d[keep_flag],
+        'camera_bboxes': camera_bboxes[keep_flag]
+    }
+    return result
+
+
+def keep_bbox_from_lidar_range(result, pcd_limit_range):
+    '''
+    result: dict(lidar_bboxes, labels, scores, bboxes2d, camera_bboxes)
+    pcd_limit_range: []
+    return: dict(lidar_bboxes, labels, scores, bboxes2d, camera_bboxes)
+    '''
+    lidar_bboxes, labels, scores = result['lidar_bboxes'], result['labels'], result['scores']
+    if 'bboxes2d' not in result:
+        result['bboxes2d'] = np.zeros_like(lidar_bboxes[:, :4])
+    if 'camera_bboxes' not in result:
+        result['camera_bboxes'] = np.zeros_like(lidar_bboxes)
+    bboxes2d, camera_bboxes = result['bboxes2d'], result['camera_bboxes']
+    flag1 = lidar_bboxes[:, :3] > pcd_limit_range[:3][None, :] # (n, 3)
+    flag2 = lidar_bboxes[:, :3] < pcd_limit_range[3:][None, :] # (n, 3)
+    keep_flag = np.all(flag1, axis=-1) & np.all(flag2, axis=-1)
+    
+    result = {
+        'lidar_bboxes': lidar_bboxes[keep_flag],
+        'labels': labels[keep_flag],
+        'scores': scores[keep_flag],
+        'bboxes2d': bboxes2d[keep_flag],
+        'camera_bboxes': camera_bboxes[keep_flag]
+    }
+    return result
+
+
+def points_in_bboxes_v2(points, r0_rect, tr_velo_to_cam, dimensions, location, rotation_y, name):
+    '''
+    points: shape=(N, 4) 
+    tr_velo_to_cam: shape=(4, 4)
+    r0_rect: shape=(4, 4)
+    dimensions: shape=(n, 3) 
+    location: shape=(n, 3) 
+    rotation_y: shape=(n, ) 
+    name: shape=(n, )
+    return:
+        indices: shape=(N, n_valid_bbox), indices[i, j] denotes whether point i is in bbox j. 
+        n_total_bbox: int. 
+        n_valid_bbox: int, not including 'DontCare' 
+        bboxes_lidar: shape=(n_valid_bbox, 7) 
+        name: shape=(n_valid_bbox, )
+    '''
+    n_total_bbox = len(dimensions)
+    n_valid_bbox = len([item for item in name if item != 'DontCare'])
+    location, dimensions = location[:n_valid_bbox], dimensions[:n_valid_bbox]
+    rotation_y, name = rotation_y[:n_valid_bbox], name[:n_valid_bbox]
+    bboxes_camera = np.concatenate([location, dimensions, rotation_y[:, None]], axis=1)
+    bboxes_lidar = bbox_camera2lidar(bboxes_camera, tr_velo_to_cam, r0_rect)
+    bboxes_corners = bbox3d2corners(bboxes_lidar)
+    group_rectangle_vertexs_v = group_rectangle_vertexs(bboxes_corners)
+    frustum_surfaces = group_plane_equation(group_rectangle_vertexs_v)
+    indices = points_in_bboxes(points[:, :3], frustum_surfaces) # (N, n), N is points num, n is bboxes number
+    return indices, n_total_bbox, n_valid_bbox, bboxes_lidar, name
+
+
+def get_points_num_in_bbox(points, r0_rect, tr_velo_to_cam, dimensions, location, rotation_y, name):
+    '''
+    points: shape=(N, 4) 
+    tr_velo_to_cam: shape=(4, 4)
+    r0_rect: shape=(4, 4)
+    dimensions: shape=(n, 3) 
+    location: shape=(n, 3) 
+    rotation_y: shape=(n, ) 
+    name: shape=(n, )
+    return: shape=(n, )
+    '''
+    indices, n_total_bbox, n_valid_bbox, bboxes_lidar, name = \
+        points_in_bboxes_v2(
+            points=points, 
+            r0_rect=r0_rect, 
+            tr_velo_to_cam=tr_velo_to_cam, 
+            dimensions=dimensions, 
+            location=location, 
+            rotation_y=rotation_y, 
+            name=name)
+    points_num = np.sum(indices, axis=0)
+    non_valid_points_num = [-1] * (n_total_bbox - n_valid_bbox)
+    points_num = np.concatenate([points_num, non_valid_points_num], axis=0)
+    return np.array(points_num, dtype=np.int32)
+
+
+# Modified from https://github.com/open-mmlab/mmdetection3d/blob/f45977008a52baaf97640a0e9b2bbe5ea1c4be34/mmdet3d/core/bbox/box_np_ops.py#L609
+def remove_outside_points(points, r0_rect, tr_velo_to_cam, P2, image_shape):
+    """Remove points which are outside of image.
+    Args:
+        points (np.ndarray, shape=[N, 3+dims]): Total points.
+        rect (np.ndarray, shape=[4, 4]): Matrix to project points in
+            specific camera coordinate (e.g. CAM2) to CAM0.
+        Trv2c (np.ndarray, shape=[4, 4]): Matrix to project points in
+            camera coordinate to lidar coordinate.
+        P2 (p.array, shape=[4, 4]): Intrinsics of Camera2.
+        image_shape (list[int]): Shape of image.
+    Returns:
+        np.ndarray, shape=[N, 3+dims]: Filtered points.
+    """
+    # 5x faster than remove_outside_points_v1(2ms vs 10ms)
+    C, R, T = projection_matrix_to_CRT_kitti(P2)
+    image_bbox = [0, 0, image_shape[1], image_shape[0]]
+    frustum = get_frustum(image_bbox, C)
+    frustum -= T
+    frustum = np.linalg.inv(R) @ frustum.T
+    frustum = points_camera2lidar(frustum.T[None, ...], tr_velo_to_cam, r0_rect) # (1, 8, 3)
+    group_rectangle_vertexs_v = group_rectangle_vertexs(frustum)
+    frustum_surfaces = group_plane_equation(group_rectangle_vertexs_v)
+    indices = points_in_bboxes(points[:, :3], frustum_surfaces) # (N, 1)
+    points = points[indices.reshape([-1])]
+    return points
+
+
+# Copied from https://github.com/open-mmlab/mmdetection3d/blob/f45977008a52baaf97640a0e9b2bbe5ea1c4be34/mmdet3d/core/bbox/box_np_ops.py#L609
+def projection_matrix_to_CRT_kitti(proj):
+    """Split projection matrix of kitti.
+    P = C @ [R|T]
+    C is upper triangular matrix, so we need to inverse CR and use QR
+    stable for all kitti camera projection matrix.
+    Args:
+        proj (p.array, shape=[4, 4]): Intrinsics of camera.
+    Returns:
+        tuple[np.ndarray]: Splited matrix of C, R and T.
+    """
+
+    CR = proj[0:3, 0:3]
+    CT = proj[0:3, 3]
+    RinvCinv = np.linalg.inv(CR)
+    Rinv, Cinv = np.linalg.qr(RinvCinv)
+    C = np.linalg.inv(Cinv)
+    R = np.linalg.inv(Rinv)
+    T = Cinv @ CT
+    return C, R, T
+
+
+# Copied from https://github.com/open-mmlab/mmdetection3d/blob/f45977008a52baaf97640a0e9b2bbe5ea1c4be34/mmdet3d/core/bbox/box_np_ops.py#L661
+def get_frustum(bbox_image, C, near_clip=0.001, far_clip=100):
+    """Get frustum corners in camera coordinates.
+    Args:
+        bbox_image (list[int]): box in image coordinates.
+        C (np.ndarray): Intrinsics.
+        near_clip (float, optional): Nearest distance of frustum.
+            Defaults to 0.001.
+        far_clip (float, optional): Farthest distance of frustum.
+            Defaults to 100.
+    Returns:
+        np.ndarray, shape=[8, 3]: coordinates of frustum corners.
+    """
+    fku = C[0, 0]
+    fkv = -C[1, 1]
+    u0v0 = C[0:2, 2]
+    z_points = np.array(
+        [near_clip] * 4 + [far_clip] * 4, dtype=C.dtype)[:, np.newaxis]
+    b = bbox_image
+    box_corners = np.array(
+        [[b[0], b[1]], [b[0], b[3]], [b[2], b[3]], [b[2], b[1]]],
+        dtype=C.dtype)
+    near_box_corners = (box_corners - u0v0) / np.array(
+        [fku / near_clip, -fkv / near_clip], dtype=C.dtype)
+    far_box_corners = (box_corners - u0v0) / np.array(
+        [fku / far_clip, -fkv / far_clip], dtype=C.dtype)
+    ret_xy = np.concatenate([near_box_corners, far_box_corners],
+                            axis=0)  # [8, 2]
+    ret_xyz = np.concatenate([ret_xy, z_points], axis=1)
+    return ret_xyz
--- a/pointpillars/utils/viewpoint.json
+++ b/pointpillars/utils/viewpoint.json
+{
+	"class_name" : "PinholeCameraParameters",
+	"extrinsic" : 
+	[
+		0.013862749108655318,
+		-0.99931910700250171,
+		0.034192785304405081,
+		0,
+		-0.99751226840734264,
+		-0.011457761025003947,
+		0.069555690558944339,
+		0,
+		-0.069116557813509449,
+		-0.035071955919460274,
+		-0.99699190535530191,
+		0,
+		7.0392596000563916,
+		14.768969974020909,
+		101.08753655485596,
+		1
+	],
+	"intrinsic" : 
+	{
+		"height" : 1056,
+		"intrinsic_matrix" : [ 914.52282639636724, 0, 0, 0, 914.52282639636724, 0, 927, 527.5, 1 ],
+		"width" : 1855
+	},
+	"version_major" : 1,
+	"version_minor" : 0
+}
\ No newline at end of file
--- a/pointpillars/utils/vis_o3d.py
+++ b/pointpillars/utils/vis_o3d.py
+import cv2
+import numpy as np
+import open3d as o3d
+import os
+from pointpillars.utils import bbox3d2corners
+
+
+COLORS = [[1, 0, 0], [0, 1, 0], [0, 0, 1], [1, 1, 0]]
+COLORS_IMG = [[0, 0, 255], [0, 255, 0], [255, 0, 0], [0, 255, 255]]
+
+LINES = [
+        [0, 1],
+        [1, 2], 
+        [2, 3],
+        [3, 0],
+        [4, 5],
+        [5, 6],
+        [6, 7],
+        [7, 4],
+        [2, 6],
+        [7, 3],
+        [1, 5],
+        [4, 0]
+    ]
+
+
+def npy2ply(npy):
+    ply = o3d.geometry.PointCloud()
+    ply.points = o3d.utility.Vector3dVector(npy[:, :3])
+    density = npy[:, 3]
+    colors = [[item, item, item] for item in density]
+    ply.colors = o3d.utility.Vector3dVector(colors)
+    return ply
+
+
+def ply2npy(ply):
+    return np.array(ply.points)
+
+
+def bbox_obj(points, color=[1, 0, 0]):
+    colors = [color for i in range(len(LINES))]
+    line_set = o3d.geometry.LineSet(
+        points=o3d.utility.Vector3dVector(points),
+        lines=o3d.utility.Vector2iVector(LINES),
+    )
+    line_set.colors = o3d.utility.Vector3dVector(colors)
+    return line_set
+
+
+def vis_core(plys):
+    vis = o3d.visualization.Visualizer()
+    vis.create_window()
+
+    PAR = os.path.dirname(os.path.abspath(__file__))
+    ctr = vis.get_view_control()
+    param = o3d.io.read_pinhole_camera_parameters(os.path.join(PAR, 'viewpoint.json'))
+    for ply in plys:
+        vis.add_geometry(ply)
+    ctr.convert_from_pinhole_camera_parameters(param)
+
+    vis.run()
+    # param = vis.get_view_control().convert_to_pinhole_camera_parameters()
+    # o3d.io.write_pinhole_camera_parameters(os.path.join(PAR, 'viewpoint.json'), param)
+    vis.destroy_window()
+
+
+def vis_pc(pc, bboxes=None, labels=None):
+    '''
+    pc: ply or np.ndarray (N, 4)
+    bboxes: np.ndarray, (n, 7) or (n, 8, 3)
+    labels: (n, )
+    '''
+    if isinstance(pc, np.ndarray):
+        pc = npy2ply(pc)
+    
+    mesh_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(
+    size=10, origin=[0, 0, 0])
+
+    if bboxes is None:
+        vis_core([pc, mesh_frame])
+        return
+    
+    if len(bboxes.shape) == 2:
+        bboxes = bbox3d2corners(bboxes)
+    
+    vis_objs = [pc, mesh_frame]
+    for i in range(len(bboxes)):
+        bbox = bboxes[i]
+        if labels is None:
+            color = [1, 1, 0]
+        else:
+            if labels[i] >= 0 and labels[i] < 3:
+                color = COLORS[labels[i]]
+            else:
+                color = COLORS[-1]
+        vis_objs.append(bbox_obj(bbox, color=color))
+    vis_core(vis_objs)
+
+
+def vis_img_3d(img, image_points, labels, rt=True):
+    '''
+    img: (h, w, 3)
+    image_points: (n, 8, 2)
+    labels: (n, )
+    '''
+
+    for i in range(len(image_points)):
+        label = labels[i]
+        bbox_points = image_points[i] # (8, 2)
+        if label >= 0 and label < 3:
+            color = COLORS_IMG[label]
+        else:
+            color = COLORS_IMG[-1]
+        for line_id in LINES:
+            x1, y1 = bbox_points[line_id[0]]
+            x2, y2 = bbox_points[line_id[1]]
+            x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
+            cv2.line(img, (x1, y1), (x2, y2), color, 1)
+    if rt:
+        return img
+    cv2.imshow('bbox', img)
+    cv2.waitKey(0)