Initial commit

d1aac35d · zhangwenwei · d1aac35d · d1aac35d · d1aac35d · d1aac35d
Commit d1aac35d authored Apr 14, 2020 by zhangwenwei
20 changed files
--- a/mmdet3d/ops/spconv/structure.py
+++ b/mmdet3d/ops/spconv/structure.py
+import numpy as np
+import torch
+
+
+def scatter_nd(indices, updates, shape):
+    """pytorch edition of tensorflow scatter_nd.
+    this function don't contain except handle code. so use this carefully
+    when indice repeats, don't support repeat add which is supported
+    in tensorflow.
+    """
+    ret = torch.zeros(*shape, dtype=updates.dtype, device=updates.device)
+    ndim = indices.shape[-1]
+    output_shape = list(indices.shape[:-1]) + shape[indices.shape[-1]:]
+    flatted_indices = indices.view(-1, ndim)
+    slices = [flatted_indices[:, i] for i in range(ndim)]
+    slices += [Ellipsis]
+    ret[slices] = updates.view(*output_shape)
+    return ret
+
+
+class SparseConvTensor(object):
+
+    def __init__(self,
+                 features,
+                 indices,
+                 spatial_shape,
+                 batch_size,
+                 grid=None):
+        """
+        Args:
+            grid: pre-allocated grid tensor.
+                  should be used when the volume of spatial shape
+                  is very large.
+        """
+        self.features = features
+        self.indices = indices
+        if self.indices.dtype != torch.int32:
+            self.indices.int()
+        self.spatial_shape = spatial_shape
+        self.batch_size = batch_size
+        self.indice_dict = {}
+        self.grid = grid
+
+    @property
+    def spatial_size(self):
+        return np.prod(self.spatial_shape)
+
+    def find_indice_pair(self, key):
+        if key is None:
+            return None
+        if key in self.indice_dict:
+            return self.indice_dict[key]
+        return None
+
+    def dense(self, channels_first=True):
+        output_shape = [self.batch_size] + list(
+            self.spatial_shape) + [self.features.shape[1]]
+        res = scatter_nd(self.indices.long(), self.features, output_shape)
+        if not channels_first:
+            return res
+        ndim = len(self.spatial_shape)
+        trans_params = list(range(0, ndim + 1))
+        trans_params.insert(1, ndim + 1)
+        return res.permute(*trans_params).contiguous()
+
+    @property
+    def sparity(self):
+        return (self.indices.shape[0] / np.prod(self.spatial_shape) /
+                self.batch_size)
--- a/mmdet3d/ops/spconv/test_utils.py
+++ b/mmdet3d/ops/spconv/test_utils.py
+# Copyright 2019 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+
+class TestCase(unittest.TestCase):
+
+    def _GetNdArray(self, a):
+        if not isinstance(a, np.ndarray):
+            a = np.array(a)
+        return a
+
+    def assertAllEqual(self, a, b):
+        """Asserts that two numpy arrays have the same values.
+        Args:
+        a: the expected numpy ndarray or anything can be converted to one.
+        b: the actual numpy ndarray or anything can be converted to one.
+        """
+        a = self._GetNdArray(a)
+        b = self._GetNdArray(b)
+        self.assertEqual(
+            a.shape, b.shape,
+            'Shape mismatch: expected %s, got %s.' % (a.shape, b.shape))
+        same = (a == b)
+
+        if a.dtype == np.float32 or a.dtype == np.float64:
+            same = np.logical_or(same,
+                                 np.logical_and(np.isnan(a), np.isnan(b)))
+        if not np.all(same):
+            # Prints more details than np.testing.assert_array_equal.
+            diff = np.logical_not(same)
+            if a.ndim:
+                x = a[np.where(diff)]
+                y = b[np.where(diff)]
+                print('not equal where = ', np.where(diff))
+            else:
+                # np.where is broken for scalars
+                x, y = a, b
+            print('not equal lhs = ', x)
+            print('not equal rhs = ', y)
+            np.testing.assert_array_equal(a, b)
+
+    def assertAllClose(self, a, b, rtol=1e-6, atol=1e-6):
+        """Asserts that two numpy arrays, or dicts of same, have near values.
+        This does not support nested dicts.
+        Args:
+        a: The expected numpy ndarray (or anything can be converted to one), or
+            dict of same. Must be a dict iff `b` is a dict.
+        b: The actual numpy ndarray (or anything can be converted to one), or
+            dict of same. Must be a dict iff `a` is a dict.
+        rtol: relative tolerance.
+        atol: absolute tolerance.
+        Raises:
+        ValueError: if only one of `a` and `b` is a dict.
+        """
+        is_a_dict = isinstance(a, dict)
+        if is_a_dict != isinstance(b, dict):
+            raise ValueError("Can't compare dict to non-dict, %s vs %s." %
+                             (a, b))
+        if is_a_dict:
+            self.assertCountEqual(
+                a.keys(),
+                b.keys(),
+                msg='mismatched keys, expected %s, got %s' %
+                (a.keys(), b.keys()))
+            for k in a:
+                self._assertArrayLikeAllClose(
+                    a[k],
+                    b[k],
+                    rtol=rtol,
+                    atol=atol,
+                    msg='%s: expected %s, got %s.' % (k, a, b))
+        else:
+            self._assertArrayLikeAllClose(a, b, rtol=rtol, atol=atol)
+
+    def _assertArrayLikeAllClose(self, a, b, rtol=1e-6, atol=1e-6, msg=None):
+        a = self._GetNdArray(a)
+        b = self._GetNdArray(b)
+        self.assertEqual(
+            a.shape, b.shape,
+            'Shape mismatch: expected %s, got %s.' % (a.shape, b.shape))
+        if not np.allclose(a, b, rtol=rtol, atol=atol):
+            # Prints more details than np.testing.assert_allclose.
+            #
+            # NOTE: numpy.allclose (and numpy.testing.assert_allclose)
+            # checks whether two arrays are element-wise equal within a
+            # tolerance. The relative difference (rtol * abs(b)) and the
+            # absolute difference atol are added together to compare against
+            # the absolute difference between a and b.  Here, we want to
+            # print out which elements violate such conditions.
+            cond = np.logical_or(
+                np.abs(a - b) > atol + rtol * np.abs(b),
+                np.isnan(a) != np.isnan(b))
+            if a.ndim:
+                x = a[np.where(cond)]
+                y = b[np.where(cond)]
+                print('not close where = ', np.where(cond))
+            else:
+                # np.where is broken for scalars
+                x, y = a, b
+            print('not close lhs = ', x)
+            print('not close rhs = ', y)
+            print('not close dif = ', np.abs(x - y))
+            print('not close tol = ', atol + rtol * np.abs(y))
+            print('dtype = %s, shape = %s' % (a.dtype, a.shape))
+            np.testing.assert_allclose(a, b, rtol=rtol, atol=atol, err_msg=msg)
+
+
+def params_grid(*params):
+    size = len(params)
+    length = 1
+    for p in params:
+        length *= len(p)
+    sizes = [len(p) for p in params]
+    counter = [0] * size
+    total = []
+    for i in range(length):
+        total.append([0] * size)
+    for i in range(length):
+        for j in range(size):
+            total[i][j] = params[j][counter[j]]
+        counter[size - 1] += 1
+        for c in range(size - 1, -1, -1):
+            if (counter[c] == sizes[c] and c > 0):
+                counter[c - 1] += 1
+                counter[c] = 0
+    return total
+
+
+def generate_sparse_data(shape,
+                         num_points,
+                         num_channels,
+                         integer=False,
+                         data_range=(-1, 1),
+                         with_dense=True,
+                         dtype=np.float32):
+    dense_shape = shape
+    ndim = len(dense_shape)
+    # num_points = np.random.randint(10, 100, size=[batch_size, ndim])
+    num_points = np.array(num_points)
+    # num_points = np.array([3, 2])
+    batch_size = len(num_points)
+    batch_indices = []
+    coors_total = np.stack(
+        np.meshgrid(*[np.arange(0, s) for s in shape]), axis=-1)
+    coors_total = coors_total.reshape(-1, ndim)
+    for i in range(batch_size):
+        np.random.shuffle(coors_total)
+        inds_total = coors_total[:num_points[i]]
+        inds_total = np.pad(
+            inds_total, ((0, 0), (0, 1)), mode='constant', constant_values=i)
+        batch_indices.append(inds_total)
+    if integer:
+        sparse_data = np.random.randint(
+            data_range[0],
+            data_range[1],
+            size=[num_points.sum(), num_channels]).astype(dtype)
+    else:
+        sparse_data = np.random.uniform(
+            data_range[0],
+            data_range[1],
+            size=[num_points.sum(), num_channels]).astype(dtype)
+
+    res = {
+        'features': sparse_data.astype(dtype),
+    }
+    if with_dense:
+        dense_data = np.zeros([batch_size, num_channels, *dense_shape],
+                              dtype=sparse_data.dtype)
+        start = 0
+        for i, inds in enumerate(batch_indices):
+            for j, ind in enumerate(inds):
+                dense_slice = (i, slice(None), *ind[:-1])
+                dense_data[dense_slice] = sparse_data[start + j]
+            start += len(inds)
+        res['features_dense'] = dense_data.astype(dtype)
+    batch_indices = np.concatenate(batch_indices, axis=0)
+    res['indices'] = batch_indices.astype(np.int32)
+    return res
--- a/mmdet3d/ops/sync_bn.py
+++ b/mmdet3d/ops/sync_bn.py
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch.autograd.function import Function
+
+
+class AllReduce(Function):
+
+    @staticmethod
+    def forward(ctx, input):
+        input_list = [
+            torch.zeros_like(input) for k in range(dist.get_world_size())
+        ]
+        # Use allgather instead of allreduce in-place operations is unreliable
+        dist.all_gather(input_list, input, async_op=False)
+        inputs = torch.stack(input_list, dim=0)
+        return torch.sum(inputs, dim=0)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        dist.all_reduce(grad_output, async_op=False)
+        return grad_output
+
+
+class NaiveSyncBatchNorm1d(nn.BatchNorm1d):
+    """Syncronized Batch Normalization for 3D Tensors
+
+    Note:
+        This implementation is modified from
+        https://github.com/facebookresearch/detectron2/
+
+        `torch.nn.SyncBatchNorm` has known unknown bugs.
+        It produces significantly worse AP (and sometimes goes NaN)
+        when the batch size on each worker is quite different
+        (e.g., when scale augmentation is used).
+        In 3D detection, different workers has points of different shapes,
+        whish also cause instability.
+
+        Use this implementation before `nn.SyncBatchNorm` is fixed.
+        It is slower than `nn.SyncBatchNorm`.
+    """
+
+    def forward(self, input):
+        if dist.get_world_size() == 1 or not self.training:
+            return super().forward(input)
+
+        assert input.shape[0] > 0, 'SyncBN does not support empty inputs'
+        C = input.shape[1]
+        mean = torch.mean(input, dim=[0, 2])
+        meansqr = torch.mean(input * input, dim=[0, 2])
+
+        vec = torch.cat([mean, meansqr], dim=0)
+        vec = AllReduce.apply(vec) * (1.0 / dist.get_world_size())
+
+        mean, meansqr = torch.split(vec, C)
+        var = meansqr - mean * mean
+        self.running_mean += self.momentum * (
+            mean.detach() - self.running_mean)
+        self.running_var += self.momentum * (var.detach() - self.running_var)
+
+        invstd = torch.rsqrt(var + self.eps)
+        scale = self.weight * invstd
+        bias = self.bias - mean * scale
+        scale = scale.reshape(1, -1, 1)
+        bias = bias.reshape(1, -1, 1)
+        return input * scale + bias
+
+
+class NaiveSyncBatchNorm2d(nn.BatchNorm2d):
+    """Syncronized Batch Normalization for 4D Tensors
+
+    Note:
+        This implementation is modified from
+        https://github.com/facebookresearch/detectron2/
+
+        `torch.nn.SyncBatchNorm` has known unknown bugs.
+        It produces significantly worse AP (and sometimes goes NaN)
+        when the batch size on each worker is quite different
+        (e.g., when scale augmentation is used).
+        This phenomenon also occurs when the multi-modality feature fusion
+        modules of multi-modality detectors use SyncBN.
+
+        Use this implementation before `nn.SyncBatchNorm` is fixed.
+        It is slower than `nn.SyncBatchNorm`.
+    """
+
+    def forward(self, input):
+        if dist.get_world_size() == 1 or not self.training:
+            return super().forward(input)
+
+        assert input.shape[0] > 0, 'SyncBN does not support empty inputs'
+        C = input.shape[1]
+        mean = torch.mean(input, dim=[0, 2, 3])
+        meansqr = torch.mean(input * input, dim=[0, 2, 3])
+
+        vec = torch.cat([mean, meansqr], dim=0)
+        vec = AllReduce.apply(vec) * (1.0 / dist.get_world_size())
+
+        mean, meansqr = torch.split(vec, C)
+        var = meansqr - mean * mean
+        self.running_mean += self.momentum * (
+            mean.detach() - self.running_mean)
+        self.running_var += self.momentum * (var.detach() - self.running_var)
+
+        invstd = torch.rsqrt(var + self.eps)
+        scale = self.weight * invstd
+        bias = self.bias - mean * scale
+        scale = scale.reshape(1, -1, 1, 1)
+        bias = bias.reshape(1, -1, 1, 1)
+        return input * scale + bias
--- a/mmdet3d/ops/voxel/__init__.py
+++ b/mmdet3d/ops/voxel/__init__.py
+from .scatter_points import DynamicScatter, dynamic_scatter
+from .voxelize import Voxelization, voxelization
+
+__all__ = ['Voxelization', 'voxelization', 'dynamic_scatter', 'DynamicScatter']
--- a/mmdet3d/ops/voxel/scatter_points.py
+++ b/mmdet3d/ops/voxel/scatter_points.py
+import torch
+from torch import nn
+from torch.autograd import Function
+
+from .voxel_layer import (dynamic_point_to_voxel_backward,
+                          dynamic_point_to_voxel_forward)
+
+
+class _dynamic_scatter(Function):
+
+    @staticmethod
+    def forward(ctx, points, coors, voxel_size, coors_range):
+        """convert kitti points(N, >=3) to voxels.
+
+        Args:
+            points: [N, ndim] float tensor. points[:, :3] contain xyz
+                points and points[:, 3:] contain other information
+                such as reflectivity.
+            voxel_size: [3] list/tuple or array, float. xyz, indicate
+                voxel size
+            coors_range: [6] list/tuple or array, float. indicate voxel range.
+                format: xyzxyz, minmax
+            max_points: int. indicate maximum points contained in a voxel.
+                if  max_points=-1, it means using dynamic_voxelize
+            max_voxels: int. indicate maximum voxels this function create.
+                for second, 20000 is a good choice. you should shuffle
+                points before call this function because max_voxels may
+                drop some points.
+        Returns:
+            tuple
+            voxels: [M, max_points, ndim] float tensor. only contain points
+                    and returned when max_points != -1.
+            coordinates: [M, 3] int32 tensor, always returned.
+            num_points_per_voxel: [M] int32 tensor. Only returned when
+            max_points != -1.
+        """
+        results = dynamic_point_to_voxel_forward(points, coors, voxel_size,
+                                                 coors_range)
+        (voxels, voxel_coors, num_points_per_voxel, point_to_voxelidx,
+         coor_to_voxelidx) = results
+        ctx.save_for_backward(num_points_per_voxel, point_to_voxelidx,
+                              coor_to_voxelidx)
+        return voxels, voxel_coors, num_points_per_voxel.float()
+
+    @staticmethod
+    def backward(ctx,
+                 grad_output_voxel,
+                 grad_output_voxel_coors=None,
+                 grad_output_num_points=None):
+        (num_points_per_voxel, point_to_voxelidx,
+         coor_to_voxelidx) = ctx.saved_tensors
+        # grad_output_voxel shape: NxMxC
+        num_points = point_to_voxelidx.size(0)
+        num_features = grad_output_voxel.size(-1)
+        grad_points = grad_output_voxel.new_zeros(
+            size=(num_points, num_features))
+        # TODO: whether to use index put or use cuda_backward
+        # To use index put, need point to voxel index
+        dynamic_point_to_voxel_backward(grad_points,
+                                        grad_output_voxel.contiguous(),
+                                        point_to_voxelidx, coor_to_voxelidx)
+        return grad_points, None, None, None
+
+
+dynamic_scatter = _dynamic_scatter.apply
+
+
+class DynamicScatter(nn.Module):
+
+    def __init__(self, voxel_size, point_cloud_range, average_points: bool):
+        super(DynamicScatter, self).__init__()
+        """Scatters points into voxels, used in the voxel encoder with
+           dynamic voxelization
+
+        **Note**: The CPU and GPU implementation get the same output, but
+        have numerical difference after summation and division (e.g., 5e-7).
+
+        Args:
+            average_points (bool): whether to use avg pooling to scatter
+                points into voxel voxel_size (list): list [x, y, z] size
+                of three dimension
+            point_cloud_range (list):
+                [x_min, y_min, z_min, x_max, y_max, z_max]
+        """
+        self.voxel_size = voxel_size
+        self.point_cloud_range = point_cloud_range
+        self.average_points = average_points
+
+    def forward_single(self, points, coors):
+        voxels, voxel_coors, num_points = dynamic_scatter(
+            points.contiguous(), coors.contiguous(), self.voxel_size,
+            self.point_cloud_range)
+        if not self.average_points:
+            voxels = torch.max(voxels, dim=1)[0]  # voxels: NxMxC -> NxC
+        else:
+            voxels = (
+                voxels.sum(dim=1, keepdim=False).div(num_points.view(-1, 1)))
+        return voxels, voxel_coors
+
+    def forward(self, points, coors):
+        """
+        Args:
+            input: NC points
+        """
+        if coors.size(-1) == 3:
+            return self.forward_single(points, coors)
+        else:
+            batch_size = coors[-1, 0] + 1
+            voxels, voxel_coors = [], []
+            for i in range(batch_size):
+                inds = torch.where(coors[:, 0] == i)
+                voxel, voxel_coor = self.forward_single(
+                    points[inds], coors[inds][:, 1:])
+                coor_pad = nn.functional.pad(
+                    voxel_coor, (1, 0), mode='constant', value=i)
+                voxel_coors.append(coor_pad)
+                voxels.append(voxel)
+            features = torch.cat(voxels, dim=0)
+            feature_coors = torch.cat(voxel_coors, dim=0)
+
+            return features, feature_coors
+
+    def __repr__(self):
+        tmpstr = self.__class__.__name__ + '('
+        tmpstr += 'voxel_size=' + str(self.voxel_size)
+        tmpstr += ', point_cloud_range=' + str(self.point_cloud_range)
+        tmpstr += ', average_points=' + str(self.average_points)
+        tmpstr += ')'
+        return tmpstr
--- a/mmdet3d/ops/voxel/src/scatter_points_cpu.cpp
+++ b/mmdet3d/ops/voxel/src/scatter_points_cpu.cpp
+#include <torch/extension.h>
+#include <ATen/TensorUtils.h>
+// #include "voxelization.h"
+
+
+namespace {
+
+
+template <typename T_int>
+void determin_max_points_kernel(torch::TensorAccessor<T_int,2> coor,
+                                torch::TensorAccessor<T_int,1> point_to_voxelidx,
+                                torch::TensorAccessor<T_int,1> num_points_per_voxel,
+                                torch::TensorAccessor<T_int,3> coor_to_voxelidx,
+                                int& voxel_num,
+                                int& max_points,
+                                const int num_points
+                                ) {
+
+    int voxelidx, num;
+    for (int i = 0; i < num_points; ++i) {
+        if (coor[i][0] == -1)
+            continue;
+        voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];
+
+        // record voxel
+        if (voxelidx == -1) {
+            voxelidx = voxel_num;
+            voxel_num += 1;
+            coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]] = voxelidx;
+        }
+
+        // put points into voxel
+        num = num_points_per_voxel[voxelidx];
+        point_to_voxelidx[i] = num;
+        num_points_per_voxel[voxelidx] += 1;
+
+        // update max points per voxel
+        max_points = std::max(max_points, num+1);
+    }
+
+    return;
+}
+
+
+template <typename T, typename T_int>
+void scatter_point_to_voxel_kernel(
+                const torch::TensorAccessor<T,2> points,
+                torch::TensorAccessor<T_int,2> coor,
+                torch::TensorAccessor<T_int,1> point_to_voxelidx,
+                torch::TensorAccessor<T_int,3> coor_to_voxelidx,
+                torch::TensorAccessor<T,3> voxels,
+                torch::TensorAccessor<T_int,2> voxel_coors,
+                const int num_features,
+                const int num_points,
+                const int NDim
+                ){
+    for (int i = 0; i < num_points; ++i) {
+        int num = point_to_voxelidx[i];
+        int voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];
+        for (int k = 0; k < num_features; ++k) {
+            voxels[voxelidx][num][k] = points[i][k];
+        }
+        for (int k = 0; k < NDim; ++k) {
+            voxel_coors[voxelidx][k] = coor[i][k];
+        }
+    }
+}
+
+} // namespace
+
+
+namespace voxelization {
+
+std::vector<at::Tensor> dynamic_point_to_voxel_cpu(
+    const at::Tensor& points,
+    const at::Tensor& voxel_mapping,
+    const std::vector<float> voxel_size,
+    const std::vector<float> coors_range) {
+    // current version tooks about 0.02s_0.03s for one frame on cpu
+    // check device
+    AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
+
+    const int NDim = voxel_mapping.size(1);
+    const int num_points = points.size(0);
+    const int num_features = points.size(1);
+
+    std::vector<int> grid_size(NDim);
+    for (int i = 0; i < NDim; ++i) {
+        grid_size[i] = round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+    }
+
+    at::Tensor num_points_per_voxel = at::zeros({num_points,}, voxel_mapping.options());
+    at::Tensor coor_to_voxelidx = -at::ones({grid_size[2], grid_size[1], grid_size[0]}, voxel_mapping.options());
+    at::Tensor point_to_voxelidx = -at::ones({num_points,}, voxel_mapping.options());
+
+    int voxel_num = 0;
+    int max_points = 0;
+    AT_DISPATCH_ALL_TYPES(voxel_mapping.type(), "determin_max_point", [&] {
+        determin_max_points_kernel<scalar_t>(
+            voxel_mapping.accessor<scalar_t,2>(),
+            point_to_voxelidx.accessor<scalar_t,1>(),
+            num_points_per_voxel.accessor<scalar_t,1>(),
+            coor_to_voxelidx.accessor<scalar_t,3>(),
+            voxel_num,
+            max_points,
+            num_points
+        );
+    });
+
+    at::Tensor voxels = at::zeros({voxel_num, max_points, num_features}, points.options());
+    at::Tensor voxel_coors = at::zeros({voxel_num, NDim}, points.options().dtype(at::kInt));
+
+    AT_DISPATCH_ALL_TYPES(points.type(), "scatter_point_to_voxel", [&] {
+        scatter_point_to_voxel_kernel<scalar_t, int>(
+            points.accessor<scalar_t,2>(),
+            voxel_mapping.accessor<int,2>(),
+            point_to_voxelidx.accessor<int,1>(),
+            coor_to_voxelidx.accessor<int,3>(),
+            voxels.accessor<scalar_t,3>(),
+            voxel_coors.accessor<int,2>(),
+            num_features,
+            num_points,
+            NDim
+        );
+    });
+
+    at::Tensor num_points_per_voxel_out = num_points_per_voxel.slice(/*dim=*/0, /*start=*/0, /*end=*/voxel_num);
+    return {voxels, voxel_coors, num_points_per_voxel_out};
+}
+
+}
--- a/mmdet3d/ops/voxel/src/scatter_points_cuda.cu
+++ b/mmdet3d/ops/voxel/src/scatter_points_cuda.cu
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/types.h>
+
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+
+#define CHECK_CUDA(x) \
+  TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) \
+  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
+  CHECK_CONTIGUOUS(x)
+
+namespace {
+int const threadsPerBlock = sizeof(unsigned long long) * 8;
+}
+
+template <typename T, typename T_int>
+__global__ void scatter_point_to_voxel_kernel(
+    const T* points, T_int* coor, T_int* point_to_voxelidx,
+    T_int* coor_to_voxelidx, T* voxels, T_int* coors, const int num_features,
+    const int num_points, const int max_points, const int NDim) {
+  const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
+  if (index >= num_points) return;
+
+  int num = point_to_voxelidx[index];
+  int voxelidx = coor_to_voxelidx[index];
+  if (num > -1 && voxelidx > -1) {
+    const int feature_per_thread = num_features / 4;
+
+    int start = threadIdx.y * feature_per_thread;
+    auto voxels_offset =
+        voxels + voxelidx * max_points * num_features + num * num_features;
+    auto points_offset = points + index * num_features;
+    for (int k = start; k < start + feature_per_thread; k++) {
+      voxels_offset[k] = points_offset[k];
+    }
+    if (num == 0 && start < NDim) {
+      auto coors_offset = coors + voxelidx * NDim;
+      auto coor_offset = coor + index * NDim;
+      for (int k = start; k < NDim; k++) {
+        coors_offset[k] = coor_offset[k];
+      }
+    }
+  }
+}
+
+template <typename T, typename T_int>
+__global__ void map_voxel_to_point_kernel(
+    T* points, T* voxels, T_int* point_to_voxelidx, T_int* coor_to_voxelidx,
+    const int num_features, const int num_points, const int max_points) {
+  const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
+  if (index >= num_points) return;
+  auto num = point_to_voxelidx[index];
+  if (num > -1) {
+    const int feature_per_thread = num_features / 4;
+    auto voxelidx = coor_to_voxelidx[index];
+
+    int start = threadIdx.y * feature_per_thread;
+    auto voxels_offset =
+        voxels + voxelidx * max_points * num_features + num * num_features;
+    auto points_offset = points + index * num_features;
+    for (int k = start; k < start + feature_per_thread; k++) {
+      points_offset[k] = voxels_offset[k];
+    }
+  }
+}
+
+template <typename T_int>
+__global__ void point_to_voxelidx_kernel(const T_int* coor,
+                                         T_int* point_to_voxelidx,
+                                         T_int* point_to_pointidx,
+                                         const int num_points, const int NDim) {
+  const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
+  auto coor_offset = coor + index * NDim;
+  // skip invalid points
+  if ((index >= num_points) || (coor_offset[0] == -1)) return;
+
+  int num = 0;
+  int coor_x = coor_offset[0];
+  int coor_y = coor_offset[1];
+  int coor_z = coor_offset[2];
+  // only calculate the coors before this coor[index]
+  for (int i = 0; i < index; ++i) {
+    auto prev_coor = coor + i * NDim;
+    if (prev_coor[0] == -1) continue;
+
+    // record voxel
+    if ((prev_coor[0] == coor_x) && (prev_coor[1] == coor_y) &&
+        (prev_coor[2] == coor_z)) {
+      num++;
+      if (num == 1) {
+        point_to_pointidx[index] = i;
+      }
+    }
+  }
+  if (num == 0) {
+    point_to_pointidx[index] = index;
+  }
+  point_to_voxelidx[index] = num;
+}
+
+template <typename T_int>
+__global__ void determin_voxel_num(
+    const T_int* coor, T_int* num_points_per_voxel, T_int* point_to_voxelidx,
+    T_int* point_to_pointidx, T_int* coor_to_voxelidx, T_int* voxel_num,
+    T_int* max_points, const int num_points, const int NDim) {
+  // only calculate the coors before this coor[index]
+  for (int i = 0; i < num_points; ++i) {
+    auto coor_offset = coor + i * NDim;
+    if (coor_offset[0] == -1) continue;
+    int point_pos_in_voxel = point_to_voxelidx[i];
+    // record voxel
+    if (point_pos_in_voxel == -1) {
+      // out of max_points or invalid point
+      printf("point_pos_in_voxel == -1, point:%d", i);
+      continue;
+    } else if (point_pos_in_voxel == 0) {
+      // record new voxel
+      int voxelidx = voxel_num[0];
+      voxel_num[0] += 1;
+      coor_to_voxelidx[i] = voxelidx;
+      num_points_per_voxel[voxelidx] = 1;
+    } else {
+      int point_idx = point_to_pointidx[i];
+      int voxelidx = coor_to_voxelidx[point_idx];
+      if (voxelidx != -1) {
+        num_points_per_voxel[voxelidx] += 1;
+        coor_to_voxelidx[i] = voxelidx;
+        max_points[0] = max(max_points[0], point_pos_in_voxel + 1);
+      } else {
+        printf("voxelidx = -1, point:%d", i);
+      }
+    }
+  }
+}
+
+namespace voxelization {
+
+std::vector<at::Tensor> dynamic_point_to_voxel_forward_gpu(
+    const at::Tensor& points, const at::Tensor& voxel_mapping,
+    const std::vector<float> voxel_size, const std::vector<float> coors_range) {
+  CHECK_INPUT(points);
+  at::cuda::CUDAGuard device_guard(points.device());
+
+  const int NDim = voxel_mapping.size(1);
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  std::vector<int> grid_size(NDim);
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+
+  // assume the mapping is already given
+  auto point_to_pointidx = -at::ones(
+      {
+          num_points,
+      },
+      voxel_mapping.options());
+  auto point_to_voxelidx = -at::ones(
+      {
+          num_points,
+      },
+      voxel_mapping.options());
+  auto max_points = at::zeros(
+      {
+          1,
+      },
+      voxel_mapping.options());  // must be zero from the begining
+
+  int col_blocks = at::cuda::ATenCeilDiv(num_points, threadsPerBlock);
+  dim3 blocks(col_blocks);
+  dim3 threads(threadsPerBlock);
+  cudaStream_t map_stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_ALL_TYPES(
+      voxel_mapping.type(), "determin_duplicate", ([&] {
+        point_to_voxelidx_kernel<int><<<blocks, threads, 0, map_stream>>>(
+            voxel_mapping.data_ptr<int>(), point_to_voxelidx.data_ptr<int>(),
+            point_to_pointidx.data_ptr<int>(), num_points, NDim);
+      }));
+  cudaDeviceSynchronize();
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // make the logic in the CUDA device could accelerate about 10 times
+  auto num_points_per_voxel = at::zeros(
+      {
+          num_points,
+      },
+      voxel_mapping.options());
+  auto coor_to_voxelidx = -at::ones(
+      {
+          num_points,
+      },
+      voxel_mapping.options());
+  auto voxel_num = at::zeros(
+      {
+          1,
+      },
+      voxel_mapping.options());  // must be zero from the begining
+  cudaStream_t logic_stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_ALL_TYPES(
+      voxel_mapping.type(), "determin_duplicate", ([&] {
+        determin_voxel_num<int><<<1, 1, 0, logic_stream>>>(
+            voxel_mapping.data_ptr<int>(), num_points_per_voxel.data_ptr<int>(),
+            point_to_voxelidx.data_ptr<int>(),
+            point_to_pointidx.data_ptr<int>(), coor_to_voxelidx.data_ptr<int>(),
+            voxel_num.data_ptr<int>(), max_points.data_ptr<int>(), num_points,
+            NDim);
+      }));
+  cudaDeviceSynchronize();
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // some temporary data
+  auto max_points_cpu = max_points.to(at::kCPU);
+  int max_points_int = max_points_cpu.data_ptr<int>()[0];
+  auto voxel_num_cpu = voxel_num.to(at::kCPU);
+  int voxel_num_int = voxel_num_cpu.data_ptr<int>()[0];
+  at::Tensor coors =
+      at::zeros({voxel_num_int, NDim}, points.options().dtype(at::kInt));
+  at::Tensor voxels = at::zeros({voxel_num_int, max_points_int, num_features},
+                                points.options());
+
+  // copy point features to voxels
+  dim3 cp_threads(threadsPerBlock, 4);
+  cudaStream_t cp_stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_ALL_TYPES(
+      points.type(), "scatter_point_to_voxel", ([&] {
+        scatter_point_to_voxel_kernel<float, int>
+            <<<blocks, cp_threads, 0, cp_stream>>>(
+                points.data_ptr<float>(), voxel_mapping.data_ptr<int>(),
+                point_to_voxelidx.data_ptr<int>(),
+                coor_to_voxelidx.data_ptr<int>(), voxels.data_ptr<float>(),
+                coors.data_ptr<int>(), num_features, num_points, max_points_int,
+                NDim);
+      }));
+  cudaDeviceSynchronize();
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  at::Tensor num_points_per_voxel_out =
+      num_points_per_voxel.slice(/*dim=*/0, /*start=*/0, /*end=*/voxel_num_int);
+  return {voxels, coors, num_points_per_voxel_out, point_to_voxelidx,
+          coor_to_voxelidx};
+}
+
+void dynamic_point_to_voxel_backward_gpu(at::Tensor& grad_input_points,
+                                         const at::Tensor& grad_output_voxels,
+                                         const at::Tensor& point_to_voxelidx,
+                                         const at::Tensor& coor_to_voxelidx) {
+  CHECK_INPUT(grad_input_points);
+  CHECK_INPUT(grad_output_voxels);
+  CHECK_INPUT(point_to_voxelidx);
+  CHECK_INPUT(coor_to_voxelidx);
+  at::cuda::CUDAGuard device_guard(grad_input_points.device());
+
+  const int num_points = grad_input_points.size(0);
+  const int num_features = grad_input_points.size(1);
+  const int max_points = grad_output_voxels.size(1);
+
+  // copy voxel grad to points
+  int col_blocks = at::cuda::ATenCeilDiv(num_points, threadsPerBlock);
+  dim3 blocks(col_blocks);
+  dim3 cp_threads(threadsPerBlock, 4);
+  cudaStream_t cp_stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_ALL_TYPES(grad_input_points.type(), "scatter_point_to_voxel",
+                        ([&] {
+                          map_voxel_to_point_kernel<float, int>
+                              <<<blocks, cp_threads, 0, cp_stream>>>(
+                                  grad_input_points.data_ptr<float>(),
+                                  grad_output_voxels.data_ptr<float>(),
+                                  point_to_voxelidx.data_ptr<int>(),
+                                  coor_to_voxelidx.data_ptr<int>(),
+                                  num_features, num_points, max_points);
+                        }));
+  cudaDeviceSynchronize();
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  return;
+}
+
+}  // namespace voxelization
--- a/mmdet3d/ops/voxel/src/voxelization.cpp
+++ b/mmdet3d/ops/voxel/src/voxelization.cpp
+#include <torch/extension.h>
+#include "voxelization.h"
+
+namespace voxelization {
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("hard_voxelize", &hard_voxelize, "hard voxelize");
+  m.def("dynamic_voxelize", &dynamic_voxelize, "dynamic voxelization");
+  m.def("dynamic_point_to_voxel_forward", &dynamic_point_to_voxel_forward, "dynamic point to voxel forward");
+  m.def("dynamic_point_to_voxel_backward", &dynamic_point_to_voxel_backward, "dynamic point to voxel backward");
+}
+
+} // namespace voxelization
--- a/mmdet3d/ops/voxel/src/voxelization.h
+++ b/mmdet3d/ops/voxel/src/voxelization.h
+#pragma once
+#include <torch/extension.h>
+
+namespace voxelization {
+
+int hard_voxelize_cpu(const at::Tensor& points, at::Tensor& voxels,
+                      at::Tensor& coors, at::Tensor& num_points_per_voxel,
+                      const std::vector<float> voxel_size,
+                      const std::vector<float> coors_range,
+                      const int max_points, const int max_voxels,
+                      const int NDim = 3);
+
+void dynamic_voxelize_cpu(const at::Tensor& points, at::Tensor& coors,
+                          const std::vector<float> voxel_size,
+                          const std::vector<float> coors_range,
+                          const int NDim = 3);
+
+std::vector<at::Tensor> dynamic_point_to_voxel_cpu(
+    const at::Tensor& points, const at::Tensor& voxel_mapping,
+    const std::vector<float> voxel_size, const std::vector<float> coors_range);
+
+#ifdef WITH_CUDA
+int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
+                      at::Tensor& coors, at::Tensor& num_points_per_voxel,
+                      const std::vector<float> voxel_size,
+                      const std::vector<float> coors_range,
+                      const int max_points, const int max_voxels,
+                      const int NDim = 3);
+
+void dynamic_voxelize_gpu(const at::Tensor& points, at::Tensor& coors,
+                          const std::vector<float> voxel_size,
+                          const std::vector<float> coors_range,
+                          const int NDim = 3);
+
+std::vector<at::Tensor> dynamic_point_to_voxel_forward_gpu(
+    const at::Tensor& points, const at::Tensor& voxel_mapping,
+    const std::vector<float> voxel_size, const std::vector<float> coors_range);
+
+void dynamic_point_to_voxel_backward_gpu(at::Tensor& grad_input_points,
+                                         const at::Tensor& grad_output_voxels,
+                                         const at::Tensor& point_to_voxelidx,
+                                         const at::Tensor& coor_to_voxelidx);
+#endif
+
+// Interface for Python
+inline int hard_voxelize(const at::Tensor& points, at::Tensor& voxels,
+                         at::Tensor& coors, at::Tensor& num_points_per_voxel,
+                         const std::vector<float> voxel_size,
+                         const std::vector<float> coors_range,
+                         const int max_points, const int max_voxels,
+                         const int NDim = 3) {
+  if (points.type().is_cuda()) {
+#ifdef WITH_CUDA
+    return hard_voxelize_gpu(points, voxels, coors, num_points_per_voxel,
+                             voxel_size, coors_range, max_points, max_voxels,
+                             NDim);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  return hard_voxelize_cpu(points, voxels, coors, num_points_per_voxel,
+                           voxel_size, coors_range, max_points, max_voxels,
+                           NDim);
+}
+
+inline void dynamic_voxelize(const at::Tensor& points, at::Tensor& coors,
+                             const std::vector<float> voxel_size,
+                             const std::vector<float> coors_range,
+                             const int NDim = 3) {
+  if (points.type().is_cuda()) {
+#ifdef WITH_CUDA
+    return dynamic_voxelize_gpu(points, coors, voxel_size, coors_range, NDim);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  return dynamic_voxelize_cpu(points, coors, voxel_size, coors_range, NDim);
+}
+
+inline std::vector<torch::Tensor> dynamic_point_to_voxel_forward(
+    const at::Tensor& points, const at::Tensor& voxel_mapping,
+    const std::vector<float> voxel_size, const std::vector<float> coors_range) {
+  if (points.type().is_cuda()) {
+#ifdef WITH_CUDA
+    return dynamic_point_to_voxel_forward_gpu(points, voxel_mapping, voxel_size,
+                                              coors_range);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  return dynamic_point_to_voxel_cpu(points, voxel_mapping, voxel_size,
+                                    coors_range);
+}
+
+inline void dynamic_point_to_voxel_backward(
+    at::Tensor& grad_input_points, const at::Tensor& grad_output_voxels,
+    const at::Tensor& point_to_voxelidx, const at::Tensor& coor_to_voxelidx) {
+  if (grad_input_points.type().is_cuda()) {
+#ifdef WITH_CUDA
+    return dynamic_point_to_voxel_backward_gpu(
+        grad_input_points, grad_output_voxels, point_to_voxelidx,
+        coor_to_voxelidx);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  // return dynamic_point_to_voxel_cpu(points,
+  //                                  voxel_mapping,
+  //                                  voxel_size,
+  //                                  coors_range);
+}
+
+}  // namespace voxelization
--- a/mmdet3d/ops/voxel/src/voxelization_cpu.cpp
+++ b/mmdet3d/ops/voxel/src/voxelization_cpu.cpp
+#include <torch/extension.h>
+#include <ATen/TensorUtils.h>
+// #include "voxelization.h"
+
+
+namespace {
+
+template <typename T, typename T_int>
+void dynamic_voxelize_kernel(const torch::TensorAccessor<T,2> points,
+                             torch::TensorAccessor<T_int, 2> coors,
+                             const std::vector<float> voxel_size,
+                             const std::vector<float> coors_range,
+                             const std::vector<int> grid_size,
+                             const int num_points,
+                             const int num_features,
+                             const int NDim
+                             ) {
+
+  const int ndim_minus_1 = NDim - 1;
+  bool failed = false;
+  int coor[NDim];
+  int c;
+
+  for (int i = 0; i < num_points; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points[i][j] - coors_range[j]) / voxel_size[j]);
+      // necessary to rm points out of range
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+
+    for (int k = 0; k < NDim; ++k) {
+      if (failed)
+        coors[i][k] = -1;
+      else
+        coors[i][k] = coor[k];
+    }
+  }
+
+  return;
+}
+
+
+template <typename T, typename T_int>
+void hard_voxelize_kernel(const torch::TensorAccessor<T,2> points,
+                          torch::TensorAccessor<T,3> voxels,
+                          torch::TensorAccessor<T_int,2> coors,
+                          torch::TensorAccessor<T_int,1> num_points_per_voxel,
+                          torch::TensorAccessor<T_int,3> coor_to_voxelidx,
+                          int& voxel_num,
+                          const std::vector<float> voxel_size,
+                          const std::vector<float> coors_range,
+                          const std::vector<int> grid_size,
+                          const int max_points,
+                          const int max_voxels,
+                          const int num_points,
+                          const int num_features,
+                          const int NDim
+                          ) {
+
+  // declare a temp coors
+  at::Tensor temp_coors = at::zeros({num_points, NDim}, at::TensorOptions().dtype(at::kInt).device(at::kCPU));
+
+  // First use dynamic voxelization to get coors,
+  // then check max points/voxels constraints
+  dynamic_voxelize_kernel<T, int>(
+          points,
+          temp_coors.accessor<int,2>(),
+          voxel_size,
+          coors_range,
+          grid_size,
+          num_points,
+          num_features,
+          NDim
+  );
+
+  int voxelidx, num;
+  auto coor = temp_coors.accessor<int,2>();
+
+  for (int i = 0; i < num_points; ++i) {
+    // T_int* coor = temp_coors.data_ptr<int>() + i * NDim;
+
+    if (coor[i][0] == -1)
+      continue;
+
+    voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];
+
+    // record voxel
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      if (max_voxels != -1 && voxel_num >= max_voxels)
+        break;
+      voxel_num += 1;
+
+      coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]] = voxelidx;
+
+      for (int k = 0; k < NDim; ++k) {
+        coors[voxelidx][k] = coor[i][k];
+      }
+    }
+
+    // put points into voxel
+    num = num_points_per_voxel[voxelidx];
+    if (max_points == -1 || num < max_points) {
+      for (int k = 0; k < num_features; ++k) {
+        voxels[voxelidx][num][k] = points[i][k];
+      }
+      num_points_per_voxel[voxelidx] += 1;
+    }
+  }
+
+  return;
+}
+
+} // namespace
+
+
+namespace voxelization {
+
+int hard_voxelize_cpu(
+    const at::Tensor& points,
+    at::Tensor& voxels,
+    at::Tensor& coors,
+    at::Tensor& num_points_per_voxel,
+    const std::vector<float> voxel_size,
+    const std::vector<float> coors_range,
+    const int max_points,
+    const int max_voxels,
+    const int NDim=3) {
+    // current version tooks about 0.02s_0.03s for one frame on cpu
+    // check device
+    AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
+
+    std::vector<int> grid_size(NDim);
+    const int num_points = points.size(0);
+    const int num_features = points.size(1);
+
+    for (int i = 0; i < NDim; ++i) {
+        grid_size[i] = round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+    }
+
+    // coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
+    //printf("cpu coor_to_voxelidx size: [%d, %d, %d]\n", grid_size[2], grid_size[1], grid_size[0]);
+    at::Tensor coor_to_voxelidx = -at::ones({grid_size[2], grid_size[1], grid_size[0]}, coors.options());
+
+    int voxel_num = 0;
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(points.type(), "hard_voxelize_forward", [&] {
+        hard_voxelize_kernel<scalar_t, int>(
+            points.accessor<scalar_t,2>(),
+            voxels.accessor<scalar_t,3>(),
+            coors.accessor<int,2>(),
+            num_points_per_voxel.accessor<int,1>(),
+            coor_to_voxelidx.accessor<int,3>(),
+            voxel_num,
+            voxel_size,
+            coors_range,
+            grid_size,
+            max_points,
+            max_voxels,
+            num_points,
+            num_features,
+            NDim
+        );
+    });
+
+    return voxel_num;
+}
+
+
+void dynamic_voxelize_cpu(
+    const at::Tensor& points,
+    at::Tensor& coors,
+    const std::vector<float> voxel_size,
+    const std::vector<float> coors_range,
+    const int NDim=3) {
+    // check device
+    AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
+
+    std::vector<int> grid_size(NDim);
+    const int num_points = points.size(0);
+    const int num_features = points.size(1);
+
+    for (int i = 0; i < NDim; ++i) {
+        grid_size[i] = round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+    }
+
+    // coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(points.type(), "hard_voxelize_forward", [&] {
+        dynamic_voxelize_kernel<scalar_t, int>(
+            points.accessor<scalar_t,2>(),
+            coors.accessor<int,2>(),
+            voxel_size,
+            coors_range,
+            grid_size,
+            num_points,
+            num_features,
+            NDim
+        );
+    });
+
+    return;
+}
+
+}
--- a/mmdet3d/ops/voxel/src/voxelization_cuda.cu
+++ b/mmdet3d/ops/voxel/src/voxelization_cuda.cu
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/types.h>
+
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+
+#define CHECK_CUDA(x) \
+  TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) \
+  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
+  CHECK_CONTIGUOUS(x)
+
+namespace {
+int const threadsPerBlock = sizeof(unsigned long long) * 8;
+}
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                            \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
+       i += blockDim.x * gridDim.x)
+
+template <typename T, typename T_int>
+__global__ void dynamic_voxelize_kernel(
+    const T* points, T_int* coors, const float voxel_x, const float voxel_y,
+    const float voxel_z, const float coors_x_min, const float coors_y_min,
+    const float coors_z_min, const float coors_x_max, const float coors_y_max,
+    const float coors_z_max, const int grid_x, const int grid_y,
+    const int grid_z, const int num_points, const int num_features,
+    const int NDim) {
+  //   const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
+  CUDA_1D_KERNEL_LOOP(index, num_points) {
+    // To save some computation
+    auto points_offset = points + index * num_features;
+    auto coors_offset = coors + index * NDim;
+    int c_x = floor((points_offset[0] - coors_x_min) / voxel_x);
+    if (c_x < 0 || c_x >= grid_x) {
+      coors_offset[0] = -1;
+      return;
+    }
+
+    int c_y = floor((points_offset[1] - coors_y_min) / voxel_y);
+    if (c_y < 0 || c_y >= grid_y) {
+      coors_offset[0] = -1;
+      coors_offset[1] = -1;
+      return;
+    }
+
+    int c_z = floor((points_offset[2] - coors_z_min) / voxel_z);
+    if (c_z < 0 || c_z >= grid_z) {
+      coors_offset[0] = -1;
+      coors_offset[1] = -1;
+      coors_offset[2] = -1;
+    } else {
+      coors_offset[0] = c_z;
+      coors_offset[1] = c_y;
+      coors_offset[2] = c_x;
+    }
+  }
+}
+
+template <typename T, typename T_int>
+__global__ void assign_point_to_voxel(const int nthreads, const T* points,
+                                      T_int* point_to_voxelidx,
+                                      T_int* coor_to_voxelidx, T* voxels,
+                                      const int max_points,
+                                      const int num_features,
+                                      const int num_points, const int NDim) {
+  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
+    // const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
+    int index = thread_idx / num_features;
+
+    int num = point_to_voxelidx[index];
+    int voxelidx = coor_to_voxelidx[index];
+    if (num > -1 && voxelidx > -1) {
+      auto voxels_offset =
+          voxels + voxelidx * max_points * num_features + num * num_features;
+
+      int k = thread_idx % num_features;
+      voxels_offset[k] = points[thread_idx];
+    }
+  }
+}
+
+template <typename T, typename T_int>
+__global__ void assign_voxel_coors(const int nthreads, T_int* coor,
+                                   T_int* point_to_voxelidx,
+                                   T_int* coor_to_voxelidx, T_int* voxel_coors,
+                                   const int num_points, const int NDim) {
+  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
+    // const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
+    // if (index >= num_points) return;
+    int index = thread_idx / NDim;
+    int num = point_to_voxelidx[index];
+    int voxelidx = coor_to_voxelidx[index];
+    if (num == 0 && voxelidx > -1) {
+      auto coors_offset = voxel_coors + voxelidx * NDim;
+      int k = thread_idx % NDim;
+      coors_offset[k] = coor[thread_idx];
+    }
+  }
+}
+
+template <typename T_int>
+__global__ void point_to_voxelidx_kernel(const T_int* coor,
+                                         T_int* point_to_voxelidx,
+                                         T_int* point_to_pointidx,
+                                         const int max_points,
+                                         const int max_voxels,
+                                         const int num_points, const int NDim) {
+  CUDA_1D_KERNEL_LOOP(index, num_points) {
+    auto coor_offset = coor + index * NDim;
+    // skip invalid points
+    if ((index >= num_points) || (coor_offset[0] == -1)) return;
+
+    int num = 0;
+    int coor_x = coor_offset[0];
+    int coor_y = coor_offset[1];
+    int coor_z = coor_offset[2];
+    // only calculate the coors before this coor[index]
+    for (int i = 0; i < index; ++i) {
+      auto prev_coor = coor + i * NDim;
+      if (prev_coor[0] == -1) continue;
+
+      // Find all previous points that have the same coors
+      // if find the same coor, record it
+      if ((prev_coor[0] == coor_x) && (prev_coor[1] == coor_y) &&
+          (prev_coor[2] == coor_z)) {
+        num++;
+        if (num == 1) {
+          // point to the same coor that first show up
+          point_to_pointidx[index] = i;
+        } else if (num >= max_points) {
+          // out of boundary
+          return;
+        }
+      }
+    }
+    if (num == 0) {
+      point_to_pointidx[index] = index;
+    }
+    if (num < max_points) {
+      point_to_voxelidx[index] = num;
+    }
+  }
+}
+
+template <typename T_int>
+__global__ void determin_voxel_num(
+    // const T_int* coor,
+    T_int* num_points_per_voxel, T_int* point_to_voxelidx,
+    T_int* point_to_pointidx, T_int* coor_to_voxelidx, T_int* voxel_num,
+    const int max_points, const int max_voxels, const int num_points) {
+  // only calculate the coors before this coor[index]
+  for (int i = 0; i < num_points; ++i) {
+    // if (coor[i][0] == -1)
+    //    continue;
+    int point_pos_in_voxel = point_to_voxelidx[i];
+    // record voxel
+    if (point_pos_in_voxel == -1) {
+      // out of max_points or invalid point
+      continue;
+    } else if (point_pos_in_voxel == 0) {
+      // record new voxel
+      int voxelidx = voxel_num[0];
+      if (voxel_num[0] >= max_voxels) break;
+      voxel_num[0] += 1;
+      coor_to_voxelidx[i] = voxelidx;
+      num_points_per_voxel[voxelidx] = 1;
+    } else {
+      int point_idx = point_to_pointidx[i];
+      int voxelidx = coor_to_voxelidx[point_idx];
+      if (voxelidx != -1) {
+        coor_to_voxelidx[i] = voxelidx;
+        num_points_per_voxel[voxelidx] += 1;
+      }
+    }
+  }
+}
+
+namespace voxelization {
+
+int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
+                      at::Tensor& coors, at::Tensor& num_points_per_voxel,
+                      const std::vector<float> voxel_size,
+                      const std::vector<float> coors_range,
+                      const int max_points, const int max_voxels,
+                      const int NDim = 3) {
+  // current version tooks about 0.04s for one frame on cpu
+  // check device
+  CHECK_INPUT(points);
+
+  at::cuda::CUDAGuard device_guard(points.device());
+
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  const float voxel_x = voxel_size[0];
+  const float voxel_y = voxel_size[1];
+  const float voxel_z = voxel_size[2];
+  const float coors_x_min = coors_range[0];
+  const float coors_y_min = coors_range[1];
+  const float coors_z_min = coors_range[2];
+  const float coors_x_max = coors_range[3];
+  const float coors_y_max = coors_range[4];
+  const float coors_z_max = coors_range[5];
+
+  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
+  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
+  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
+
+  // map points to voxel coors
+  at::Tensor temp_coors =
+      at::zeros({num_points, NDim}, points.options().dtype(at::kInt));
+
+  dim3 grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
+  dim3 block(512);
+
+  // 1. link point to corresponding voxel coors
+  AT_DISPATCH_ALL_TYPES(
+      points.type(), "hard_voxelize_kernel", ([&] {
+        dynamic_voxelize_kernel<scalar_t, int>
+            <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
+                points.contiguous().data_ptr<scalar_t>(),
+                temp_coors.contiguous().data_ptr<int>(), voxel_x, voxel_y,
+                voxel_z, coors_x_min, coors_y_min, coors_z_min, coors_x_max,
+                coors_y_max, coors_z_max, grid_x, grid_y, grid_z, num_points,
+                num_features, NDim);
+      }));
+  cudaDeviceSynchronize();
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // 2. map point to the idx of the corresponding voxel, find duplicate coor
+  // create some temporary variables
+  auto point_to_pointidx = -at::ones(
+      {
+          num_points,
+      },
+      points.options().dtype(at::kInt));
+  auto point_to_voxelidx = -at::ones(
+      {
+          num_points,
+      },
+      points.options().dtype(at::kInt));
+
+  dim3 map_grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
+  dim3 map_block(512);
+  AT_DISPATCH_ALL_TYPES(
+      temp_coors.type(), "determin_duplicate", ([&] {
+        point_to_voxelidx_kernel<int>
+            <<<map_grid, map_block, 0, at::cuda::getCurrentCUDAStream()>>>(
+                temp_coors.contiguous().data_ptr<int>(),
+                point_to_voxelidx.contiguous().data_ptr<int>(),
+                point_to_pointidx.contiguous().data_ptr<int>(), max_points,
+                max_voxels, num_points, NDim);
+      }));
+  cudaDeviceSynchronize();
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // 3. determin voxel num and voxel's coor index
+  // make the logic in the CUDA device could accelerate about 10 times
+  auto coor_to_voxelidx = -at::ones(
+      {
+          num_points,
+      },
+      points.options().dtype(at::kInt));
+  auto voxel_num = at::zeros(
+      {
+          1,
+      },
+      points.options().dtype(at::kInt));  // must be zero from the begining
+
+  AT_DISPATCH_ALL_TYPES(
+      temp_coors.type(), "determin_duplicate", ([&] {
+        determin_voxel_num<int><<<1, 1, 0, at::cuda::getCurrentCUDAStream()>>>(
+            num_points_per_voxel.contiguous().data_ptr<int>(),
+            point_to_voxelidx.contiguous().data_ptr<int>(),
+            point_to_pointidx.contiguous().data_ptr<int>(),
+            coor_to_voxelidx.contiguous().data_ptr<int>(),
+            voxel_num.contiguous().data_ptr<int>(), max_points, max_voxels,
+            num_points);
+      }));
+  cudaDeviceSynchronize();
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // 4. copy point features to voxels
+  // Step 4 & 5 could be parallel
+  auto pts_output_size = num_points * num_features;
+  dim3 cp_grid(std::min(at::cuda::ATenCeilDiv(pts_output_size, 512), 4096));
+  dim3 cp_block(512);
+  AT_DISPATCH_ALL_TYPES(
+      points.type(), "assign_point_to_voxel", ([&] {
+        assign_point_to_voxel<float, int>
+            <<<cp_grid, cp_block, 0, at::cuda::getCurrentCUDAStream()>>>(
+                pts_output_size, points.contiguous().data_ptr<float>(),
+                point_to_voxelidx.contiguous().data_ptr<int>(),
+                coor_to_voxelidx.contiguous().data_ptr<int>(),
+                voxels.contiguous().data_ptr<float>(), max_points, num_features,
+                num_points, NDim);
+      }));
+  //   cudaDeviceSynchronize();
+  //   AT_CUDA_CHECK(cudaGetLastError());
+
+  // 5. copy coors of each voxels
+  auto coors_output_size = num_points * NDim;
+  dim3 coors_cp_grid(
+      std::min(at::cuda::ATenCeilDiv(coors_output_size, 512), 4096));
+  dim3 coors_cp_block(512);
+  AT_DISPATCH_ALL_TYPES(
+      points.type(), "assign_point_to_voxel", ([&] {
+        assign_voxel_coors<float, int><<<coors_cp_grid, coors_cp_block, 0,
+                                         at::cuda::getCurrentCUDAStream()>>>(
+            coors_output_size, temp_coors.contiguous().data_ptr<int>(),
+            point_to_voxelidx.contiguous().data_ptr<int>(),
+            coor_to_voxelidx.contiguous().data_ptr<int>(),
+            coors.contiguous().data_ptr<int>(), num_points, NDim);
+      }));
+  cudaDeviceSynchronize();
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  auto voxel_num_cpu = voxel_num.to(at::kCPU);
+  int voxel_num_int = voxel_num_cpu.data_ptr<int>()[0];
+
+  return voxel_num_int;
+}
+
+void dynamic_voxelize_gpu(const at::Tensor& points, at::Tensor& coors,
+                          const std::vector<float> voxel_size,
+                          const std::vector<float> coors_range,
+                          const int NDim = 3) {
+  // current version tooks about 0.04s for one frame on cpu
+  // check device
+  CHECK_INPUT(points);
+
+  at::cuda::CUDAGuard device_guard(points.device());
+
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  const float voxel_x = voxel_size[0];
+  const float voxel_y = voxel_size[1];
+  const float voxel_z = voxel_size[2];
+  const float coors_x_min = coors_range[0];
+  const float coors_y_min = coors_range[1];
+  const float coors_z_min = coors_range[2];
+  const float coors_x_max = coors_range[3];
+  const float coors_y_max = coors_range[4];
+  const float coors_z_max = coors_range[5];
+
+  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
+  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
+  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
+
+  const int col_blocks = at::cuda::ATenCeilDiv(num_points, threadsPerBlock);
+  dim3 blocks(col_blocks);
+  dim3 threads(threadsPerBlock);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_ALL_TYPES(points.scalar_type(), "dynamic_voxelize_kernel", [&] {
+    dynamic_voxelize_kernel<scalar_t, int><<<blocks, threads, 0, stream>>>(
+        points.contiguous().data_ptr<scalar_t>(),
+        coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,
+        coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,
+        coors_z_max, grid_x, grid_y, grid_z, num_points, num_features, NDim);
+  });
+  cudaDeviceSynchronize();
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  return;
+}
+
+}  // namespace voxelization
--- a/mmdet3d/ops/voxel/voxelize.py
+++ b/mmdet3d/ops/voxel/voxelize.py
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import torch
+from torch import nn
+from torch.autograd import Function
+from torch.nn.modules.utils import _pair
+
+from .voxel_layer import dynamic_voxelize, hard_voxelize
+
+
+class _Voxelization(Function):
+
+    @staticmethod
+    def forward(ctx,
+                points,
+                voxel_size,
+                coors_range,
+                max_points=35,
+                max_voxels=20000):
+        """convert kitti points(N, >=3) to voxels.
+
+        Args:
+            points: [N, ndim] float tensor. points[:, :3] contain xyz points
+                and points[:, 3:] contain other information like reflectivity
+            voxel_size: [3] list/tuple or array, float. xyz, indicate voxel
+                size
+            coors_range: [6] list/tuple or array, float. indicate voxel
+                range. format: xyzxyz, minmax
+            max_points: int. indicate maximum points contained in a voxel. if
+                max_points=-1, it means using dynamic_voxelize
+            max_voxels: int. indicate maximum voxels this function create.
+                for second, 20000 is a good choice. Users should shuffle points
+                before call this function because max_voxels may drop points.
+
+        Returns:
+            voxels: [M, max_points, ndim] float tensor. only contain points
+                    and returned when max_points != -1.
+            coordinates: [M, 3] int32 tensor, always returned.
+            num_points_per_voxel: [M] int32 tensor. Only returned when
+                max_points != -1.
+        """
+        if max_points == -1 or max_voxels == -1:
+            coors = points.new_zeros(size=(points.size(0), 3), dtype=torch.int)
+            dynamic_voxelize(points, coors, voxel_size, coors_range, 3)
+            return coors
+        else:
+            voxels = points.new_zeros(
+                size=(max_voxels, max_points, points.size(1)))
+            coors = points.new_zeros(size=(max_voxels, 3), dtype=torch.int)
+            num_points_per_voxel = points.new_zeros(
+                size=(max_voxels, ), dtype=torch.int)
+            voxel_num = hard_voxelize(points, voxels, coors,
+                                      num_points_per_voxel, voxel_size,
+                                      coors_range, max_points, max_voxels, 3)
+            # select the valid voxels
+            voxels_out = voxels[:voxel_num]
+            coors_out = coors[:voxel_num]
+            num_points_per_voxel_out = num_points_per_voxel[:voxel_num]
+            return voxels_out, coors_out, num_points_per_voxel_out
+
+
+voxelization = _Voxelization.apply
+
+
+class Voxelization(nn.Module):
+
+    def __init__(self,
+                 voxel_size,
+                 point_cloud_range,
+                 max_num_points,
+                 max_voxels=20000):
+        super(Voxelization, self).__init__()
+        """
+        Args:
+            voxel_size (list): list [x, y, z] size of three dimension
+            point_cloud_range (list):
+                [x_min, y_min, z_min, x_max, y_max, z_max]
+            max_num_points (int): max number of points per voxel
+            max_voxels (tuple or int): max number of voxels in
+                (training, testing) time
+        """
+        self.voxel_size = voxel_size
+        self.point_cloud_range = point_cloud_range
+        self.max_num_points = max_num_points
+        if isinstance(max_voxels, tuple):
+            self.max_voxels = max_voxels
+        else:
+            self.max_voxels = _pair(max_voxels)
+
+        point_cloud_range = torch.tensor(
+            point_cloud_range, dtype=torch.float32)
+        # [0, -40, -3, 70.4, 40, 1]
+        voxel_size = torch.tensor(voxel_size, dtype=torch.float32)
+        grid_size = (point_cloud_range[3:] -
+                     point_cloud_range[:3]) / voxel_size
+        grid_size = torch.round(grid_size).long()
+        input_feat_shape = grid_size[:2]
+        self.grid_size = grid_size
+        # the origin shape is as [x-len, y-len, z-len]
+        # [w, h, d] -> [d, h, w]
+        self.pcd_shape = [*input_feat_shape, 1][::-1]
+
+    def forward(self, input):
+        """
+        Args:
+            input: NC points
+        """
+        if self.training:
+            max_voxels = self.max_voxels[0]
+        else:
+            max_voxels = self.max_voxels[1]
+
+        return voxelization(input, self.voxel_size, self.point_cloud_range,
+                            self.max_num_points, max_voxels)
+
+    def __repr__(self):
+        tmpstr = self.__class__.__name__ + '('
+        tmpstr += 'voxel_size=' + str(self.voxel_size)
+        tmpstr += ', point_cloud_range=' + str(self.point_cloud_range)
+        tmpstr += ', max_num_points=' + str(self.max_num_points)
+        tmpstr += ', max_voxels=' + str(self.max_voxels)
+        tmpstr += ')'
+        return tmpstr
--- a/mmdet3d/utils/__init__.py
+++ b/mmdet3d/utils/__init__.py
+from mmdet.utils import (Registry, build_from_cfg, get_model_complexity_info,
+                         get_root_logger, print_log)
+from .collect_env import collect_env
+
+__all__ = [
+    'Registry', 'build_from_cfg', 'get_model_complexity_info',
+    'get_root_logger', 'print_log', 'collect_env'
+]
--- a/mmdet3d/utils/collect_env.py
+++ b/mmdet3d/utils/collect_env.py
+import os.path as osp
+import subprocess
+import sys
+from collections import defaultdict
+
+import cv2
+import mmcv
+import torch
+import torchvision
+
+import mmdet
+import mmdet3d
+
+
+def collect_env():
+    env_info = {}
+    env_info['sys.platform'] = sys.platform
+    env_info['Python'] = sys.version.replace('\n', '')
+
+    cuda_available = torch.cuda.is_available()
+    env_info['CUDA available'] = cuda_available
+
+    if cuda_available:
+        from torch.utils.cpp_extension import CUDA_HOME
+        env_info['CUDA_HOME'] = CUDA_HOME
+
+        if CUDA_HOME is not None and osp.isdir(CUDA_HOME):
+            try:
+                nvcc = osp.join(CUDA_HOME, 'bin/nvcc')
+                nvcc = subprocess.check_output(
+                    '"{}" -V | tail -n1'.format(nvcc), shell=True)
+                nvcc = nvcc.decode('utf-8').strip()
+            except subprocess.SubprocessError:
+                nvcc = 'Not Available'
+            env_info['NVCC'] = nvcc
+
+        devices = defaultdict(list)
+        for k in range(torch.cuda.device_count()):
+            devices[torch.cuda.get_device_name(k)].append(str(k))
+        for name, devids in devices.items():
+            env_info['GPU ' + ','.join(devids)] = name
+
+    gcc = subprocess.check_output('gcc --version | head -n1', shell=True)
+    gcc = gcc.decode('utf-8').strip()
+    env_info['GCC'] = gcc
+
+    env_info['PyTorch'] = torch.__version__
+    env_info['PyTorch compiling details'] = torch.__config__.show()
+
+    env_info['TorchVision'] = torchvision.__version__
+
+    env_info['OpenCV'] = cv2.__version__
+
+    env_info['MMCV'] = mmcv.__version__
+    env_info['MMDetection'] = mmdet.__version__
+    env_info['MMDetection3D'] = mmdet3d.__version__
+    from mmdet.ops import get_compiler_version, get_compiling_cuda_version
+    env_info['MMDetection3D Compiler'] = get_compiler_version()
+    env_info['MMDetection3D CUDA Compiler'] = get_compiling_cuda_version()
+    return env_info
+
+
+if __name__ == '__main__':
+    for name, val in collect_env().items():
+        print('{}: {}'.format(name, val))
--- a/requirements.txt
+++ b/requirements.txt
+-r requirements/build.txt
+-r requirements/optional.txt
+-r requirements/runtime.txt
+-r requirements/tests.txt
--- a/requirements/build.txt
+++ b/requirements/build.txt
+# These must be installed before building mmdetection
+numpy
+torch>=1.1
--- a/requirements/optional.txt
+++ b/requirements/optional.txt
+# To avoid install too many unnecessary packages
+nuscenes-devkit==1.0.5
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
+matplotlib
+mmcv>=0.2.15
+numpy
+# need older pillow until torchvision is fixed
+Pillow<=6.2.2
+six
+terminaltables
+torch>=1.1
+torchvision
--- a/requirements/tests.txt
+++ b/requirements/tests.txt
+asynctest
+codecov
+flake8
+isort
+# Note: used for kwarray.group_items, this may be ported to mmcv in the future.
+kwarray
+pytest
+pytest-cov
+pytest-runner
+ubelt
+xdoctest >= 0.10.0
+yapf
--- a/setup.py
+++ b/setup.py
+import os
+import platform
+import subprocess
+import time
+from setuptools import Extension, find_packages, setup
+
+import numpy as np
+from Cython.Build import cythonize
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+
+def readme():
+    with open('README.md', encoding='utf-8') as f:
+        content = f.read()
+    return content
+
+
+MAJOR = 0
+MINOR = 1
+PATCH = ''
+SUFFIX = 'rc0'
+SHORT_VERSION = '{}.{}.{}{}'.format(MAJOR, MINOR, PATCH, SUFFIX)
+
+version_file = 'mmdet3d/version.py'
+
+
+def get_git_hash():
+
+    def _minimal_ext_cmd(cmd):
+        # construct minimal environment
+        env = {}
+        for k in ['SYSTEMROOT', 'PATH', 'HOME']:
+            v = os.environ.get(k)
+            if v is not None:
+                env[k] = v
+        # LANGUAGE is used on win32
+        env['LANGUAGE'] = 'C'
+        env['LANG'] = 'C'
+        env['LC_ALL'] = 'C'
+        out = subprocess.Popen(
+            cmd, stdout=subprocess.PIPE, env=env).communicate()[0]
+        return out
+
+    try:
+        out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD'])
+        sha = out.strip().decode('ascii')
+    except OSError:
+        sha = 'unknown'
+
+    return sha
+
+
+def get_hash():
+    if os.path.exists('.git'):
+        sha = get_git_hash()[:7]
+    elif os.path.exists(version_file):
+        try:
+            from mmdet3d.version import __version__
+            sha = __version__.split('+')[-1]
+        except ImportError:
+            raise ImportError('Unable to get git version')
+    else:
+        sha = 'unknown'
+
+    return sha
+
+
+def write_version_py():
+    content = """# GENERATED VERSION FILE
+# TIME: {}
+__version__ = '{}'
+short_version = '{}'
+"""
+    sha = get_hash()
+    VERSION = SHORT_VERSION + '+' + sha
+
+    with open(version_file, 'w') as f:
+        f.write(content.format(time.asctime(), VERSION, SHORT_VERSION))
+
+
+def get_version():
+    with open(version_file, 'r') as f:
+        exec(compile(f.read(), version_file, 'exec'))
+    return locals()['__version__']
+
+
+def make_cuda_ext(name, module, sources, extra_args=[], extra_include_path=[]):
+    return CUDAExtension(
+        name='{}.{}'.format(module, name),
+        define_macros=[('WITH_CUDA', None)],
+        sources=[os.path.join(*module.split('.'), p) for p in sources],
+        include_dirs=extra_include_path,
+        extra_compile_args={
+            'cxx': [] + extra_args,
+            'nvcc':
+            extra_args + [
+                '-D__CUDA_NO_HALF_OPERATORS__',
+                '-D__CUDA_NO_HALF_CONVERSIONS__',
+                '-D__CUDA_NO_HALF2_OPERATORS__',
+            ]
+        })
+
+
+def make_cython_ext(name, module, sources):
+    extra_compile_args = None
+    if platform.system() != 'Windows':
+        extra_compile_args = {
+            'cxx': ['-Wno-unused-function', '-Wno-write-strings']
+        }
+
+    extension = Extension(
+        '{}.{}'.format(module, name),
+        [os.path.join(*module.split('.'), p) for p in sources],
+        include_dirs=[np.get_include()],
+        language='c++',
+        extra_compile_args=extra_compile_args)
+    extension, = cythonize(extension)
+    return extension
+
+
+def parse_requirements(fname='requirements.txt', with_version=True):
+    """
+    Parse the package dependencies listed in a requirements file but strips
+    specific versioning information.
+    Args:
+        fname (str): path to requirements file
+        with_version (bool, default=False): if True include version specs
+    Returns:
+        List[str]: list of requirements items
+    CommandLine:
+        python -c "import setup; print(setup.parse_requirements())"
+    """
+    import sys
+    from os.path import exists
+    import re
+    require_fpath = fname
+
+    def parse_line(line):
+        """
+        Parse information from a line in a requirements text file
+        """
+        if line.startswith('-r '):
+            # Allow specifying requirements in other files
+            target = line.split(' ')[1]
+            for info in parse_require_file(target):
+                yield info
+        else:
+            info = {'line': line}
+            if line.startswith('-e '):
+                info['package'] = line.split('#egg=')[1]
+            else:
+                # Remove versioning from the package
+                pat = '(' + '|'.join(['>=', '==', '>']) + ')'
+                parts = re.split(pat, line, maxsplit=1)
+                parts = [p.strip() for p in parts]
+
+                info['package'] = parts[0]
+                if len(parts) > 1:
+                    op, rest = parts[1:]
+                    if ';' in rest:
+                        # Handle platform specific dependencies
+                        # http://setuptools.readthedocs.io/en/latest/setuptools.html#declaring-platform-specific-dependencies
+                        version, platform_deps = map(str.strip,
+                                                     rest.split(';'))
+                        info['platform_deps'] = platform_deps
+                    else:
+                        version = rest  # NOQA
+                    info['version'] = (op, version)
+            yield info
+
+    def parse_require_file(fpath):
+        with open(fpath, 'r') as f:
+            for line in f.readlines():
+                line = line.strip()
+                if line and not line.startswith('#'):
+                    for info in parse_line(line):
+                        yield info
+
+    def gen_packages_items():
+        if exists(require_fpath):
+            for info in parse_require_file(require_fpath):
+                parts = [info['package']]
+                if with_version and 'version' in info:
+                    parts.extend(info['version'])
+                if not sys.version.startswith('3.4'):
+                    # apparently package_deps are broken in 3.4
+                    platform_deps = info.get('platform_deps')
+                    if platform_deps is not None:
+                        parts.append(';' + platform_deps)
+                item = ''.join(parts)
+                yield item
+
+    packages = list(gen_packages_items())
+    return packages
+
+
+if __name__ == '__main__':
+    write_version_py()
+    setup(
+        name='mmdet3d',
+        version=get_version(),
+        description='3D Detection Toolbox',
+        long_description=readme(),
+        keywords='computer vision, 3D object detection',
+        url='https://github.com/ZwwWayne/mmdetection3d',
+        packages=find_packages(exclude=('configs', 'tools', 'demo')),
+        package_data={'mmdet3d.ops': ['*/*.so']},
+        classifiers=[
+            'Development Status :: 4 - Beta',
+            'License :: OSI Approved :: Apache Software License',
+            'Operating System :: OS Independent',
+            'Programming Language :: Python :: 3',
+            'Programming Language :: Python :: 3.4',
+            'Programming Language :: Python :: 3.5',
+            'Programming Language :: Python :: 3.6',
+            'Programming Language :: Python :: 3.7',
+        ],
+        license='Apache License 2.0',
+        setup_requires=parse_requirements('requirements/build.txt'),
+        tests_require=parse_requirements('requirements/tests.txt'),
+        install_requires=parse_requirements('requirements/runtime.txt'),
+        extras_require={
+            'all': parse_requirements('requirements.txt'),
+            'tests': parse_requirements('requirements/tests.txt'),
+            'build': parse_requirements('requirements/build.txt'),
+            'optional': parse_requirements('requirements/optional.txt'),
+        },
+        ext_modules=[
+            make_cuda_ext(
+                name='sparse_conv_ext',
+                module='mmdet3d.ops.spconv',
+                extra_include_path=[
+                    os.path.join(*'mmdet3d.ops.spconv'.split('.'), 'include/')
+                ],
+                sources=[
+                    'src/all.cc',
+                    'src/reordering.cc',
+                    'src/reordering_cuda.cu',
+                    'src/indice.cc',
+                    'src/indice_cuda.cu',
+                    'src/maxpool.cc',
+                    'src/maxpool_cuda.cu',
+                ],
+                extra_args=['-w', '-std=c++14']),
+            make_cuda_ext(
+                name='iou3d_cuda',
+                module='mmdet3d.ops.iou3d',
+                sources=[
+                    'src/iou3d.cpp',
+                    'src/iou3d_kernel.cu',
+                ]),
+            make_cuda_ext(
+                name='sigmoid_focal_loss_cuda',
+                module='mmdet3d.ops.sigmoid_focal_loss',
+                sources=[
+                    'src/sigmoid_focal_loss.cpp',
+                    'src/sigmoid_focal_loss_cuda.cu'
+                ]),
+            make_cuda_ext(
+                name='voxel_layer',
+                module='mmdet3d.ops.voxel',
+                sources=[
+                    'src/voxelization.cpp',
+                    'src/scatter_points_cpu.cpp',
+                    'src/scatter_points_cuda.cu',
+                    'src/voxelization_cpu.cpp',
+                    'src/voxelization_cuda.cu',
+                ]),
+        ],
+        cmdclass={'build_ext': BuildExtension},
+        zip_safe=False)