Commit d1aac35d authored by zhangwenwei's avatar zhangwenwei
Browse files

Initial commit

parents
import numpy as np
import torch
def scatter_nd(indices, updates, shape):
"""pytorch edition of tensorflow scatter_nd.
this function don't contain except handle code. so use this carefully
when indice repeats, don't support repeat add which is supported
in tensorflow.
"""
ret = torch.zeros(*shape, dtype=updates.dtype, device=updates.device)
ndim = indices.shape[-1]
output_shape = list(indices.shape[:-1]) + shape[indices.shape[-1]:]
flatted_indices = indices.view(-1, ndim)
slices = [flatted_indices[:, i] for i in range(ndim)]
slices += [Ellipsis]
ret[slices] = updates.view(*output_shape)
return ret
class SparseConvTensor(object):
def __init__(self,
features,
indices,
spatial_shape,
batch_size,
grid=None):
"""
Args:
grid: pre-allocated grid tensor.
should be used when the volume of spatial shape
is very large.
"""
self.features = features
self.indices = indices
if self.indices.dtype != torch.int32:
self.indices.int()
self.spatial_shape = spatial_shape
self.batch_size = batch_size
self.indice_dict = {}
self.grid = grid
@property
def spatial_size(self):
return np.prod(self.spatial_shape)
def find_indice_pair(self, key):
if key is None:
return None
if key in self.indice_dict:
return self.indice_dict[key]
return None
def dense(self, channels_first=True):
output_shape = [self.batch_size] + list(
self.spatial_shape) + [self.features.shape[1]]
res = scatter_nd(self.indices.long(), self.features, output_shape)
if not channels_first:
return res
ndim = len(self.spatial_shape)
trans_params = list(range(0, ndim + 1))
trans_params.insert(1, ndim + 1)
return res.permute(*trans_params).contiguous()
@property
def sparity(self):
return (self.indices.shape[0] / np.prod(self.spatial_shape) /
self.batch_size)
# Copyright 2019 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
class TestCase(unittest.TestCase):
def _GetNdArray(self, a):
if not isinstance(a, np.ndarray):
a = np.array(a)
return a
def assertAllEqual(self, a, b):
"""Asserts that two numpy arrays have the same values.
Args:
a: the expected numpy ndarray or anything can be converted to one.
b: the actual numpy ndarray or anything can be converted to one.
"""
a = self._GetNdArray(a)
b = self._GetNdArray(b)
self.assertEqual(
a.shape, b.shape,
'Shape mismatch: expected %s, got %s.' % (a.shape, b.shape))
same = (a == b)
if a.dtype == np.float32 or a.dtype == np.float64:
same = np.logical_or(same,
np.logical_and(np.isnan(a), np.isnan(b)))
if not np.all(same):
# Prints more details than np.testing.assert_array_equal.
diff = np.logical_not(same)
if a.ndim:
x = a[np.where(diff)]
y = b[np.where(diff)]
print('not equal where = ', np.where(diff))
else:
# np.where is broken for scalars
x, y = a, b
print('not equal lhs = ', x)
print('not equal rhs = ', y)
np.testing.assert_array_equal(a, b)
def assertAllClose(self, a, b, rtol=1e-6, atol=1e-6):
"""Asserts that two numpy arrays, or dicts of same, have near values.
This does not support nested dicts.
Args:
a: The expected numpy ndarray (or anything can be converted to one), or
dict of same. Must be a dict iff `b` is a dict.
b: The actual numpy ndarray (or anything can be converted to one), or
dict of same. Must be a dict iff `a` is a dict.
rtol: relative tolerance.
atol: absolute tolerance.
Raises:
ValueError: if only one of `a` and `b` is a dict.
"""
is_a_dict = isinstance(a, dict)
if is_a_dict != isinstance(b, dict):
raise ValueError("Can't compare dict to non-dict, %s vs %s." %
(a, b))
if is_a_dict:
self.assertCountEqual(
a.keys(),
b.keys(),
msg='mismatched keys, expected %s, got %s' %
(a.keys(), b.keys()))
for k in a:
self._assertArrayLikeAllClose(
a[k],
b[k],
rtol=rtol,
atol=atol,
msg='%s: expected %s, got %s.' % (k, a, b))
else:
self._assertArrayLikeAllClose(a, b, rtol=rtol, atol=atol)
def _assertArrayLikeAllClose(self, a, b, rtol=1e-6, atol=1e-6, msg=None):
a = self._GetNdArray(a)
b = self._GetNdArray(b)
self.assertEqual(
a.shape, b.shape,
'Shape mismatch: expected %s, got %s.' % (a.shape, b.shape))
if not np.allclose(a, b, rtol=rtol, atol=atol):
# Prints more details than np.testing.assert_allclose.
#
# NOTE: numpy.allclose (and numpy.testing.assert_allclose)
# checks whether two arrays are element-wise equal within a
# tolerance. The relative difference (rtol * abs(b)) and the
# absolute difference atol are added together to compare against
# the absolute difference between a and b. Here, we want to
# print out which elements violate such conditions.
cond = np.logical_or(
np.abs(a - b) > atol + rtol * np.abs(b),
np.isnan(a) != np.isnan(b))
if a.ndim:
x = a[np.where(cond)]
y = b[np.where(cond)]
print('not close where = ', np.where(cond))
else:
# np.where is broken for scalars
x, y = a, b
print('not close lhs = ', x)
print('not close rhs = ', y)
print('not close dif = ', np.abs(x - y))
print('not close tol = ', atol + rtol * np.abs(y))
print('dtype = %s, shape = %s' % (a.dtype, a.shape))
np.testing.assert_allclose(a, b, rtol=rtol, atol=atol, err_msg=msg)
def params_grid(*params):
size = len(params)
length = 1
for p in params:
length *= len(p)
sizes = [len(p) for p in params]
counter = [0] * size
total = []
for i in range(length):
total.append([0] * size)
for i in range(length):
for j in range(size):
total[i][j] = params[j][counter[j]]
counter[size - 1] += 1
for c in range(size - 1, -1, -1):
if (counter[c] == sizes[c] and c > 0):
counter[c - 1] += 1
counter[c] = 0
return total
def generate_sparse_data(shape,
num_points,
num_channels,
integer=False,
data_range=(-1, 1),
with_dense=True,
dtype=np.float32):
dense_shape = shape
ndim = len(dense_shape)
# num_points = np.random.randint(10, 100, size=[batch_size, ndim])
num_points = np.array(num_points)
# num_points = np.array([3, 2])
batch_size = len(num_points)
batch_indices = []
coors_total = np.stack(
np.meshgrid(*[np.arange(0, s) for s in shape]), axis=-1)
coors_total = coors_total.reshape(-1, ndim)
for i in range(batch_size):
np.random.shuffle(coors_total)
inds_total = coors_total[:num_points[i]]
inds_total = np.pad(
inds_total, ((0, 0), (0, 1)), mode='constant', constant_values=i)
batch_indices.append(inds_total)
if integer:
sparse_data = np.random.randint(
data_range[0],
data_range[1],
size=[num_points.sum(), num_channels]).astype(dtype)
else:
sparse_data = np.random.uniform(
data_range[0],
data_range[1],
size=[num_points.sum(), num_channels]).astype(dtype)
res = {
'features': sparse_data.astype(dtype),
}
if with_dense:
dense_data = np.zeros([batch_size, num_channels, *dense_shape],
dtype=sparse_data.dtype)
start = 0
for i, inds in enumerate(batch_indices):
for j, ind in enumerate(inds):
dense_slice = (i, slice(None), *ind[:-1])
dense_data[dense_slice] = sparse_data[start + j]
start += len(inds)
res['features_dense'] = dense_data.astype(dtype)
batch_indices = np.concatenate(batch_indices, axis=0)
res['indices'] = batch_indices.astype(np.int32)
return res
import torch
import torch.distributed as dist
import torch.nn as nn
from torch.autograd.function import Function
class AllReduce(Function):
@staticmethod
def forward(ctx, input):
input_list = [
torch.zeros_like(input) for k in range(dist.get_world_size())
]
# Use allgather instead of allreduce in-place operations is unreliable
dist.all_gather(input_list, input, async_op=False)
inputs = torch.stack(input_list, dim=0)
return torch.sum(inputs, dim=0)
@staticmethod
def backward(ctx, grad_output):
dist.all_reduce(grad_output, async_op=False)
return grad_output
class NaiveSyncBatchNorm1d(nn.BatchNorm1d):
"""Syncronized Batch Normalization for 3D Tensors
Note:
This implementation is modified from
https://github.com/facebookresearch/detectron2/
`torch.nn.SyncBatchNorm` has known unknown bugs.
It produces significantly worse AP (and sometimes goes NaN)
when the batch size on each worker is quite different
(e.g., when scale augmentation is used).
In 3D detection, different workers has points of different shapes,
whish also cause instability.
Use this implementation before `nn.SyncBatchNorm` is fixed.
It is slower than `nn.SyncBatchNorm`.
"""
def forward(self, input):
if dist.get_world_size() == 1 or not self.training:
return super().forward(input)
assert input.shape[0] > 0, 'SyncBN does not support empty inputs'
C = input.shape[1]
mean = torch.mean(input, dim=[0, 2])
meansqr = torch.mean(input * input, dim=[0, 2])
vec = torch.cat([mean, meansqr], dim=0)
vec = AllReduce.apply(vec) * (1.0 / dist.get_world_size())
mean, meansqr = torch.split(vec, C)
var = meansqr - mean * mean
self.running_mean += self.momentum * (
mean.detach() - self.running_mean)
self.running_var += self.momentum * (var.detach() - self.running_var)
invstd = torch.rsqrt(var + self.eps)
scale = self.weight * invstd
bias = self.bias - mean * scale
scale = scale.reshape(1, -1, 1)
bias = bias.reshape(1, -1, 1)
return input * scale + bias
class NaiveSyncBatchNorm2d(nn.BatchNorm2d):
"""Syncronized Batch Normalization for 4D Tensors
Note:
This implementation is modified from
https://github.com/facebookresearch/detectron2/
`torch.nn.SyncBatchNorm` has known unknown bugs.
It produces significantly worse AP (and sometimes goes NaN)
when the batch size on each worker is quite different
(e.g., when scale augmentation is used).
This phenomenon also occurs when the multi-modality feature fusion
modules of multi-modality detectors use SyncBN.
Use this implementation before `nn.SyncBatchNorm` is fixed.
It is slower than `nn.SyncBatchNorm`.
"""
def forward(self, input):
if dist.get_world_size() == 1 or not self.training:
return super().forward(input)
assert input.shape[0] > 0, 'SyncBN does not support empty inputs'
C = input.shape[1]
mean = torch.mean(input, dim=[0, 2, 3])
meansqr = torch.mean(input * input, dim=[0, 2, 3])
vec = torch.cat([mean, meansqr], dim=0)
vec = AllReduce.apply(vec) * (1.0 / dist.get_world_size())
mean, meansqr = torch.split(vec, C)
var = meansqr - mean * mean
self.running_mean += self.momentum * (
mean.detach() - self.running_mean)
self.running_var += self.momentum * (var.detach() - self.running_var)
invstd = torch.rsqrt(var + self.eps)
scale = self.weight * invstd
bias = self.bias - mean * scale
scale = scale.reshape(1, -1, 1, 1)
bias = bias.reshape(1, -1, 1, 1)
return input * scale + bias
from .scatter_points import DynamicScatter, dynamic_scatter
from .voxelize import Voxelization, voxelization
__all__ = ['Voxelization', 'voxelization', 'dynamic_scatter', 'DynamicScatter']
import torch
from torch import nn
from torch.autograd import Function
from .voxel_layer import (dynamic_point_to_voxel_backward,
dynamic_point_to_voxel_forward)
class _dynamic_scatter(Function):
@staticmethod
def forward(ctx, points, coors, voxel_size, coors_range):
"""convert kitti points(N, >=3) to voxels.
Args:
points: [N, ndim] float tensor. points[:, :3] contain xyz
points and points[:, 3:] contain other information
such as reflectivity.
voxel_size: [3] list/tuple or array, float. xyz, indicate
voxel size
coors_range: [6] list/tuple or array, float. indicate voxel range.
format: xyzxyz, minmax
max_points: int. indicate maximum points contained in a voxel.
if max_points=-1, it means using dynamic_voxelize
max_voxels: int. indicate maximum voxels this function create.
for second, 20000 is a good choice. you should shuffle
points before call this function because max_voxels may
drop some points.
Returns:
tuple
voxels: [M, max_points, ndim] float tensor. only contain points
and returned when max_points != -1.
coordinates: [M, 3] int32 tensor, always returned.
num_points_per_voxel: [M] int32 tensor. Only returned when
max_points != -1.
"""
results = dynamic_point_to_voxel_forward(points, coors, voxel_size,
coors_range)
(voxels, voxel_coors, num_points_per_voxel, point_to_voxelidx,
coor_to_voxelidx) = results
ctx.save_for_backward(num_points_per_voxel, point_to_voxelidx,
coor_to_voxelidx)
return voxels, voxel_coors, num_points_per_voxel.float()
@staticmethod
def backward(ctx,
grad_output_voxel,
grad_output_voxel_coors=None,
grad_output_num_points=None):
(num_points_per_voxel, point_to_voxelidx,
coor_to_voxelidx) = ctx.saved_tensors
# grad_output_voxel shape: NxMxC
num_points = point_to_voxelidx.size(0)
num_features = grad_output_voxel.size(-1)
grad_points = grad_output_voxel.new_zeros(
size=(num_points, num_features))
# TODO: whether to use index put or use cuda_backward
# To use index put, need point to voxel index
dynamic_point_to_voxel_backward(grad_points,
grad_output_voxel.contiguous(),
point_to_voxelidx, coor_to_voxelidx)
return grad_points, None, None, None
dynamic_scatter = _dynamic_scatter.apply
class DynamicScatter(nn.Module):
def __init__(self, voxel_size, point_cloud_range, average_points: bool):
super(DynamicScatter, self).__init__()
"""Scatters points into voxels, used in the voxel encoder with
dynamic voxelization
**Note**: The CPU and GPU implementation get the same output, but
have numerical difference after summation and division (e.g., 5e-7).
Args:
average_points (bool): whether to use avg pooling to scatter
points into voxel voxel_size (list): list [x, y, z] size
of three dimension
point_cloud_range (list):
[x_min, y_min, z_min, x_max, y_max, z_max]
"""
self.voxel_size = voxel_size
self.point_cloud_range = point_cloud_range
self.average_points = average_points
def forward_single(self, points, coors):
voxels, voxel_coors, num_points = dynamic_scatter(
points.contiguous(), coors.contiguous(), self.voxel_size,
self.point_cloud_range)
if not self.average_points:
voxels = torch.max(voxels, dim=1)[0] # voxels: NxMxC -> NxC
else:
voxels = (
voxels.sum(dim=1, keepdim=False).div(num_points.view(-1, 1)))
return voxels, voxel_coors
def forward(self, points, coors):
"""
Args:
input: NC points
"""
if coors.size(-1) == 3:
return self.forward_single(points, coors)
else:
batch_size = coors[-1, 0] + 1
voxels, voxel_coors = [], []
for i in range(batch_size):
inds = torch.where(coors[:, 0] == i)
voxel, voxel_coor = self.forward_single(
points[inds], coors[inds][:, 1:])
coor_pad = nn.functional.pad(
voxel_coor, (1, 0), mode='constant', value=i)
voxel_coors.append(coor_pad)
voxels.append(voxel)
features = torch.cat(voxels, dim=0)
feature_coors = torch.cat(voxel_coors, dim=0)
return features, feature_coors
def __repr__(self):
tmpstr = self.__class__.__name__ + '('
tmpstr += 'voxel_size=' + str(self.voxel_size)
tmpstr += ', point_cloud_range=' + str(self.point_cloud_range)
tmpstr += ', average_points=' + str(self.average_points)
tmpstr += ')'
return tmpstr
#include <torch/extension.h>
#include <ATen/TensorUtils.h>
// #include "voxelization.h"
namespace {
template <typename T_int>
void determin_max_points_kernel(torch::TensorAccessor<T_int,2> coor,
torch::TensorAccessor<T_int,1> point_to_voxelidx,
torch::TensorAccessor<T_int,1> num_points_per_voxel,
torch::TensorAccessor<T_int,3> coor_to_voxelidx,
int& voxel_num,
int& max_points,
const int num_points
) {
int voxelidx, num;
for (int i = 0; i < num_points; ++i) {
if (coor[i][0] == -1)
continue;
voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];
// record voxel
if (voxelidx == -1) {
voxelidx = voxel_num;
voxel_num += 1;
coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]] = voxelidx;
}
// put points into voxel
num = num_points_per_voxel[voxelidx];
point_to_voxelidx[i] = num;
num_points_per_voxel[voxelidx] += 1;
// update max points per voxel
max_points = std::max(max_points, num+1);
}
return;
}
template <typename T, typename T_int>
void scatter_point_to_voxel_kernel(
const torch::TensorAccessor<T,2> points,
torch::TensorAccessor<T_int,2> coor,
torch::TensorAccessor<T_int,1> point_to_voxelidx,
torch::TensorAccessor<T_int,3> coor_to_voxelidx,
torch::TensorAccessor<T,3> voxels,
torch::TensorAccessor<T_int,2> voxel_coors,
const int num_features,
const int num_points,
const int NDim
){
for (int i = 0; i < num_points; ++i) {
int num = point_to_voxelidx[i];
int voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];
for (int k = 0; k < num_features; ++k) {
voxels[voxelidx][num][k] = points[i][k];
}
for (int k = 0; k < NDim; ++k) {
voxel_coors[voxelidx][k] = coor[i][k];
}
}
}
} // namespace
namespace voxelization {
std::vector<at::Tensor> dynamic_point_to_voxel_cpu(
const at::Tensor& points,
const at::Tensor& voxel_mapping,
const std::vector<float> voxel_size,
const std::vector<float> coors_range) {
// current version tooks about 0.02s_0.03s for one frame on cpu
// check device
AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
const int NDim = voxel_mapping.size(1);
const int num_points = points.size(0);
const int num_features = points.size(1);
std::vector<int> grid_size(NDim);
for (int i = 0; i < NDim; ++i) {
grid_size[i] = round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
}
at::Tensor num_points_per_voxel = at::zeros({num_points,}, voxel_mapping.options());
at::Tensor coor_to_voxelidx = -at::ones({grid_size[2], grid_size[1], grid_size[0]}, voxel_mapping.options());
at::Tensor point_to_voxelidx = -at::ones({num_points,}, voxel_mapping.options());
int voxel_num = 0;
int max_points = 0;
AT_DISPATCH_ALL_TYPES(voxel_mapping.type(), "determin_max_point", [&] {
determin_max_points_kernel<scalar_t>(
voxel_mapping.accessor<scalar_t,2>(),
point_to_voxelidx.accessor<scalar_t,1>(),
num_points_per_voxel.accessor<scalar_t,1>(),
coor_to_voxelidx.accessor<scalar_t,3>(),
voxel_num,
max_points,
num_points
);
});
at::Tensor voxels = at::zeros({voxel_num, max_points, num_features}, points.options());
at::Tensor voxel_coors = at::zeros({voxel_num, NDim}, points.options().dtype(at::kInt));
AT_DISPATCH_ALL_TYPES(points.type(), "scatter_point_to_voxel", [&] {
scatter_point_to_voxel_kernel<scalar_t, int>(
points.accessor<scalar_t,2>(),
voxel_mapping.accessor<int,2>(),
point_to_voxelidx.accessor<int,1>(),
coor_to_voxelidx.accessor<int,3>(),
voxels.accessor<scalar_t,3>(),
voxel_coors.accessor<int,2>(),
num_features,
num_points,
NDim
);
});
at::Tensor num_points_per_voxel_out = num_points_per_voxel.slice(/*dim=*/0, /*start=*/0, /*end=*/voxel_num);
return {voxels, voxel_coors, num_points_per_voxel_out};
}
}
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <torch/types.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>
#define CHECK_CUDA(x) \
TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) \
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
namespace {
int const threadsPerBlock = sizeof(unsigned long long) * 8;
}
template <typename T, typename T_int>
__global__ void scatter_point_to_voxel_kernel(
const T* points, T_int* coor, T_int* point_to_voxelidx,
T_int* coor_to_voxelidx, T* voxels, T_int* coors, const int num_features,
const int num_points, const int max_points, const int NDim) {
const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
if (index >= num_points) return;
int num = point_to_voxelidx[index];
int voxelidx = coor_to_voxelidx[index];
if (num > -1 && voxelidx > -1) {
const int feature_per_thread = num_features / 4;
int start = threadIdx.y * feature_per_thread;
auto voxels_offset =
voxels + voxelidx * max_points * num_features + num * num_features;
auto points_offset = points + index * num_features;
for (int k = start; k < start + feature_per_thread; k++) {
voxels_offset[k] = points_offset[k];
}
if (num == 0 && start < NDim) {
auto coors_offset = coors + voxelidx * NDim;
auto coor_offset = coor + index * NDim;
for (int k = start; k < NDim; k++) {
coors_offset[k] = coor_offset[k];
}
}
}
}
template <typename T, typename T_int>
__global__ void map_voxel_to_point_kernel(
T* points, T* voxels, T_int* point_to_voxelidx, T_int* coor_to_voxelidx,
const int num_features, const int num_points, const int max_points) {
const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
if (index >= num_points) return;
auto num = point_to_voxelidx[index];
if (num > -1) {
const int feature_per_thread = num_features / 4;
auto voxelidx = coor_to_voxelidx[index];
int start = threadIdx.y * feature_per_thread;
auto voxels_offset =
voxels + voxelidx * max_points * num_features + num * num_features;
auto points_offset = points + index * num_features;
for (int k = start; k < start + feature_per_thread; k++) {
points_offset[k] = voxels_offset[k];
}
}
}
template <typename T_int>
__global__ void point_to_voxelidx_kernel(const T_int* coor,
T_int* point_to_voxelidx,
T_int* point_to_pointidx,
const int num_points, const int NDim) {
const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
auto coor_offset = coor + index * NDim;
// skip invalid points
if ((index >= num_points) || (coor_offset[0] == -1)) return;
int num = 0;
int coor_x = coor_offset[0];
int coor_y = coor_offset[1];
int coor_z = coor_offset[2];
// only calculate the coors before this coor[index]
for (int i = 0; i < index; ++i) {
auto prev_coor = coor + i * NDim;
if (prev_coor[0] == -1) continue;
// record voxel
if ((prev_coor[0] == coor_x) && (prev_coor[1] == coor_y) &&
(prev_coor[2] == coor_z)) {
num++;
if (num == 1) {
point_to_pointidx[index] = i;
}
}
}
if (num == 0) {
point_to_pointidx[index] = index;
}
point_to_voxelidx[index] = num;
}
template <typename T_int>
__global__ void determin_voxel_num(
const T_int* coor, T_int* num_points_per_voxel, T_int* point_to_voxelidx,
T_int* point_to_pointidx, T_int* coor_to_voxelidx, T_int* voxel_num,
T_int* max_points, const int num_points, const int NDim) {
// only calculate the coors before this coor[index]
for (int i = 0; i < num_points; ++i) {
auto coor_offset = coor + i * NDim;
if (coor_offset[0] == -1) continue;
int point_pos_in_voxel = point_to_voxelidx[i];
// record voxel
if (point_pos_in_voxel == -1) {
// out of max_points or invalid point
printf("point_pos_in_voxel == -1, point:%d", i);
continue;
} else if (point_pos_in_voxel == 0) {
// record new voxel
int voxelidx = voxel_num[0];
voxel_num[0] += 1;
coor_to_voxelidx[i] = voxelidx;
num_points_per_voxel[voxelidx] = 1;
} else {
int point_idx = point_to_pointidx[i];
int voxelidx = coor_to_voxelidx[point_idx];
if (voxelidx != -1) {
num_points_per_voxel[voxelidx] += 1;
coor_to_voxelidx[i] = voxelidx;
max_points[0] = max(max_points[0], point_pos_in_voxel + 1);
} else {
printf("voxelidx = -1, point:%d", i);
}
}
}
}
namespace voxelization {
std::vector<at::Tensor> dynamic_point_to_voxel_forward_gpu(
const at::Tensor& points, const at::Tensor& voxel_mapping,
const std::vector<float> voxel_size, const std::vector<float> coors_range) {
CHECK_INPUT(points);
at::cuda::CUDAGuard device_guard(points.device());
const int NDim = voxel_mapping.size(1);
const int num_points = points.size(0);
const int num_features = points.size(1);
std::vector<int> grid_size(NDim);
for (int i = 0; i < NDim; ++i) {
grid_size[i] =
round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
}
// assume the mapping is already given
auto point_to_pointidx = -at::ones(
{
num_points,
},
voxel_mapping.options());
auto point_to_voxelidx = -at::ones(
{
num_points,
},
voxel_mapping.options());
auto max_points = at::zeros(
{
1,
},
voxel_mapping.options()); // must be zero from the begining
int col_blocks = at::cuda::ATenCeilDiv(num_points, threadsPerBlock);
dim3 blocks(col_blocks);
dim3 threads(threadsPerBlock);
cudaStream_t map_stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_ALL_TYPES(
voxel_mapping.type(), "determin_duplicate", ([&] {
point_to_voxelidx_kernel<int><<<blocks, threads, 0, map_stream>>>(
voxel_mapping.data_ptr<int>(), point_to_voxelidx.data_ptr<int>(),
point_to_pointidx.data_ptr<int>(), num_points, NDim);
}));
cudaDeviceSynchronize();
AT_CUDA_CHECK(cudaGetLastError());
// make the logic in the CUDA device could accelerate about 10 times
auto num_points_per_voxel = at::zeros(
{
num_points,
},
voxel_mapping.options());
auto coor_to_voxelidx = -at::ones(
{
num_points,
},
voxel_mapping.options());
auto voxel_num = at::zeros(
{
1,
},
voxel_mapping.options()); // must be zero from the begining
cudaStream_t logic_stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_ALL_TYPES(
voxel_mapping.type(), "determin_duplicate", ([&] {
determin_voxel_num<int><<<1, 1, 0, logic_stream>>>(
voxel_mapping.data_ptr<int>(), num_points_per_voxel.data_ptr<int>(),
point_to_voxelidx.data_ptr<int>(),
point_to_pointidx.data_ptr<int>(), coor_to_voxelidx.data_ptr<int>(),
voxel_num.data_ptr<int>(), max_points.data_ptr<int>(), num_points,
NDim);
}));
cudaDeviceSynchronize();
AT_CUDA_CHECK(cudaGetLastError());
// some temporary data
auto max_points_cpu = max_points.to(at::kCPU);
int max_points_int = max_points_cpu.data_ptr<int>()[0];
auto voxel_num_cpu = voxel_num.to(at::kCPU);
int voxel_num_int = voxel_num_cpu.data_ptr<int>()[0];
at::Tensor coors =
at::zeros({voxel_num_int, NDim}, points.options().dtype(at::kInt));
at::Tensor voxels = at::zeros({voxel_num_int, max_points_int, num_features},
points.options());
// copy point features to voxels
dim3 cp_threads(threadsPerBlock, 4);
cudaStream_t cp_stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_ALL_TYPES(
points.type(), "scatter_point_to_voxel", ([&] {
scatter_point_to_voxel_kernel<float, int>
<<<blocks, cp_threads, 0, cp_stream>>>(
points.data_ptr<float>(), voxel_mapping.data_ptr<int>(),
point_to_voxelidx.data_ptr<int>(),
coor_to_voxelidx.data_ptr<int>(), voxels.data_ptr<float>(),
coors.data_ptr<int>(), num_features, num_points, max_points_int,
NDim);
}));
cudaDeviceSynchronize();
AT_CUDA_CHECK(cudaGetLastError());
at::Tensor num_points_per_voxel_out =
num_points_per_voxel.slice(/*dim=*/0, /*start=*/0, /*end=*/voxel_num_int);
return {voxels, coors, num_points_per_voxel_out, point_to_voxelidx,
coor_to_voxelidx};
}
void dynamic_point_to_voxel_backward_gpu(at::Tensor& grad_input_points,
const at::Tensor& grad_output_voxels,
const at::Tensor& point_to_voxelidx,
const at::Tensor& coor_to_voxelidx) {
CHECK_INPUT(grad_input_points);
CHECK_INPUT(grad_output_voxels);
CHECK_INPUT(point_to_voxelidx);
CHECK_INPUT(coor_to_voxelidx);
at::cuda::CUDAGuard device_guard(grad_input_points.device());
const int num_points = grad_input_points.size(0);
const int num_features = grad_input_points.size(1);
const int max_points = grad_output_voxels.size(1);
// copy voxel grad to points
int col_blocks = at::cuda::ATenCeilDiv(num_points, threadsPerBlock);
dim3 blocks(col_blocks);
dim3 cp_threads(threadsPerBlock, 4);
cudaStream_t cp_stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_ALL_TYPES(grad_input_points.type(), "scatter_point_to_voxel",
([&] {
map_voxel_to_point_kernel<float, int>
<<<blocks, cp_threads, 0, cp_stream>>>(
grad_input_points.data_ptr<float>(),
grad_output_voxels.data_ptr<float>(),
point_to_voxelidx.data_ptr<int>(),
coor_to_voxelidx.data_ptr<int>(),
num_features, num_points, max_points);
}));
cudaDeviceSynchronize();
AT_CUDA_CHECK(cudaGetLastError());
return;
}
} // namespace voxelization
#include <torch/extension.h>
#include "voxelization.h"
namespace voxelization {
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("hard_voxelize", &hard_voxelize, "hard voxelize");
m.def("dynamic_voxelize", &dynamic_voxelize, "dynamic voxelization");
m.def("dynamic_point_to_voxel_forward", &dynamic_point_to_voxel_forward, "dynamic point to voxel forward");
m.def("dynamic_point_to_voxel_backward", &dynamic_point_to_voxel_backward, "dynamic point to voxel backward");
}
} // namespace voxelization
#pragma once
#include <torch/extension.h>
namespace voxelization {
int hard_voxelize_cpu(const at::Tensor& points, at::Tensor& voxels,
at::Tensor& coors, at::Tensor& num_points_per_voxel,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const int max_points, const int max_voxels,
const int NDim = 3);
void dynamic_voxelize_cpu(const at::Tensor& points, at::Tensor& coors,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const int NDim = 3);
std::vector<at::Tensor> dynamic_point_to_voxel_cpu(
const at::Tensor& points, const at::Tensor& voxel_mapping,
const std::vector<float> voxel_size, const std::vector<float> coors_range);
#ifdef WITH_CUDA
int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
at::Tensor& coors, at::Tensor& num_points_per_voxel,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const int max_points, const int max_voxels,
const int NDim = 3);
void dynamic_voxelize_gpu(const at::Tensor& points, at::Tensor& coors,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const int NDim = 3);
std::vector<at::Tensor> dynamic_point_to_voxel_forward_gpu(
const at::Tensor& points, const at::Tensor& voxel_mapping,
const std::vector<float> voxel_size, const std::vector<float> coors_range);
void dynamic_point_to_voxel_backward_gpu(at::Tensor& grad_input_points,
const at::Tensor& grad_output_voxels,
const at::Tensor& point_to_voxelidx,
const at::Tensor& coor_to_voxelidx);
#endif
// Interface for Python
inline int hard_voxelize(const at::Tensor& points, at::Tensor& voxels,
at::Tensor& coors, at::Tensor& num_points_per_voxel,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const int max_points, const int max_voxels,
const int NDim = 3) {
if (points.type().is_cuda()) {
#ifdef WITH_CUDA
return hard_voxelize_gpu(points, voxels, coors, num_points_per_voxel,
voxel_size, coors_range, max_points, max_voxels,
NDim);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
return hard_voxelize_cpu(points, voxels, coors, num_points_per_voxel,
voxel_size, coors_range, max_points, max_voxels,
NDim);
}
inline void dynamic_voxelize(const at::Tensor& points, at::Tensor& coors,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const int NDim = 3) {
if (points.type().is_cuda()) {
#ifdef WITH_CUDA
return dynamic_voxelize_gpu(points, coors, voxel_size, coors_range, NDim);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
return dynamic_voxelize_cpu(points, coors, voxel_size, coors_range, NDim);
}
inline std::vector<torch::Tensor> dynamic_point_to_voxel_forward(
const at::Tensor& points, const at::Tensor& voxel_mapping,
const std::vector<float> voxel_size, const std::vector<float> coors_range) {
if (points.type().is_cuda()) {
#ifdef WITH_CUDA
return dynamic_point_to_voxel_forward_gpu(points, voxel_mapping, voxel_size,
coors_range);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
return dynamic_point_to_voxel_cpu(points, voxel_mapping, voxel_size,
coors_range);
}
inline void dynamic_point_to_voxel_backward(
at::Tensor& grad_input_points, const at::Tensor& grad_output_voxels,
const at::Tensor& point_to_voxelidx, const at::Tensor& coor_to_voxelidx) {
if (grad_input_points.type().is_cuda()) {
#ifdef WITH_CUDA
return dynamic_point_to_voxel_backward_gpu(
grad_input_points, grad_output_voxels, point_to_voxelidx,
coor_to_voxelidx);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
// return dynamic_point_to_voxel_cpu(points,
// voxel_mapping,
// voxel_size,
// coors_range);
}
} // namespace voxelization
#include <torch/extension.h>
#include <ATen/TensorUtils.h>
// #include "voxelization.h"
namespace {
template <typename T, typename T_int>
void dynamic_voxelize_kernel(const torch::TensorAccessor<T,2> points,
torch::TensorAccessor<T_int, 2> coors,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const std::vector<int> grid_size,
const int num_points,
const int num_features,
const int NDim
) {
const int ndim_minus_1 = NDim - 1;
bool failed = false;
int coor[NDim];
int c;
for (int i = 0; i < num_points; ++i) {
failed = false;
for (int j = 0; j < NDim; ++j) {
c = floor((points[i][j] - coors_range[j]) / voxel_size[j]);
// necessary to rm points out of range
if ((c < 0 || c >= grid_size[j])) {
failed = true;
break;
}
coor[ndim_minus_1 - j] = c;
}
for (int k = 0; k < NDim; ++k) {
if (failed)
coors[i][k] = -1;
else
coors[i][k] = coor[k];
}
}
return;
}
template <typename T, typename T_int>
void hard_voxelize_kernel(const torch::TensorAccessor<T,2> points,
torch::TensorAccessor<T,3> voxels,
torch::TensorAccessor<T_int,2> coors,
torch::TensorAccessor<T_int,1> num_points_per_voxel,
torch::TensorAccessor<T_int,3> coor_to_voxelidx,
int& voxel_num,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const std::vector<int> grid_size,
const int max_points,
const int max_voxels,
const int num_points,
const int num_features,
const int NDim
) {
// declare a temp coors
at::Tensor temp_coors = at::zeros({num_points, NDim}, at::TensorOptions().dtype(at::kInt).device(at::kCPU));
// First use dynamic voxelization to get coors,
// then check max points/voxels constraints
dynamic_voxelize_kernel<T, int>(
points,
temp_coors.accessor<int,2>(),
voxel_size,
coors_range,
grid_size,
num_points,
num_features,
NDim
);
int voxelidx, num;
auto coor = temp_coors.accessor<int,2>();
for (int i = 0; i < num_points; ++i) {
// T_int* coor = temp_coors.data_ptr<int>() + i * NDim;
if (coor[i][0] == -1)
continue;
voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];
// record voxel
if (voxelidx == -1) {
voxelidx = voxel_num;
if (max_voxels != -1 && voxel_num >= max_voxels)
break;
voxel_num += 1;
coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]] = voxelidx;
for (int k = 0; k < NDim; ++k) {
coors[voxelidx][k] = coor[i][k];
}
}
// put points into voxel
num = num_points_per_voxel[voxelidx];
if (max_points == -1 || num < max_points) {
for (int k = 0; k < num_features; ++k) {
voxels[voxelidx][num][k] = points[i][k];
}
num_points_per_voxel[voxelidx] += 1;
}
}
return;
}
} // namespace
namespace voxelization {
int hard_voxelize_cpu(
const at::Tensor& points,
at::Tensor& voxels,
at::Tensor& coors,
at::Tensor& num_points_per_voxel,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const int max_points,
const int max_voxels,
const int NDim=3) {
// current version tooks about 0.02s_0.03s for one frame on cpu
// check device
AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
std::vector<int> grid_size(NDim);
const int num_points = points.size(0);
const int num_features = points.size(1);
for (int i = 0; i < NDim; ++i) {
grid_size[i] = round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
}
// coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
//printf("cpu coor_to_voxelidx size: [%d, %d, %d]\n", grid_size[2], grid_size[1], grid_size[0]);
at::Tensor coor_to_voxelidx = -at::ones({grid_size[2], grid_size[1], grid_size[0]}, coors.options());
int voxel_num = 0;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(points.type(), "hard_voxelize_forward", [&] {
hard_voxelize_kernel<scalar_t, int>(
points.accessor<scalar_t,2>(),
voxels.accessor<scalar_t,3>(),
coors.accessor<int,2>(),
num_points_per_voxel.accessor<int,1>(),
coor_to_voxelidx.accessor<int,3>(),
voxel_num,
voxel_size,
coors_range,
grid_size,
max_points,
max_voxels,
num_points,
num_features,
NDim
);
});
return voxel_num;
}
void dynamic_voxelize_cpu(
const at::Tensor& points,
at::Tensor& coors,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const int NDim=3) {
// check device
AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
std::vector<int> grid_size(NDim);
const int num_points = points.size(0);
const int num_features = points.size(1);
for (int i = 0; i < NDim; ++i) {
grid_size[i] = round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
}
// coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
AT_DISPATCH_FLOATING_TYPES_AND_HALF(points.type(), "hard_voxelize_forward", [&] {
dynamic_voxelize_kernel<scalar_t, int>(
points.accessor<scalar_t,2>(),
coors.accessor<int,2>(),
voxel_size,
coors_range,
grid_size,
num_points,
num_features,
NDim
);
});
return;
}
}
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <torch/types.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>
#define CHECK_CUDA(x) \
TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) \
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
namespace {
int const threadsPerBlock = sizeof(unsigned long long) * 8;
}
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
i += blockDim.x * gridDim.x)
template <typename T, typename T_int>
__global__ void dynamic_voxelize_kernel(
const T* points, T_int* coors, const float voxel_x, const float voxel_y,
const float voxel_z, const float coors_x_min, const float coors_y_min,
const float coors_z_min, const float coors_x_max, const float coors_y_max,
const float coors_z_max, const int grid_x, const int grid_y,
const int grid_z, const int num_points, const int num_features,
const int NDim) {
// const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
CUDA_1D_KERNEL_LOOP(index, num_points) {
// To save some computation
auto points_offset = points + index * num_features;
auto coors_offset = coors + index * NDim;
int c_x = floor((points_offset[0] - coors_x_min) / voxel_x);
if (c_x < 0 || c_x >= grid_x) {
coors_offset[0] = -1;
return;
}
int c_y = floor((points_offset[1] - coors_y_min) / voxel_y);
if (c_y < 0 || c_y >= grid_y) {
coors_offset[0] = -1;
coors_offset[1] = -1;
return;
}
int c_z = floor((points_offset[2] - coors_z_min) / voxel_z);
if (c_z < 0 || c_z >= grid_z) {
coors_offset[0] = -1;
coors_offset[1] = -1;
coors_offset[2] = -1;
} else {
coors_offset[0] = c_z;
coors_offset[1] = c_y;
coors_offset[2] = c_x;
}
}
}
template <typename T, typename T_int>
__global__ void assign_point_to_voxel(const int nthreads, const T* points,
T_int* point_to_voxelidx,
T_int* coor_to_voxelidx, T* voxels,
const int max_points,
const int num_features,
const int num_points, const int NDim) {
CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
// const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
int index = thread_idx / num_features;
int num = point_to_voxelidx[index];
int voxelidx = coor_to_voxelidx[index];
if (num > -1 && voxelidx > -1) {
auto voxels_offset =
voxels + voxelidx * max_points * num_features + num * num_features;
int k = thread_idx % num_features;
voxels_offset[k] = points[thread_idx];
}
}
}
template <typename T, typename T_int>
__global__ void assign_voxel_coors(const int nthreads, T_int* coor,
T_int* point_to_voxelidx,
T_int* coor_to_voxelidx, T_int* voxel_coors,
const int num_points, const int NDim) {
CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
// const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
// if (index >= num_points) return;
int index = thread_idx / NDim;
int num = point_to_voxelidx[index];
int voxelidx = coor_to_voxelidx[index];
if (num == 0 && voxelidx > -1) {
auto coors_offset = voxel_coors + voxelidx * NDim;
int k = thread_idx % NDim;
coors_offset[k] = coor[thread_idx];
}
}
}
template <typename T_int>
__global__ void point_to_voxelidx_kernel(const T_int* coor,
T_int* point_to_voxelidx,
T_int* point_to_pointidx,
const int max_points,
const int max_voxels,
const int num_points, const int NDim) {
CUDA_1D_KERNEL_LOOP(index, num_points) {
auto coor_offset = coor + index * NDim;
// skip invalid points
if ((index >= num_points) || (coor_offset[0] == -1)) return;
int num = 0;
int coor_x = coor_offset[0];
int coor_y = coor_offset[1];
int coor_z = coor_offset[2];
// only calculate the coors before this coor[index]
for (int i = 0; i < index; ++i) {
auto prev_coor = coor + i * NDim;
if (prev_coor[0] == -1) continue;
// Find all previous points that have the same coors
// if find the same coor, record it
if ((prev_coor[0] == coor_x) && (prev_coor[1] == coor_y) &&
(prev_coor[2] == coor_z)) {
num++;
if (num == 1) {
// point to the same coor that first show up
point_to_pointidx[index] = i;
} else if (num >= max_points) {
// out of boundary
return;
}
}
}
if (num == 0) {
point_to_pointidx[index] = index;
}
if (num < max_points) {
point_to_voxelidx[index] = num;
}
}
}
template <typename T_int>
__global__ void determin_voxel_num(
// const T_int* coor,
T_int* num_points_per_voxel, T_int* point_to_voxelidx,
T_int* point_to_pointidx, T_int* coor_to_voxelidx, T_int* voxel_num,
const int max_points, const int max_voxels, const int num_points) {
// only calculate the coors before this coor[index]
for (int i = 0; i < num_points; ++i) {
// if (coor[i][0] == -1)
// continue;
int point_pos_in_voxel = point_to_voxelidx[i];
// record voxel
if (point_pos_in_voxel == -1) {
// out of max_points or invalid point
continue;
} else if (point_pos_in_voxel == 0) {
// record new voxel
int voxelidx = voxel_num[0];
if (voxel_num[0] >= max_voxels) break;
voxel_num[0] += 1;
coor_to_voxelidx[i] = voxelidx;
num_points_per_voxel[voxelidx] = 1;
} else {
int point_idx = point_to_pointidx[i];
int voxelidx = coor_to_voxelidx[point_idx];
if (voxelidx != -1) {
coor_to_voxelidx[i] = voxelidx;
num_points_per_voxel[voxelidx] += 1;
}
}
}
}
namespace voxelization {
int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
at::Tensor& coors, at::Tensor& num_points_per_voxel,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const int max_points, const int max_voxels,
const int NDim = 3) {
// current version tooks about 0.04s for one frame on cpu
// check device
CHECK_INPUT(points);
at::cuda::CUDAGuard device_guard(points.device());
const int num_points = points.size(0);
const int num_features = points.size(1);
const float voxel_x = voxel_size[0];
const float voxel_y = voxel_size[1];
const float voxel_z = voxel_size[2];
const float coors_x_min = coors_range[0];
const float coors_y_min = coors_range[1];
const float coors_z_min = coors_range[2];
const float coors_x_max = coors_range[3];
const float coors_y_max = coors_range[4];
const float coors_z_max = coors_range[5];
const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
// map points to voxel coors
at::Tensor temp_coors =
at::zeros({num_points, NDim}, points.options().dtype(at::kInt));
dim3 grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
dim3 block(512);
// 1. link point to corresponding voxel coors
AT_DISPATCH_ALL_TYPES(
points.type(), "hard_voxelize_kernel", ([&] {
dynamic_voxelize_kernel<scalar_t, int>
<<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
points.contiguous().data_ptr<scalar_t>(),
temp_coors.contiguous().data_ptr<int>(), voxel_x, voxel_y,
voxel_z, coors_x_min, coors_y_min, coors_z_min, coors_x_max,
coors_y_max, coors_z_max, grid_x, grid_y, grid_z, num_points,
num_features, NDim);
}));
cudaDeviceSynchronize();
AT_CUDA_CHECK(cudaGetLastError());
// 2. map point to the idx of the corresponding voxel, find duplicate coor
// create some temporary variables
auto point_to_pointidx = -at::ones(
{
num_points,
},
points.options().dtype(at::kInt));
auto point_to_voxelidx = -at::ones(
{
num_points,
},
points.options().dtype(at::kInt));
dim3 map_grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
dim3 map_block(512);
AT_DISPATCH_ALL_TYPES(
temp_coors.type(), "determin_duplicate", ([&] {
point_to_voxelidx_kernel<int>
<<<map_grid, map_block, 0, at::cuda::getCurrentCUDAStream()>>>(
temp_coors.contiguous().data_ptr<int>(),
point_to_voxelidx.contiguous().data_ptr<int>(),
point_to_pointidx.contiguous().data_ptr<int>(), max_points,
max_voxels, num_points, NDim);
}));
cudaDeviceSynchronize();
AT_CUDA_CHECK(cudaGetLastError());
// 3. determin voxel num and voxel's coor index
// make the logic in the CUDA device could accelerate about 10 times
auto coor_to_voxelidx = -at::ones(
{
num_points,
},
points.options().dtype(at::kInt));
auto voxel_num = at::zeros(
{
1,
},
points.options().dtype(at::kInt)); // must be zero from the begining
AT_DISPATCH_ALL_TYPES(
temp_coors.type(), "determin_duplicate", ([&] {
determin_voxel_num<int><<<1, 1, 0, at::cuda::getCurrentCUDAStream()>>>(
num_points_per_voxel.contiguous().data_ptr<int>(),
point_to_voxelidx.contiguous().data_ptr<int>(),
point_to_pointidx.contiguous().data_ptr<int>(),
coor_to_voxelidx.contiguous().data_ptr<int>(),
voxel_num.contiguous().data_ptr<int>(), max_points, max_voxels,
num_points);
}));
cudaDeviceSynchronize();
AT_CUDA_CHECK(cudaGetLastError());
// 4. copy point features to voxels
// Step 4 & 5 could be parallel
auto pts_output_size = num_points * num_features;
dim3 cp_grid(std::min(at::cuda::ATenCeilDiv(pts_output_size, 512), 4096));
dim3 cp_block(512);
AT_DISPATCH_ALL_TYPES(
points.type(), "assign_point_to_voxel", ([&] {
assign_point_to_voxel<float, int>
<<<cp_grid, cp_block, 0, at::cuda::getCurrentCUDAStream()>>>(
pts_output_size, points.contiguous().data_ptr<float>(),
point_to_voxelidx.contiguous().data_ptr<int>(),
coor_to_voxelidx.contiguous().data_ptr<int>(),
voxels.contiguous().data_ptr<float>(), max_points, num_features,
num_points, NDim);
}));
// cudaDeviceSynchronize();
// AT_CUDA_CHECK(cudaGetLastError());
// 5. copy coors of each voxels
auto coors_output_size = num_points * NDim;
dim3 coors_cp_grid(
std::min(at::cuda::ATenCeilDiv(coors_output_size, 512), 4096));
dim3 coors_cp_block(512);
AT_DISPATCH_ALL_TYPES(
points.type(), "assign_point_to_voxel", ([&] {
assign_voxel_coors<float, int><<<coors_cp_grid, coors_cp_block, 0,
at::cuda::getCurrentCUDAStream()>>>(
coors_output_size, temp_coors.contiguous().data_ptr<int>(),
point_to_voxelidx.contiguous().data_ptr<int>(),
coor_to_voxelidx.contiguous().data_ptr<int>(),
coors.contiguous().data_ptr<int>(), num_points, NDim);
}));
cudaDeviceSynchronize();
AT_CUDA_CHECK(cudaGetLastError());
auto voxel_num_cpu = voxel_num.to(at::kCPU);
int voxel_num_int = voxel_num_cpu.data_ptr<int>()[0];
return voxel_num_int;
}
void dynamic_voxelize_gpu(const at::Tensor& points, at::Tensor& coors,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const int NDim = 3) {
// current version tooks about 0.04s for one frame on cpu
// check device
CHECK_INPUT(points);
at::cuda::CUDAGuard device_guard(points.device());
const int num_points = points.size(0);
const int num_features = points.size(1);
const float voxel_x = voxel_size[0];
const float voxel_y = voxel_size[1];
const float voxel_z = voxel_size[2];
const float coors_x_min = coors_range[0];
const float coors_y_min = coors_range[1];
const float coors_z_min = coors_range[2];
const float coors_x_max = coors_range[3];
const float coors_y_max = coors_range[4];
const float coors_z_max = coors_range[5];
const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
const int col_blocks = at::cuda::ATenCeilDiv(num_points, threadsPerBlock);
dim3 blocks(col_blocks);
dim3 threads(threadsPerBlock);
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_ALL_TYPES(points.scalar_type(), "dynamic_voxelize_kernel", [&] {
dynamic_voxelize_kernel<scalar_t, int><<<blocks, threads, 0, stream>>>(
points.contiguous().data_ptr<scalar_t>(),
coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,
coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,
coors_z_max, grid_x, grid_y, grid_z, num_points, num_features, NDim);
});
cudaDeviceSynchronize();
AT_CUDA_CHECK(cudaGetLastError());
return;
}
} // namespace voxelization
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import torch
from torch import nn
from torch.autograd import Function
from torch.nn.modules.utils import _pair
from .voxel_layer import dynamic_voxelize, hard_voxelize
class _Voxelization(Function):
@staticmethod
def forward(ctx,
points,
voxel_size,
coors_range,
max_points=35,
max_voxels=20000):
"""convert kitti points(N, >=3) to voxels.
Args:
points: [N, ndim] float tensor. points[:, :3] contain xyz points
and points[:, 3:] contain other information like reflectivity
voxel_size: [3] list/tuple or array, float. xyz, indicate voxel
size
coors_range: [6] list/tuple or array, float. indicate voxel
range. format: xyzxyz, minmax
max_points: int. indicate maximum points contained in a voxel. if
max_points=-1, it means using dynamic_voxelize
max_voxels: int. indicate maximum voxels this function create.
for second, 20000 is a good choice. Users should shuffle points
before call this function because max_voxels may drop points.
Returns:
voxels: [M, max_points, ndim] float tensor. only contain points
and returned when max_points != -1.
coordinates: [M, 3] int32 tensor, always returned.
num_points_per_voxel: [M] int32 tensor. Only returned when
max_points != -1.
"""
if max_points == -1 or max_voxels == -1:
coors = points.new_zeros(size=(points.size(0), 3), dtype=torch.int)
dynamic_voxelize(points, coors, voxel_size, coors_range, 3)
return coors
else:
voxels = points.new_zeros(
size=(max_voxels, max_points, points.size(1)))
coors = points.new_zeros(size=(max_voxels, 3), dtype=torch.int)
num_points_per_voxel = points.new_zeros(
size=(max_voxels, ), dtype=torch.int)
voxel_num = hard_voxelize(points, voxels, coors,
num_points_per_voxel, voxel_size,
coors_range, max_points, max_voxels, 3)
# select the valid voxels
voxels_out = voxels[:voxel_num]
coors_out = coors[:voxel_num]
num_points_per_voxel_out = num_points_per_voxel[:voxel_num]
return voxels_out, coors_out, num_points_per_voxel_out
voxelization = _Voxelization.apply
class Voxelization(nn.Module):
def __init__(self,
voxel_size,
point_cloud_range,
max_num_points,
max_voxels=20000):
super(Voxelization, self).__init__()
"""
Args:
voxel_size (list): list [x, y, z] size of three dimension
point_cloud_range (list):
[x_min, y_min, z_min, x_max, y_max, z_max]
max_num_points (int): max number of points per voxel
max_voxels (tuple or int): max number of voxels in
(training, testing) time
"""
self.voxel_size = voxel_size
self.point_cloud_range = point_cloud_range
self.max_num_points = max_num_points
if isinstance(max_voxels, tuple):
self.max_voxels = max_voxels
else:
self.max_voxels = _pair(max_voxels)
point_cloud_range = torch.tensor(
point_cloud_range, dtype=torch.float32)
# [0, -40, -3, 70.4, 40, 1]
voxel_size = torch.tensor(voxel_size, dtype=torch.float32)
grid_size = (point_cloud_range[3:] -
point_cloud_range[:3]) / voxel_size
grid_size = torch.round(grid_size).long()
input_feat_shape = grid_size[:2]
self.grid_size = grid_size
# the origin shape is as [x-len, y-len, z-len]
# [w, h, d] -> [d, h, w]
self.pcd_shape = [*input_feat_shape, 1][::-1]
def forward(self, input):
"""
Args:
input: NC points
"""
if self.training:
max_voxels = self.max_voxels[0]
else:
max_voxels = self.max_voxels[1]
return voxelization(input, self.voxel_size, self.point_cloud_range,
self.max_num_points, max_voxels)
def __repr__(self):
tmpstr = self.__class__.__name__ + '('
tmpstr += 'voxel_size=' + str(self.voxel_size)
tmpstr += ', point_cloud_range=' + str(self.point_cloud_range)
tmpstr += ', max_num_points=' + str(self.max_num_points)
tmpstr += ', max_voxels=' + str(self.max_voxels)
tmpstr += ')'
return tmpstr
from mmdet.utils import (Registry, build_from_cfg, get_model_complexity_info,
get_root_logger, print_log)
from .collect_env import collect_env
__all__ = [
'Registry', 'build_from_cfg', 'get_model_complexity_info',
'get_root_logger', 'print_log', 'collect_env'
]
import os.path as osp
import subprocess
import sys
from collections import defaultdict
import cv2
import mmcv
import torch
import torchvision
import mmdet
import mmdet3d
def collect_env():
env_info = {}
env_info['sys.platform'] = sys.platform
env_info['Python'] = sys.version.replace('\n', '')
cuda_available = torch.cuda.is_available()
env_info['CUDA available'] = cuda_available
if cuda_available:
from torch.utils.cpp_extension import CUDA_HOME
env_info['CUDA_HOME'] = CUDA_HOME
if CUDA_HOME is not None and osp.isdir(CUDA_HOME):
try:
nvcc = osp.join(CUDA_HOME, 'bin/nvcc')
nvcc = subprocess.check_output(
'"{}" -V | tail -n1'.format(nvcc), shell=True)
nvcc = nvcc.decode('utf-8').strip()
except subprocess.SubprocessError:
nvcc = 'Not Available'
env_info['NVCC'] = nvcc
devices = defaultdict(list)
for k in range(torch.cuda.device_count()):
devices[torch.cuda.get_device_name(k)].append(str(k))
for name, devids in devices.items():
env_info['GPU ' + ','.join(devids)] = name
gcc = subprocess.check_output('gcc --version | head -n1', shell=True)
gcc = gcc.decode('utf-8').strip()
env_info['GCC'] = gcc
env_info['PyTorch'] = torch.__version__
env_info['PyTorch compiling details'] = torch.__config__.show()
env_info['TorchVision'] = torchvision.__version__
env_info['OpenCV'] = cv2.__version__
env_info['MMCV'] = mmcv.__version__
env_info['MMDetection'] = mmdet.__version__
env_info['MMDetection3D'] = mmdet3d.__version__
from mmdet.ops import get_compiler_version, get_compiling_cuda_version
env_info['MMDetection3D Compiler'] = get_compiler_version()
env_info['MMDetection3D CUDA Compiler'] = get_compiling_cuda_version()
return env_info
if __name__ == '__main__':
for name, val in collect_env().items():
print('{}: {}'.format(name, val))
-r requirements/build.txt
-r requirements/optional.txt
-r requirements/runtime.txt
-r requirements/tests.txt
# These must be installed before building mmdetection
numpy
torch>=1.1
# To avoid install too many unnecessary packages
nuscenes-devkit==1.0.5
matplotlib
mmcv>=0.2.15
numpy
# need older pillow until torchvision is fixed
Pillow<=6.2.2
six
terminaltables
torch>=1.1
torchvision
asynctest
codecov
flake8
isort
# Note: used for kwarray.group_items, this may be ported to mmcv in the future.
kwarray
pytest
pytest-cov
pytest-runner
ubelt
xdoctest >= 0.10.0
yapf
import os
import platform
import subprocess
import time
from setuptools import Extension, find_packages, setup
import numpy as np
from Cython.Build import cythonize
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
def readme():
with open('README.md', encoding='utf-8') as f:
content = f.read()
return content
MAJOR = 0
MINOR = 1
PATCH = ''
SUFFIX = 'rc0'
SHORT_VERSION = '{}.{}.{}{}'.format(MAJOR, MINOR, PATCH, SUFFIX)
version_file = 'mmdet3d/version.py'
def get_git_hash():
def _minimal_ext_cmd(cmd):
# construct minimal environment
env = {}
for k in ['SYSTEMROOT', 'PATH', 'HOME']:
v = os.environ.get(k)
if v is not None:
env[k] = v
# LANGUAGE is used on win32
env['LANGUAGE'] = 'C'
env['LANG'] = 'C'
env['LC_ALL'] = 'C'
out = subprocess.Popen(
cmd, stdout=subprocess.PIPE, env=env).communicate()[0]
return out
try:
out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD'])
sha = out.strip().decode('ascii')
except OSError:
sha = 'unknown'
return sha
def get_hash():
if os.path.exists('.git'):
sha = get_git_hash()[:7]
elif os.path.exists(version_file):
try:
from mmdet3d.version import __version__
sha = __version__.split('+')[-1]
except ImportError:
raise ImportError('Unable to get git version')
else:
sha = 'unknown'
return sha
def write_version_py():
content = """# GENERATED VERSION FILE
# TIME: {}
__version__ = '{}'
short_version = '{}'
"""
sha = get_hash()
VERSION = SHORT_VERSION + '+' + sha
with open(version_file, 'w') as f:
f.write(content.format(time.asctime(), VERSION, SHORT_VERSION))
def get_version():
with open(version_file, 'r') as f:
exec(compile(f.read(), version_file, 'exec'))
return locals()['__version__']
def make_cuda_ext(name, module, sources, extra_args=[], extra_include_path=[]):
return CUDAExtension(
name='{}.{}'.format(module, name),
define_macros=[('WITH_CUDA', None)],
sources=[os.path.join(*module.split('.'), p) for p in sources],
include_dirs=extra_include_path,
extra_compile_args={
'cxx': [] + extra_args,
'nvcc':
extra_args + [
'-D__CUDA_NO_HALF_OPERATORS__',
'-D__CUDA_NO_HALF_CONVERSIONS__',
'-D__CUDA_NO_HALF2_OPERATORS__',
]
})
def make_cython_ext(name, module, sources):
extra_compile_args = None
if platform.system() != 'Windows':
extra_compile_args = {
'cxx': ['-Wno-unused-function', '-Wno-write-strings']
}
extension = Extension(
'{}.{}'.format(module, name),
[os.path.join(*module.split('.'), p) for p in sources],
include_dirs=[np.get_include()],
language='c++',
extra_compile_args=extra_compile_args)
extension, = cythonize(extension)
return extension
def parse_requirements(fname='requirements.txt', with_version=True):
"""
Parse the package dependencies listed in a requirements file but strips
specific versioning information.
Args:
fname (str): path to requirements file
with_version (bool, default=False): if True include version specs
Returns:
List[str]: list of requirements items
CommandLine:
python -c "import setup; print(setup.parse_requirements())"
"""
import sys
from os.path import exists
import re
require_fpath = fname
def parse_line(line):
"""
Parse information from a line in a requirements text file
"""
if line.startswith('-r '):
# Allow specifying requirements in other files
target = line.split(' ')[1]
for info in parse_require_file(target):
yield info
else:
info = {'line': line}
if line.startswith('-e '):
info['package'] = line.split('#egg=')[1]
else:
# Remove versioning from the package
pat = '(' + '|'.join(['>=', '==', '>']) + ')'
parts = re.split(pat, line, maxsplit=1)
parts = [p.strip() for p in parts]
info['package'] = parts[0]
if len(parts) > 1:
op, rest = parts[1:]
if ';' in rest:
# Handle platform specific dependencies
# http://setuptools.readthedocs.io/en/latest/setuptools.html#declaring-platform-specific-dependencies
version, platform_deps = map(str.strip,
rest.split(';'))
info['platform_deps'] = platform_deps
else:
version = rest # NOQA
info['version'] = (op, version)
yield info
def parse_require_file(fpath):
with open(fpath, 'r') as f:
for line in f.readlines():
line = line.strip()
if line and not line.startswith('#'):
for info in parse_line(line):
yield info
def gen_packages_items():
if exists(require_fpath):
for info in parse_require_file(require_fpath):
parts = [info['package']]
if with_version and 'version' in info:
parts.extend(info['version'])
if not sys.version.startswith('3.4'):
# apparently package_deps are broken in 3.4
platform_deps = info.get('platform_deps')
if platform_deps is not None:
parts.append(';' + platform_deps)
item = ''.join(parts)
yield item
packages = list(gen_packages_items())
return packages
if __name__ == '__main__':
write_version_py()
setup(
name='mmdet3d',
version=get_version(),
description='3D Detection Toolbox',
long_description=readme(),
keywords='computer vision, 3D object detection',
url='https://github.com/ZwwWayne/mmdetection3d',
packages=find_packages(exclude=('configs', 'tools', 'demo')),
package_data={'mmdet3d.ops': ['*/*.so']},
classifiers=[
'Development Status :: 4 - Beta',
'License :: OSI Approved :: Apache Software License',
'Operating System :: OS Independent',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.4',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
],
license='Apache License 2.0',
setup_requires=parse_requirements('requirements/build.txt'),
tests_require=parse_requirements('requirements/tests.txt'),
install_requires=parse_requirements('requirements/runtime.txt'),
extras_require={
'all': parse_requirements('requirements.txt'),
'tests': parse_requirements('requirements/tests.txt'),
'build': parse_requirements('requirements/build.txt'),
'optional': parse_requirements('requirements/optional.txt'),
},
ext_modules=[
make_cuda_ext(
name='sparse_conv_ext',
module='mmdet3d.ops.spconv',
extra_include_path=[
os.path.join(*'mmdet3d.ops.spconv'.split('.'), 'include/')
],
sources=[
'src/all.cc',
'src/reordering.cc',
'src/reordering_cuda.cu',
'src/indice.cc',
'src/indice_cuda.cu',
'src/maxpool.cc',
'src/maxpool_cuda.cu',
],
extra_args=['-w', '-std=c++14']),
make_cuda_ext(
name='iou3d_cuda',
module='mmdet3d.ops.iou3d',
sources=[
'src/iou3d.cpp',
'src/iou3d_kernel.cu',
]),
make_cuda_ext(
name='sigmoid_focal_loss_cuda',
module='mmdet3d.ops.sigmoid_focal_loss',
sources=[
'src/sigmoid_focal_loss.cpp',
'src/sigmoid_focal_loss_cuda.cu'
]),
make_cuda_ext(
name='voxel_layer',
module='mmdet3d.ops.voxel',
sources=[
'src/voxelization.cpp',
'src/scatter_points_cpu.cpp',
'src/scatter_points_cuda.cu',
'src/voxelization_cpu.cpp',
'src/voxelization_cuda.cu',
]),
],
cmdclass={'build_ext': BuildExtension},
zip_safe=False)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment