Unverified Commit 2f88c124 authored by Wenhao Wu's avatar Wenhao Wu Committed by GitHub
Browse files

[Enhance] Replace mmdet3d ops with mmcv ops (#1240)

* import some ops from mmcv instead of mmdet3d

* use mmcv ops in primitive_head.py

* use mmcv ops in PAConv

* remove ops in mmdet3d & fix some bugs

* remove spconv & fix some bugs

* fix voxelization unittest

* remove spconv in ops/__init__.py

* refine ops/__init__.py

* recover sparse_block in ops/__init__

* fix parta2_bbox_head unittest

* remove remaining ops

* recover ops/__init__.py for bc breaking

* add source of ops from mmcv

* recover the unittest for voxelization

* remove unittest
parent 41d77dad
# Copyright (c) OpenMMLab. All rights reserved.
import torch
from mmcv.cnn import ConvModule
from mmcv.ops import GroupAll
from mmcv.ops import PointsSampler as Points_Sampler
from mmcv.ops import QueryAndGroup, gather_points
from torch import nn as nn
from torch.nn import functional as F
from mmdet3d.ops import (GroupAll, PAConv, Points_Sampler, QueryAndGroup,
gather_points)
from mmdet3d.ops import PAConv
from .builder import SA_MODULES
......
# Copyright (c) OpenMMLab. All rights reserved.
from .points_in_boxes import (points_in_boxes_all, points_in_boxes_cpu,
points_in_boxes_part)
from .roiaware_pool3d import RoIAwarePool3d
__all__ = [
'RoIAwarePool3d', 'points_in_boxes_part', 'points_in_boxes_cpu',
'points_in_boxes_all'
]
# Copyright (c) OpenMMLab. All rights reserved.
import torch
from . import roiaware_pool3d_ext
def points_in_boxes_part(points, boxes):
"""Find the box in which each point is (CUDA).
Args:
points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR/DEPTH coordinate
boxes (torch.Tensor): [B, T, 7],
num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz] in
LiDAR/DEPTH coordinate, (x, y, z) is the bottom center
Returns:
box_idxs_of_pts (torch.Tensor): (B, M), default background = -1
"""
assert points.shape[0] == boxes.shape[0], \
f'Points and boxes should have the same batch size, ' \
f'got {points.shape[0]} and {boxes.shape[0]}'
assert boxes.shape[2] == 7, \
f'boxes dimension should be 7, ' \
f'got unexpected shape {boxes.shape[2]}'
assert points.shape[2] == 3, \
f'points dimension should be 3, ' \
f'got unexpected shape {points.shape[2]}'
batch_size, num_points, _ = points.shape
box_idxs_of_pts = points.new_zeros((batch_size, num_points),
dtype=torch.int).fill_(-1)
# If manually put the tensor 'points' or 'boxes' on a device
# which is not the current device, some temporary variables
# will be created on the current device in the cuda op,
# and the output will be incorrect.
# Therefore, we force the current device to be the same
# as the device of the tensors if it was not.
# Please refer to https://github.com/open-mmlab/mmdetection3d/issues/305
# for the incorrect output before the fix.
points_device = points.get_device()
assert points_device == boxes.get_device(), \
'Points and boxes should be put on the same device'
if torch.cuda.current_device() != points_device:
torch.cuda.set_device(points_device)
roiaware_pool3d_ext.points_in_boxes_part(boxes.contiguous(),
points.contiguous(),
box_idxs_of_pts)
return box_idxs_of_pts
def points_in_boxes_cpu(points, boxes):
"""Find all boxes in which each point is (CPU). The CPU version of
:meth:`points_in_boxes_all`.
Args:
points (torch.Tensor): [B, M, 3], [x, y, z] in
LiDAR/DEPTH coordinate
boxes (torch.Tensor): [B, T, 7],
num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz],
(x, y, z) is the bottom center.
Returns:
box_idxs_of_pts (torch.Tensor): (B, M, T), default background = 0.
"""
assert points.shape[0] == boxes.shape[0], \
f'Points and boxes should have the same batch size, ' \
f'got {points.shape[0]} and {boxes.shape[0]}'
assert boxes.shape[2] == 7, \
f'boxes dimension should be 7, ' \
f'got unexpected shape {boxes.shape[2]}'
assert points.shape[2] == 3, \
f'points dimension should be 3, ' \
f'got unexpected shape {points.shape[2]}'
batch_size, num_points, _ = points.shape
num_boxes = boxes.shape[1]
point_indices = points.new_zeros((batch_size, num_boxes, num_points),
dtype=torch.int)
for b in range(batch_size):
roiaware_pool3d_ext.points_in_boxes_cpu(boxes[b].float().contiguous(),
points[b].float().contiguous(),
point_indices[b])
point_indices = point_indices.transpose(1, 2)
return point_indices
def points_in_boxes_all(points, boxes):
"""Find all boxes in which each point is (CUDA).
Args:
points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR/DEPTH coordinate
boxes (torch.Tensor): [B, T, 7],
num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz],
(x, y, z) is the bottom center.
Returns:
box_idxs_of_pts (torch.Tensor): (B, M, T), default background = 0.
"""
assert boxes.shape[0] == points.shape[0], \
f'Points and boxes should have the same batch size, ' \
f'got {boxes.shape[0]} and {boxes.shape[0]}'
assert boxes.shape[2] == 7, \
f'boxes dimension should be 7, ' \
f'got unexpected shape {boxes.shape[2]}'
assert points.shape[2] == 3, \
f'points dimension should be 3, ' \
f'got unexpected shape {points.shape[2]}'
batch_size, num_points, _ = points.shape
num_boxes = boxes.shape[1]
box_idxs_of_pts = points.new_zeros((batch_size, num_points, num_boxes),
dtype=torch.int).fill_(0)
# Same reason as line 25-32
points_device = points.get_device()
assert points_device == boxes.get_device(), \
'Points and boxes should be put on the same device'
if torch.cuda.current_device() != points_device:
torch.cuda.set_device(points_device)
roiaware_pool3d_ext.points_in_boxes_all(boxes.contiguous(),
points.contiguous(),
box_idxs_of_pts)
return box_idxs_of_pts
# Copyright (c) OpenMMLab. All rights reserved.
import mmcv
import torch
from torch import nn as nn
from torch.autograd import Function
from . import roiaware_pool3d_ext
class RoIAwarePool3d(nn.Module):
def __init__(self, out_size, max_pts_per_voxel=128, mode='max'):
super().__init__()
"""RoIAwarePool3d module
Args:
out_size (int or tuple): n or [n1, n2, n3]
max_pts_per_voxel (int): m
mode (str): 'max' or 'avg'
"""
self.out_size = out_size
self.max_pts_per_voxel = max_pts_per_voxel
assert mode in ['max', 'avg']
pool_method_map = {'max': 0, 'avg': 1}
self.mode = pool_method_map[mode]
def forward(self, rois, pts, pts_feature):
"""RoIAwarePool3d module forward.
Args:
rois (torch.Tensor): [N, 7],in LiDAR coordinate,
(x, y, z) is the bottom center of rois
pts (torch.Tensor): [npoints, 3]
pts_feature (torch.Tensor): [npoints, C]
Returns:
pooled_features (torch.Tensor): [N, out_x, out_y, out_z, C]
"""
return RoIAwarePool3dFunction.apply(rois, pts, pts_feature,
self.out_size,
self.max_pts_per_voxel, self.mode)
class RoIAwarePool3dFunction(Function):
@staticmethod
def forward(ctx, rois, pts, pts_feature, out_size, max_pts_per_voxel,
mode):
"""RoIAwarePool3d function forward.
Args:
rois (torch.Tensor): [N, 7], in LiDAR coordinate,
(x, y, z) is the bottom center of rois
pts (torch.Tensor): [npoints, 3]
pts_feature (torch.Tensor): [npoints, C]
out_size (int or tuple): n or [n1, n2, n3]
max_pts_per_voxel (int): m
mode (int): 0 (max pool) or 1 (average pool)
Returns:
pooled_features (torch.Tensor): [N, out_x, out_y, out_z, C]
"""
if isinstance(out_size, int):
out_x = out_y = out_z = out_size
else:
assert len(out_size) == 3
assert mmcv.is_tuple_of(out_size, int)
out_x, out_y, out_z = out_size
num_rois = rois.shape[0]
num_channels = pts_feature.shape[-1]
num_pts = pts.shape[0]
pooled_features = pts_feature.new_zeros(
(num_rois, out_x, out_y, out_z, num_channels))
argmax = pts_feature.new_zeros(
(num_rois, out_x, out_y, out_z, num_channels), dtype=torch.int)
pts_idx_of_voxels = pts_feature.new_zeros(
(num_rois, out_x, out_y, out_z, max_pts_per_voxel),
dtype=torch.int)
roiaware_pool3d_ext.forward(rois, pts, pts_feature, argmax,
pts_idx_of_voxels, pooled_features, mode)
ctx.roiaware_pool3d_for_backward = (pts_idx_of_voxels, argmax, mode,
num_pts, num_channels)
return pooled_features
@staticmethod
def backward(ctx, grad_out):
"""RoIAwarePool3d function forward.
Args:
grad_out (torch.Tensor): [N, out_x, out_y, out_z, C]
Returns:
grad_in (torch.Tensor): [npoints, C]
"""
ret = ctx.roiaware_pool3d_for_backward
pts_idx_of_voxels, argmax, mode, num_pts, num_channels = ret
grad_in = grad_out.new_zeros((num_pts, num_channels))
roiaware_pool3d_ext.backward(pts_idx_of_voxels, argmax,
grad_out.contiguous(), grad_in, mode)
return None, None, grad_in, None, None, None
if __name__ == '__main__':
pass
// Modified from
// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
// Written by Shaoshuai Shi
// All Rights Reserved 2019.
#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <torch/extension.h>
#include <torch/serialize/tensor.h>
#define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
// #define DEBUG
inline void lidar_to_local_coords_cpu(float shift_x, float shift_y, float rz,
float &local_x, float &local_y) {
float cosa = cos(-rz), sina = sin(-rz);
local_x = shift_x * cosa + shift_y * (-sina);
local_y = shift_x * sina + shift_y * cosa;
}
inline int check_pt_in_box3d_cpu(const float *pt, const float *box3d,
float &local_x, float &local_y) {
// param pt: (x, y, z)
// param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate, cz in the
// bottom center
float x = pt[0], y = pt[1], z = pt[2];
float cx = box3d[0], cy = box3d[1], cz = box3d[2];
float x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6];
cz += z_size / 2.0; // shift to the center since cz in box3d is the bottom center
if (fabsf(z - cz) > z_size / 2.0) return 0;
lidar_to_local_coords_cpu(x - cx, y - cy, rz, local_x, local_y);
float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) &
(local_y > -y_size / 2.0) & (local_y < y_size / 2.0);
return in_flag;
}
int points_in_boxes_cpu(at::Tensor boxes_tensor, at::Tensor pts_tensor,
at::Tensor pts_indices_tensor) {
// params boxes: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate, z is the
// bottom center, each box DO NOT overlaps params pts: (npoints, 3) [x, y, z]
// in LiDAR coordinate params pts_indices: (N, npoints)
CHECK_CONTIGUOUS(boxes_tensor);
CHECK_CONTIGUOUS(pts_tensor);
CHECK_CONTIGUOUS(pts_indices_tensor);
int boxes_num = boxes_tensor.size(0);
int pts_num = pts_tensor.size(0);
const float *boxes = boxes_tensor.data_ptr<float>();
const float *pts = pts_tensor.data_ptr<float>();
int *pts_indices = pts_indices_tensor.data_ptr<int>();
float local_x = 0, local_y = 0;
for (int i = 0; i < boxes_num; i++) {
for (int j = 0; j < pts_num; j++) {
int cur_in_flag =
check_pt_in_box3d_cpu(pts + j * 3, boxes + i * 7, local_x, local_y);
pts_indices[i * pts_num + j] = cur_in_flag;
}
}
return 1;
}
// Modified from
// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
// Written by Shaoshuai Shi
// All Rights Reserved 2019.
#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <torch/serialize/tensor.h>
#include <torch/types.h>
#define THREADS_PER_BLOCK 256
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
#define CHECK_CUDA(x) \
TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_INPUT(x) \
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
// #define DEBUG
__device__ inline void lidar_to_local_coords(float shift_x, float shift_y,
float rz, float &local_x,
float &local_y) {
float cosa = cos(-rz), sina = sin(-rz);
local_x = shift_x * cosa + shift_y * (-sina);
local_y = shift_x * sina + shift_y * cosa;
}
__device__ inline int check_pt_in_box3d(const float *pt, const float *box3d,
float &local_x, float &local_y) {
// param pt: (x, y, z)
// param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate, cz in the
// bottom center
float x = pt[0], y = pt[1], z = pt[2];
float cx = box3d[0], cy = box3d[1], cz = box3d[2];
float x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6];
cz += z_size / 2.0; // shift to the center since cz in box3d is the bottom center
if (fabsf(z - cz) > z_size / 2.0) return 0;
lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) &
(local_y > -y_size / 2.0) & (local_y < y_size / 2.0);
return in_flag;
}
__global__ void points_in_boxes_part_kernel(int batch_size, int boxes_num,
int pts_num, const float *boxes,
const float *pts,
int *box_idx_of_points) {
// params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate, z is
// the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
// y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
// -1
int bs_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (bs_idx >= batch_size || pt_idx >= pts_num) return;
boxes += bs_idx * boxes_num * 7;
pts += bs_idx * pts_num * 3 + pt_idx * 3;
box_idx_of_points += bs_idx * pts_num + pt_idx;
float local_x = 0, local_y = 0;
int cur_in_flag = 0;
for (int k = 0; k < boxes_num; k++) {
cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
if (cur_in_flag) {
box_idx_of_points[0] = k;
break;
}
}
}
__global__ void points_in_boxes_all_kernel(int batch_size, int boxes_num,
int pts_num, const float *boxes,
const float *pts,
int *box_idx_of_points) {
// params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate, z is
// the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
// y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
// -1
int bs_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (bs_idx >= batch_size || pt_idx >= pts_num) return;
boxes += bs_idx * boxes_num * 7;
pts += bs_idx * pts_num * 3 + pt_idx * 3;
box_idx_of_points += bs_idx * pts_num * boxes_num + pt_idx * boxes_num;
float local_x = 0, local_y = 0;
int cur_in_flag = 0;
for (int k = 0; k < boxes_num; k++) {
cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
if (cur_in_flag) {
box_idx_of_points[k] = 1;
}
cur_in_flag = 0;
}
}
void points_in_boxes_part_launcher(int batch_size, int boxes_num, int pts_num,
const float *boxes, const float *pts,
int *box_idx_of_points) {
// params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate, z is
// the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
// y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
// -1
cudaError_t err;
dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), batch_size);
dim3 threads(THREADS_PER_BLOCK);
points_in_boxes_part_kernel<<<blocks, threads>>>(batch_size, boxes_num, pts_num,
boxes, pts, box_idx_of_points);
err = cudaGetLastError();
if (cudaSuccess != err) {
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1);
}
#ifdef DEBUG
cudaDeviceSynchronize(); // for using printf in kernel function
#endif
}
void points_in_boxes_all_launcher(int batch_size, int boxes_num, int pts_num,
const float *boxes, const float *pts,
int *box_idx_of_points) {
// params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate, z is
// the bottom center, each box params pts: (B, npoints, 3) [x, y, z] in
// LiDAR coordinate params boxes_idx_of_points: (B, npoints), default -1
cudaError_t err;
dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), batch_size);
dim3 threads(THREADS_PER_BLOCK);
points_in_boxes_all_kernel<<<blocks, threads>>>(
batch_size, boxes_num, pts_num, boxes, pts, box_idx_of_points);
err = cudaGetLastError();
if (cudaSuccess != err) {
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1);
}
#ifdef DEBUG
cudaDeviceSynchronize(); // for using printf in kernel function
#endif
}
int points_in_boxes_part(at::Tensor boxes_tensor, at::Tensor pts_tensor,
at::Tensor box_idx_of_points_tensor) {
// params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate, z is
// the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
// y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
// -1
CHECK_INPUT(boxes_tensor);
CHECK_INPUT(pts_tensor);
CHECK_INPUT(box_idx_of_points_tensor);
int batch_size = boxes_tensor.size(0);
int boxes_num = boxes_tensor.size(1);
int pts_num = pts_tensor.size(1);
const float *boxes = boxes_tensor.data_ptr<float>();
const float *pts = pts_tensor.data_ptr<float>();
int *box_idx_of_points = box_idx_of_points_tensor.data_ptr<int>();
points_in_boxes_part_launcher(batch_size, boxes_num, pts_num, boxes, pts,
box_idx_of_points);
return 1;
}
int points_in_boxes_all(at::Tensor boxes_tensor, at::Tensor pts_tensor,
at::Tensor box_idx_of_points_tensor) {
// params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate, z is
// the bottom center. params pts: (B, npoints, 3) [x, y, z] in LiDAR
// coordinate params boxes_idx_of_points: (B, npoints), default -1
CHECK_INPUT(boxes_tensor);
CHECK_INPUT(pts_tensor);
CHECK_INPUT(box_idx_of_points_tensor);
int batch_size = boxes_tensor.size(0);
int boxes_num = boxes_tensor.size(1);
int pts_num = pts_tensor.size(1);
const float *boxes = boxes_tensor.data_ptr<float>();
const float *pts = pts_tensor.data_ptr<float>();
int *box_idx_of_points = box_idx_of_points_tensor.data_ptr<int>();
points_in_boxes_all_launcher(batch_size, boxes_num, pts_num, boxes, pts,
box_idx_of_points);
return 1;
}
// Modified from
// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
// Written by Shaoshuai Shi
// All Rights Reserved 2019.
#include <assert.h>
#include <torch/extension.h>
#include <torch/serialize/tensor.h>
#define CHECK_CUDA(x) \
TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_INPUT(x) \
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
void roiaware_pool3d_launcher(int boxes_num, int pts_num, int channels,
int max_pts_each_voxel, int out_x, int out_y,
int out_z, const float *rois, const float *pts,
const float *pts_feature, int *argmax,
int *pts_idx_of_voxels, float *pooled_features,
int pool_method);
void roiaware_pool3d_backward_launcher(int boxes_num, int out_x, int out_y,
int out_z, int channels,
int max_pts_each_voxel,
const int *pts_idx_of_voxels,
const int *argmax, const float *grad_out,
float *grad_in, int pool_method);
int roiaware_pool3d_gpu(at::Tensor rois, at::Tensor pts, at::Tensor pts_feature,
at::Tensor argmax, at::Tensor pts_idx_of_voxels,
at::Tensor pooled_features, int pool_method);
int roiaware_pool3d_gpu_backward(at::Tensor pts_idx_of_voxels,
at::Tensor argmax, at::Tensor grad_out,
at::Tensor grad_in, int pool_method);
int points_in_boxes_cpu(at::Tensor boxes_tensor, at::Tensor pts_tensor,
at::Tensor pts_indices_tensor);
int points_in_boxes_part(at::Tensor boxes_tensor, at::Tensor pts_tensor,
at::Tensor box_idx_of_points_tensor);
int points_in_boxes_all(at::Tensor boxes_tensor, at::Tensor pts_tensor,
at::Tensor box_idx_of_points_tensor);
int roiaware_pool3d_gpu(at::Tensor rois, at::Tensor pts, at::Tensor pts_feature,
at::Tensor argmax, at::Tensor pts_idx_of_voxels,
at::Tensor pooled_features, int pool_method) {
// params rois: (N, 7) [x, y, z, x_size, y_size, z_size, ry] in LiDAR coordinate
// params pts: (npoints, 3) [x, y, z] in LiDAR coordinate
// params pts_feature: (npoints, C)
// params argmax: (N, out_x, out_y, out_z, C)
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
// params pooled_features: (N, out_x, out_y, out_z, C)
// params pool_method: 0: max_pool 1: avg_pool
CHECK_INPUT(rois);
CHECK_INPUT(pts);
CHECK_INPUT(pts_feature);
CHECK_INPUT(argmax);
CHECK_INPUT(pts_idx_of_voxels);
CHECK_INPUT(pooled_features);
int boxes_num = rois.size(0);
int pts_num = pts.size(0);
int channels = pts_feature.size(1);
int max_pts_each_voxel = pts_idx_of_voxels.size(4); // index 0 is the counter
int out_x = pts_idx_of_voxels.size(1);
int out_y = pts_idx_of_voxels.size(2);
int out_z = pts_idx_of_voxels.size(3);
assert((out_x < 256) && (out_y < 256) &&
(out_z < 256)); // we encode index with 8bit
const float *rois_data = rois.data_ptr<float>();
const float *pts_data = pts.data_ptr<float>();
const float *pts_feature_data = pts_feature.data_ptr<float>();
int *argmax_data = argmax.data_ptr<int>();
int *pts_idx_of_voxels_data = pts_idx_of_voxels.data_ptr<int>();
float *pooled_features_data = pooled_features.data_ptr<float>();
roiaware_pool3d_launcher(
boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
rois_data, pts_data, pts_feature_data, argmax_data,
pts_idx_of_voxels_data, pooled_features_data, pool_method);
return 1;
}
int roiaware_pool3d_gpu_backward(at::Tensor pts_idx_of_voxels,
at::Tensor argmax, at::Tensor grad_out,
at::Tensor grad_in, int pool_method) {
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
// params argmax: (N, out_x, out_y, out_z, C)
// params grad_out: (N, out_x, out_y, out_z, C)
// params grad_in: (npoints, C), return value
// params pool_method: 0: max_pool 1: avg_pool
CHECK_INPUT(pts_idx_of_voxels);
CHECK_INPUT(argmax);
CHECK_INPUT(grad_out);
CHECK_INPUT(grad_in);
int boxes_num = pts_idx_of_voxels.size(0);
int out_x = pts_idx_of_voxels.size(1);
int out_y = pts_idx_of_voxels.size(2);
int out_z = pts_idx_of_voxels.size(3);
int max_pts_each_voxel = pts_idx_of_voxels.size(4); // index 0 is the counter
int channels = grad_out.size(4);
const int *pts_idx_of_voxels_data = pts_idx_of_voxels.data_ptr<int>();
const int *argmax_data = argmax.data_ptr<int>();
const float *grad_out_data = grad_out.data_ptr<float>();
float *grad_in_data = grad_in.data_ptr<float>();
roiaware_pool3d_backward_launcher(boxes_num, out_x, out_y, out_z, channels,
max_pts_each_voxel, pts_idx_of_voxels_data,
argmax_data, grad_out_data, grad_in_data,
pool_method);
return 1;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &roiaware_pool3d_gpu, "roiaware pool3d forward (CUDA)");
m.def("backward", &roiaware_pool3d_gpu_backward,
"roiaware pool3d backward (CUDA)");
m.def("points_in_boxes_part", &points_in_boxes_part,
"points_in_boxes_part forward (CUDA)");
m.def("points_in_boxes_all", &points_in_boxes_all,
"points_in_boxes_all forward (CUDA)");
m.def("points_in_boxes_cpu", &points_in_boxes_cpu,
"points_in_boxes_cpu forward (CPU)");
}
// Modified from
// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
// Written by Shaoshuai Shi
// All Rights Reserved 2019.
#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <torch/serialize/tensor.h>
#include <torch/types.h>
#define THREADS_PER_BLOCK 256
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
// #define DEBUG
__device__ inline void lidar_to_local_coords(float shift_x, float shift_y,
float rz, float &local_x,
float &local_y) {
float cosa = cos(-rz), sina = sin(-rz);
local_x = shift_x * cosa + shift_y * (-sina);
local_y = shift_x * sina + shift_y * cosa;
}
__device__ inline int check_pt_in_box3d(const float *pt, const float *box3d,
float &local_x, float &local_y) {
// param pt: (x, y, z)
// param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate, cz in the
// bottom center
float x = pt[0], y = pt[1], z = pt[2];
float cx = box3d[0], cy = box3d[1], cz = box3d[2];
float x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6];
cz += z_size / 2.0; // shift to the center since cz in box3d is the bottom center
if (fabsf(z - cz) > z_size / 2.0) return 0;
lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) &
(local_y > -y_size / 2.0) & (local_y < y_size / 2.0);
return in_flag;
}
__global__ void generate_pts_mask_for_box3d(int boxes_num, int pts_num,
int out_x, int out_y, int out_z,
const float *rois, const float *pts,
int *pts_mask) {
// params rois: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate
// params pts: (npoints, 3) [x, y, z]
// params pts_mask: (N, npoints): -1 means point does not in this box,
// otherwise: encode (x_idxs, y_idxs, z_idxs) by binary bit
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
int box_idx = blockIdx.y;
if (pt_idx >= pts_num || box_idx >= boxes_num) return;
pts += pt_idx * 3;
rois += box_idx * 7;
pts_mask += box_idx * pts_num + pt_idx;
float local_x = 0, local_y = 0;
int cur_in_flag = check_pt_in_box3d(pts, rois, local_x, local_y);
pts_mask[0] = -1;
if (cur_in_flag > 0) {
float local_z = pts[2] - rois[2];
float x_size = rois[3], y_size = rois[4], z_size = rois[5];
float x_res = x_size / out_x;
float y_res = y_size / out_y;
float z_res = z_size / out_z;
unsigned int x_idx = int((local_x + x_size / 2) / x_res);
unsigned int y_idx = int((local_y + y_size / 2) / y_res);
unsigned int z_idx = int(local_z / z_res);
x_idx = min(max(x_idx, 0), out_x - 1);
y_idx = min(max(y_idx, 0), out_y - 1);
z_idx = min(max(z_idx, 0), out_z - 1);
unsigned int idx_encoding = (x_idx << 16) + (y_idx << 8) + z_idx;
#ifdef DEBUG
printf(
"mask: pts_%d(%.3f, %.3f, %.3f), local(%.3f, %.3f, %.3f), idx(%d, %d, "
"%d), res(%.3f, %.3f, %.3f), idx_encoding=%x\n",
pt_idx, pts[0], pts[1], pts[2], local_x, local_y, local_z, x_idx, y_idx,
z_idx, x_res, y_res, z_res, idx_encoding);
#endif
pts_mask[0] = idx_encoding;
}
}
__global__ void collect_inside_pts_for_box3d(int boxes_num, int pts_num,
int max_pts_each_voxel, int out_x,
int out_y, int out_z,
const int *pts_mask,
int *pts_idx_of_voxels) {
// params pts_mask: (N, npoints) 0 or 1
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
int box_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (box_idx >= boxes_num) return;
int max_num_pts = max_pts_each_voxel - 1; // index 0 is the counter
pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel;
for (int k = 0; k < pts_num; k++) {
if (pts_mask[box_idx * pts_num + k] != -1) {
unsigned int idx_encoding = pts_mask[box_idx * pts_num + k];
unsigned int x_idx = (idx_encoding >> 16) & 0xFF;
unsigned int y_idx = (idx_encoding >> 8) & 0xFF;
unsigned int z_idx = idx_encoding & 0xFF;
unsigned int base_offset = x_idx * out_y * out_z * max_pts_each_voxel +
y_idx * out_z * max_pts_each_voxel +
z_idx * max_pts_each_voxel;
unsigned int cnt = pts_idx_of_voxels[base_offset];
if (cnt < max_num_pts) {
pts_idx_of_voxels[base_offset + cnt + 1] = k;
pts_idx_of_voxels[base_offset]++;
}
#ifdef DEBUG
printf("collect: pts_%d, idx(%d, %d, %d), idx_encoding=%x\n", k, x_idx,
y_idx, z_idx, idx_encoding);
#endif
}
}
}
__global__ void roiaware_maxpool3d(int boxes_num, int pts_num, int channels,
int max_pts_each_voxel, int out_x, int out_y,
int out_z, const float *pts_feature,
const int *pts_idx_of_voxels,
float *pooled_features, int *argmax) {
// params pts_feature: (npoints, C)
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel),
// index 0 is the counter params pooled_features: (N, out_x, out_y, out_z, C)
// params argmax: (N, out_x, out_y, out_z, C)
int box_idx = blockIdx.z;
int channel_idx = blockIdx.y;
int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
int x_idx = voxel_idx_flat / (out_y * out_z);
int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
int z_idx = voxel_idx_flat % out_z;
if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
y_idx >= out_y || z_idx >= out_z)
return;
#ifdef DEBUG
printf("src pts_idx_of_voxels: (%p, ), argmax: %p\n", pts_idx_of_voxels,
argmax);
#endif
int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
offset_base * max_pts_each_voxel;
pooled_features += box_idx * out_x * out_y * out_z * channels +
offset_base * channels + channel_idx;
argmax += box_idx * out_x * out_y * out_z * channels +
offset_base * channels + channel_idx;
int argmax_idx = -1;
float max_val = -1e50;
int total_pts = pts_idx_of_voxels[0];
for (int k = 1; k <= total_pts; k++) {
if (pts_feature[pts_idx_of_voxels[k] * channels + channel_idx] > max_val) {
max_val = pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
argmax_idx = pts_idx_of_voxels[k];
}
}
if (argmax_idx != -1) {
pooled_features[0] = max_val;
}
argmax[0] = argmax_idx;
#ifdef DEBUG
printf(
"channel_%d idx(%d, %d, %d), argmax_idx=(%d, %.3f), total=%d, after "
"pts_idx: %p, argmax: (%p, %d)\n",
channel_idx, x_idx, y_idx, z_idx, argmax_idx, max_val, total_pts,
pts_idx_of_voxels, argmax, argmax_idx);
#endif
}
__global__ void roiaware_avgpool3d(int boxes_num, int pts_num, int channels,
int max_pts_each_voxel, int out_x, int out_y,
int out_z, const float *pts_feature,
const int *pts_idx_of_voxels,
float *pooled_features) {
// params pts_feature: (npoints, C)
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel),
// index 0 is the counter params pooled_features: (N, out_x, out_y, out_z, C)
// params argmax: (N, out_x, out_y, out_z, C)
int box_idx = blockIdx.z;
int channel_idx = blockIdx.y;
int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
int x_idx = voxel_idx_flat / (out_y * out_z);
int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
int z_idx = voxel_idx_flat % out_z;
if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
y_idx >= out_y || z_idx >= out_z)
return;
int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
offset_base * max_pts_each_voxel;
pooled_features += box_idx * out_x * out_y * out_z * channels +
offset_base * channels + channel_idx;
float sum_val = 0;
int total_pts = pts_idx_of_voxels[0];
for (int k = 1; k <= total_pts; k++) {
sum_val += pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
}
if (total_pts > 0) {
pooled_features[0] = sum_val / total_pts;
}
}
void roiaware_pool3d_launcher(int boxes_num, int pts_num, int channels,
int max_pts_each_voxel, int out_x, int out_y,
int out_z, const float *rois, const float *pts,
const float *pts_feature, int *argmax,
int *pts_idx_of_voxels, float *pooled_features,
int pool_method) {
// params rois: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate
// params pts: (npoints, 3) [x, y, z] in LiDAR coordinate
// params pts_feature: (npoints, C)
// params argmax: (N, out_x, out_y, out_z, C)
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
// params pooled_features: (N, out_x, out_y, out_z, C)
// params pool_method: 0: max_pool 1: avg_pool
int *pts_mask = NULL;
cudaMalloc(&pts_mask, boxes_num * pts_num * sizeof(int)); // (N, M)
cudaMemset(pts_mask, -1, boxes_num * pts_num * sizeof(int));
dim3 blocks_mask(DIVUP(pts_num, THREADS_PER_BLOCK), boxes_num);
dim3 threads(THREADS_PER_BLOCK);
generate_pts_mask_for_box3d<<<blocks_mask, threads>>>(
boxes_num, pts_num, out_x, out_y, out_z, rois, pts, pts_mask);
// TODO: Merge the collect and pool functions, SS
dim3 blocks_collect(DIVUP(boxes_num, THREADS_PER_BLOCK));
collect_inside_pts_for_box3d<<<blocks_collect, threads>>>(
boxes_num, pts_num, max_pts_each_voxel, out_x, out_y, out_z, pts_mask,
pts_idx_of_voxels);
dim3 blocks_pool(DIVUP(out_x * out_y * out_z, THREADS_PER_BLOCK), channels,
boxes_num);
if (pool_method == 0) {
roiaware_maxpool3d<<<blocks_pool, threads>>>(
boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
pts_feature, pts_idx_of_voxels, pooled_features, argmax);
} else if (pool_method == 1) {
roiaware_avgpool3d<<<blocks_pool, threads>>>(
boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
pts_feature, pts_idx_of_voxels, pooled_features);
}
cudaFree(pts_mask);
#ifdef DEBUG
cudaDeviceSynchronize(); // for using printf in kernel function
#endif
}
__global__ void roiaware_maxpool3d_backward(int boxes_num, int channels,
int out_x, int out_y, int out_z,
const int *argmax,
const float *grad_out,
float *grad_in) {
// params argmax: (N, out_x, out_y, out_z, C)
// params grad_out: (N, out_x, out_y, out_z, C)
// params grad_in: (npoints, C), return value
int box_idx = blockIdx.z;
int channel_idx = blockIdx.y;
int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
int x_idx = voxel_idx_flat / (out_y * out_z);
int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
int z_idx = voxel_idx_flat % out_z;
if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
y_idx >= out_y || z_idx >= out_z)
return;
int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
argmax += box_idx * out_x * out_y * out_z * channels +
offset_base * channels + channel_idx;
grad_out += box_idx * out_x * out_y * out_z * channels +
offset_base * channels + channel_idx;
if (argmax[0] == -1) return;
atomicAdd(grad_in + argmax[0] * channels + channel_idx, grad_out[0] * 1);
}
__global__ void roiaware_avgpool3d_backward(int boxes_num, int channels,
int out_x, int out_y, int out_z,
int max_pts_each_voxel,
const int *pts_idx_of_voxels,
const float *grad_out,
float *grad_in) {
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
// params grad_out: (N, out_x, out_y, out_z, C)
// params grad_in: (npoints, C), return value
int box_idx = blockIdx.z;
int channel_idx = blockIdx.y;
int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
int x_idx = voxel_idx_flat / (out_y * out_z);
int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
int z_idx = voxel_idx_flat % out_z;
if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
y_idx >= out_y || z_idx >= out_z)
return;
int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
offset_base * max_pts_each_voxel;
grad_out += box_idx * out_x * out_y * out_z * channels +
offset_base * channels + channel_idx;
int total_pts = pts_idx_of_voxels[0];
float cur_grad = 1 / fmaxf(float(total_pts), 1.0);
for (int k = 1; k <= total_pts; k++) {
atomicAdd(grad_in + pts_idx_of_voxels[k] * channels + channel_idx,
grad_out[0] * cur_grad);
}
}
void roiaware_pool3d_backward_launcher(int boxes_num, int out_x, int out_y,
int out_z, int channels,
int max_pts_each_voxel,
const int *pts_idx_of_voxels,
const int *argmax, const float *grad_out,
float *grad_in, int pool_method) {
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
// params argmax: (N, out_x, out_y, out_z, C)
// params grad_out: (N, out_x, out_y, out_z, C)
// params grad_in: (npoints, C), return value
// params pool_method: 0: max_pool, 1: avg_pool
dim3 blocks(DIVUP(out_x * out_y * out_z, THREADS_PER_BLOCK), channels,
boxes_num);
dim3 threads(THREADS_PER_BLOCK);
if (pool_method == 0) {
roiaware_maxpool3d_backward<<<blocks, threads>>>(
boxes_num, channels, out_x, out_y, out_z, argmax, grad_out, grad_in);
} else if (pool_method == 1) {
roiaware_avgpool3d_backward<<<blocks, threads>>>(
boxes_num, channels, out_x, out_y, out_z, max_pts_each_voxel,
pts_idx_of_voxels, grad_out, grad_in);
}
}
# Copyright (c) OpenMMLab. All rights reserved.
from .roipoint_pool3d import RoIPointPool3d
__all__ = ['RoIPointPool3d']
# Copyright (c) OpenMMLab. All rights reserved.
from torch import nn as nn
from torch.autograd import Function
from . import roipoint_pool3d_ext
class RoIPointPool3d(nn.Module):
def __init__(self, num_sampled_points=512):
super().__init__()
"""
Args:
num_sampled_points (int): Number of samples in each roi
"""
self.num_sampled_points = num_sampled_points
def forward(self, points, point_features, boxes3d):
"""
Args:
points (torch.Tensor): Input points whose shape is BxNx3
point_features: (B, N, C)
boxes3d: (B, M, 7), [x, y, z, dx, dy, dz, heading]
Returns:
torch.Tensor: (B, M, 512, 3 + C) pooled_features
torch.Tensor: (B, M) pooled_empty_flag
"""
return RoIPointPool3dFunction.apply(points, point_features, boxes3d,
self.num_sampled_points)
class RoIPointPool3dFunction(Function):
@staticmethod
def forward(ctx, points, point_features, boxes3d, num_sampled_points=512):
"""
Args:
points (torch.Tensor): Input points whose shape is (B, N, 3)
point_features (torch.Tensor): Input points features shape is \
(B, N, C)
boxes3d (torch.Tensor): Input bounding boxes whose shape is \
(B, M, 7)
num_sampled_points (int): the num of sampled points
Returns:
torch.Tensor: (B, M, 512, 3 + C) pooled_features
torch.Tensor: (B, M) pooled_empty_flag
"""
assert points.shape.__len__() == 3 and points.shape[2] == 3
batch_size, boxes_num, feature_len = points.shape[0], boxes3d.shape[
1], point_features.shape[2]
pooled_boxes3d = boxes3d.view(batch_size, -1, 7)
pooled_features = point_features.new_zeros(
(batch_size, boxes_num, num_sampled_points, 3 + feature_len))
pooled_empty_flag = point_features.new_zeros(
(batch_size, boxes_num)).int()
roipoint_pool3d_ext.forward(points.contiguous(),
pooled_boxes3d.contiguous(),
point_features.contiguous(),
pooled_features, pooled_empty_flag)
return pooled_features, pooled_empty_flag
@staticmethod
def backward(ctx, grad_out):
raise NotImplementedError
if __name__ == '__main__':
pass
/*
Modified for
https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/roipoint_pool3d/src/roipoint_pool3d_kernel.cu
Point cloud feature pooling
Written by Shaoshuai Shi
All Rights Reserved 2018.
*/
#include <torch/serialize/tensor.h>
#include <torch/extension.h>
#define CHECK_CUDA(x) do { \
if (!x.type().is_cuda()) { \
fprintf(stderr, "%s must be CUDA tensor at %s:%d\n", #x, __FILE__, __LINE__); \
exit(-1); \
} \
} while (0)
#define CHECK_CONTIGUOUS(x) do { \
if (!x.is_contiguous()) { \
fprintf(stderr, "%s must be contiguous tensor at %s:%d\n", #x, __FILE__, __LINE__); \
exit(-1); \
} \
} while (0)
#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
void roipool3dLauncher(int batch_size, int pts_num, int boxes_num, int feature_in_len, int sampled_pts_num,
const float *xyz, const float *boxes3d, const float *pts_feature, float *pooled_features, int *pooled_empty_flag);
int roipool3d_gpu(at::Tensor xyz, at::Tensor boxes3d, at::Tensor pts_feature, at::Tensor pooled_features, at::Tensor pooled_empty_flag){
// params xyz: (B, N, 3)
// params boxes3d: (B, M, 7)
// params pts_feature: (B, N, C)
// params pooled_features: (B, M, 512, 3+C)
// params pooled_empty_flag: (B, M)
CHECK_INPUT(xyz);
CHECK_INPUT(boxes3d);
CHECK_INPUT(pts_feature);
CHECK_INPUT(pooled_features);
CHECK_INPUT(pooled_empty_flag);
int batch_size = xyz.size(0);
int pts_num = xyz.size(1);
int boxes_num = boxes3d.size(1);
int feature_in_len = pts_feature.size(2);
int sampled_pts_num = pooled_features.size(2);
const float * xyz_data = xyz.data<float>();
const float * boxes3d_data = boxes3d.data<float>();
const float * pts_feature_data = pts_feature.data<float>();
float * pooled_features_data = pooled_features.data<float>();
int * pooled_empty_flag_data = pooled_empty_flag.data<int>();
roipool3dLauncher(batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
xyz_data, boxes3d_data, pts_feature_data, pooled_features_data, pooled_empty_flag_data);
return 1;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &roipool3d_gpu, "roipool3d forward (CUDA)");
}
/*
Modified from
https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roipoint_pool3d/src/roipoint_pool3d_kernel.cu
Point cloud feature pooling
Written by Shaoshuai Shi
All Rights Reserved 2018.
*/
#include <math.h>
#include <stdio.h>
#define THREADS_PER_BLOCK 256
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
// #define DEBUG
__device__ inline void lidar_to_local_coords(float shift_x, float shift_y,
float rz, float &local_x,
float &local_y) {
float cosa = cos(-rz), sina = sin(-rz);
local_x = shift_x * cosa + shift_y * (-sina);
local_y = shift_x * sina + shift_y * cosa;
}
__device__ inline int check_pt_in_box3d(const float *pt, const float *box3d,
float &local_x, float &local_y) {
// param pt: (x, y, z)
// param box3d: (cx, cy, cz, dx, dy, dz, rz) in LiDAR coordinate, cz in the
// bottom center
float x = pt[0], y = pt[1], z = pt[2];
float cx = box3d[0], cy = box3d[1], cz = box3d[2];
float dx = box3d[3], dy = box3d[4], dz = box3d[5], rz = box3d[6];
cz += dz / 2.0; // shift to the center since cz in box3d is the bottom center
if (fabsf(z - cz) > dz / 2.0) return 0;
lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
float in_flag = (local_x > -dx / 2.0) & (local_x < dx / 2.0) &
(local_y > -dy / 2.0) & (local_y < dy / 2.0);
return in_flag;
}
__global__ void assign_pts_to_box3d(int batch_size, int pts_num, int boxes_num, const float *xyz, const float *boxes3d, int *pts_assign){
// params xyz: (B, N, 3)
// params boxes3d: (B, M, 7)
// params pts_assign: (B, N, M): idx of the corresponding box3d, -1 means background points
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
int box_idx = blockIdx.y;
int bs_idx = blockIdx.z;
if (pt_idx >= pts_num || box_idx >= boxes_num || bs_idx >= batch_size){
return;
}
int assign_idx = bs_idx * pts_num * boxes_num + pt_idx * boxes_num + box_idx;
pts_assign[assign_idx] = 0;
int box_offset = bs_idx * boxes_num * 7 + box_idx * 7;
int pt_offset = bs_idx * pts_num * 3 + pt_idx * 3;
float local_x = 0, local_y = 0;
int cur_in_flag = check_pt_in_box3d(xyz + pt_offset, boxes3d + box_offset, local_x, local_y);
pts_assign[assign_idx] = cur_in_flag;
// printf("bs=%d, pt=%d, in=%d\n", bs_idx, pt_idx, pts_assign[bs_idx * pts_num + pt_idx]);
}
__global__ void get_pooled_idx(int batch_size, int pts_num, int boxes_num, int sampled_pts_num,
const int *pts_assign, int *pts_idx, int *pooled_empty_flag){
// params xyz: (B, N, 3)
// params pts_feature: (B, N, C)
// params pts_assign: (B, N)
// params pts_idx: (B, M, 512)
// params pooled_empty_flag: (B, M)
int boxes_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (boxes_idx >= boxes_num){
return;
}
int bs_idx = blockIdx.y;
int cnt = 0;
for (int k = 0; k < pts_num; k++){
if (pts_assign[bs_idx * pts_num * boxes_num + k * boxes_num + boxes_idx]){
if (cnt < sampled_pts_num){
pts_idx[bs_idx * boxes_num * sampled_pts_num + boxes_idx * sampled_pts_num + cnt] = k;
cnt++;
}
else break;
}
}
if (cnt == 0){
pooled_empty_flag[bs_idx * boxes_num + boxes_idx] = 1;
}
else if (cnt < sampled_pts_num){
// duplicate same points for sampling
for (int k = cnt; k < sampled_pts_num; k++){
int duplicate_idx = k % cnt;
int base_offset = bs_idx * boxes_num * sampled_pts_num + boxes_idx * sampled_pts_num;
pts_idx[base_offset + k] = pts_idx[base_offset + duplicate_idx];
}
}
}
__global__ void roipool3d_forward(int batch_size, int pts_num, int boxes_num, int feature_in_len, int sampled_pts_num,
const float *xyz, const int *pts_idx, const float *pts_feature,
float *pooled_features, int *pooled_empty_flag){
// params xyz: (B, N, 3)
// params pts_idx: (B, M, 512)
// params pts_feature: (B, N, C)
// params pooled_features: (B, M, 512, 3+C)
// params pooled_empty_flag: (B, M)
int sample_pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
int box_idx = blockIdx.y;
int bs_idx = blockIdx.z;
if (sample_pt_idx >= sampled_pts_num || box_idx >= boxes_num || bs_idx >= batch_size){
return;
}
if (pooled_empty_flag[bs_idx * boxes_num + box_idx]){
return;
}
int temp_idx = bs_idx * boxes_num * sampled_pts_num + box_idx * sampled_pts_num + sample_pt_idx;
int src_pt_idx = pts_idx[temp_idx];
int dst_feature_offset = temp_idx * (3 + feature_in_len);
for (int j = 0; j < 3; j++)
pooled_features[dst_feature_offset + j] = xyz[bs_idx * pts_num * 3 + src_pt_idx * 3 + j];
int src_feature_offset = bs_idx * pts_num * feature_in_len + src_pt_idx * feature_in_len;
for (int j = 0; j < feature_in_len; j++)
pooled_features[dst_feature_offset + 3 + j] = pts_feature[src_feature_offset + j];
}
void roipool3dLauncher(int batch_size, int pts_num, int boxes_num, int feature_in_len, int sampled_pts_num,
const float *xyz, const float *boxes3d, const float *pts_feature, float *pooled_features, int *pooled_empty_flag){
// printf("batch_size=%d, pts_num=%d, boxes_num=%d\n", batch_size, pts_num, boxes_num);
int *pts_assign = NULL;
cudaMalloc(&pts_assign, batch_size * pts_num * boxes_num * sizeof(int)); // (batch_size, N, M)
// cudaMemset(&pts_assign, -1, batch_size * pts_num * boxes_num * sizeof(int));
dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), boxes_num, batch_size); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK);
assign_pts_to_box3d<<<blocks, threads>>>(batch_size, pts_num, boxes_num, xyz, boxes3d, pts_assign);
int *pts_idx = NULL;
cudaMalloc(&pts_idx, batch_size * boxes_num * sampled_pts_num * sizeof(int)); // (batch_size, M, sampled_pts_num)
dim3 blocks2(DIVUP(boxes_num, THREADS_PER_BLOCK), batch_size); // blockIdx.x(col), blockIdx.y(row)
get_pooled_idx<<<blocks2, threads>>>(batch_size, pts_num, boxes_num, sampled_pts_num, pts_assign, pts_idx, pooled_empty_flag);
dim3 blocks_pool(DIVUP(sampled_pts_num, THREADS_PER_BLOCK), boxes_num, batch_size);
roipool3d_forward<<<blocks_pool, threads>>>(batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
xyz, pts_idx, pts_feature, pooled_features, pooled_empty_flag);
cudaFree(pts_assign);
cudaFree(pts_idx);
#ifdef DEBUG
cudaDeviceSynchronize(); // for using printf in kernel function
#endif
}
# Copyright (c) OpenMMLab. All rights reserved.
from mmcv.cnn import build_conv_layer, build_norm_layer
from mmcv.ops import SparseModule, SparseSequential
from torch import nn
from mmdet3d.ops import spconv
from mmdet.models.backbones.resnet import BasicBlock, Bottleneck
class SparseBottleneck(Bottleneck, spconv.SparseModule):
class SparseBottleneck(Bottleneck, SparseModule):
"""Sparse bottleneck block for PartA^2.
Bottleneck block implemented with submanifold sparse convolution.
......@@ -32,7 +32,7 @@ class SparseBottleneck(Bottleneck, spconv.SparseModule):
conv_cfg=None,
norm_cfg=None):
spconv.SparseModule.__init__(self)
SparseModule.__init__(self)
Bottleneck.__init__(
self,
inplanes,
......@@ -65,7 +65,7 @@ class SparseBottleneck(Bottleneck, spconv.SparseModule):
return out
class SparseBasicBlock(BasicBlock, spconv.SparseModule):
class SparseBasicBlock(BasicBlock, SparseModule):
"""Sparse basic block for PartA^2.
Sparse basic block implemented with submanifold sparse convolution.
......@@ -90,7 +90,7 @@ class SparseBasicBlock(BasicBlock, spconv.SparseModule):
downsample=None,
conv_cfg=None,
norm_cfg=None):
spconv.SparseModule.__init__(self)
SparseModule.__init__(self)
BasicBlock.__init__(
self,
inplanes,
......@@ -182,5 +182,5 @@ def make_sparse_convmodule(in_channels,
elif layer == 'act':
layers.append(nn.ReLU(inplace=True))
layers = spconv.SparseSequential(*layers)
layers = SparseSequential(*layers)
return layers
# Copyright 2019 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .conv import (SparseConv2d, SparseConv3d, SparseConvTranspose2d,
SparseConvTranspose3d, SparseInverseConv2d,
SparseInverseConv3d, SubMConv2d, SubMConv3d)
from .modules import SparseModule, SparseSequential
from .pool import SparseMaxPool2d, SparseMaxPool3d
from .structure import SparseConvTensor, scatter_nd
__all__ = [
'SparseConv2d',
'SparseConv3d',
'SubMConv2d',
'SubMConv3d',
'SparseConvTranspose2d',
'SparseConvTranspose3d',
'SparseInverseConv2d',
'SparseInverseConv3d',
'SparseModule',
'SparseSequential',
'SparseMaxPool2d',
'SparseMaxPool3d',
'SparseConvTensor',
'scatter_nd',
]
# Copyright 2019 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import numpy as np
import torch
from mmcv.cnn import CONV_LAYERS
from torch.nn import init
from torch.nn.parameter import Parameter
from . import functional as Fsp
from . import ops
from .modules import SparseModule
from .structure import SparseConvTensor
def _calculate_fan_in_and_fan_out_hwio(tensor):
dimensions = tensor.ndimension()
if dimensions < 2:
raise ValueError('fan in and fan out can not be computed for tensor'
'with fewer than 2 dimensions')
if dimensions == 2: # Linear
fan_in = tensor.size(-2)
fan_out = tensor.size(-1)
else:
num_input_fmaps = tensor.size(-2)
num_output_fmaps = tensor.size(-1)
receptive_field_size = 1
if tensor.dim() > 2:
receptive_field_size = tensor[..., 0, 0].numel()
fan_in = num_input_fmaps * receptive_field_size
fan_out = num_output_fmaps * receptive_field_size
return fan_in, fan_out
class SparseConvolution(SparseModule):
def __init__(self,
ndim,
in_channels,
out_channels,
kernel_size=3,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
subm=False,
output_padding=0,
transposed=False,
inverse=False,
indice_key=None,
fused_bn=False):
super(SparseConvolution, self).__init__()
assert groups == 1
if not isinstance(kernel_size, (list, tuple)):
kernel_size = [kernel_size] * ndim
if not isinstance(stride, (list, tuple)):
stride = [stride] * ndim
if not isinstance(padding, (list, tuple)):
padding = [padding] * ndim
if not isinstance(dilation, (list, tuple)):
dilation = [dilation] * ndim
if not isinstance(output_padding, (list, tuple)):
output_padding = [output_padding] * ndim
for d, s in zip(dilation, stride):
assert any([s == 1, d == 1]), "don't support this."
self.ndim = ndim
self.in_channels = in_channels
self.out_channels = out_channels
self.kernel_size = kernel_size
self.conv1x1 = np.prod(kernel_size) == 1
self.stride = stride
self.padding = padding
self.dilation = dilation
self.transposed = transposed
self.inverse = inverse
self.output_padding = output_padding
self.groups = groups
self.subm = subm
self.indice_key = indice_key
self.fused_bn = fused_bn
self.weight = Parameter(
torch.Tensor(*kernel_size, in_channels, out_channels))
if bias:
self.bias = Parameter(torch.Tensor(out_channels))
else:
self.register_parameter('bias', None)
self.reset_parameters()
def reset_parameters(self):
init.kaiming_uniform_(self.weight, a=math.sqrt(5))
if self.bias is not None:
fan_in, _ = _calculate_fan_in_and_fan_out_hwio(self.weight)
bound = 1 / math.sqrt(fan_in)
init.uniform_(self.bias, -bound, bound)
def forward(self, input):
assert isinstance(input, SparseConvTensor)
features = input.features
device = features.device
indices = input.indices
spatial_shape = input.spatial_shape
batch_size = input.batch_size
if not self.subm:
if self.transposed:
out_spatial_shape = ops.get_deconv_output_size(
spatial_shape, self.kernel_size, self.stride, self.padding,
self.dilation, self.output_padding)
else:
out_spatial_shape = ops.get_conv_output_size(
spatial_shape, self.kernel_size, self.stride, self.padding,
self.dilation)
else:
out_spatial_shape = spatial_shape
# input.update_grid(out_spatial_shape)
# t = time.time()
if self.conv1x1:
features = torch.mm(
input.features,
self.weight.view(self.in_channels, self.out_channels))
if self.bias is not None:
features += self.bias
out_tensor = SparseConvTensor(features, input.indices,
input.spatial_shape,
input.batch_size)
out_tensor.indice_dict = input.indice_dict
out_tensor.grid = input.grid
return out_tensor
data = input.find_indice_pair(self.indice_key)
if self.inverse:
assert data is not None and self.indice_key is not None
_, outids, indice_pairs, indice_pair_num, out_spatial_shape = data
assert indice_pairs.shape[0] == np.prod(
self.kernel_size
), 'inverse conv must have same kernel size as its couple conv'
else:
if self.indice_key is not None and data is not None:
outids, _, indice_pairs, indice_pair_num, _ = data
else:
outids, indice_pairs, indice_pair_num = ops.get_indice_pairs(
indices,
batch_size,
spatial_shape,
self.kernel_size,
self.stride,
self.padding,
self.dilation,
self.output_padding,
self.subm,
self.transposed,
grid=input.grid)
input.indice_dict[self.indice_key] = (outids, indices,
indice_pairs,
indice_pair_num,
spatial_shape)
if self.fused_bn:
assert self.bias is not None
out_features = ops.fused_indice_conv(features, self.weight,
self.bias,
indice_pairs.to(device),
indice_pair_num,
outids.shape[0], self.inverse,
self.subm)
else:
if self.subm:
out_features = Fsp.indice_subm_conv(features, self.weight,
indice_pairs.to(device),
indice_pair_num,
outids.shape[0])
else:
if self.inverse:
out_features = Fsp.indice_inverse_conv(
features, self.weight, indice_pairs.to(device),
indice_pair_num, outids.shape[0])
else:
out_features = Fsp.indice_conv(features, self.weight,
indice_pairs.to(device),
indice_pair_num,
outids.shape[0])
if self.bias is not None:
out_features += self.bias
out_tensor = SparseConvTensor(out_features, outids, out_spatial_shape,
batch_size)
out_tensor.indice_dict = input.indice_dict
out_tensor.grid = input.grid
return out_tensor
@CONV_LAYERS.register_module(force=True)
class SparseConv2d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
indice_key=None):
super(SparseConv2d, self).__init__(
2,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
indice_key=indice_key)
@CONV_LAYERS.register_module(force=True)
class SparseConv3d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
indice_key=None):
super(SparseConv3d, self).__init__(
3,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
indice_key=indice_key)
@CONV_LAYERS.register_module(force=True)
class SparseConv4d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
indice_key=None):
super(SparseConv4d, self).__init__(
4,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
indice_key=indice_key)
@CONV_LAYERS.register_module(force=True)
class SparseConvTranspose2d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
indice_key=None):
super(SparseConvTranspose2d, self).__init__(
2,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
transposed=True,
indice_key=indice_key)
@CONV_LAYERS.register_module(force=True)
class SparseConvTranspose3d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
indice_key=None):
super(SparseConvTranspose3d, self).__init__(
3,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
transposed=True,
indice_key=indice_key)
@CONV_LAYERS.register_module(force=True)
class SparseInverseConv2d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
indice_key,
bias=True):
super(SparseInverseConv2d, self).__init__(
2,
in_channels,
out_channels,
kernel_size,
bias=bias,
inverse=True,
indice_key=indice_key)
@CONV_LAYERS.register_module(force=True)
class SparseInverseConv3d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
indice_key,
bias=True):
super(SparseInverseConv3d, self).__init__(
3,
in_channels,
out_channels,
kernel_size,
bias=bias,
inverse=True,
indice_key=indice_key)
@CONV_LAYERS.register_module(force=True)
class SubMConv2d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
indice_key=None):
super(SubMConv2d, self).__init__(
2,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
True,
indice_key=indice_key)
@CONV_LAYERS.register_module(force=True)
class SubMConv3d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
indice_key=None):
super(SubMConv3d, self).__init__(
3,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
True,
indice_key=indice_key)
@CONV_LAYERS.register_module(force=True)
class SubMConv4d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
indice_key=None):
super(SubMConv4d, self).__init__(
4,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
True,
indice_key=indice_key)
# Copyright 2019 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from torch.autograd import Function
from . import ops as ops
class SparseConvFunction(Function):
@staticmethod
def forward(ctx, features, filters, indice_pairs, indice_pair_num,
num_activate_out):
ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
return ops.indice_conv(features, filters, indice_pairs,
indice_pair_num, num_activate_out, False)
@staticmethod
def backward(ctx, grad_output):
indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
input_bp, filters_bp = ops.indice_conv_backward(
features, filters, grad_output, indice_pairs, indice_pair_num,
False)
return input_bp, filters_bp, None, None, None
class SparseInverseConvFunction(Function):
@staticmethod
def forward(ctx, features, filters, indice_pairs, indice_pair_num,
num_activate_out):
ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
return ops.indice_conv(features, filters, indice_pairs,
indice_pair_num, num_activate_out, True, False)
@staticmethod
def backward(ctx, grad_output):
indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
input_bp, filters_bp = ops.indice_conv_backward(
features, filters, grad_output, indice_pairs, indice_pair_num,
True, False)
return input_bp, filters_bp, None, None, None
class SubMConvFunction(Function):
@staticmethod
def forward(ctx, features, filters, indice_pairs, indice_pair_num,
num_activate_out):
ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
return ops.indice_conv(features, filters, indice_pairs,
indice_pair_num, num_activate_out, False, True)
@staticmethod
def backward(ctx, grad_output):
indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
input_bp, filters_bp = ops.indice_conv_backward(
features, filters, grad_output, indice_pairs, indice_pair_num,
False, True)
return input_bp, filters_bp, None, None, None
class SparseMaxPoolFunction(Function):
@staticmethod
def forward(ctx, features, indice_pairs, indice_pair_num,
num_activate_out):
out = ops.indice_maxpool(features, indice_pairs, indice_pair_num,
num_activate_out)
ctx.save_for_backward(indice_pairs, indice_pair_num, features, out)
return out
@staticmethod
def backward(ctx, grad_output):
indice_pairs, indice_pair_num, features, out = ctx.saved_tensors
input_bp = ops.indice_maxpool_backward(features, out, grad_output,
indice_pairs, indice_pair_num)
return input_bp, None, None, None
indice_conv = SparseConvFunction.apply
indice_inverse_conv = SparseInverseConvFunction.apply
indice_subm_conv = SubMConvFunction.apply
indice_maxpool = SparseMaxPoolFunction.apply
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef PARAMS_GRID_H_
#define PARAMS_GRID_H_
#include <tuple>
#include <vector>
namespace detail {
template <class T>
int getTotalSize(std::vector<T> arg) {
return arg.size();
}
template <class T, class... TArgs>
int getTotalSize(std::vector<T> arg, std::vector<TArgs>... args) {
return arg.size() * getTotalSize(args...);
}
template <typename T>
int getSize(std::vector<T> arg) {
return arg.size();
}
template <int Idx, class TT, class T>
void assigner(TT &src, std::vector<int> counter, std::vector<T> &arg) {
std::get<Idx>(src) = arg[counter[Idx]];
}
template <int Idx, class TT, class T, class... TArgs>
void assigner(TT &src, std::vector<int> counter, std::vector<T> &arg,
std::vector<TArgs> &... args) {
std::get<Idx>(src) = arg[counter[Idx]];
assigner<Idx + 1>(src, counter, args...);
}
} // namespace detail
template <class... TArgs>
std::vector<std::tuple<TArgs...>> paramsGrid(std::vector<TArgs>... args) {
int length = detail::getTotalSize(args...);
std::vector<int> sizes = {detail::getSize(args)...};
int size = sizes.size();
std::vector<std::tuple<TArgs...>> params(length);
std::vector<int> counter(size);
for (int i = 0; i < length; ++i) {
detail::assigner<0>(params[i], counter, args...);
counter[size - 1] += 1;
for (int c = size - 1; c >= 0; --c) {
if (counter[c] == sizes[c] && c > 0) {
counter[c - 1] += 1;
counter[c] = 0;
}
}
}
return params;
}
#endif
// Copyright Louis Delacroix 2010 - 2014.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
//
// A pretty printing library for C++
//
// Usage:
// Include this header, and operator<< will "just work".
#ifndef H_PRETTY_PRINT
#define H_PRETTY_PRINT
#include <cstddef>
#include <iterator>
#include <memory>
#include <ostream>
#include <set>
#include <tuple>
#include <type_traits>
#include <unordered_set>
#include <utility>
#include <valarray>
namespace pretty_print {
namespace detail {
// SFINAE type trait to detect whether T::const_iterator exists.
struct sfinae_base {
using yes = char;
using no = yes[2];
};
template <typename T>
struct has_const_iterator : private sfinae_base {
private:
template <typename C>
static yes &test(typename C::const_iterator *);
template <typename C>
static no &test(...);
public:
static const bool value = sizeof(test<T>(nullptr)) == sizeof(yes);
using type = T;
};
template <typename T>
struct has_begin_end : private sfinae_base {
private:
template <typename C>
static yes &
f(typename std::enable_if<
std::is_same<decltype(static_cast<typename C::const_iterator (C::*)()
const>(&C::begin)),
typename C::const_iterator (C::*)() const>::value>::type *);
template <typename C>
static no &f(...);
template <typename C>
static yes &g(typename std::enable_if<
std::is_same<decltype(static_cast<typename C::const_iterator (
C::*)() const>(&C::end)),
typename C::const_iterator (C::*)() const>::value,
void>::type *);
template <typename C>
static no &g(...);
public:
static bool const beg_value = sizeof(f<T>(nullptr)) == sizeof(yes);
static bool const end_value = sizeof(g<T>(nullptr)) == sizeof(yes);
};
} // namespace detail
// Holds the delimiter values for a specific character type
template <typename TChar>
struct delimiters_values {
using char_type = TChar;
const char_type *prefix;
const char_type *delimiter;
const char_type *postfix;
};
// Defines the delimiter values for a specific container and character type
template <typename T, typename TChar>
struct delimiters {
using type = delimiters_values<TChar>;
static const type values;
};
// Functor to print containers. You can use this directly if you want
// to specify a non-default delimiters type. The printing logic can
// be customized by specializing the nested template.
template <typename T, typename TChar = char,
typename TCharTraits = ::std::char_traits<TChar>,
typename TDelimiters = delimiters<T, TChar>>
struct print_container_helper {
using delimiters_type = TDelimiters;
using ostream_type = std::basic_ostream<TChar, TCharTraits>;
template <typename U>
struct printer {
static void print_body(const U &c, ostream_type &stream) {
using std::begin;
using std::end;
auto it = begin(c);
const auto the_end = end(c);
if (it != the_end) {
for (;;) {
stream << *it;
if (++it == the_end) break;
if (delimiters_type::values.delimiter != NULL)
stream << delimiters_type::values.delimiter;
}
}
}
};
print_container_helper(const T &container) : container_(container) {}
inline void operator()(ostream_type &stream) const {
if (delimiters_type::values.prefix != NULL)
stream << delimiters_type::values.prefix;
printer<T>::print_body(container_, stream);
if (delimiters_type::values.postfix != NULL)
stream << delimiters_type::values.postfix;
}
private:
const T &container_;
};
// Specialization for pairs
template <typename T, typename TChar, typename TCharTraits,
typename TDelimiters>
template <typename T1, typename T2>
struct print_container_helper<T, TChar, TCharTraits,
TDelimiters>::printer<std::pair<T1, T2>> {
using ostream_type =
typename print_container_helper<T, TChar, TCharTraits,
TDelimiters>::ostream_type;
static void print_body(const std::pair<T1, T2> &c, ostream_type &stream) {
stream << c.first;
if (print_container_helper<T, TChar, TCharTraits,
TDelimiters>::delimiters_type::values
.delimiter != NULL)
stream << print_container_helper<T, TChar, TCharTraits,
TDelimiters>::delimiters_type::values
.delimiter;
stream << c.second;
}
};
// Specialization for tuples
template <typename T, typename TChar, typename TCharTraits,
typename TDelimiters>
template <typename... Args>
struct print_container_helper<T, TChar, TCharTraits,
TDelimiters>::printer<std::tuple<Args...>> {
using ostream_type =
typename print_container_helper<T, TChar, TCharTraits,
TDelimiters>::ostream_type;
using element_type = std::tuple<Args...>;
template <std::size_t I>
struct Int {};
static void print_body(const element_type &c, ostream_type &stream) {
tuple_print(c, stream, Int<0>());
}
static void tuple_print(const element_type &, ostream_type &,
Int<sizeof...(Args)>) {}
static void tuple_print(
const element_type &c, ostream_type &stream,
typename std::conditional<sizeof...(Args) != 0, Int<0>,
std::nullptr_t>::type) {
stream << std::get<0>(c);
tuple_print(c, stream, Int<1>());
}
template <std::size_t N>
static void tuple_print(const element_type &c, ostream_type &stream, Int<N>) {
if (print_container_helper<T, TChar, TCharTraits,
TDelimiters>::delimiters_type::values
.delimiter != NULL)
stream << print_container_helper<T, TChar, TCharTraits,
TDelimiters>::delimiters_type::values
.delimiter;
stream << std::get<N>(c);
tuple_print(c, stream, Int<N + 1>());
}
};
// Prints a print_container_helper to the specified stream.
template <typename T, typename TChar, typename TCharTraits,
typename TDelimiters>
inline std::basic_ostream<TChar, TCharTraits> &operator<<(
std::basic_ostream<TChar, TCharTraits> &stream,
const print_container_helper<T, TChar, TCharTraits, TDelimiters> &helper) {
helper(stream);
return stream;
}
// Basic is_container template; specialize to derive from std::true_type for all
// desired container types
template <typename T>
struct is_container
: public std::integral_constant<bool,
detail::has_const_iterator<T>::value &&
detail::has_begin_end<T>::beg_value &&
detail::has_begin_end<T>::end_value> {};
template <typename T, std::size_t N>
struct is_container<T[N]> : std::true_type {};
template <std::size_t N>
struct is_container<char[N]> : std::false_type {};
template <typename T>
struct is_container<std::valarray<T>> : std::true_type {};
template <typename T1, typename T2>
struct is_container<std::pair<T1, T2>> : std::true_type {};
template <typename... Args>
struct is_container<std::tuple<Args...>> : std::true_type {};
// Default delimiters
template <typename T>
struct delimiters<T, char> {
static const delimiters_values<char> values;
};
template <typename T>
const delimiters_values<char> delimiters<T, char>::values = {"[", ", ", "]"};
template <typename T>
struct delimiters<T, wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename T>
const delimiters_values<wchar_t> delimiters<T, wchar_t>::values = {L"[", L", ",
L"]"};
// Delimiters for (multi)set and unordered_(multi)set
template <typename T, typename TComp, typename TAllocator>
struct delimiters<::std::set<T, TComp, TAllocator>, char> {
static const delimiters_values<char> values;
};
template <typename T, typename TComp, typename TAllocator>
const delimiters_values<char>
delimiters<::std::set<T, TComp, TAllocator>, char>::values = {"{", ", ",
"}"};
template <typename T, typename TComp, typename TAllocator>
struct delimiters<::std::set<T, TComp, TAllocator>, wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename T, typename TComp, typename TAllocator>
const delimiters_values<wchar_t>
delimiters<::std::set<T, TComp, TAllocator>, wchar_t>::values = {
L"{", L", ", L"}"};
template <typename T, typename TComp, typename TAllocator>
struct delimiters<::std::multiset<T, TComp, TAllocator>, char> {
static const delimiters_values<char> values;
};
template <typename T, typename TComp, typename TAllocator>
const delimiters_values<char>
delimiters<::std::multiset<T, TComp, TAllocator>, char>::values = {
"{", ", ", "}"};
template <typename T, typename TComp, typename TAllocator>
struct delimiters<::std::multiset<T, TComp, TAllocator>, wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename T, typename TComp, typename TAllocator>
const delimiters_values<wchar_t>
delimiters<::std::multiset<T, TComp, TAllocator>, wchar_t>::values = {
L"{", L", ", L"}"};
template <typename T, typename THash, typename TEqual, typename TAllocator>
struct delimiters<::std::unordered_set<T, THash, TEqual, TAllocator>, char> {
static const delimiters_values<char> values;
};
template <typename T, typename THash, typename TEqual, typename TAllocator>
const delimiters_values<char> delimiters<
::std::unordered_set<T, THash, TEqual, TAllocator>, char>::values = {
"{", ", ", "}"};
template <typename T, typename THash, typename TEqual, typename TAllocator>
struct delimiters<::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename T, typename THash, typename TEqual, typename TAllocator>
const delimiters_values<wchar_t> delimiters<
::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t>::values = {
L"{", L", ", L"}"};
template <typename T, typename THash, typename TEqual, typename TAllocator>
struct delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
char> {
static const delimiters_values<char> values;
};
template <typename T, typename THash, typename TEqual, typename TAllocator>
const delimiters_values<char> delimiters<
::std::unordered_multiset<T, THash, TEqual, TAllocator>, char>::values = {
"{", ", ", "}"};
template <typename T, typename THash, typename TEqual, typename TAllocator>
struct delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename T, typename THash, typename TEqual, typename TAllocator>
const delimiters_values<wchar_t>
delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
wchar_t>::values = {L"{", L", ", L"}"};
// Delimiters for pair and tuple
template <typename T1, typename T2>
struct delimiters<std::pair<T1, T2>, char> {
static const delimiters_values<char> values;
};
template <typename T1, typename T2>
const delimiters_values<char> delimiters<std::pair<T1, T2>, char>::values = {
"(", ", ", ")"};
template <typename T1, typename T2>
struct delimiters<::std::pair<T1, T2>, wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename T1, typename T2>
const delimiters_values<wchar_t>
delimiters<::std::pair<T1, T2>, wchar_t>::values = {L"(", L", ", L")"};
template <typename... Args>
struct delimiters<std::tuple<Args...>, char> {
static const delimiters_values<char> values;
};
template <typename... Args>
const delimiters_values<char> delimiters<std::tuple<Args...>, char>::values = {
"(", ", ", ")"};
template <typename... Args>
struct delimiters<::std::tuple<Args...>, wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename... Args>
const delimiters_values<wchar_t>
delimiters<::std::tuple<Args...>, wchar_t>::values = {L"(", L", ", L")"};
// Type-erasing helper class for easy use of custom delimiters.
// Requires TCharTraits = std::char_traits<TChar> and TChar = char or wchar_t,
// and MyDelims needs to be defined for TChar. Usage: "cout <<
// pretty_print::custom_delims<MyDelims>(x)".
struct custom_delims_base {
virtual ~custom_delims_base() {}
virtual std::ostream &stream(::std::ostream &) = 0;
virtual std::wostream &stream(::std::wostream &) = 0;
};
template <typename T, typename Delims>
struct custom_delims_wrapper : custom_delims_base {
custom_delims_wrapper(const T &t_) : t(t_) {}
std::ostream &stream(std::ostream &s) {
return s << print_container_helper<T, char, std::char_traits<char>, Delims>(
t);
}
std::wostream &stream(std::wostream &s) {
return s << print_container_helper<T, wchar_t, std::char_traits<wchar_t>,
Delims>(t);
}
private:
const T &t;
};
template <typename Delims>
struct custom_delims {
template <typename Container>
custom_delims(const Container &c)
: base(new custom_delims_wrapper<Container, Delims>(c)) {}
std::unique_ptr<custom_delims_base> base;
};
template <typename TChar, typename TCharTraits, typename Delims>
inline std::basic_ostream<TChar, TCharTraits> &operator<<(
std::basic_ostream<TChar, TCharTraits> &s, const custom_delims<Delims> &p) {
return p.base->stream(s);
}
// A wrapper for a C-style array given as pointer-plus-size.
// Usage: std::cout << pretty_print_array(arr, n) << std::endl;
template <typename T>
struct array_wrapper_n {
typedef const T *const_iterator;
typedef T value_type;
array_wrapper_n(const T *const a, size_t n) : _array(a), _n(n) {}
inline const_iterator begin() const { return _array; }
inline const_iterator end() const { return _array + _n; }
private:
const T *const _array;
size_t _n;
};
// A wrapper for hash-table based containers that offer local iterators to each
// bucket. Usage: std::cout << bucket_print(m, 4) << std::endl; (Prints bucket
// 5 of container m.)
template <typename T>
struct bucket_print_wrapper {
typedef typename T::const_local_iterator const_iterator;
typedef typename T::size_type size_type;
const_iterator begin() const { return m_map.cbegin(n); }
const_iterator end() const { return m_map.cend(n); }
bucket_print_wrapper(const T &m, size_type bucket) : m_map(m), n(bucket) {}
private:
const T &m_map;
const size_type n;
};
} // namespace pretty_print
// Global accessor functions for the convenience wrappers
template <typename T>
inline pretty_print::array_wrapper_n<T> pretty_print_array(const T *const a,
size_t n) {
return pretty_print::array_wrapper_n<T>(a, n);
}
template <typename T>
pretty_print::bucket_print_wrapper<T> bucket_print(const T &m,
typename T::size_type n) {
return pretty_print::bucket_print_wrapper<T>(m, n);
}
// Main magic entry point: An overload snuck into namespace std.
// Can we do better?
namespace std {
// Prints a container to the stream using default delimiters
template <typename T, typename TChar, typename TCharTraits>
inline typename enable_if<::pretty_print::is_container<T>::value,
basic_ostream<TChar, TCharTraits> &>::type
operator<<(basic_ostream<TChar, TCharTraits> &stream, const T &container) {
return stream
<< ::pretty_print::print_container_helper<T, TChar, TCharTraits>(
container);
}
} // namespace std
#endif // H_PRETTY_PRINT
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <iostream>
#include <pybind11/embed.h> // everything needed for embedding
#include <pybind11/functional.h>
#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <tensorview/tensorview.h>
namespace py = pybind11;
template <typename T, typename TPyObject>
std::vector<T> array2Vector(TPyObject arr){
py::array arr_np = arr;
size_t size = arr.attr("size").template cast<size_t>();
py::array_t<T> arr_cc = arr_np;
std::vector<T> data(arr_cc.data(), arr_cc.data() + size);
return data;
}
template <typename T>
std::vector<T> arrayT2Vector(py::array_t<T> arr)
{
std::vector<T> data(arr.data(), arr.data() + arr.size());
return data;
}
template <typename T, typename TPyObject>
tv::TensorView<T> array2TensorView(TPyObject arr){
py::array arr_np = arr;
py::array_t<T> arr_cc = arr_np;
tv::Shape shape;
for (int i = 0; i < arr_cc.ndim(); ++i){
shape.push_back(arr_cc.shape(i));
}
return tv::TensorView<T>(arr_cc.mutable_data(), shape);
}
template <typename T>
tv::TensorView<T> arrayT2TensorView(py::array_t<T> arr){
tv::Shape shape;
for (int i = 0; i < arr.ndim(); ++i){
shape.push_back(arr.shape(i));
}
return tv::TensorView<T>(arr.mutable_data(), shape);
}
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef FUSED_SPARSE_CONV_OP_H_
#define FUSED_SPARSE_CONV_OP_H_
#include <cuda_runtime_api.h>
#include <spconv/indice.h>
#include <spconv/reordering.h>
#include <torch/script.h>
#include <torch_utils.h>
#include <utility/timer.h>
namespace spconv {
// torch.jit's doc says only support int64, so we need to convert to int32.
template <typename T>
torch::Tensor fusedIndiceConvBatchNorm(
torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
int64_t _inverse, int64_t _subM) {
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto kernelVolume = indicePairs.size(0);
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto indicePairMaxSizeIter =
std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
int indicePairMaxOffset =
indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
int indicePairMaxSize = *indicePairMaxSizeIter;
/*if (_subM){
std::vector<int> indicePairNumVec(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
auto indicePairVecMaxSizeIter = std::max_element(
indicePairNumVec.begin(), indicePairNumVec.end());
indicePairMaxSize = *indicePairVecMaxSizeIter;
}*/
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
// auto indicePairOptions =
// torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
torch::Tensor output =
torch::zeros({numActOut, numOutPlanes}, options).copy_(bias);
torch::Tensor inputBuffer =
torch::zeros({indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::zeros({indicePairMaxSize, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
if (subM) { // the center index of subm conv don't need gather and scatter
// add.
torch::mm_out(output, features, filters[indicePairMaxOffset]);
}
double totalGatherTime = 0;
double totalGEMMTime = 0;
double totalSAddTime = 0;
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
// auto timer = spconv::CudaContextTimer<>();
auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr<T>(),
{nHot, numOutPlanes}, options);
auto inputBufferBlob = torch::from_blob(inputBuffer.data_ptr<T>(),
{nHot, numInPlanes}, options);
if (device == torch::kCPU) {
functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
} else {
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR();
/* slower than SparseGatherFunctor, may due to int->long conversion
auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
auto indicePairBlob = torch::from_blob(indicePairLong.data_ptr<long>(),
{nHot}, indicePairOptions); torch::index_select_out(inputBufferBlob,
features, 0, indicePairBlob);*/
}
// totalGatherTime += timer.report() / 1000.0;
torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
// totalGEMMTime += timer.report() / 1000.0;
if (device == torch::kCPU) {
functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
scatterFtor(tv::CPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot, true);
} else {
functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot, true);
TV_CHECK_CUDA_ERR();
}
// totalSAddTime += timer.report() / 1000.0;
}
// std::cout << "gather time " << totalGatherTime << std::endl;
// std::cout << "gemm time " << totalGEMMTime << std::endl;
// std::cout << "scatteradd time " << totalSAddTime << std::endl;
return output;
}
} // namespace spconv
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment