Commit ee5667c1 authored by zhangwenwei's avatar zhangwenwei
Browse files

Reformat cpp code to supress warning

parent 2bb43004
...@@ -274,7 +274,7 @@ class KittiDataset(torch_data.Dataset): ...@@ -274,7 +274,7 @@ class KittiDataset(torch_data.Dataset):
out) out)
return result_files return result_files
def evaluate(self, result_files, eval_types=None): def evaluate(self, result_files, logger=None, eval_types=None):
from mmdet3d.core.evaluation import kitti_eval from mmdet3d.core.evaluation import kitti_eval
gt_annos = [info['annos'] for info in self.kitti_infos] gt_annos = [info['annos'] for info in self.kitti_infos]
if eval_types == 'img_bbox': if eval_types == 'img_bbox':
...@@ -283,7 +283,7 @@ class KittiDataset(torch_data.Dataset): ...@@ -283,7 +283,7 @@ class KittiDataset(torch_data.Dataset):
else: else:
ap_result_str, ap_dict = kitti_eval(gt_annos, result_files, ap_result_str, ap_dict = kitti_eval(gt_annos, result_files,
self.class_names) self.class_names)
return ap_result_str, ap_dict return ap_dict
def bbox2result_kitti(self, net_outputs, class_names, out=None): def bbox2result_kitti(self, net_outputs, class_names, out=None):
if out: if out:
......
...@@ -15,12 +15,32 @@ from .train_mixins import AnchorTrainMixin ...@@ -15,12 +15,32 @@ from .train_mixins import AnchorTrainMixin
@HEADS.register_module @HEADS.register_module
class SECONDHead(nn.Module, AnchorTrainMixin): class SECONDHead(nn.Module, AnchorTrainMixin):
"""Anchor-based head (RPN, RetinaNet, SSD, etc.). """Anchor-based head for VoxelNet detectors.
Args: Args:
class_name (list[str]): name of classes (TODO: to be removed)
in_channels (int): Number of channels in the input feature map. in_channels (int): Number of channels in the input feature map.
train_cfg (dict): train configs
test_cfg (dict): test configs
feat_channels (int): Number of channels of the feature map. feat_channels (int): Number of channels of the feature map.
use_direction_classifier (bool): Whether to add a direction classifier.
encode_bg_as_zeros (bool): Whether to use sigmoid of softmax
(TODO: to be removed)
box_code_size (int): The size of box code.
anchor_generator(dict): Config dict of anchor generator.
assigner_per_size (bool): Whether to do assignment for each separate
anchor size.
assign_per_class (bool): Whether to do assignment for each class.
diff_rad_by_sin (bool): Whether to change the difference into sin
difference for box regression loss.
dir_offset (float | int): The offset of BEV rotation angles
(TODO: may be moved into box coder)
dirlimit_offset (float | int): The limited range of BEV rotation angles
(TODO: may be moved into box coder)
box_coder (dict): Config dict of box coders.
loss_cls (dict): Config of classification loss. loss_cls (dict): Config of classification loss.
loss_bbox (dict): Config of localization loss. loss_bbox (dict): Config of localization loss.
loss_dir (dict): Config of direction classifier loss.
""" # noqa: W605 """ # noqa: W605
def __init__(self, def __init__(self,
...@@ -253,7 +273,7 @@ class SECONDHead(nn.Module, AnchorTrainMixin): ...@@ -253,7 +273,7 @@ class SECONDHead(nn.Module, AnchorTrainMixin):
num_levels = len(cls_scores) num_levels = len(cls_scores)
featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)] featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)]
device = cls_scores[0].device device = cls_scores[0].device
mlvl_anchors = self.anchor_generators.grid_anchors( mlvl_anchors = self.anchor_generator.grid_anchors(
featmap_sizes, device=device) featmap_sizes, device=device)
mlvl_anchors = [ mlvl_anchors = [
anchor.reshape(-1, self.box_code_size) for anchor in mlvl_anchors anchor.reshape(-1, self.box_code_size) for anchor in mlvl_anchors
......
#include <torch/serialize/tensor.h>
#include <torch/extension.h>
#include <vector>
#include <cuda.h> #include <cuda.h>
#include <cuda_runtime_api.h> #include <cuda_runtime_api.h>
#include <torch/extension.h>
#include <torch/serialize/tensor.h>
#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ") #include <vector>
#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
#define CHECK_ERROR(ans) { gpuAssert((ans), __FILE__, __LINE__); } #define CHECK_CUDA(x) \
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
{ #define CHECK_CONTIGUOUS(x) \
if (code != cudaSuccess) TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
{ #define CHECK_INPUT(x) \
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
#define CHECK_ERROR(ans) \
{ gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line,
bool abort = true) {
if (code != cudaSuccess) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
line);
if (abort) exit(code); if (abort) exit(code);
} }
} }
const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8; const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
void boxesoverlapLauncher(const int num_a, const float *boxes_a,
void boxesoverlapLauncher(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_overlap); const int num_b, const float *boxes_b,
void boxesioubevLauncher(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_iou); float *ans_overlap);
void nmsLauncher(const float *boxes, unsigned long long * mask, int boxes_num, float nms_overlap_thresh); void boxesioubevLauncher(const int num_a, const float *boxes_a, const int num_b,
void nmsNormalLauncher(const float *boxes, unsigned long long * mask, int boxes_num, float nms_overlap_thresh); const float *boxes_b, float *ans_iou);
void nmsLauncher(const float *boxes, unsigned long long *mask, int boxes_num,
int boxes_overlap_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b, at::Tensor ans_overlap){ float nms_overlap_thresh);
void nmsNormalLauncher(const float *boxes, unsigned long long *mask,
int boxes_num, float nms_overlap_thresh);
int boxes_overlap_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b,
at::Tensor ans_overlap) {
// params boxes_a: (N, 5) [x1, y1, x2, y2, ry] // params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
// params boxes_b: (M, 5) // params boxes_b: (M, 5)
// params ans_overlap: (N, M) // params ans_overlap: (N, M)
...@@ -40,16 +51,18 @@ int boxes_overlap_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b, at::Tensor ans ...@@ -40,16 +51,18 @@ int boxes_overlap_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b, at::Tensor ans
int num_a = boxes_a.size(0); int num_a = boxes_a.size(0);
int num_b = boxes_b.size(0); int num_b = boxes_b.size(0);
const float * boxes_a_data = boxes_a.data<float>(); const float *boxes_a_data = boxes_a.data_ptr<float>();
const float * boxes_b_data = boxes_b.data<float>(); const float *boxes_b_data = boxes_b.data_ptr<float>();
float * ans_overlap_data = ans_overlap.data<float>(); float *ans_overlap_data = ans_overlap.data_ptr<float>();
boxesoverlapLauncher(num_a, boxes_a_data, num_b, boxes_b_data, ans_overlap_data); boxesoverlapLauncher(num_a, boxes_a_data, num_b, boxes_b_data,
ans_overlap_data);
return 1; return 1;
} }
int boxes_iou_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b, at::Tensor ans_iou){ int boxes_iou_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b,
at::Tensor ans_iou) {
// params boxes_a: (N, 5) [x1, y1, x2, y2, ry] // params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
// params boxes_b: (M, 5) // params boxes_b: (M, 5)
// params ans_overlap: (N, M) // params ans_overlap: (N, M)
...@@ -61,16 +74,16 @@ int boxes_iou_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b, at::Tensor ans_iou ...@@ -61,16 +74,16 @@ int boxes_iou_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b, at::Tensor ans_iou
int num_a = boxes_a.size(0); int num_a = boxes_a.size(0);
int num_b = boxes_b.size(0); int num_b = boxes_b.size(0);
const float * boxes_a_data = boxes_a.data<float>(); const float *boxes_a_data = boxes_a.data_ptr<float>();
const float * boxes_b_data = boxes_b.data<float>(); const float *boxes_b_data = boxes_b.data_ptr<float>();
float * ans_iou_data = ans_iou.data<float>(); float *ans_iou_data = ans_iou.data_ptr<float>();
boxesioubevLauncher(num_a, boxes_a_data, num_b, boxes_b_data, ans_iou_data); boxesioubevLauncher(num_a, boxes_a_data, num_b, boxes_b_data, ans_iou_data);
return 1; return 1;
} }
int nms_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh){ int nms_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh) {
// params boxes: (N, 5) [x1, y1, x2, y2, ry] // params boxes: (N, 5) [x1, y1, x2, y2, ry]
// params keep: (N) // params keep: (N)
...@@ -78,21 +91,24 @@ int nms_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh){ ...@@ -78,21 +91,24 @@ int nms_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh){
CHECK_CONTIGUOUS(keep); CHECK_CONTIGUOUS(keep);
int boxes_num = boxes.size(0); int boxes_num = boxes.size(0);
const float * boxes_data = boxes.data<float>(); const float *boxes_data = boxes.data_ptr<float>();
long * keep_data = keep.data<long>(); long *keep_data = keep.data_ptr<long>();
const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS); const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
unsigned long long *mask_data = NULL; unsigned long long *mask_data = NULL;
CHECK_ERROR(cudaMalloc((void**)&mask_data, boxes_num * col_blocks * sizeof(unsigned long long))); CHECK_ERROR(cudaMalloc((void **)&mask_data,
boxes_num * col_blocks * sizeof(unsigned long long)));
nmsLauncher(boxes_data, mask_data, boxes_num, nms_overlap_thresh); nmsLauncher(boxes_data, mask_data, boxes_num, nms_overlap_thresh);
// unsigned long long mask_cpu[boxes_num * col_blocks]; // unsigned long long mask_cpu[boxes_num * col_blocks];
// unsigned long long *mask_cpu = new unsigned long long [boxes_num * col_blocks]; // unsigned long long *mask_cpu = new unsigned long long [boxes_num *
// col_blocks];
std::vector<unsigned long long> mask_cpu(boxes_num * col_blocks); std::vector<unsigned long long> mask_cpu(boxes_num * col_blocks);
// printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks); // printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks);
CHECK_ERROR(cudaMemcpy(&mask_cpu[0], mask_data, boxes_num * col_blocks * sizeof(unsigned long long), CHECK_ERROR(cudaMemcpy(&mask_cpu[0], mask_data,
boxes_num * col_blocks * sizeof(unsigned long long),
cudaMemcpyDeviceToHost)); cudaMemcpyDeviceToHost));
cudaFree(mask_data); cudaFree(mask_data);
...@@ -102,25 +118,25 @@ int nms_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh){ ...@@ -102,25 +118,25 @@ int nms_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh){
int num_to_keep = 0; int num_to_keep = 0;
for (int i = 0; i < boxes_num; i++){ for (int i = 0; i < boxes_num; i++) {
int nblock = i / THREADS_PER_BLOCK_NMS; int nblock = i / THREADS_PER_BLOCK_NMS;
int inblock = i % THREADS_PER_BLOCK_NMS; int inblock = i % THREADS_PER_BLOCK_NMS;
if (!(remv_cpu[nblock] & (1ULL << inblock))){ if (!(remv_cpu[nblock] & (1ULL << inblock))) {
keep_data[num_to_keep++] = i; keep_data[num_to_keep++] = i;
unsigned long long *p = &mask_cpu[0] + i * col_blocks; unsigned long long *p = &mask_cpu[0] + i * col_blocks;
for (int j = nblock; j < col_blocks; j++){ for (int j = nblock; j < col_blocks; j++) {
remv_cpu[j] |= p[j]; remv_cpu[j] |= p[j];
} }
} }
} }
if ( cudaSuccess != cudaGetLastError() ) printf( "Error!\n" ); if (cudaSuccess != cudaGetLastError()) printf("Error!\n");
return num_to_keep; return num_to_keep;
} }
int nms_normal_gpu(at::Tensor boxes, at::Tensor keep,
int nms_normal_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh){ float nms_overlap_thresh) {
// params boxes: (N, 5) [x1, y1, x2, y2, ry] // params boxes: (N, 5) [x1, y1, x2, y2, ry]
// params keep: (N) // params keep: (N)
...@@ -128,21 +144,24 @@ int nms_normal_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh){ ...@@ -128,21 +144,24 @@ int nms_normal_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh){
CHECK_CONTIGUOUS(keep); CHECK_CONTIGUOUS(keep);
int boxes_num = boxes.size(0); int boxes_num = boxes.size(0);
const float * boxes_data = boxes.data<float>(); const float *boxes_data = boxes.data_ptr<float>();
long * keep_data = keep.data<long>(); long *keep_data = keep.data_ptr<long>();
const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS); const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
unsigned long long *mask_data = NULL; unsigned long long *mask_data = NULL;
CHECK_ERROR(cudaMalloc((void**)&mask_data, boxes_num * col_blocks * sizeof(unsigned long long))); CHECK_ERROR(cudaMalloc((void **)&mask_data,
boxes_num * col_blocks * sizeof(unsigned long long)));
nmsNormalLauncher(boxes_data, mask_data, boxes_num, nms_overlap_thresh); nmsNormalLauncher(boxes_data, mask_data, boxes_num, nms_overlap_thresh);
// unsigned long long mask_cpu[boxes_num * col_blocks]; // unsigned long long mask_cpu[boxes_num * col_blocks];
// unsigned long long *mask_cpu = new unsigned long long [boxes_num * col_blocks]; // unsigned long long *mask_cpu = new unsigned long long [boxes_num *
// col_blocks];
std::vector<unsigned long long> mask_cpu(boxes_num * col_blocks); std::vector<unsigned long long> mask_cpu(boxes_num * col_blocks);
// printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks); // printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks);
CHECK_ERROR(cudaMemcpy(&mask_cpu[0], mask_data, boxes_num * col_blocks * sizeof(unsigned long long), CHECK_ERROR(cudaMemcpy(&mask_cpu[0], mask_data,
boxes_num * col_blocks * sizeof(unsigned long long),
cudaMemcpyDeviceToHost)); cudaMemcpyDeviceToHost));
cudaFree(mask_data); cudaFree(mask_data);
...@@ -152,27 +171,26 @@ int nms_normal_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh){ ...@@ -152,27 +171,26 @@ int nms_normal_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh){
int num_to_keep = 0; int num_to_keep = 0;
for (int i = 0; i < boxes_num; i++){ for (int i = 0; i < boxes_num; i++) {
int nblock = i / THREADS_PER_BLOCK_NMS; int nblock = i / THREADS_PER_BLOCK_NMS;
int inblock = i % THREADS_PER_BLOCK_NMS; int inblock = i % THREADS_PER_BLOCK_NMS;
if (!(remv_cpu[nblock] & (1ULL << inblock))){ if (!(remv_cpu[nblock] & (1ULL << inblock))) {
keep_data[num_to_keep++] = i; keep_data[num_to_keep++] = i;
unsigned long long *p = &mask_cpu[0] + i * col_blocks; unsigned long long *p = &mask_cpu[0] + i * col_blocks;
for (int j = nblock; j < col_blocks; j++){ for (int j = nblock; j < col_blocks; j++) {
remv_cpu[j] |= p[j]; remv_cpu[j] |= p[j];
} }
} }
} }
if ( cudaSuccess != cudaGetLastError() ) printf( "Error!\n" ); if (cudaSuccess != cudaGetLastError()) printf("Error!\n");
return num_to_keep; return num_to_keep;
} }
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("boxes_overlap_bev_gpu", &boxes_overlap_bev_gpu, "oriented boxes overlap"); m.def("boxes_overlap_bev_gpu", &boxes_overlap_bev_gpu,
"oriented boxes overlap");
m.def("boxes_iou_bev_gpu", &boxes_iou_bev_gpu, "oriented boxes iou"); m.def("boxes_iou_bev_gpu", &boxes_iou_bev_gpu, "oriented boxes iou");
m.def("nms_gpu", &nms_gpu, "oriented nms gpu"); m.def("nms_gpu", &nms_gpu, "oriented nms gpu");
m.def("nms_normal_gpu", &nms_normal_gpu, "nms gpu"); m.def("nms_normal_gpu", &nms_normal_gpu, "nms gpu");
......
//Modified from // Modified from
//https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu // https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
//Points in boxes cpu // Points in boxes cpu
//Written by Shaoshuai Shi // Written by Shaoshuai Shi
//All Rights Reserved 2019. // All Rights Reserved 2019.
#include <torch/serialize/tensor.h>
#include <torch/extension.h>
#include <assert.h> #include <assert.h>
#include <math.h> #include <math.h>
#include <stdio.h> #include <stdio.h>
#include <torch/extension.h>
#include <torch/serialize/tensor.h>
#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ") #define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
// #define DEBUG // #define DEBUG
inline void lidar_to_local_coords_cpu(float shift_x, float shift_y, float rz,
inline void lidar_to_local_coords_cpu(float shift_x, float shift_y, float rz, float &local_x, float &local_y){ float &local_x, float &local_y) {
// should rotate pi/2 + alpha to translate LiDAR to local // should rotate pi/2 + alpha to translate LiDAR to local
float rot_angle = rz + M_PI / 2; float rot_angle = rz + M_PI / 2;
float cosa = cos(rot_angle), sina = sin(rot_angle); float cosa = cos(rot_angle), sina = sin(rot_angle);
...@@ -24,10 +23,11 @@ inline void lidar_to_local_coords_cpu(float shift_x, float shift_y, float rz, fl ...@@ -24,10 +23,11 @@ inline void lidar_to_local_coords_cpu(float shift_x, float shift_y, float rz, fl
local_y = shift_x * sina + shift_y * cosa; local_y = shift_x * sina + shift_y * cosa;
} }
inline int check_pt_in_box3d_cpu(const float *pt, const float *box3d,
inline int check_pt_in_box3d_cpu(const float *pt, const float *box3d, float &local_x, float &local_y){ float &local_x, float &local_y) {
// param pt: (x, y, z) // param pt: (x, y, z)
// param box3d: (cx, cy, cz, w, l, h, rz) in LiDAR coordinate, cz in the bottom center // param box3d: (cx, cy, cz, w, l, h, rz) in LiDAR coordinate, cz in the
// bottom center
float x = pt[0], y = pt[1], z = pt[2]; float x = pt[0], y = pt[1], z = pt[2];
float cx = box3d[0], cy = box3d[1], cz = box3d[2]; float cx = box3d[0], cy = box3d[1], cz = box3d[2];
float w = box3d[3], l = box3d[4], h = box3d[5], rz = box3d[6]; float w = box3d[3], l = box3d[4], h = box3d[5], rz = box3d[6];
...@@ -35,15 +35,16 @@ inline int check_pt_in_box3d_cpu(const float *pt, const float *box3d, float &loc ...@@ -35,15 +35,16 @@ inline int check_pt_in_box3d_cpu(const float *pt, const float *box3d, float &loc
if (fabsf(z - cz) > h / 2.0) return 0; if (fabsf(z - cz) > h / 2.0) return 0;
lidar_to_local_coords_cpu(x - cx, y - cy, rz, local_x, local_y); lidar_to_local_coords_cpu(x - cx, y - cy, rz, local_x, local_y);
float in_flag = (local_x > -l / 2.0) & (local_x < l / 2.0) & (local_y > -w / 2.0) & (local_y < w / 2.0); float in_flag = (local_x > -l / 2.0) & (local_x < l / 2.0) &
(local_y > -w / 2.0) & (local_y < w / 2.0);
return in_flag; return in_flag;
} }
int points_in_boxes_cpu(at::Tensor boxes_tensor, at::Tensor pts_tensor,
int points_in_boxes_cpu(at::Tensor boxes_tensor, at::Tensor pts_tensor, at::Tensor pts_indices_tensor){ at::Tensor pts_indices_tensor) {
// params boxes: (N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is the bottom center, each box DO NOT overlaps // params boxes: (N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is the
// params pts: (npoints, 3) [x, y, z] in LiDAR coordinate // bottom center, each box DO NOT overlaps params pts: (npoints, 3) [x, y, z]
// params pts_indices: (N, npoints) // in LiDAR coordinate params pts_indices: (N, npoints)
CHECK_CONTIGUOUS(boxes_tensor); CHECK_CONTIGUOUS(boxes_tensor);
CHECK_CONTIGUOUS(pts_tensor); CHECK_CONTIGUOUS(pts_tensor);
...@@ -52,14 +53,15 @@ int points_in_boxes_cpu(at::Tensor boxes_tensor, at::Tensor pts_tensor, at::Tens ...@@ -52,14 +53,15 @@ int points_in_boxes_cpu(at::Tensor boxes_tensor, at::Tensor pts_tensor, at::Tens
int boxes_num = boxes_tensor.size(0); int boxes_num = boxes_tensor.size(0);
int pts_num = pts_tensor.size(0); int pts_num = pts_tensor.size(0);
const float *boxes = boxes_tensor.data<float>(); const float *boxes = boxes_tensor.data_ptr<float>();
const float *pts = pts_tensor.data<float>(); const float *pts = pts_tensor.data_ptr<float>();
int *pts_indices = pts_indices_tensor.data<int>(); int *pts_indices = pts_indices_tensor.data_ptr<int>();
float local_x = 0, local_y = 0; float local_x = 0, local_y = 0;
for (int i = 0; i < boxes_num; i++){ for (int i = 0; i < boxes_num; i++) {
for (int j = 0; j < pts_num; j++){ for (int j = 0; j < pts_num; j++) {
int cur_in_flag = check_pt_in_box3d_cpu(pts + j * 3, boxes + i * 7, local_x, local_y); int cur_in_flag =
check_pt_in_box3d_cpu(pts + j * 3, boxes + i * 7, local_x, local_y);
pts_indices[i * pts_num + j] = cur_in_flag; pts_indices[i * pts_num + j] = cur_in_flag;
} }
} }
......
//Modified from // Modified from
//https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu // https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
//Points in boxes gpu // Points in boxes gpu
//Written by Shaoshuai Shi // Written by Shaoshuai Shi
//All Rights Reserved 2019. // All Rights Reserved 2019.
#include <torch/serialize/tensor.h>
#include <torch/extension.h>
#include <assert.h> #include <assert.h>
#include <math.h> #include <math.h>
#include <stdio.h> #include <stdio.h>
#include <torch/extension.h>
#include <torch/serialize/tensor.h>
#define THREADS_PER_BLOCK 256 #define THREADS_PER_BLOCK 256
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ") #define CHECK_CUDA(x) \
#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ") TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x) #define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_INPUT(x) \
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
// #define DEBUG // #define DEBUG
__device__ inline void lidar_to_local_coords(float shift_x, float shift_y,
__device__ inline void lidar_to_local_coords(float shift_x, float shift_y, float rz, float &local_x, float &local_y){ float rz, float &local_x,
float &local_y) {
// should rotate pi/2 + alpha to translate LiDAR to local // should rotate pi/2 + alpha to translate LiDAR to local
float rot_angle = rz + M_PI / 2; float rot_angle = rz + M_PI / 2;
float cosa = cos(rot_angle), sina = sin(rot_angle); float cosa = cos(rot_angle), sina = sin(rot_angle);
...@@ -29,10 +32,11 @@ __device__ inline void lidar_to_local_coords(float shift_x, float shift_y, float ...@@ -29,10 +32,11 @@ __device__ inline void lidar_to_local_coords(float shift_x, float shift_y, float
local_y = shift_x * sina + shift_y * cosa; local_y = shift_x * sina + shift_y * cosa;
} }
__device__ inline int check_pt_in_box3d(const float *pt, const float *box3d,
__device__ inline int check_pt_in_box3d(const float *pt, const float *box3d, float &local_x, float &local_y){ float &local_x, float &local_y) {
// param pt: (x, y, z) // param pt: (x, y, z)
// param box3d: (cx, cy, cz, w, l, h, rz) in LiDAR coordinate, cz in the bottom center // param box3d: (cx, cy, cz, w, l, h, rz) in LiDAR coordinate, cz in the
// bottom center
float x = pt[0], y = pt[1], z = pt[2]; float x = pt[0], y = pt[1], z = pt[2];
float cx = box3d[0], cy = box3d[1], cz = box3d[2]; float cx = box3d[0], cy = box3d[1], cz = box3d[2];
float w = box3d[3], l = box3d[4], h = box3d[5], rz = box3d[6]; float w = box3d[3], l = box3d[4], h = box3d[5], rz = box3d[6];
...@@ -40,16 +44,19 @@ __device__ inline int check_pt_in_box3d(const float *pt, const float *box3d, flo ...@@ -40,16 +44,19 @@ __device__ inline int check_pt_in_box3d(const float *pt, const float *box3d, flo
if (fabsf(z - cz) > h / 2.0) return 0; if (fabsf(z - cz) > h / 2.0) return 0;
lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y); lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
float in_flag = (local_x > -l / 2.0) & (local_x < l / 2.0) & (local_y > -w / 2.0) & (local_y < w / 2.0); float in_flag = (local_x > -l / 2.0) & (local_x < l / 2.0) &
(local_y > -w / 2.0) & (local_y < w / 2.0);
return in_flag; return in_flag;
} }
__global__ void points_in_boxes_kernel(int batch_size, int boxes_num,
__global__ void points_in_boxes_kernel(int batch_size, int boxes_num, int pts_num, const float *boxes, int pts_num, const float *boxes,
const float *pts, int *box_idx_of_points){ const float *pts,
// params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is the bottom center, each box DO NOT overlaps int *box_idx_of_points) {
// params pts: (B, npoints, 3) [x, y, z] in LiDAR coordinate // params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is
// params boxes_idx_of_points: (B, npoints), default -1 // the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
// y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
// -1
int bs_idx = blockIdx.y; int bs_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
...@@ -61,26 +68,28 @@ __global__ void points_in_boxes_kernel(int batch_size, int boxes_num, int pts_nu ...@@ -61,26 +68,28 @@ __global__ void points_in_boxes_kernel(int batch_size, int boxes_num, int pts_nu
float local_x = 0, local_y = 0; float local_x = 0, local_y = 0;
int cur_in_flag = 0; int cur_in_flag = 0;
for (int k = 0; k < boxes_num; k++){ for (int k = 0; k < boxes_num; k++) {
cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y); cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
if (cur_in_flag){ if (cur_in_flag) {
box_idx_of_points[0] = k; box_idx_of_points[0] = k;
break; break;
} }
} }
} }
void points_in_boxes_launcher(int batch_size, int boxes_num, int pts_num,
void points_in_boxes_launcher(int batch_size, int boxes_num, int pts_num, const float *boxes, const float *boxes, const float *pts,
const float *pts, int *box_idx_of_points){ int *box_idx_of_points) {
// params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is the bottom center, each box DO NOT overlaps // params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is
// params pts: (B, npoints, 3) [x, y, z] in LiDAR coordinate // the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
// params boxes_idx_of_points: (B, npoints), default -1 // y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
// -1
cudaError_t err; cudaError_t err;
dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), batch_size); dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), batch_size);
dim3 threads(THREADS_PER_BLOCK); dim3 threads(THREADS_PER_BLOCK);
points_in_boxes_kernel<<<blocks, threads>>>(batch_size, boxes_num, pts_num, boxes, pts, box_idx_of_points); points_in_boxes_kernel<<<blocks, threads>>>(batch_size, boxes_num, pts_num,
boxes, pts, box_idx_of_points);
err = cudaGetLastError(); err = cudaGetLastError();
if (cudaSuccess != err) { if (cudaSuccess != err) {
...@@ -93,10 +102,12 @@ void points_in_boxes_launcher(int batch_size, int boxes_num, int pts_num, const ...@@ -93,10 +102,12 @@ void points_in_boxes_launcher(int batch_size, int boxes_num, int pts_num, const
#endif #endif
} }
int points_in_boxes_gpu(at::Tensor boxes_tensor, at::Tensor pts_tensor, at::Tensor box_idx_of_points_tensor){ int points_in_boxes_gpu(at::Tensor boxes_tensor, at::Tensor pts_tensor,
// params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is the bottom center, each box DO NOT overlaps at::Tensor box_idx_of_points_tensor) {
// params pts: (B, npoints, 3) [x, y, z] in LiDAR coordinate // params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is
// params boxes_idx_of_points: (B, npoints), default -1 // the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
// y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
// -1
CHECK_INPUT(boxes_tensor); CHECK_INPUT(boxes_tensor);
CHECK_INPUT(pts_tensor); CHECK_INPUT(pts_tensor);
...@@ -106,11 +117,12 @@ int points_in_boxes_gpu(at::Tensor boxes_tensor, at::Tensor pts_tensor, at::Tens ...@@ -106,11 +117,12 @@ int points_in_boxes_gpu(at::Tensor boxes_tensor, at::Tensor pts_tensor, at::Tens
int boxes_num = boxes_tensor.size(1); int boxes_num = boxes_tensor.size(1);
int pts_num = pts_tensor.size(1); int pts_num = pts_tensor.size(1);
const float *boxes = boxes_tensor.data<float>(); const float *boxes = boxes_tensor.data_ptr<float>();
const float *pts = pts_tensor.data<float>(); const float *pts = pts_tensor.data_ptr<float>();
int *box_idx_of_points = box_idx_of_points_tensor.data<int>(); int *box_idx_of_points = box_idx_of_points_tensor.data_ptr<int>();
points_in_boxes_launcher(batch_size, boxes_num, pts_num, boxes, pts, box_idx_of_points); points_in_boxes_launcher(batch_size, boxes_num, pts_num, boxes, pts,
box_idx_of_points);
return 1; return 1;
} }
//Modified from // Modified from
//https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu // https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
//RoI-aware point cloud feature pooling // RoI-aware point cloud feature pooling
//Written by Shaoshuai Shi // Written by Shaoshuai Shi
//All Rights Reserved 2019. // All Rights Reserved 2019.
#include <torch/serialize/tensor.h>
#include <torch/extension.h>
#include <assert.h> #include <assert.h>
#include <torch/extension.h>
#include <torch/serialize/tensor.h>
#define CHECK_CUDA(x) \
#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ") TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ") #define CHECK_CONTIGUOUS(x) \
#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_INPUT(x) \
CHECK_CUDA(x); \
void roiaware_pool3d_launcher(int boxes_num, int pts_num, int channels, int max_pts_each_voxel, CHECK_CONTIGUOUS(x)
int out_x, int out_y, int out_z, const float *rois, const float *pts, const float *pts_feature,
int *argmax, int *pts_idx_of_voxels, float *pooled_features, int pool_method); void roiaware_pool3d_launcher(int boxes_num, int pts_num, int channels,
int max_pts_each_voxel, int out_x, int out_y,
void roiaware_pool3d_backward_launcher(int boxes_num, int out_x, int out_y, int out_z, int channels, int max_pts_each_voxel, int out_z, const float *rois, const float *pts,
const int *pts_idx_of_voxels, const int *argmax, const float *grad_out, float *grad_in, int pool_method); const float *pts_feature, int *argmax,
int *pts_idx_of_voxels, float *pooled_features,
int roiaware_pool3d_gpu(at::Tensor rois, at::Tensor pts, at::Tensor pts_feature, at::Tensor argmax, int pool_method);
at::Tensor pts_idx_of_voxels, at::Tensor pooled_features, int pool_method);
void roiaware_pool3d_backward_launcher(int boxes_num, int out_x, int out_y,
int roiaware_pool3d_gpu_backward(at::Tensor pts_idx_of_voxels, at::Tensor argmax, at::Tensor grad_out, int out_z, int channels,
int max_pts_each_voxel,
const int *pts_idx_of_voxels,
const int *argmax, const float *grad_out,
float *grad_in, int pool_method);
int roiaware_pool3d_gpu(at::Tensor rois, at::Tensor pts, at::Tensor pts_feature,
at::Tensor argmax, at::Tensor pts_idx_of_voxels,
at::Tensor pooled_features, int pool_method);
int roiaware_pool3d_gpu_backward(at::Tensor pts_idx_of_voxels,
at::Tensor argmax, at::Tensor grad_out,
at::Tensor grad_in, int pool_method); at::Tensor grad_in, int pool_method);
int points_in_boxes_cpu(at::Tensor boxes_tensor, at::Tensor pts_tensor, at::Tensor pts_indices_tensor); int points_in_boxes_cpu(at::Tensor boxes_tensor, at::Tensor pts_tensor,
at::Tensor pts_indices_tensor);
int points_in_boxes_gpu(at::Tensor boxes_tensor, at::Tensor pts_tensor, at::Tensor box_idx_of_points_tensor);
int points_in_boxes_gpu(at::Tensor boxes_tensor, at::Tensor pts_tensor,
at::Tensor box_idx_of_points_tensor);
int roiaware_pool3d_gpu(at::Tensor rois, at::Tensor pts, at::Tensor pts_feature, at::Tensor argmax, at::Tensor pts_idx_of_voxels, at::Tensor pooled_features, int pool_method){ int roiaware_pool3d_gpu(at::Tensor rois, at::Tensor pts, at::Tensor pts_feature,
at::Tensor argmax, at::Tensor pts_idx_of_voxels,
at::Tensor pooled_features, int pool_method) {
// params rois: (N, 7) [x, y, z, w, l, h, ry] in LiDAR coordinate // params rois: (N, 7) [x, y, z, w, l, h, ry] in LiDAR coordinate
// params pts: (npoints, 3) [x, y, z] in LiDAR coordinate // params pts: (npoints, 3) [x, y, z] in LiDAR coordinate
// params pts_feature: (npoints, C) // params pts_feature: (npoints, C)
...@@ -56,22 +69,27 @@ int roiaware_pool3d_gpu(at::Tensor rois, at::Tensor pts, at::Tensor pts_feature, ...@@ -56,22 +69,27 @@ int roiaware_pool3d_gpu(at::Tensor rois, at::Tensor pts, at::Tensor pts_feature,
int out_x = pts_idx_of_voxels.size(1); int out_x = pts_idx_of_voxels.size(1);
int out_y = pts_idx_of_voxels.size(2); int out_y = pts_idx_of_voxels.size(2);
int out_z = pts_idx_of_voxels.size(3); int out_z = pts_idx_of_voxels.size(3);
assert ((out_x < 256) && (out_y < 256) && (out_z < 256)); // we encode index with 8bit assert((out_x < 256) && (out_y < 256) &&
(out_z < 256)); // we encode index with 8bit
const float *rois_data = rois.data<float>(); const float *rois_data = rois.data_ptr<float>();
const float *pts_data = pts.data<float>(); const float *pts_data = pts.data_ptr<float>();
const float *pts_feature_data = pts_feature.data<float>(); const float *pts_feature_data = pts_feature.data_ptr<float>();
int *argmax_data = argmax.data<int>(); int *argmax_data = argmax.data_ptr<int>();
int *pts_idx_of_voxels_data = pts_idx_of_voxels.data<int>(); int *pts_idx_of_voxels_data = pts_idx_of_voxels.data_ptr<int>();
float *pooled_features_data = pooled_features.data<float>(); float *pooled_features_data = pooled_features.data_ptr<float>();
roiaware_pool3d_launcher(boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z, roiaware_pool3d_launcher(
rois_data, pts_data, pts_feature_data, argmax_data, pts_idx_of_voxels_data, pooled_features_data, pool_method); boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
rois_data, pts_data, pts_feature_data, argmax_data,
pts_idx_of_voxels_data, pooled_features_data, pool_method);
return 1; return 1;
} }
int roiaware_pool3d_gpu_backward(at::Tensor pts_idx_of_voxels, at::Tensor argmax, at::Tensor grad_out, at::Tensor grad_in, int pool_method){ int roiaware_pool3d_gpu_backward(at::Tensor pts_idx_of_voxels,
at::Tensor argmax, at::Tensor grad_out,
at::Tensor grad_in, int pool_method) {
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel) // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
// params argmax: (N, out_x, out_y, out_z, C) // params argmax: (N, out_x, out_y, out_z, C)
// params grad_out: (N, out_x, out_y, out_z, C) // params grad_out: (N, out_x, out_y, out_z, C)
...@@ -90,20 +108,25 @@ int roiaware_pool3d_gpu_backward(at::Tensor pts_idx_of_voxels, at::Tensor argmax ...@@ -90,20 +108,25 @@ int roiaware_pool3d_gpu_backward(at::Tensor pts_idx_of_voxels, at::Tensor argmax
int max_pts_each_voxel = pts_idx_of_voxels.size(4); // index 0 is the counter int max_pts_each_voxel = pts_idx_of_voxels.size(4); // index 0 is the counter
int channels = grad_out.size(4); int channels = grad_out.size(4);
const int *pts_idx_of_voxels_data = pts_idx_of_voxels.data<int>(); const int *pts_idx_of_voxels_data = pts_idx_of_voxels.data_ptr<int>();
const int *argmax_data = argmax.data<int>(); const int *argmax_data = argmax.data_ptr<int>();
const float *grad_out_data = grad_out.data<float>(); const float *grad_out_data = grad_out.data_ptr<float>();
float *grad_in_data = grad_in.data<float>(); float *grad_in_data = grad_in.data_ptr<float>();
roiaware_pool3d_backward_launcher(boxes_num, out_x, out_y, out_z, channels, max_pts_each_voxel, roiaware_pool3d_backward_launcher(boxes_num, out_x, out_y, out_z, channels,
pts_idx_of_voxels_data, argmax_data, grad_out_data, grad_in_data, pool_method); max_pts_each_voxel, pts_idx_of_voxels_data,
argmax_data, grad_out_data, grad_in_data,
pool_method);
return 1; return 1;
} }
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &roiaware_pool3d_gpu, "roiaware pool3d forward (CUDA)"); m.def("forward", &roiaware_pool3d_gpu, "roiaware pool3d forward (CUDA)");
m.def("backward", &roiaware_pool3d_gpu_backward, "roiaware pool3d backward (CUDA)"); m.def("backward", &roiaware_pool3d_gpu_backward,
m.def("points_in_boxes_gpu", &points_in_boxes_gpu, "points_in_boxes_gpu forward (CUDA)"); "roiaware pool3d backward (CUDA)");
m.def("points_in_boxes_cpu", &points_in_boxes_cpu, "points_in_boxes_cpu forward (CPU)"); m.def("points_in_boxes_gpu", &points_in_boxes_gpu,
"points_in_boxes_gpu forward (CUDA)");
m.def("points_in_boxes_cpu", &points_in_boxes_cpu,
"points_in_boxes_cpu forward (CPU)");
} }
...@@ -26,9 +26,10 @@ namespace spconv { ...@@ -26,9 +26,10 @@ namespace spconv {
// torch.jit's doc says only support int64, so we need to convert to int32. // torch.jit's doc says only support int64, so we need to convert to int32.
template <typename T> template <typename T>
torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor filters, torch::Tensor bias, torch::Tensor fusedIndiceConvBatchNorm(
torch::Tensor indicePairs, torch::Tensor indiceNum, torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
int64_t numActOut, int64_t _inverse, int64_t _subM) { torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
int64_t _inverse, int64_t _subM) {
bool subM = _subM != 0; bool subM = _subM != 0;
bool inverse = _inverse != 0; bool inverse = _inverse != 0;
auto device = features.device().type(); auto device = features.device().type();
...@@ -37,13 +38,16 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil ...@@ -37,13 +38,16 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
auto numInPlanes = features.size(1); auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1); auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU}); auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto indicePairMaxSizeIter = std::max_element( auto indicePairMaxSizeIter =
indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume); std::max_element(indicePairNumCpu.data_ptr<int>(),
int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumCpu.data<int>(); indicePairNumCpu.data_ptr<int>() + kernelVolume);
int indicePairMaxOffset =
indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
int indicePairMaxSize = *indicePairMaxSizeIter; int indicePairMaxSize = *indicePairMaxSizeIter;
/*if (_subM){ /*if (_subM){
std::vector<int> indicePairNumVec(indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume); std::vector<int> indicePairNumVec(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset); indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
auto indicePairVecMaxSizeIter = std::max_element( auto indicePairVecMaxSizeIter = std::max_element(
...@@ -56,8 +60,10 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil ...@@ -56,8 +60,10 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
// auto indicePairOptions = // auto indicePairOptions =
// torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device()); // torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options).copy_(bias); torch::Tensor output =
torch::Tensor inputBuffer = torch::zeros({indicePairMaxSize, numInPlanes}, options); torch::zeros({numActOut, numOutPlanes}, options).copy_(bias);
torch::Tensor inputBuffer =
torch::zeros({indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer = torch::Tensor outputBuffer =
torch::zeros({indicePairMaxSize, numOutPlanes}, options); torch::zeros({indicePairMaxSize, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes}); filters = filters.view({-1, numInPlanes, numOutPlanes});
...@@ -69,33 +75,34 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil ...@@ -69,33 +75,34 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
double totalGEMMTime = 0; double totalGEMMTime = 0;
double totalSAddTime = 0; double totalSAddTime = 0;
for (int i = 0; i < kernelVolume; ++i) { for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data<int>()[i]; auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) { if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue; continue;
} }
// auto timer = spconv::CudaContextTimer<>(); // auto timer = spconv::CudaContextTimer<>();
auto outputBufferBlob = auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr<T>(),
torch::from_blob(outputBuffer.data<T>(), {nHot, numOutPlanes}, options); {nHot, numOutPlanes}, options);
auto inputBufferBlob = auto inputBufferBlob = torch::from_blob(inputBuffer.data_ptr<T>(),
torch::from_blob(inputBuffer.data<T>(), {nHot, numInPlanes}, options); {nHot, numInPlanes}, options);
if (device == torch::kCPU) { if (device == torch::kCPU) {
functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor; functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer), gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features), tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot); tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
} else { } else {
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor; functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer), gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features), tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot); tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
/* slower than SparseGatherFunctor, may due to int->long conversion /* slower than SparseGatherFunctor, may due to int->long conversion
auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64); auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(), {nHot}, auto indicePairBlob = torch::from_blob(indicePairLong.data_ptr<long>(),
indicePairOptions); {nHot}, indicePairOptions); torch::index_select_out(inputBufferBlob,
torch::index_select_out(inputBufferBlob, features, 0, features, 0, indicePairBlob);*/
indicePairBlob);*/
} }
// totalGatherTime += timer.report() / 1000.0; // totalGatherTime += timer.report() / 1000.0;
torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]); torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
...@@ -105,14 +112,14 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil ...@@ -105,14 +112,14 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor; functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
scatterFtor(tv::CPU(), tv::torch2tv<T>(output), scatterFtor(tv::CPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBuffer), tv::torch2tv<const T>(outputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot, tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
true); nHot, true);
} else { } else {
functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor; functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output), scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBuffer), tv::torch2tv<const T>(outputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot, tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
true); nHot, true);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
// totalSAddTime += timer.report() / 1000.0; // totalSAddTime += timer.report() / 1000.0;
......
...@@ -34,7 +34,7 @@ torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs, ...@@ -34,7 +34,7 @@ torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
torch::Tensor output = torch::zeros({numAct, numInPlanes}, options); torch::Tensor output = torch::zeros({numAct, numInPlanes}, options);
double totalTime = 0; double totalTime = 0;
for (int i = 0; i < kernelVolume; ++i) { for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data<int>()[i]; auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0) { if (nHot <= 0) {
continue; continue;
} }
...@@ -60,7 +60,8 @@ torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs, ...@@ -60,7 +60,8 @@ torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
template <typename T> template <typename T>
torch::Tensor indiceMaxPoolBackward(torch::Tensor features, torch::Tensor indiceMaxPoolBackward(torch::Tensor features,
torch::Tensor outFeatures, torch::Tensor outFeatures,
torch::Tensor outGrad, torch::Tensor indicePairs, torch::Tensor outGrad,
torch::Tensor indicePairs,
torch::Tensor indiceNum) { torch::Tensor indiceNum) {
auto device = features.device().type(); auto device = features.device().type();
auto numInPlanes = features.size(1); auto numInPlanes = features.size(1);
...@@ -70,7 +71,7 @@ torch::Tensor indiceMaxPoolBackward(torch::Tensor features, ...@@ -70,7 +71,7 @@ torch::Tensor indiceMaxPoolBackward(torch::Tensor features,
torch::Tensor inputGrad = torch::zeros(features.sizes(), options); torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
auto kernelVolume = indicePairs.size(0); auto kernelVolume = indicePairs.size(0);
for (int i = 0; i < kernelVolume; ++i) { for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data<int>()[i]; auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0) { if (nHot <= 0) {
continue; continue;
} }
......
...@@ -25,8 +25,8 @@ ...@@ -25,8 +25,8 @@
namespace spconv { namespace spconv {
// torch.jit's doc says only support int64, so we need to convert to int32. // torch.jit's doc says only support int64, so we need to convert to int32.
template <unsigned NDim> template <unsigned NDim>
std::vector<torch::Tensor> std::vector<torch::Tensor> getIndicePair(
getIndicePair(torch::Tensor indices, int64_t batchSize, torch::Tensor indices, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape, std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride, std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation, std::vector<int64_t> padding, std::vector<int64_t> dilation,
...@@ -67,8 +67,8 @@ getIndicePair(torch::Tensor indices, int64_t batchSize, ...@@ -67,8 +67,8 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
tv::SimpleVector<int, NDim> stride32; tv::SimpleVector<int, NDim> stride32;
tv::SimpleVector<int, NDim> padding32; tv::SimpleVector<int, NDim> padding32;
tv::SimpleVector<int, NDim> dilation32; tv::SimpleVector<int, NDim> dilation32;
auto indicePairUnique = auto indicePairUnique = torch::full(
torch::full({indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(), {indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
torch::dtype(torch::kInt32).device(indices.device())); torch::dtype(torch::kInt32).device(indices.device()));
for (int i = 0; i < NDim; ++i) { for (int i = 0; i < NDim; ++i) {
outSpatialShape32.push_back(outSpatialShape[i]); outSpatialShape32.push_back(outSpatialShape[i]);
...@@ -88,16 +88,18 @@ getIndicePair(torch::Tensor indices, int64_t batchSize, ...@@ -88,16 +88,18 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
auto getIndicePairFtor = auto getIndicePairFtor =
functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>(); functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
numActOut = getIndicePairFtor( numActOut = getIndicePairFtor(
tv::CPU(), tv::torch2tv<const int>(indices), tv::torch2tv<int>(gridOut), tv::CPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32, tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
stride32, padding32, dilation32, outSpatialShape32, transpose); tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
dilation32, outSpatialShape32, transpose);
} else { } else {
auto getIndicePairFtor = auto getIndicePairFtor =
functor::CreateSubMIndicePairFunctor<tv::GPU, int, int, NDim>(); functor::CreateSubMIndicePairFunctor<tv::GPU, int, int, NDim>();
numActOut = getIndicePairFtor( numActOut = getIndicePairFtor(
tv::TorchGPU(), tv::torch2tv<const int>(indices), tv::torch2tv<int>(gridOut), tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32, tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
stride32, padding32, dilation32, outSpatialShape32, transpose); tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
dilation32, outSpatialShape32, transpose);
} }
return {indices, indicePairs, indiceNum}; return {indices, indicePairs, indiceNum};
} else { } else {
...@@ -105,19 +107,21 @@ getIndicePair(torch::Tensor indices, int64_t batchSize, ...@@ -105,19 +107,21 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
torch::zeros({numAct * kernelVolume, coorDim + 1}, torch::zeros({numAct * kernelVolume, coorDim + 1},
torch::dtype(torch::kInt32).device(indices.device())); torch::dtype(torch::kInt32).device(indices.device()));
if (indices.device().type() == torch::kCPU) { if (indices.device().type() == torch::kCPU) {
auto getIndicePairFtor = functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>(); auto getIndicePairFtor =
functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
numActOut = getIndicePairFtor( numActOut = getIndicePairFtor(
tv::CPU(), tv::torch2tv<const int>(indices), tv::CPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut), tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32, tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
stride32, padding32, dilation32, outSpatialShape32, transpose); kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
transpose);
} else { } else {
auto getIndicePairFtorP1 = auto getIndicePairFtorP1 =
functor::CreateConvIndicePairFunctorP1<tv::GPU, int, int, NDim>(); functor::CreateConvIndicePairFunctorP1<tv::GPU, int, int, NDim>();
auto getIndicePairFtorP2 = auto getIndicePairFtorP2 =
functor::CreateConvIndicePairFunctorP2<tv::GPU, int, int, NDim>(); functor::CreateConvIndicePairFunctorP2<tv::GPU, int, int, NDim>();
numActOut = numActOut = getIndicePairFtorP1(
getIndicePairFtorP1(tv::TorchGPU(), tv::torch2tv<const int>(indices), tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut), tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32, tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
...@@ -137,8 +141,8 @@ getIndicePair(torch::Tensor indices, int64_t batchSize, ...@@ -137,8 +141,8 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
} }
template <unsigned NDim> template <unsigned NDim>
std::vector<torch::Tensor> std::vector<torch::Tensor> getIndicePairPreGrid(
getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize, torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape, std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride, std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation, std::vector<int64_t> padding, std::vector<int64_t> dilation,
...@@ -177,8 +181,8 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch ...@@ -177,8 +181,8 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch
tv::SimpleVector<int, NDim> stride32; tv::SimpleVector<int, NDim> stride32;
tv::SimpleVector<int, NDim> padding32; tv::SimpleVector<int, NDim> padding32;
tv::SimpleVector<int, NDim> dilation32; tv::SimpleVector<int, NDim> dilation32;
auto indicePairUnique = auto indicePairUnique = torch::full(
torch::full({indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(), {indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
torch::dtype(torch::kInt32).device(indices.device())); torch::dtype(torch::kInt32).device(indices.device()));
for (int i = 0; i < NDim; ++i) { for (int i = 0; i < NDim; ++i) {
outSpatialShape32.push_back(outSpatialShape[i]); outSpatialShape32.push_back(outSpatialShape[i]);
...@@ -198,17 +202,19 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch ...@@ -198,17 +202,19 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch
auto getIndicePairFtor = auto getIndicePairFtor =
functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>(); functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
numActOut = getIndicePairFtor( numActOut = getIndicePairFtor(
tv::CPU(), tv::torch2tv<const int>(indices), tv::torch2tv<int>(gridOut), tv::CPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32, tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
stride32, padding32, dilation32, outSpatialShape32, transpose); tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
dilation32, outSpatialShape32, transpose);
gridOut.fill_(-1); gridOut.fill_(-1);
} else { } else {
auto getIndicePairFtor = auto getIndicePairFtor =
functor::CreateSubMIndicePairFunctor<tv::GPU, int, int, NDim>(); functor::CreateSubMIndicePairFunctor<tv::GPU, int, int, NDim>();
numActOut = getIndicePairFtor( numActOut = getIndicePairFtor(
tv::TorchGPU(), tv::torch2tv<const int>(indices), tv::torch2tv<int>(gridOut), tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32, tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
stride32, padding32, dilation32, outSpatialShape32, transpose, true); tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
dilation32, outSpatialShape32, transpose, true);
} }
return {indices, indicePairs, indiceNum}; return {indices, indicePairs, indiceNum};
} else { } else {
...@@ -216,20 +222,22 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch ...@@ -216,20 +222,22 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch
torch::zeros({numAct * kernelVolume, coorDim + 1}, torch::zeros({numAct * kernelVolume, coorDim + 1},
torch::dtype(torch::kInt32).device(indices.device())); torch::dtype(torch::kInt32).device(indices.device()));
if (indices.device().type() == torch::kCPU) { if (indices.device().type() == torch::kCPU) {
auto getIndicePairFtor = functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>(); auto getIndicePairFtor =
functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
numActOut = getIndicePairFtor( numActOut = getIndicePairFtor(
tv::CPU(), tv::torch2tv<const int>(indices), tv::CPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut), tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32, tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
stride32, padding32, dilation32, outSpatialShape32, transpose, true); kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
transpose, true);
gridOut.fill_(-1); gridOut.fill_(-1);
} else { } else {
auto getIndicePairFtorP1 = auto getIndicePairFtorP1 =
functor::CreateConvIndicePairFunctorP1<tv::GPU, int, int, NDim>(); functor::CreateConvIndicePairFunctorP1<tv::GPU, int, int, NDim>();
auto getIndicePairFtorP2 = auto getIndicePairFtorP2 =
functor::CreateConvIndicePairFunctorP2<tv::GPU, int, int, NDim>(); functor::CreateConvIndicePairFunctorP2<tv::GPU, int, int, NDim>();
numActOut = numActOut = getIndicePairFtorP1(
getIndicePairFtorP1(tv::TorchGPU(), tv::torch2tv<const int>(indices), tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut), tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32, tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
...@@ -241,15 +249,14 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch ...@@ -241,15 +249,14 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch
tv::TorchGPU(), tv::torch2tv<const int>(indices), tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut), tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose, true); tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose,
true);
} }
} }
return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum}; return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
} }
} }
template <typename T> template <typename T>
torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters, torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs, torch::Tensor indiceNum, torch::Tensor indicePairs, torch::Tensor indiceNum,
...@@ -262,13 +269,16 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters, ...@@ -262,13 +269,16 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
auto numInPlanes = features.size(1); auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1); auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU}); auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto indicePairMaxSizeIter = std::max_element( auto indicePairMaxSizeIter =
indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume); std::max_element(indicePairNumCpu.data_ptr<int>(),
int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumCpu.data<int>(); indicePairNumCpu.data_ptr<int>() + kernelVolume);
int indicePairMaxOffset =
indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
int indicePairMaxSize = *indicePairMaxSizeIter; int indicePairMaxSize = *indicePairMaxSizeIter;
/*if (_subM){ /*if (_subM){
std::vector<int> indicePairNumVec(indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume); std::vector<int> indicePairNumVec(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset); indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
auto indicePairVecMaxSizeIter = std::max_element( auto indicePairVecMaxSizeIter = std::max_element(
...@@ -282,7 +292,8 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters, ...@@ -282,7 +292,8 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
// torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device()); // torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options); torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
torch::Tensor inputBuffer = torch::zeros({indicePairMaxSize, numInPlanes}, options); torch::Tensor inputBuffer =
torch::zeros({indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer = torch::Tensor outputBuffer =
torch::zeros({indicePairMaxSize, numOutPlanes}, options); torch::zeros({indicePairMaxSize, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes}); filters = filters.view({-1, numInPlanes, numOutPlanes});
...@@ -294,33 +305,34 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters, ...@@ -294,33 +305,34 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
double totalGEMMTime = 0; double totalGEMMTime = 0;
double totalSAddTime = 0; double totalSAddTime = 0;
for (int i = 0; i < kernelVolume; ++i) { for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data<int>()[i]; auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) { if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue; continue;
} }
// auto timer = spconv::CudaContextTimer<>(); // auto timer = spconv::CudaContextTimer<>();
auto outputBufferBlob = auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr<T>(),
torch::from_blob(outputBuffer.data<T>(), {nHot, numOutPlanes}, options); {nHot, numOutPlanes}, options);
auto inputBufferBlob = auto inputBufferBlob = torch::from_blob(inputBuffer.data_ptr<T>(),
torch::from_blob(inputBuffer.data<T>(), {nHot, numInPlanes}, options); {nHot, numInPlanes}, options);
if (device == torch::kCPU) { if (device == torch::kCPU) {
functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor; functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer), gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features), tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot); tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
} else { } else {
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor; functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer), gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features), tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot); tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
/* slower than SparseGatherFunctor, may due to int->long conversion /* slower than SparseGatherFunctor, may due to int->long conversion
auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64); auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(), {nHot}, auto indicePairBlob = torch::from_blob(indicePairLong.data_ptr<long>(),
indicePairOptions); {nHot}, indicePairOptions); torch::index_select_out(inputBufferBlob,
torch::index_select_out(inputBufferBlob, features, 0, features, 0, indicePairBlob);*/
indicePairBlob);*/
} }
// totalGatherTime += timer.report() / 1000.0; // totalGatherTime += timer.report() / 1000.0;
torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]); torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
...@@ -330,14 +342,14 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters, ...@@ -330,14 +342,14 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor; functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
scatterFtor(tv::CPU(), tv::torch2tv<T>(output), scatterFtor(tv::CPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBuffer), tv::torch2tv<const T>(outputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot, tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
true); nHot, true);
} else { } else {
functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor; functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output), scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBuffer), tv::torch2tv<const T>(outputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot, tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
true); nHot, true);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
// totalSAddTime += timer.report() / 1000.0; // totalSAddTime += timer.report() / 1000.0;
...@@ -349,9 +361,11 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters, ...@@ -349,9 +361,11 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
} }
template <typename T> template <typename T>
std::vector<torch::Tensor> std::vector<torch::Tensor> indiceConvBackward(torch::Tensor features,
indiceConvBackward(torch::Tensor features, torch::Tensor filters, torch::Tensor filters,
torch::Tensor outGrad, torch::Tensor indicePairs, torch::Tensor indiceNum, torch::Tensor outGrad,
torch::Tensor indicePairs,
torch::Tensor indiceNum,
int64_t _inverse, int64_t _subM) { int64_t _inverse, int64_t _subM) {
bool subM = _subM != 0; bool subM = _subM != 0;
bool inverse = _inverse != 0; bool inverse = _inverse != 0;
...@@ -362,16 +376,19 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters, ...@@ -362,16 +376,19 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
auto numInPlanes = features.size(1); auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1); auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU}); auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto indicePairMaxSizeIter = std::max_element( auto indicePairMaxSizeIter =
indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume); std::max_element(indicePairNumCpu.data_ptr<int>(),
int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumCpu.data<int>(); indicePairNumCpu.data_ptr<int>() + kernelVolume);
int indicePairMaxOffset =
indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
int indicePairMaxSize = *indicePairMaxSizeIter; int indicePairMaxSize = *indicePairMaxSizeIter;
auto options = auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device()); torch::TensorOptions().dtype(features.dtype()).device(features.device());
auto filterShape = filters.sizes(); auto filterShape = filters.sizes();
torch::Tensor inputGrad = torch::zeros(features.sizes(), options); torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
torch::Tensor filtersGrad = torch::zeros(filterShape, options); torch::Tensor filtersGrad = torch::zeros(filterShape, options);
torch::Tensor inputBuffer = torch::zeros({indicePairMaxSize, numInPlanes}, options); torch::Tensor inputBuffer =
torch::zeros({indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer = torch::Tensor outputBuffer =
torch::zeros({indicePairMaxSize, numOutPlanes}, options); torch::zeros({indicePairMaxSize, numOutPlanes}, options);
...@@ -383,7 +400,7 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters, ...@@ -383,7 +400,7 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t()); torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
} }
for (int i = 0; i < kernelVolume; ++i) { for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data<int>()[i]; auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) { if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue; continue;
} }
...@@ -392,27 +409,31 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters, ...@@ -392,27 +409,31 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtorOut; functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtorOut;
gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer), gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features), tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot); tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
gatherFtorOut(tv::CPU(), tv::torch2tv<T>(outputBuffer), gatherFtorOut(tv::CPU(), tv::torch2tv<T>(outputBuffer),
tv::torch2tv<const T>(outGrad), tv::torch2tv<const T>(outGrad),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot); tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot);
} else { } else {
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor; functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtorOut; functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtorOut;
gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer), gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features), tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot); tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
gatherFtorOut(tv::TorchGPU(), tv::torch2tv<T>(outputBuffer), gatherFtorOut(tv::TorchGPU(), tv::torch2tv<T>(outputBuffer),
tv::torch2tv<const T>(outGrad), tv::torch2tv<const T>(outGrad),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot); tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
auto filterGradSub = filtersGrad[i]; auto filterGradSub = filtersGrad[i];
auto outputBufferBlob = auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr<T>(),
torch::from_blob(outputBuffer.data<T>(), {nHot, numOutPlanes}, options); {nHot, numOutPlanes}, options);
auto inputBufferBlob = auto inputBufferBlob = torch::from_blob(inputBuffer.data_ptr<T>(),
torch::from_blob(inputBuffer.data<T>(), {nHot, numInPlanes}, options); {nHot, numInPlanes}, options);
torch::mm_out(filterGradSub, inputBufferBlob.t(), outputBufferBlob); torch::mm_out(filterGradSub, inputBufferBlob.t(), outputBufferBlob);
torch::mm_out(inputBufferBlob, outputBufferBlob, filters[i].t()); torch::mm_out(inputBufferBlob, outputBufferBlob, filters[i].t());
...@@ -420,12 +441,14 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters, ...@@ -420,12 +441,14 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor; functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
scatterFtor(tv::CPU(), tv::torch2tv<T>(inputGrad), scatterFtor(tv::CPU(), tv::torch2tv<T>(inputGrad),
tv::torch2tv<const T>(inputBuffer), tv::torch2tv<const T>(inputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot); tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
} else { } else {
functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor; functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(inputGrad), scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(inputGrad),
tv::torch2tv<const T>(inputBuffer), tv::torch2tv<const T>(inputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot); tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
} }
...@@ -433,9 +456,12 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters, ...@@ -433,9 +456,12 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
} }
template <typename T> template <typename T>
torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor filters, torch::Tensor indiceConvDevelopDontUse(torch::Tensor features,
torch::Tensor indicePairs, torch::Tensor indiceNum, torch::Tensor filters,
int64_t numActOut, int64_t _inverse, int64_t _subM) { torch::Tensor indicePairs,
torch::Tensor indiceNum,
int64_t numActOut, int64_t _inverse,
int64_t _subM) {
bool subM = _subM != 0; bool subM = _subM != 0;
bool inverse = _inverse != 0; bool inverse = _inverse != 0;
...@@ -446,15 +472,19 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil ...@@ -446,15 +472,19 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil
auto numOutPlanes = filters.size(ndim + 1); auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU}); auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto totalActsTen = indicePairNumCpu.sum(); auto totalActsTen = indicePairNumCpu.sum();
auto totalActs = indicePairNumCpu.data<int>()[0]; auto totalActs = indicePairNumCpu.data_ptr<int>()[0];
auto indicePairMaxSizeIter = std::max_element( auto indicePairMaxSizeIter =
indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume); std::max_element(indicePairNumCpu.data_ptr<int>(),
int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumCpu.data<int>(); indicePairNumCpu.data_ptr<int>() + kernelVolume);
int indicePairMaxOffset =
indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
int indicePairMaxSize = *indicePairMaxSizeIter; int indicePairMaxSize = *indicePairMaxSizeIter;
std::vector<int> indicePairNumVec(indicePairNumCpu.data<int>(), std::vector<int> indicePairNumVec(
indicePairNumCpu.data<int>() + kernelVolume); indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset); indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
int subRuleMaxSize = *std::max_element(indicePairNumVec.begin(), indicePairNumVec.end()); int subRuleMaxSize =
*std::max_element(indicePairNumVec.begin(), indicePairNumVec.end());
if (subM) { if (subM) {
indicePairMaxSize = subRuleMaxSize; indicePairMaxSize = subRuleMaxSize;
} }
...@@ -470,7 +500,7 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil ...@@ -470,7 +500,7 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil
torch::Tensor outputBuffer = torch::Tensor outputBuffer =
torch::zeros({kernelVolume, indicePairMaxSize, numOutPlanes}, options); torch::zeros({kernelVolume, indicePairMaxSize, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes}); filters = filters.view({-1, numInPlanes, numOutPlanes});
std::cout << "create time " << timer.report()/1000.0 << std::endl; std::cout << "create time " << timer.report() / 1000.0 << std::endl;
if (subM) { // the center index of subm conv don't need gather and scatter if (subM) { // the center index of subm conv don't need gather and scatter
// add. // add.
torch::mm_out(output, features, filters[indicePairMaxOffset]); torch::mm_out(output, features, filters[indicePairMaxOffset]);
...@@ -480,43 +510,44 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil ...@@ -480,43 +510,44 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil
double totalSAddTime = 0; double totalSAddTime = 0;
// auto timer = spconv::CudaContextTimer<>(); // auto timer = spconv::CudaContextTimer<>();
for (int i = 0; i < kernelVolume; ++i) { for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data<int>()[i]; auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) { if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue; continue;
} }
// //
auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(), auto outputBufferBlob = torch::from_blob(outputBuffer[i].data_ptr<T>(),
{nHot, numOutPlanes}, options); {nHot, numOutPlanes}, options);
auto inputBufferBlob = torch::from_blob(inputBuffer[i].data<T>(), auto inputBufferBlob = torch::from_blob(inputBuffer[i].data_ptr<T>(),
{nHot, numInPlanes}, options); {nHot, numInPlanes}, options);
if (device == torch::kCPU) { if (device == torch::kCPU) {
functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor; functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBufferBlob), gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBufferBlob),
tv::torch2tv<const T>(features), tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot); tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
} else { } else {
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor; functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBufferBlob), gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBufferBlob),
tv::torch2tv<const T>(features), tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot); tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
// } // }
// for (int i = 0; i < kernelVolume; ++i) { // for (int i = 0; i < kernelVolume; ++i) {
// totalGatherTime += timer.report() / 1000.0; // totalGatherTime += timer.report() / 1000.0;
// auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(), // auto outputBufferBlob = torch::from_blob(outputBuffer[i].data_ptr<T>(),
// {nHot, numOutPlanes}, options); // {nHot, numOutPlanes}, options);
} }
// totalGatherTime += timer.report() / 1000.0; // totalGatherTime += timer.report() / 1000.0;
for (int i = 0; i < kernelVolume; ++i) { for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data<int>()[i]; auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) { if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue; continue;
} }
auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(), auto outputBufferBlob = torch::from_blob(outputBuffer[i].data_ptr<T>(),
{nHot, numOutPlanes}, options); {nHot, numOutPlanes}, options);
auto inputBufferBlob = torch::from_blob(inputBuffer[i].data<T>(), auto inputBufferBlob = torch::from_blob(inputBuffer[i].data_ptr<T>(),
{nHot, numInPlanes}, options); {nHot, numInPlanes}, options);
torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]); torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
...@@ -524,27 +555,27 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil ...@@ -524,27 +555,27 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil
// totalGEMMTime += timer.report() / 1000.0; // totalGEMMTime += timer.report() / 1000.0;
// totalGEMMTime += timer.report() / 1000.0; // totalGEMMTime += timer.report() / 1000.0;
for (int i = 0; i < kernelVolume; ++i) { for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data<int>()[i]; auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) { if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue; continue;
} }
auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(), auto outputBufferBlob = torch::from_blob(outputBuffer[i].data_ptr<T>(),
{nHot, numOutPlanes}, options); {nHot, numOutPlanes}, options);
auto inputBufferBlob = torch::from_blob(inputBuffer[i].data<T>(), auto inputBufferBlob = torch::from_blob(inputBuffer[i].data_ptr<T>(),
{nHot, numInPlanes}, options); {nHot, numInPlanes}, options);
if (device == torch::kCPU) { if (device == torch::kCPU) {
functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor; functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
scatterFtor(tv::CPU(), tv::torch2tv<T>(output), scatterFtor(tv::CPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBufferBlob), tv::torch2tv<const T>(outputBufferBlob),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot, tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
true); nHot, true);
} else { } else {
functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor; functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output), scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBufferBlob), tv::torch2tv<const T>(outputBufferBlob),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot, tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
true); nHot, true);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
// totalSAddTime += timer.report() / 1000.0; // totalSAddTime += timer.report() / 1000.0;
......
...@@ -13,20 +13,21 @@ ...@@ -13,20 +13,21 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include <tensorview/tensorview.h>
#include <torch/script.h>
#include <ATen/ATen.h> #include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#include <tensorview/tensorview.h>
#include <torch/script.h>
namespace tv { namespace tv {
struct TorchGPU: public tv::GPU { struct TorchGPU : public tv::GPU {
virtual cudaStream_t getStream() const override { virtual cudaStream_t getStream() const override {
return at::cuda::getCurrentCUDAStream(); return at::cuda::getCurrentCUDAStream();
} }
}; };
template <typename T> void check_torch_dtype(const torch::Tensor &tensor) { template <typename T>
void check_torch_dtype(const torch::Tensor &tensor) {
switch (tensor.type().scalarType()) { switch (tensor.type().scalarType()) {
case at::ScalarType::Double: { case at::ScalarType::Double: {
auto val = std::is_same<std::remove_const_t<T>, double>::value; auto val = std::is_same<std::remove_const_t<T>, double>::value;
...@@ -65,6 +66,6 @@ tv::TensorView<T> torch2tv(const torch::Tensor &tensor) { ...@@ -65,6 +66,6 @@ tv::TensorView<T> torch2tv(const torch::Tensor &tensor) {
for (auto i : tensor.sizes()) { for (auto i : tensor.sizes()) {
shape.push_back(i); shape.push_back(i);
} }
return tv::TensorView<T>(tensor.data<std::remove_const_t<T>>(), shape); return tv::TensorView<T>(tensor.data_ptr<std::remove_const_t<T>>(), shape);
} }
} // namespace tv } // namespace tv
#include <torch/extension.h>
#include <ATen/TensorUtils.h> #include <ATen/TensorUtils.h>
#include <torch/extension.h>
// #include "voxelization.h" // #include "voxelization.h"
namespace { namespace {
template <typename T_int> template <typename T_int>
void determin_max_points_kernel(torch::TensorAccessor<T_int,2> coor, void determin_max_points_kernel(
torch::TensorAccessor<T_int,1> point_to_voxelidx, torch::TensorAccessor<T_int, 2> coor,
torch::TensorAccessor<T_int,1> num_points_per_voxel, torch::TensorAccessor<T_int, 1> point_to_voxelidx,
torch::TensorAccessor<T_int,3> coor_to_voxelidx, torch::TensorAccessor<T_int, 1> num_points_per_voxel,
int& voxel_num, torch::TensorAccessor<T_int, 3> coor_to_voxelidx, int& voxel_num,
int& max_points, int& max_points, const int num_points) {
const int num_points
) {
int voxelidx, num; int voxelidx, num;
for (int i = 0; i < num_points; ++i) { for (int i = 0; i < num_points; ++i) {
if (coor[i][0] == -1) if (coor[i][0] == -1) continue;
continue;
voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]]; voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];
// record voxel // record voxel
...@@ -35,25 +29,21 @@ void determin_max_points_kernel(torch::TensorAccessor<T_int,2> coor, ...@@ -35,25 +29,21 @@ void determin_max_points_kernel(torch::TensorAccessor<T_int,2> coor,
num_points_per_voxel[voxelidx] += 1; num_points_per_voxel[voxelidx] += 1;
// update max points per voxel // update max points per voxel
max_points = std::max(max_points, num+1); max_points = std::max(max_points, num + 1);
} }
return; return;
} }
template <typename T, typename T_int> template <typename T, typename T_int>
void scatter_point_to_voxel_kernel( void scatter_point_to_voxel_kernel(
const torch::TensorAccessor<T,2> points, const torch::TensorAccessor<T, 2> points,
torch::TensorAccessor<T_int,2> coor, torch::TensorAccessor<T_int, 2> coor,
torch::TensorAccessor<T_int,1> point_to_voxelidx, torch::TensorAccessor<T_int, 1> point_to_voxelidx,
torch::TensorAccessor<T_int,3> coor_to_voxelidx, torch::TensorAccessor<T_int, 3> coor_to_voxelidx,
torch::TensorAccessor<T,3> voxels, torch::TensorAccessor<T, 3> voxels,
torch::TensorAccessor<T_int,2> voxel_coors, torch::TensorAccessor<T_int, 2> voxel_coors, const int num_features,
const int num_features, const int num_points, const int NDim) {
const int num_points,
const int NDim
){
for (int i = 0; i < num_points; ++i) { for (int i = 0; i < num_points; ++i) {
int num = point_to_voxelidx[i]; int num = point_to_voxelidx[i];
int voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]]; int voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];
...@@ -68,14 +58,11 @@ void scatter_point_to_voxel_kernel( ...@@ -68,14 +58,11 @@ void scatter_point_to_voxel_kernel(
} // namespace } // namespace
namespace voxelization { namespace voxelization {
std::vector<at::Tensor> dynamic_point_to_voxel_cpu( std::vector<at::Tensor> dynamic_point_to_voxel_cpu(
const at::Tensor& points, const at::Tensor& points, const at::Tensor& voxel_mapping,
const at::Tensor& voxel_mapping, const std::vector<float> voxel_size, const std::vector<float> coors_range) {
const std::vector<float> voxel_size,
const std::vector<float> coors_range) {
// current version tooks about 0.02s_0.03s for one frame on cpu // current version tooks about 0.02s_0.03s for one frame on cpu
// check device // check device
AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor"); AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
...@@ -86,46 +73,50 @@ std::vector<at::Tensor> dynamic_point_to_voxel_cpu( ...@@ -86,46 +73,50 @@ std::vector<at::Tensor> dynamic_point_to_voxel_cpu(
std::vector<int> grid_size(NDim); std::vector<int> grid_size(NDim);
for (int i = 0; i < NDim; ++i) { for (int i = 0; i < NDim; ++i) {
grid_size[i] = round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]); grid_size[i] =
round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
} }
at::Tensor num_points_per_voxel = at::zeros({num_points,}, voxel_mapping.options()); at::Tensor num_points_per_voxel = at::zeros(
at::Tensor coor_to_voxelidx = -at::ones({grid_size[2], grid_size[1], grid_size[0]}, voxel_mapping.options()); {
at::Tensor point_to_voxelidx = -at::ones({num_points,}, voxel_mapping.options()); num_points,
},
voxel_mapping.options());
at::Tensor coor_to_voxelidx = -at::ones(
{grid_size[2], grid_size[1], grid_size[0]}, voxel_mapping.options());
at::Tensor point_to_voxelidx = -at::ones(
{
num_points,
},
voxel_mapping.options());
int voxel_num = 0; int voxel_num = 0;
int max_points = 0; int max_points = 0;
AT_DISPATCH_ALL_TYPES(voxel_mapping.type(), "determin_max_point", [&] { AT_DISPATCH_ALL_TYPES(voxel_mapping.scalar_type(), "determin_max_point", [&] {
determin_max_points_kernel<scalar_t>( determin_max_points_kernel<scalar_t>(
voxel_mapping.accessor<scalar_t,2>(), voxel_mapping.accessor<scalar_t, 2>(),
point_to_voxelidx.accessor<scalar_t,1>(), point_to_voxelidx.accessor<scalar_t, 1>(),
num_points_per_voxel.accessor<scalar_t,1>(), num_points_per_voxel.accessor<scalar_t, 1>(),
coor_to_voxelidx.accessor<scalar_t,3>(), coor_to_voxelidx.accessor<scalar_t, 3>(), voxel_num, max_points,
voxel_num, num_points);
max_points,
num_points
);
}); });
at::Tensor voxels = at::zeros({voxel_num, max_points, num_features}, points.options()); at::Tensor voxels =
at::Tensor voxel_coors = at::zeros({voxel_num, NDim}, points.options().dtype(at::kInt)); at::zeros({voxel_num, max_points, num_features}, points.options());
at::Tensor voxel_coors =
at::zeros({voxel_num, NDim}, points.options().dtype(at::kInt));
AT_DISPATCH_ALL_TYPES(points.type(), "scatter_point_to_voxel", [&] { AT_DISPATCH_ALL_TYPES(points.scalar_type(), "scatter_point_to_voxel", [&] {
scatter_point_to_voxel_kernel<scalar_t, int>( scatter_point_to_voxel_kernel<scalar_t, int>(
points.accessor<scalar_t,2>(), points.accessor<scalar_t, 2>(), voxel_mapping.accessor<int, 2>(),
voxel_mapping.accessor<int,2>(), point_to_voxelidx.accessor<int, 1>(),
point_to_voxelidx.accessor<int,1>(), coor_to_voxelidx.accessor<int, 3>(), voxels.accessor<scalar_t, 3>(),
coor_to_voxelidx.accessor<int,3>(), voxel_coors.accessor<int, 2>(), num_features, num_points, NDim);
voxels.accessor<scalar_t,3>(),
voxel_coors.accessor<int,2>(),
num_features,
num_points,
NDim
);
}); });
at::Tensor num_points_per_voxel_out = num_points_per_voxel.slice(/*dim=*/0, /*start=*/0, /*end=*/voxel_num); at::Tensor num_points_per_voxel_out =
num_points_per_voxel.slice(/*dim=*/0, /*start=*/0, /*end=*/voxel_num);
return {voxels, voxel_coors, num_points_per_voxel_out}; return {voxels, voxel_coors, num_points_per_voxel_out};
} }
} } // namespace voxelization
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
#include <ATen/cuda/CUDAApplyUtils.cuh> #include <ATen/cuda/CUDAApplyUtils.cuh>
#define CHECK_CUDA(x) \ #define CHECK_CUDA(x) \
TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor") TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) \ #define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) \ #define CHECK_INPUT(x) \
...@@ -177,7 +177,7 @@ std::vector<at::Tensor> dynamic_point_to_voxel_forward_gpu( ...@@ -177,7 +177,7 @@ std::vector<at::Tensor> dynamic_point_to_voxel_forward_gpu(
dim3 threads(threadsPerBlock); dim3 threads(threadsPerBlock);
cudaStream_t map_stream = at::cuda::getCurrentCUDAStream(); cudaStream_t map_stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_ALL_TYPES( AT_DISPATCH_ALL_TYPES(
voxel_mapping.type(), "determin_duplicate", ([&] { voxel_mapping.scalar_type(), "determin_duplicate", ([&] {
point_to_voxelidx_kernel<int><<<blocks, threads, 0, map_stream>>>( point_to_voxelidx_kernel<int><<<blocks, threads, 0, map_stream>>>(
voxel_mapping.data_ptr<int>(), point_to_voxelidx.data_ptr<int>(), voxel_mapping.data_ptr<int>(), point_to_voxelidx.data_ptr<int>(),
point_to_pointidx.data_ptr<int>(), num_points, NDim); point_to_pointidx.data_ptr<int>(), num_points, NDim);
...@@ -203,7 +203,7 @@ std::vector<at::Tensor> dynamic_point_to_voxel_forward_gpu( ...@@ -203,7 +203,7 @@ std::vector<at::Tensor> dynamic_point_to_voxel_forward_gpu(
voxel_mapping.options()); // must be zero from the begining voxel_mapping.options()); // must be zero from the begining
cudaStream_t logic_stream = at::cuda::getCurrentCUDAStream(); cudaStream_t logic_stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_ALL_TYPES( AT_DISPATCH_ALL_TYPES(
voxel_mapping.type(), "determin_duplicate", ([&] { voxel_mapping.scalar_type(), "determin_duplicate", ([&] {
determin_voxel_num<int><<<1, 1, 0, logic_stream>>>( determin_voxel_num<int><<<1, 1, 0, logic_stream>>>(
voxel_mapping.data_ptr<int>(), num_points_per_voxel.data_ptr<int>(), voxel_mapping.data_ptr<int>(), num_points_per_voxel.data_ptr<int>(),
point_to_voxelidx.data_ptr<int>(), point_to_voxelidx.data_ptr<int>(),
...@@ -228,7 +228,7 @@ std::vector<at::Tensor> dynamic_point_to_voxel_forward_gpu( ...@@ -228,7 +228,7 @@ std::vector<at::Tensor> dynamic_point_to_voxel_forward_gpu(
dim3 cp_threads(threadsPerBlock, 4); dim3 cp_threads(threadsPerBlock, 4);
cudaStream_t cp_stream = at::cuda::getCurrentCUDAStream(); cudaStream_t cp_stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_ALL_TYPES( AT_DISPATCH_ALL_TYPES(
points.type(), "scatter_point_to_voxel", ([&] { points.scalar_type(), "scatter_point_to_voxel", ([&] {
scatter_point_to_voxel_kernel<float, int> scatter_point_to_voxel_kernel<float, int>
<<<blocks, cp_threads, 0, cp_stream>>>( <<<blocks, cp_threads, 0, cp_stream>>>(
points.data_ptr<float>(), voxel_mapping.data_ptr<int>(), points.data_ptr<float>(), voxel_mapping.data_ptr<int>(),
...@@ -265,8 +265,8 @@ void dynamic_point_to_voxel_backward_gpu(at::Tensor& grad_input_points, ...@@ -265,8 +265,8 @@ void dynamic_point_to_voxel_backward_gpu(at::Tensor& grad_input_points,
dim3 blocks(col_blocks); dim3 blocks(col_blocks);
dim3 cp_threads(threadsPerBlock, 4); dim3 cp_threads(threadsPerBlock, 4);
cudaStream_t cp_stream = at::cuda::getCurrentCUDAStream(); cudaStream_t cp_stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_ALL_TYPES(grad_input_points.type(), "scatter_point_to_voxel", AT_DISPATCH_ALL_TYPES(grad_input_points.scalar_type(),
([&] { "scatter_point_to_voxel", ([&] {
map_voxel_to_point_kernel<float, int> map_voxel_to_point_kernel<float, int>
<<<blocks, cp_threads, 0, cp_stream>>>( <<<blocks, cp_threads, 0, cp_stream>>>(
grad_input_points.data_ptr<float>(), grad_input_points.data_ptr<float>(),
......
...@@ -49,7 +49,7 @@ inline int hard_voxelize(const at::Tensor& points, at::Tensor& voxels, ...@@ -49,7 +49,7 @@ inline int hard_voxelize(const at::Tensor& points, at::Tensor& voxels,
const std::vector<float> coors_range, const std::vector<float> coors_range,
const int max_points, const int max_voxels, const int max_points, const int max_voxels,
const int NDim = 3) { const int NDim = 3) {
if (points.type().is_cuda()) { if (points.device().is_cuda()) {
#ifdef WITH_CUDA #ifdef WITH_CUDA
return hard_voxelize_gpu(points, voxels, coors, num_points_per_voxel, return hard_voxelize_gpu(points, voxels, coors, num_points_per_voxel,
voxel_size, coors_range, max_points, max_voxels, voxel_size, coors_range, max_points, max_voxels,
...@@ -67,7 +67,7 @@ inline void dynamic_voxelize(const at::Tensor& points, at::Tensor& coors, ...@@ -67,7 +67,7 @@ inline void dynamic_voxelize(const at::Tensor& points, at::Tensor& coors,
const std::vector<float> voxel_size, const std::vector<float> voxel_size,
const std::vector<float> coors_range, const std::vector<float> coors_range,
const int NDim = 3) { const int NDim = 3) {
if (points.type().is_cuda()) { if (points.device().is_cuda()) {
#ifdef WITH_CUDA #ifdef WITH_CUDA
return dynamic_voxelize_gpu(points, coors, voxel_size, coors_range, NDim); return dynamic_voxelize_gpu(points, coors, voxel_size, coors_range, NDim);
#else #else
...@@ -80,7 +80,7 @@ inline void dynamic_voxelize(const at::Tensor& points, at::Tensor& coors, ...@@ -80,7 +80,7 @@ inline void dynamic_voxelize(const at::Tensor& points, at::Tensor& coors,
inline std::vector<torch::Tensor> dynamic_point_to_voxel_forward( inline std::vector<torch::Tensor> dynamic_point_to_voxel_forward(
const at::Tensor& points, const at::Tensor& voxel_mapping, const at::Tensor& points, const at::Tensor& voxel_mapping,
const std::vector<float> voxel_size, const std::vector<float> coors_range) { const std::vector<float> voxel_size, const std::vector<float> coors_range) {
if (points.type().is_cuda()) { if (points.device().is_cuda()) {
#ifdef WITH_CUDA #ifdef WITH_CUDA
return dynamic_point_to_voxel_forward_gpu(points, voxel_mapping, voxel_size, return dynamic_point_to_voxel_forward_gpu(points, voxel_mapping, voxel_size,
coors_range); coors_range);
...@@ -95,7 +95,7 @@ inline std::vector<torch::Tensor> dynamic_point_to_voxel_forward( ...@@ -95,7 +95,7 @@ inline std::vector<torch::Tensor> dynamic_point_to_voxel_forward(
inline void dynamic_point_to_voxel_backward( inline void dynamic_point_to_voxel_backward(
at::Tensor& grad_input_points, const at::Tensor& grad_output_voxels, at::Tensor& grad_input_points, const at::Tensor& grad_output_voxels,
const at::Tensor& point_to_voxelidx, const at::Tensor& coor_to_voxelidx) { const at::Tensor& point_to_voxelidx, const at::Tensor& coor_to_voxelidx) {
if (grad_input_points.type().is_cuda()) { if (grad_input_points.device().is_cuda()) {
#ifdef WITH_CUDA #ifdef WITH_CUDA
return dynamic_point_to_voxel_backward_gpu( return dynamic_point_to_voxel_backward_gpu(
grad_input_points, grad_output_voxels, point_to_voxelidx, grad_input_points, grad_output_voxels, point_to_voxelidx,
......
#include <torch/extension.h>
#include <ATen/TensorUtils.h> #include <ATen/TensorUtils.h>
#include <torch/extension.h>
// #include "voxelization.h" // #include "voxelization.h"
namespace { namespace {
template <typename T, typename T_int> template <typename T, typename T_int>
void dynamic_voxelize_kernel(const torch::TensorAccessor<T,2> points, void dynamic_voxelize_kernel(const torch::TensorAccessor<T, 2> points,
torch::TensorAccessor<T_int, 2> coors, torch::TensorAccessor<T_int, 2> coors,
const std::vector<float> voxel_size, const std::vector<float> voxel_size,
const std::vector<float> coors_range, const std::vector<float> coors_range,
const std::vector<int> grid_size, const std::vector<int> grid_size,
const int num_points, const int num_points, const int num_features,
const int num_features, const int NDim) {
const int NDim
) {
const int ndim_minus_1 = NDim - 1; const int ndim_minus_1 = NDim - 1;
bool failed = false; bool failed = false;
int coor[NDim]; int coor[NDim];
...@@ -44,56 +40,42 @@ void dynamic_voxelize_kernel(const torch::TensorAccessor<T,2> points, ...@@ -44,56 +40,42 @@ void dynamic_voxelize_kernel(const torch::TensorAccessor<T,2> points,
return; return;
} }
template <typename T, typename T_int> template <typename T, typename T_int>
void hard_voxelize_kernel(const torch::TensorAccessor<T,2> points, void hard_voxelize_kernel(const torch::TensorAccessor<T, 2> points,
torch::TensorAccessor<T,3> voxels, torch::TensorAccessor<T, 3> voxels,
torch::TensorAccessor<T_int,2> coors, torch::TensorAccessor<T_int, 2> coors,
torch::TensorAccessor<T_int,1> num_points_per_voxel, torch::TensorAccessor<T_int, 1> num_points_per_voxel,
torch::TensorAccessor<T_int,3> coor_to_voxelidx, torch::TensorAccessor<T_int, 3> coor_to_voxelidx,
int& voxel_num, int& voxel_num, const std::vector<float> voxel_size,
const std::vector<float> voxel_size,
const std::vector<float> coors_range, const std::vector<float> coors_range,
const std::vector<int> grid_size, const std::vector<int> grid_size,
const int max_points, const int max_points, const int max_voxels,
const int max_voxels, const int num_points, const int num_features,
const int num_points, const int NDim) {
const int num_features,
const int NDim
) {
// declare a temp coors // declare a temp coors
at::Tensor temp_coors = at::zeros({num_points, NDim}, at::TensorOptions().dtype(at::kInt).device(at::kCPU)); at::Tensor temp_coors = at::zeros(
{num_points, NDim}, at::TensorOptions().dtype(at::kInt).device(at::kCPU));
// First use dynamic voxelization to get coors, // First use dynamic voxelization to get coors,
// then check max points/voxels constraints // then check max points/voxels constraints
dynamic_voxelize_kernel<T, int>( dynamic_voxelize_kernel<T, int>(points, temp_coors.accessor<int, 2>(),
points, voxel_size, coors_range, grid_size,
temp_coors.accessor<int,2>(), num_points, num_features, NDim);
voxel_size,
coors_range,
grid_size,
num_points,
num_features,
NDim
);
int voxelidx, num; int voxelidx, num;
auto coor = temp_coors.accessor<int,2>(); auto coor = temp_coors.accessor<int, 2>();
for (int i = 0; i < num_points; ++i) { for (int i = 0; i < num_points; ++i) {
// T_int* coor = temp_coors.data_ptr<int>() + i * NDim; // T_int* coor = temp_coors.data_ptr<int>() + i * NDim;
if (coor[i][0] == -1) if (coor[i][0] == -1) continue;
continue;
voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]]; voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];
// record voxel // record voxel
if (voxelidx == -1) { if (voxelidx == -1) {
voxelidx = voxel_num; voxelidx = voxel_num;
if (max_voxels != -1 && voxel_num >= max_voxels) if (max_voxels != -1 && voxel_num >= max_voxels) break;
break;
voxel_num += 1; voxel_num += 1;
coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]] = voxelidx; coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]] = voxelidx;
...@@ -118,19 +100,14 @@ void hard_voxelize_kernel(const torch::TensorAccessor<T,2> points, ...@@ -118,19 +100,14 @@ void hard_voxelize_kernel(const torch::TensorAccessor<T,2> points,
} // namespace } // namespace
namespace voxelization { namespace voxelization {
int hard_voxelize_cpu( int hard_voxelize_cpu(const at::Tensor& points, at::Tensor& voxels,
const at::Tensor& points, at::Tensor& coors, at::Tensor& num_points_per_voxel,
at::Tensor& voxels,
at::Tensor& coors,
at::Tensor& num_points_per_voxel,
const std::vector<float> voxel_size, const std::vector<float> voxel_size,
const std::vector<float> coors_range, const std::vector<float> coors_range,
const int max_points, const int max_points, const int max_voxels,
const int max_voxels, const int NDim = 3) {
const int NDim=3) {
// current version tooks about 0.02s_0.03s for one frame on cpu // current version tooks about 0.02s_0.03s for one frame on cpu
// check device // check device
AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor"); AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
...@@ -140,43 +117,34 @@ int hard_voxelize_cpu( ...@@ -140,43 +117,34 @@ int hard_voxelize_cpu(
const int num_features = points.size(1); const int num_features = points.size(1);
for (int i = 0; i < NDim; ++i) { for (int i = 0; i < NDim; ++i) {
grid_size[i] = round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]); grid_size[i] =
round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
} }
// coors, num_points_per_voxel, coor_to_voxelidx are int Tensor // coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
//printf("cpu coor_to_voxelidx size: [%d, %d, %d]\n", grid_size[2], grid_size[1], grid_size[0]); // printf("cpu coor_to_voxelidx size: [%d, %d, %d]\n", grid_size[2],
at::Tensor coor_to_voxelidx = -at::ones({grid_size[2], grid_size[1], grid_size[0]}, coors.options()); // grid_size[1], grid_size[0]);
at::Tensor coor_to_voxelidx =
-at::ones({grid_size[2], grid_size[1], grid_size[0]}, coors.options());
int voxel_num = 0; int voxel_num = 0;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(points.type(), "hard_voxelize_forward", [&] { AT_DISPATCH_FLOATING_TYPES_AND_HALF(
points.scalar_type(), "hard_voxelize_forward", [&] {
hard_voxelize_kernel<scalar_t, int>( hard_voxelize_kernel<scalar_t, int>(
points.accessor<scalar_t,2>(), points.accessor<scalar_t, 2>(), voxels.accessor<scalar_t, 3>(),
voxels.accessor<scalar_t,3>(), coors.accessor<int, 2>(), num_points_per_voxel.accessor<int, 1>(),
coors.accessor<int,2>(), coor_to_voxelidx.accessor<int, 3>(), voxel_num, voxel_size,
num_points_per_voxel.accessor<int,1>(), coors_range, grid_size, max_points, max_voxels, num_points,
coor_to_voxelidx.accessor<int,3>(), num_features, NDim);
voxel_num,
voxel_size,
coors_range,
grid_size,
max_points,
max_voxels,
num_points,
num_features,
NDim
);
}); });
return voxel_num; return voxel_num;
} }
void dynamic_voxelize_cpu(const at::Tensor& points, at::Tensor& coors,
void dynamic_voxelize_cpu(
const at::Tensor& points,
at::Tensor& coors,
const std::vector<float> voxel_size, const std::vector<float> voxel_size,
const std::vector<float> coors_range, const std::vector<float> coors_range,
const int NDim=3) { const int NDim = 3) {
// check device // check device
AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor"); AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
...@@ -185,24 +153,19 @@ void dynamic_voxelize_cpu( ...@@ -185,24 +153,19 @@ void dynamic_voxelize_cpu(
const int num_features = points.size(1); const int num_features = points.size(1);
for (int i = 0; i < NDim; ++i) { for (int i = 0; i < NDim; ++i) {
grid_size[i] = round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]); grid_size[i] =
round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
} }
// coors, num_points_per_voxel, coor_to_voxelidx are int Tensor // coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
AT_DISPATCH_FLOATING_TYPES_AND_HALF(points.type(), "hard_voxelize_forward", [&] { AT_DISPATCH_FLOATING_TYPES_AND_HALF(
points.scalar_type(), "hard_voxelize_forward", [&] {
dynamic_voxelize_kernel<scalar_t, int>( dynamic_voxelize_kernel<scalar_t, int>(
points.accessor<scalar_t,2>(), points.accessor<scalar_t, 2>(), coors.accessor<int, 2>(),
coors.accessor<int,2>(), voxel_size, coors_range, grid_size, num_points, num_features, NDim);
voxel_size,
coors_range,
grid_size,
num_points,
num_features,
NDim
);
}); });
return; return;
} }
} } // namespace voxelization
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
#include <ATen/cuda/CUDAApplyUtils.cuh> #include <ATen/cuda/CUDAApplyUtils.cuh>
#define CHECK_CUDA(x) \ #define CHECK_CUDA(x) \
TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor") TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) \ #define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) \ #define CHECK_INPUT(x) \
...@@ -219,7 +219,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels, ...@@ -219,7 +219,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
// 1. link point to corresponding voxel coors // 1. link point to corresponding voxel coors
AT_DISPATCH_ALL_TYPES( AT_DISPATCH_ALL_TYPES(
points.type(), "hard_voxelize_kernel", ([&] { points.scalar_type(), "hard_voxelize_kernel", ([&] {
dynamic_voxelize_kernel<scalar_t, int> dynamic_voxelize_kernel<scalar_t, int>
<<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>( <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
points.contiguous().data_ptr<scalar_t>(), points.contiguous().data_ptr<scalar_t>(),
...@@ -247,7 +247,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels, ...@@ -247,7 +247,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
dim3 map_grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096)); dim3 map_grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
dim3 map_block(512); dim3 map_block(512);
AT_DISPATCH_ALL_TYPES( AT_DISPATCH_ALL_TYPES(
temp_coors.type(), "determin_duplicate", ([&] { temp_coors.scalar_type(), "determin_duplicate", ([&] {
point_to_voxelidx_kernel<int> point_to_voxelidx_kernel<int>
<<<map_grid, map_block, 0, at::cuda::getCurrentCUDAStream()>>>( <<<map_grid, map_block, 0, at::cuda::getCurrentCUDAStream()>>>(
temp_coors.contiguous().data_ptr<int>(), temp_coors.contiguous().data_ptr<int>(),
...@@ -272,7 +272,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels, ...@@ -272,7 +272,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
points.options().dtype(at::kInt)); // must be zero from the begining points.options().dtype(at::kInt)); // must be zero from the begining
AT_DISPATCH_ALL_TYPES( AT_DISPATCH_ALL_TYPES(
temp_coors.type(), "determin_duplicate", ([&] { temp_coors.scalar_type(), "determin_duplicate", ([&] {
determin_voxel_num<int><<<1, 1, 0, at::cuda::getCurrentCUDAStream()>>>( determin_voxel_num<int><<<1, 1, 0, at::cuda::getCurrentCUDAStream()>>>(
num_points_per_voxel.contiguous().data_ptr<int>(), num_points_per_voxel.contiguous().data_ptr<int>(),
point_to_voxelidx.contiguous().data_ptr<int>(), point_to_voxelidx.contiguous().data_ptr<int>(),
...@@ -290,7 +290,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels, ...@@ -290,7 +290,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
dim3 cp_grid(std::min(at::cuda::ATenCeilDiv(pts_output_size, 512), 4096)); dim3 cp_grid(std::min(at::cuda::ATenCeilDiv(pts_output_size, 512), 4096));
dim3 cp_block(512); dim3 cp_block(512);
AT_DISPATCH_ALL_TYPES( AT_DISPATCH_ALL_TYPES(
points.type(), "assign_point_to_voxel", ([&] { points.scalar_type(), "assign_point_to_voxel", ([&] {
assign_point_to_voxel<float, int> assign_point_to_voxel<float, int>
<<<cp_grid, cp_block, 0, at::cuda::getCurrentCUDAStream()>>>( <<<cp_grid, cp_block, 0, at::cuda::getCurrentCUDAStream()>>>(
pts_output_size, points.contiguous().data_ptr<float>(), pts_output_size, points.contiguous().data_ptr<float>(),
...@@ -308,7 +308,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels, ...@@ -308,7 +308,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
std::min(at::cuda::ATenCeilDiv(coors_output_size, 512), 4096)); std::min(at::cuda::ATenCeilDiv(coors_output_size, 512), 4096));
dim3 coors_cp_block(512); dim3 coors_cp_block(512);
AT_DISPATCH_ALL_TYPES( AT_DISPATCH_ALL_TYPES(
points.type(), "assign_point_to_voxel", ([&] { points.scalar_type(), "assign_point_to_voxel", ([&] {
assign_voxel_coors<float, int><<<coors_cp_grid, coors_cp_block, 0, assign_voxel_coors<float, int><<<coors_cp_grid, coors_cp_block, 0,
at::cuda::getCurrentCUDAStream()>>>( at::cuda::getCurrentCUDAStream()>>>(
coors_output_size, temp_coors.contiguous().data_ptr<int>(), coors_output_size, temp_coors.contiguous().data_ptr<int>(),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment