Commit 4eca6606 authored by zhangwenwei's avatar zhangwenwei
Browse files

Merge branch 'refactor-anchor_coder' into 'master'

Refactor anchor generator and box coder

See merge request open-mmlab/mmdet.3d!3
parents 7a872356 dda784e5
//Modified from
//https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
//Points in boxes gpu
//Written by Shaoshuai Shi
//All Rights Reserved 2019.
// Modified from
// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
// Points in boxes gpu
// Written by Shaoshuai Shi
// All Rights Reserved 2019.
#include <torch/serialize/tensor.h>
#include <torch/extension.h>
#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <torch/extension.h>
#include <torch/serialize/tensor.h>
#define THREADS_PER_BLOCK 256
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
#define CHECK_CUDA(x) \
TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_INPUT(x) \
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
// #define DEBUG
__device__ inline void lidar_to_local_coords(float shift_x, float shift_y, float rz, float &local_x, float &local_y){
// should rotate pi/2 + alpha to translate LiDAR to local
float rot_angle = rz + M_PI / 2;
float cosa = cos(rot_angle), sina = sin(rot_angle);
local_x = shift_x * cosa + shift_y * (-sina);
local_y = shift_x * sina + shift_y * cosa;
__device__ inline void lidar_to_local_coords(float shift_x, float shift_y,
float rz, float &local_x,
float &local_y) {
// should rotate pi/2 + alpha to translate LiDAR to local
float rot_angle = rz + M_PI / 2;
float cosa = cos(rot_angle), sina = sin(rot_angle);
local_x = shift_x * cosa + shift_y * (-sina);
local_y = shift_x * sina + shift_y * cosa;
}
__device__ inline int check_pt_in_box3d(const float *pt, const float *box3d, float &local_x, float &local_y){
// param pt: (x, y, z)
// param box3d: (cx, cy, cz, w, l, h, rz) in LiDAR coordinate, cz in the bottom center
float x = pt[0], y = pt[1], z = pt[2];
float cx = box3d[0], cy = box3d[1], cz = box3d[2];
float w = box3d[3], l = box3d[4], h = box3d[5], rz = box3d[6];
cz += h / 2.0; // shift to the center since cz in box3d is the bottom center
if (fabsf(z - cz) > h / 2.0) return 0;
lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
float in_flag = (local_x > -l / 2.0) & (local_x < l / 2.0) & (local_y > -w / 2.0) & (local_y < w / 2.0);
return in_flag;
__device__ inline int check_pt_in_box3d(const float *pt, const float *box3d,
float &local_x, float &local_y) {
// param pt: (x, y, z)
// param box3d: (cx, cy, cz, w, l, h, rz) in LiDAR coordinate, cz in the
// bottom center
float x = pt[0], y = pt[1], z = pt[2];
float cx = box3d[0], cy = box3d[1], cz = box3d[2];
float w = box3d[3], l = box3d[4], h = box3d[5], rz = box3d[6];
cz += h / 2.0; // shift to the center since cz in box3d is the bottom center
if (fabsf(z - cz) > h / 2.0) return 0;
lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
float in_flag = (local_x > -l / 2.0) & (local_x < l / 2.0) &
(local_y > -w / 2.0) & (local_y < w / 2.0);
return in_flag;
}
__global__ void points_in_boxes_kernel(int batch_size, int boxes_num, int pts_num, const float *boxes,
const float *pts, int *box_idx_of_points){
// params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is the bottom center, each box DO NOT overlaps
// params pts: (B, npoints, 3) [x, y, z] in LiDAR coordinate
// params boxes_idx_of_points: (B, npoints), default -1
int bs_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (bs_idx >= batch_size || pt_idx >= pts_num) return;
boxes += bs_idx * boxes_num * 7;
pts += bs_idx * pts_num * 3 + pt_idx * 3;
box_idx_of_points += bs_idx * pts_num + pt_idx;
float local_x = 0, local_y = 0;
int cur_in_flag = 0;
for (int k = 0; k < boxes_num; k++){
cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
if (cur_in_flag){
box_idx_of_points[0] = k;
break;
}
__global__ void points_in_boxes_kernel(int batch_size, int boxes_num,
int pts_num, const float *boxes,
const float *pts,
int *box_idx_of_points) {
// params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is
// the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
// y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
// -1
int bs_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (bs_idx >= batch_size || pt_idx >= pts_num) return;
boxes += bs_idx * boxes_num * 7;
pts += bs_idx * pts_num * 3 + pt_idx * 3;
box_idx_of_points += bs_idx * pts_num + pt_idx;
float local_x = 0, local_y = 0;
int cur_in_flag = 0;
for (int k = 0; k < boxes_num; k++) {
cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
if (cur_in_flag) {
box_idx_of_points[0] = k;
break;
}
}
}
void points_in_boxes_launcher(int batch_size, int boxes_num, int pts_num, const float *boxes,
const float *pts, int *box_idx_of_points){
// params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is the bottom center, each box DO NOT overlaps
// params pts: (B, npoints, 3) [x, y, z] in LiDAR coordinate
// params boxes_idx_of_points: (B, npoints), default -1
cudaError_t err;
dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), batch_size);
dim3 threads(THREADS_PER_BLOCK);
points_in_boxes_kernel<<<blocks, threads>>>(batch_size, boxes_num, pts_num, boxes, pts, box_idx_of_points);
err = cudaGetLastError();
if (cudaSuccess != err) {
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1);
}
void points_in_boxes_launcher(int batch_size, int boxes_num, int pts_num,
const float *boxes, const float *pts,
int *box_idx_of_points) {
// params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is
// the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
// y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
// -1
cudaError_t err;
dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), batch_size);
dim3 threads(THREADS_PER_BLOCK);
points_in_boxes_kernel<<<blocks, threads>>>(batch_size, boxes_num, pts_num,
boxes, pts, box_idx_of_points);
err = cudaGetLastError();
if (cudaSuccess != err) {
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1);
}
#ifdef DEBUG
cudaDeviceSynchronize(); // for using printf in kernel function
cudaDeviceSynchronize(); // for using printf in kernel function
#endif
}
int points_in_boxes_gpu(at::Tensor boxes_tensor, at::Tensor pts_tensor, at::Tensor box_idx_of_points_tensor){
// params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is the bottom center, each box DO NOT overlaps
// params pts: (B, npoints, 3) [x, y, z] in LiDAR coordinate
// params boxes_idx_of_points: (B, npoints), default -1
int points_in_boxes_gpu(at::Tensor boxes_tensor, at::Tensor pts_tensor,
at::Tensor box_idx_of_points_tensor) {
// params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is
// the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
// y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
// -1
CHECK_INPUT(boxes_tensor);
CHECK_INPUT(pts_tensor);
CHECK_INPUT(box_idx_of_points_tensor);
CHECK_INPUT(boxes_tensor);
CHECK_INPUT(pts_tensor);
CHECK_INPUT(box_idx_of_points_tensor);
int batch_size = boxes_tensor.size(0);
int boxes_num = boxes_tensor.size(1);
int pts_num = pts_tensor.size(1);
int batch_size = boxes_tensor.size(0);
int boxes_num = boxes_tensor.size(1);
int pts_num = pts_tensor.size(1);
const float *boxes = boxes_tensor.data<float>();
const float *pts = pts_tensor.data<float>();
int *box_idx_of_points = box_idx_of_points_tensor.data<int>();
const float *boxes = boxes_tensor.data_ptr<float>();
const float *pts = pts_tensor.data_ptr<float>();
int *box_idx_of_points = box_idx_of_points_tensor.data_ptr<int>();
points_in_boxes_launcher(batch_size, boxes_num, pts_num, boxes, pts, box_idx_of_points);
points_in_boxes_launcher(batch_size, boxes_num, pts_num, boxes, pts,
box_idx_of_points);
return 1;
return 1;
}
//Modified from
//https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
//RoI-aware point cloud feature pooling
//Written by Shaoshuai Shi
//All Rights Reserved 2019.
// Modified from
// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
// RoI-aware point cloud feature pooling
// Written by Shaoshuai Shi
// All Rights Reserved 2019.
#include <torch/serialize/tensor.h>
#include <torch/extension.h>
#include <assert.h>
#include <torch/extension.h>
#include <torch/serialize/tensor.h>
#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
void roiaware_pool3d_launcher(int boxes_num, int pts_num, int channels, int max_pts_each_voxel,
int out_x, int out_y, int out_z, const float *rois, const float *pts, const float *pts_feature,
int *argmax, int *pts_idx_of_voxels, float *pooled_features, int pool_method);
void roiaware_pool3d_backward_launcher(int boxes_num, int out_x, int out_y, int out_z, int channels, int max_pts_each_voxel,
const int *pts_idx_of_voxels, const int *argmax, const float *grad_out, float *grad_in, int pool_method);
int roiaware_pool3d_gpu(at::Tensor rois, at::Tensor pts, at::Tensor pts_feature, at::Tensor argmax,
at::Tensor pts_idx_of_voxels, at::Tensor pooled_features, int pool_method);
int roiaware_pool3d_gpu_backward(at::Tensor pts_idx_of_voxels, at::Tensor argmax, at::Tensor grad_out,
at::Tensor grad_in, int pool_method);
int points_in_boxes_cpu(at::Tensor boxes_tensor, at::Tensor pts_tensor, at::Tensor pts_indices_tensor);
int points_in_boxes_gpu(at::Tensor boxes_tensor, at::Tensor pts_tensor, at::Tensor box_idx_of_points_tensor);
int roiaware_pool3d_gpu(at::Tensor rois, at::Tensor pts, at::Tensor pts_feature, at::Tensor argmax, at::Tensor pts_idx_of_voxels, at::Tensor pooled_features, int pool_method){
// params rois: (N, 7) [x, y, z, w, l, h, ry] in LiDAR coordinate
// params pts: (npoints, 3) [x, y, z] in LiDAR coordinate
// params pts_feature: (npoints, C)
// params argmax: (N, out_x, out_y, out_z, C)
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
// params pooled_features: (N, out_x, out_y, out_z, C)
// params pool_method: 0: max_pool 1: avg_pool
CHECK_INPUT(rois);
CHECK_INPUT(pts);
CHECK_INPUT(pts_feature);
CHECK_INPUT(argmax);
CHECK_INPUT(pts_idx_of_voxels);
CHECK_INPUT(pooled_features);
int boxes_num = rois.size(0);
int pts_num = pts.size(0);
int channels = pts_feature.size(1);
int max_pts_each_voxel = pts_idx_of_voxels.size(4); // index 0 is the counter
int out_x = pts_idx_of_voxels.size(1);
int out_y = pts_idx_of_voxels.size(2);
int out_z = pts_idx_of_voxels.size(3);
assert ((out_x < 256) && (out_y < 256) && (out_z < 256)); // we encode index with 8bit
const float *rois_data = rois.data<float>();
const float *pts_data = pts.data<float>();
const float *pts_feature_data = pts_feature.data<float>();
int *argmax_data = argmax.data<int>();
int *pts_idx_of_voxels_data = pts_idx_of_voxels.data<int>();
float *pooled_features_data = pooled_features.data<float>();
roiaware_pool3d_launcher(boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
rois_data, pts_data, pts_feature_data, argmax_data, pts_idx_of_voxels_data, pooled_features_data, pool_method);
return 1;
#define CHECK_CUDA(x) \
TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_INPUT(x) \
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
void roiaware_pool3d_launcher(int boxes_num, int pts_num, int channels,
int max_pts_each_voxel, int out_x, int out_y,
int out_z, const float *rois, const float *pts,
const float *pts_feature, int *argmax,
int *pts_idx_of_voxels, float *pooled_features,
int pool_method);
void roiaware_pool3d_backward_launcher(int boxes_num, int out_x, int out_y,
int out_z, int channels,
int max_pts_each_voxel,
const int *pts_idx_of_voxels,
const int *argmax, const float *grad_out,
float *grad_in, int pool_method);
int roiaware_pool3d_gpu(at::Tensor rois, at::Tensor pts, at::Tensor pts_feature,
at::Tensor argmax, at::Tensor pts_idx_of_voxels,
at::Tensor pooled_features, int pool_method);
int roiaware_pool3d_gpu_backward(at::Tensor pts_idx_of_voxels,
at::Tensor argmax, at::Tensor grad_out,
at::Tensor grad_in, int pool_method);
int points_in_boxes_cpu(at::Tensor boxes_tensor, at::Tensor pts_tensor,
at::Tensor pts_indices_tensor);
int points_in_boxes_gpu(at::Tensor boxes_tensor, at::Tensor pts_tensor,
at::Tensor box_idx_of_points_tensor);
int roiaware_pool3d_gpu(at::Tensor rois, at::Tensor pts, at::Tensor pts_feature,
at::Tensor argmax, at::Tensor pts_idx_of_voxels,
at::Tensor pooled_features, int pool_method) {
// params rois: (N, 7) [x, y, z, w, l, h, ry] in LiDAR coordinate
// params pts: (npoints, 3) [x, y, z] in LiDAR coordinate
// params pts_feature: (npoints, C)
// params argmax: (N, out_x, out_y, out_z, C)
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
// params pooled_features: (N, out_x, out_y, out_z, C)
// params pool_method: 0: max_pool 1: avg_pool
CHECK_INPUT(rois);
CHECK_INPUT(pts);
CHECK_INPUT(pts_feature);
CHECK_INPUT(argmax);
CHECK_INPUT(pts_idx_of_voxels);
CHECK_INPUT(pooled_features);
int boxes_num = rois.size(0);
int pts_num = pts.size(0);
int channels = pts_feature.size(1);
int max_pts_each_voxel = pts_idx_of_voxels.size(4); // index 0 is the counter
int out_x = pts_idx_of_voxels.size(1);
int out_y = pts_idx_of_voxels.size(2);
int out_z = pts_idx_of_voxels.size(3);
assert((out_x < 256) && (out_y < 256) &&
(out_z < 256)); // we encode index with 8bit
const float *rois_data = rois.data_ptr<float>();
const float *pts_data = pts.data_ptr<float>();
const float *pts_feature_data = pts_feature.data_ptr<float>();
int *argmax_data = argmax.data_ptr<int>();
int *pts_idx_of_voxels_data = pts_idx_of_voxels.data_ptr<int>();
float *pooled_features_data = pooled_features.data_ptr<float>();
roiaware_pool3d_launcher(
boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
rois_data, pts_data, pts_feature_data, argmax_data,
pts_idx_of_voxels_data, pooled_features_data, pool_method);
return 1;
}
int roiaware_pool3d_gpu_backward(at::Tensor pts_idx_of_voxels, at::Tensor argmax, at::Tensor grad_out, at::Tensor grad_in, int pool_method){
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
// params argmax: (N, out_x, out_y, out_z, C)
// params grad_out: (N, out_x, out_y, out_z, C)
// params grad_in: (npoints, C), return value
// params pool_method: 0: max_pool 1: avg_pool
CHECK_INPUT(pts_idx_of_voxels);
CHECK_INPUT(argmax);
CHECK_INPUT(grad_out);
CHECK_INPUT(grad_in);
int boxes_num = pts_idx_of_voxels.size(0);
int out_x = pts_idx_of_voxels.size(1);
int out_y = pts_idx_of_voxels.size(2);
int out_z = pts_idx_of_voxels.size(3);
int max_pts_each_voxel = pts_idx_of_voxels.size(4); // index 0 is the counter
int channels = grad_out.size(4);
const int *pts_idx_of_voxels_data = pts_idx_of_voxels.data<int>();
const int *argmax_data = argmax.data<int>();
const float *grad_out_data = grad_out.data<float>();
float *grad_in_data = grad_in.data<float>();
roiaware_pool3d_backward_launcher(boxes_num, out_x, out_y, out_z, channels, max_pts_each_voxel,
pts_idx_of_voxels_data, argmax_data, grad_out_data, grad_in_data, pool_method);
return 1;
int roiaware_pool3d_gpu_backward(at::Tensor pts_idx_of_voxels,
at::Tensor argmax, at::Tensor grad_out,
at::Tensor grad_in, int pool_method) {
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
// params argmax: (N, out_x, out_y, out_z, C)
// params grad_out: (N, out_x, out_y, out_z, C)
// params grad_in: (npoints, C), return value
// params pool_method: 0: max_pool 1: avg_pool
CHECK_INPUT(pts_idx_of_voxels);
CHECK_INPUT(argmax);
CHECK_INPUT(grad_out);
CHECK_INPUT(grad_in);
int boxes_num = pts_idx_of_voxels.size(0);
int out_x = pts_idx_of_voxels.size(1);
int out_y = pts_idx_of_voxels.size(2);
int out_z = pts_idx_of_voxels.size(3);
int max_pts_each_voxel = pts_idx_of_voxels.size(4); // index 0 is the counter
int channels = grad_out.size(4);
const int *pts_idx_of_voxels_data = pts_idx_of_voxels.data_ptr<int>();
const int *argmax_data = argmax.data_ptr<int>();
const float *grad_out_data = grad_out.data_ptr<float>();
float *grad_in_data = grad_in.data_ptr<float>();
roiaware_pool3d_backward_launcher(boxes_num, out_x, out_y, out_z, channels,
max_pts_each_voxel, pts_idx_of_voxels_data,
argmax_data, grad_out_data, grad_in_data,
pool_method);
return 1;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &roiaware_pool3d_gpu, "roiaware pool3d forward (CUDA)");
m.def("backward", &roiaware_pool3d_gpu_backward, "roiaware pool3d backward (CUDA)");
m.def("points_in_boxes_gpu", &points_in_boxes_gpu, "points_in_boxes_gpu forward (CUDA)");
m.def("points_in_boxes_cpu", &points_in_boxes_cpu, "points_in_boxes_cpu forward (CPU)");
m.def("forward", &roiaware_pool3d_gpu, "roiaware pool3d forward (CUDA)");
m.def("backward", &roiaware_pool3d_gpu_backward,
"roiaware pool3d backward (CUDA)");
m.def("points_in_boxes_gpu", &points_in_boxes_gpu,
"points_in_boxes_gpu forward (CUDA)");
m.def("points_in_boxes_cpu", &points_in_boxes_cpu,
"points_in_boxes_cpu forward (CPU)");
}
......@@ -26,9 +26,10 @@ namespace spconv {
// torch.jit's doc says only support int64, so we need to convert to int32.
template <typename T>
torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
torch::Tensor indicePairs, torch::Tensor indiceNum,
int64_t numActOut, int64_t _inverse, int64_t _subM) {
torch::Tensor fusedIndiceConvBatchNorm(
torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
int64_t _inverse, int64_t _subM) {
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
......@@ -37,13 +38,16 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto indicePairMaxSizeIter = std::max_element(
indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumCpu.data<int>();
auto indicePairMaxSizeIter =
std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
int indicePairMaxOffset =
indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
int indicePairMaxSize = *indicePairMaxSizeIter;
/*if (_subM){
std::vector<int> indicePairNumVec(indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
std::vector<int> indicePairNumVec(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
auto indicePairVecMaxSizeIter = std::max_element(
......@@ -56,46 +60,49 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
// auto indicePairOptions =
// torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options).copy_(bias);
torch::Tensor inputBuffer = torch::zeros({indicePairMaxSize, numInPlanes}, options);
torch::Tensor output =
torch::zeros({numActOut, numOutPlanes}, options).copy_(bias);
torch::Tensor inputBuffer =
torch::zeros({indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::zeros({indicePairMaxSize, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
if (subM) { // the center index of subm conv don't need gather and scatter
// add.
if (subM) { // the center index of subm conv don't need gather and scatter
// add.
torch::mm_out(output, features, filters[indicePairMaxOffset]);
}
double totalGatherTime = 0;
double totalGEMMTime = 0;
double totalSAddTime = 0;
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data<int>()[i];
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
// auto timer = spconv::CudaContextTimer<>();
auto outputBufferBlob =
torch::from_blob(outputBuffer.data<T>(), {nHot, numOutPlanes}, options);
auto inputBufferBlob =
torch::from_blob(inputBuffer.data<T>(), {nHot, numInPlanes}, options);
auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr<T>(),
{nHot, numOutPlanes}, options);
auto inputBufferBlob = torch::from_blob(inputBuffer.data_ptr<T>(),
{nHot, numInPlanes}, options);
if (device == torch::kCPU) {
functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
} else {
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR();
/* slower than SparseGatherFunctor, may due to int->long conversion
auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(), {nHot},
indicePairOptions);
torch::index_select_out(inputBufferBlob, features, 0,
indicePairBlob);*/
auto indicePairBlob = torch::from_blob(indicePairLong.data_ptr<long>(),
{nHot}, indicePairOptions); torch::index_select_out(inputBufferBlob,
features, 0, indicePairBlob);*/
}
// totalGatherTime += timer.report() / 1000.0;
torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
......@@ -105,14 +112,14 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
scatterFtor(tv::CPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
true);
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot, true);
} else {
functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
true);
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot, true);
TV_CHECK_CUDA_ERR();
}
// totalSAddTime += timer.report() / 1000.0;
......@@ -122,6 +129,6 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
// std::cout << "scatteradd time " << totalSAddTime << std::endl;
return output;
}
} // namespace spconv
} // namespace spconv
#endif
......@@ -24,7 +24,7 @@
namespace spconv {
template <typename T>
torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t numAct) {
torch::Tensor indiceNum, int64_t numAct) {
auto device = features.device().type();
auto kernelVolume = indicePairs.size(0);
auto numInPlanes = features.size(1);
......@@ -34,7 +34,7 @@ torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
torch::Tensor output = torch::zeros({numAct, numInPlanes}, options);
double totalTime = 0;
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data<int>()[i];
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0) {
continue;
}
......@@ -59,18 +59,19 @@ torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
template <typename T>
torch::Tensor indiceMaxPoolBackward(torch::Tensor features,
torch::Tensor outFeatures,
torch::Tensor outGrad, torch::Tensor indicePairs,
torch::Tensor indiceNum) {
torch::Tensor outFeatures,
torch::Tensor outGrad,
torch::Tensor indicePairs,
torch::Tensor indiceNum) {
auto device = features.device().type();
auto numInPlanes = features.size(1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
auto kernelVolume = indicePairs.size(0);
auto kernelVolume = indicePairs.size(0);
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data<int>()[i];
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0) {
continue;
}
......@@ -92,6 +93,6 @@ torch::Tensor indiceMaxPoolBackward(torch::Tensor features,
return inputGrad;
}
} // namespace spconv
} // namespace spconv
#endif
......@@ -25,17 +25,17 @@
namespace spconv {
// torch.jit's doc says only support int64, so we need to convert to int32.
template <unsigned NDim>
std::vector<torch::Tensor>
getIndicePair(torch::Tensor indices, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
std::vector<torch::Tensor> getIndicePair(
torch::Tensor indices, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
// auto timer = spconv::CudaContextTimer<>();
bool subM = _subM != 0;
bool transpose = _transpose != 0;
auto numAct = indices.size(0);
auto coorDim = indices.size(1) - 1; // batchIdx + xyz
auto coorDim = indices.size(1) - 1; // batchIdx + xyz
TV_ASSERT_RT_ERR(NDim == coorDim, "error");
TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
......@@ -54,7 +54,7 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
}
torch::Tensor indicePairs =
torch::full({kernelVolume, 2, numAct}, -1,
torch::dtype(torch::kInt32).device(indices.device()));
torch::dtype(torch::kInt32).device(indices.device()));
torch::Tensor indiceNum = torch::zeros(
{kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
torch::Tensor gridOut =
......@@ -67,9 +67,9 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
tv::SimpleVector<int, NDim> stride32;
tv::SimpleVector<int, NDim> padding32;
tv::SimpleVector<int, NDim> dilation32;
auto indicePairUnique =
torch::full({indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
torch::dtype(torch::kInt32).device(indices.device()));
auto indicePairUnique = torch::full(
{indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
torch::dtype(torch::kInt32).device(indices.device()));
for (int i = 0; i < NDim; ++i) {
outSpatialShape32.push_back(outSpatialShape[i]);
kernelSize32.push_back(kernelSize[i]);
......@@ -88,40 +88,44 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
auto getIndicePairFtor =
functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
numActOut = getIndicePairFtor(
tv::CPU(), tv::torch2tv<const int>(indices), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
stride32, padding32, dilation32, outSpatialShape32, transpose);
tv::CPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
dilation32, outSpatialShape32, transpose);
} else {
auto getIndicePairFtor =
functor::CreateSubMIndicePairFunctor<tv::GPU, int, int, NDim>();
numActOut = getIndicePairFtor(
tv::TorchGPU(), tv::torch2tv<const int>(indices), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
stride32, padding32, dilation32, outSpatialShape32, transpose);
tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
dilation32, outSpatialShape32, transpose);
}
return {indices, indicePairs, indiceNum};
} else {
torch::Tensor outInds =
torch::zeros({numAct * kernelVolume, coorDim + 1},
torch::dtype(torch::kInt32).device(indices.device()));
torch::dtype(torch::kInt32).device(indices.device()));
if (indices.device().type() == torch::kCPU) {
auto getIndicePairFtor = functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
auto getIndicePairFtor =
functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
numActOut = getIndicePairFtor(
tv::CPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
stride32, padding32, dilation32, outSpatialShape32, transpose);
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
transpose);
} else {
auto getIndicePairFtorP1 =
functor::CreateConvIndicePairFunctorP1<tv::GPU, int, int, NDim>();
auto getIndicePairFtorP2 =
functor::CreateConvIndicePairFunctorP2<tv::GPU, int, int, NDim>();
numActOut =
getIndicePairFtorP1(tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
padding32, dilation32, outSpatialShape32, transpose);
numActOut = getIndicePairFtorP1(
tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
padding32, dilation32, outSpatialShape32, transpose);
if (numActOut > 0) {
auto res = torch::_unique(indicePairUnique);
indicePairUnique = std::get<0>(res);
......@@ -137,17 +141,17 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
}
template <unsigned NDim>
std::vector<torch::Tensor>
getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
std::vector<torch::Tensor> getIndicePairPreGrid(
torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
// auto timer = spconv::CudaContextTimer<>();
bool subM = _subM != 0;
bool transpose = _transpose != 0;
auto numAct = indices.size(0);
auto coorDim = indices.size(1) - 1; // batchIdx + xyz
auto coorDim = indices.size(1) - 1; // batchIdx + xyz
TV_ASSERT_RT_ERR(NDim == coorDim, "error");
TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
......@@ -167,7 +171,7 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch
TV_ASSERT_INVALID_ARG(gridOut.numel() >= outputVolume * batchSize, "error");
torch::Tensor indicePairs =
torch::full({kernelVolume, 2, numAct}, -1,
torch::dtype(torch::kInt32).device(indices.device()));
torch::dtype(torch::kInt32).device(indices.device()));
torch::Tensor indiceNum = torch::zeros(
{kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
// std::cout << "full time " << timer.report() / 1000.0 << std::endl;
......@@ -177,9 +181,9 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch
tv::SimpleVector<int, NDim> stride32;
tv::SimpleVector<int, NDim> padding32;
tv::SimpleVector<int, NDim> dilation32;
auto indicePairUnique =
torch::full({indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
torch::dtype(torch::kInt32).device(indices.device()));
auto indicePairUnique = torch::full(
{indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
torch::dtype(torch::kInt32).device(indices.device()));
for (int i = 0; i < NDim; ++i) {
outSpatialShape32.push_back(outSpatialShape[i]);
kernelSize32.push_back(kernelSize[i]);
......@@ -198,42 +202,46 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch
auto getIndicePairFtor =
functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
numActOut = getIndicePairFtor(
tv::CPU(), tv::torch2tv<const int>(indices), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
stride32, padding32, dilation32, outSpatialShape32, transpose);
tv::CPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
dilation32, outSpatialShape32, transpose);
gridOut.fill_(-1);
} else {
auto getIndicePairFtor =
functor::CreateSubMIndicePairFunctor<tv::GPU, int, int, NDim>();
numActOut = getIndicePairFtor(
tv::TorchGPU(), tv::torch2tv<const int>(indices), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
stride32, padding32, dilation32, outSpatialShape32, transpose, true);
tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
dilation32, outSpatialShape32, transpose, true);
}
return {indices, indicePairs, indiceNum};
} else {
torch::Tensor outInds =
torch::zeros({numAct * kernelVolume, coorDim + 1},
torch::dtype(torch::kInt32).device(indices.device()));
torch::dtype(torch::kInt32).device(indices.device()));
if (indices.device().type() == torch::kCPU) {
auto getIndicePairFtor = functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
auto getIndicePairFtor =
functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
numActOut = getIndicePairFtor(
tv::CPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
stride32, padding32, dilation32, outSpatialShape32, transpose, true);
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
transpose, true);
gridOut.fill_(-1);
} else {
auto getIndicePairFtorP1 =
functor::CreateConvIndicePairFunctorP1<tv::GPU, int, int, NDim>();
auto getIndicePairFtorP2 =
functor::CreateConvIndicePairFunctorP2<tv::GPU, int, int, NDim>();
numActOut =
getIndicePairFtorP1(tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
padding32, dilation32, outSpatialShape32, transpose);
numActOut = getIndicePairFtorP1(
tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
padding32, dilation32, outSpatialShape32, transpose);
if (numActOut > 0) {
auto res = torch::_unique(indicePairUnique);
indicePairUnique = std::get<0>(res);
......@@ -241,19 +249,18 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch
tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose, true);
tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose,
true);
}
}
return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
}
}
template <typename T>
torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs, torch::Tensor indiceNum,
int64_t numActOut, int64_t _inverse, int64_t _subM) {
torch::Tensor indicePairs, torch::Tensor indiceNum,
int64_t numActOut, int64_t _inverse, int64_t _subM) {
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
......@@ -262,13 +269,16 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto indicePairMaxSizeIter = std::max_element(
indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumCpu.data<int>();
auto indicePairMaxSizeIter =
std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
int indicePairMaxOffset =
indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
int indicePairMaxSize = *indicePairMaxSizeIter;
/*if (_subM){
std::vector<int> indicePairNumVec(indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
std::vector<int> indicePairNumVec(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
auto indicePairVecMaxSizeIter = std::max_element(
......@@ -282,45 +292,47 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
// torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
torch::Tensor inputBuffer = torch::zeros({indicePairMaxSize, numInPlanes}, options);
torch::Tensor inputBuffer =
torch::zeros({indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::zeros({indicePairMaxSize, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
if (subM) { // the center index of subm conv don't need gather and scatter
// add.
if (subM) { // the center index of subm conv don't need gather and scatter
// add.
torch::mm_out(output, features, filters[indicePairMaxOffset]);
}
double totalGatherTime = 0;
double totalGEMMTime = 0;
double totalSAddTime = 0;
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data<int>()[i];
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
// auto timer = spconv::CudaContextTimer<>();
auto outputBufferBlob =
torch::from_blob(outputBuffer.data<T>(), {nHot, numOutPlanes}, options);
auto inputBufferBlob =
torch::from_blob(inputBuffer.data<T>(), {nHot, numInPlanes}, options);
auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr<T>(),
{nHot, numOutPlanes}, options);
auto inputBufferBlob = torch::from_blob(inputBuffer.data_ptr<T>(),
{nHot, numInPlanes}, options);
if (device == torch::kCPU) {
functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
} else {
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR();
/* slower than SparseGatherFunctor, may due to int->long conversion
auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(), {nHot},
indicePairOptions);
torch::index_select_out(inputBufferBlob, features, 0,
indicePairBlob);*/
auto indicePairBlob = torch::from_blob(indicePairLong.data_ptr<long>(),
{nHot}, indicePairOptions); torch::index_select_out(inputBufferBlob,
features, 0, indicePairBlob);*/
}
// totalGatherTime += timer.report() / 1000.0;
torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
......@@ -330,14 +342,14 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
scatterFtor(tv::CPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
true);
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot, true);
} else {
functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
true);
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot, true);
TV_CHECK_CUDA_ERR();
}
// totalSAddTime += timer.report() / 1000.0;
......@@ -349,10 +361,12 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
}
template <typename T>
std::vector<torch::Tensor>
indiceConvBackward(torch::Tensor features, torch::Tensor filters,
torch::Tensor outGrad, torch::Tensor indicePairs, torch::Tensor indiceNum,
int64_t _inverse, int64_t _subM) {
std::vector<torch::Tensor> indiceConvBackward(torch::Tensor features,
torch::Tensor filters,
torch::Tensor outGrad,
torch::Tensor indicePairs,
torch::Tensor indiceNum,
int64_t _inverse, int64_t _subM) {
bool subM = _subM != 0;
bool inverse = _inverse != 0;
......@@ -362,16 +376,19 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto indicePairMaxSizeIter = std::max_element(
indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumCpu.data<int>();
auto indicePairMaxSizeIter =
std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
int indicePairMaxOffset =
indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
int indicePairMaxSize = *indicePairMaxSizeIter;
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
auto filterShape = filters.sizes();
torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
torch::Tensor filtersGrad = torch::zeros(filterShape, options);
torch::Tensor inputBuffer = torch::zeros({indicePairMaxSize, numInPlanes}, options);
torch::Tensor inputBuffer =
torch::zeros({indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::zeros({indicePairMaxSize, numOutPlanes}, options);
......@@ -383,7 +400,7 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
}
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data<int>()[i];
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
......@@ -392,27 +409,31 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtorOut;
gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
gatherFtorOut(tv::CPU(), tv::torch2tv<T>(outputBuffer),
tv::torch2tv<const T>(outGrad),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot);
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot);
} else {
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtorOut;
gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR();
gatherFtorOut(tv::TorchGPU(), tv::torch2tv<T>(outputBuffer),
tv::torch2tv<const T>(outGrad),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot);
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot);
TV_CHECK_CUDA_ERR();
}
auto filterGradSub = filtersGrad[i];
auto outputBufferBlob =
torch::from_blob(outputBuffer.data<T>(), {nHot, numOutPlanes}, options);
auto inputBufferBlob =
torch::from_blob(inputBuffer.data<T>(), {nHot, numInPlanes}, options);
auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr<T>(),
{nHot, numOutPlanes}, options);
auto inputBufferBlob = torch::from_blob(inputBuffer.data_ptr<T>(),
{nHot, numInPlanes}, options);
torch::mm_out(filterGradSub, inputBufferBlob.t(), outputBufferBlob);
torch::mm_out(inputBufferBlob, outputBufferBlob, filters[i].t());
......@@ -420,12 +441,14 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
scatterFtor(tv::CPU(), tv::torch2tv<T>(inputGrad),
tv::torch2tv<const T>(inputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
} else {
functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(inputGrad),
tv::torch2tv<const T>(inputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR();
}
}
......@@ -433,9 +456,12 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
}
template <typename T>
torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs, torch::Tensor indiceNum,
int64_t numActOut, int64_t _inverse, int64_t _subM) {
torch::Tensor indiceConvDevelopDontUse(torch::Tensor features,
torch::Tensor filters,
torch::Tensor indicePairs,
torch::Tensor indiceNum,
int64_t numActOut, int64_t _inverse,
int64_t _subM) {
bool subM = _subM != 0;
bool inverse = _inverse != 0;
......@@ -446,15 +472,19 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto totalActsTen = indicePairNumCpu.sum();
auto totalActs = indicePairNumCpu.data<int>()[0];
auto indicePairMaxSizeIter = std::max_element(
indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumCpu.data<int>();
auto totalActs = indicePairNumCpu.data_ptr<int>()[0];
auto indicePairMaxSizeIter =
std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
int indicePairMaxOffset =
indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
int indicePairMaxSize = *indicePairMaxSizeIter;
std::vector<int> indicePairNumVec(indicePairNumCpu.data<int>(),
indicePairNumCpu.data<int>() + kernelVolume);
std::vector<int> indicePairNumVec(
indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
int subRuleMaxSize = *std::max_element(indicePairNumVec.begin(), indicePairNumVec.end());
int subRuleMaxSize =
*std::max_element(indicePairNumVec.begin(), indicePairNumVec.end());
if (subM) {
indicePairMaxSize = subRuleMaxSize;
}
......@@ -470,9 +500,9 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil
torch::Tensor outputBuffer =
torch::zeros({kernelVolume, indicePairMaxSize, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
std::cout << "create time " << timer.report()/1000.0 << std::endl;
if (subM) { // the center index of subm conv don't need gather and scatter
// add.
std::cout << "create time " << timer.report() / 1000.0 << std::endl;
if (subM) { // the center index of subm conv don't need gather and scatter
// add.
torch::mm_out(output, features, filters[indicePairMaxOffset]);
}
double totalGatherTime = 0;
......@@ -480,43 +510,44 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil
double totalSAddTime = 0;
// auto timer = spconv::CudaContextTimer<>();
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data<int>()[i];
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
//
auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(),
auto outputBufferBlob = torch::from_blob(outputBuffer[i].data_ptr<T>(),
{nHot, numOutPlanes}, options);
auto inputBufferBlob = torch::from_blob(inputBuffer[i].data<T>(),
auto inputBufferBlob = torch::from_blob(inputBuffer[i].data_ptr<T>(),
{nHot, numInPlanes}, options);
if (device == torch::kCPU) {
functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBufferBlob),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
} else {
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBufferBlob),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR();
}
// }
// for (int i = 0; i < kernelVolume; ++i) {
// totalGatherTime += timer.report() / 1000.0;
// auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(),
// auto outputBufferBlob = torch::from_blob(outputBuffer[i].data_ptr<T>(),
// {nHot, numOutPlanes}, options);
}
// totalGatherTime += timer.report() / 1000.0;
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data<int>()[i];
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(),
auto outputBufferBlob = torch::from_blob(outputBuffer[i].data_ptr<T>(),
{nHot, numOutPlanes}, options);
auto inputBufferBlob = torch::from_blob(inputBuffer[i].data<T>(),
auto inputBufferBlob = torch::from_blob(inputBuffer[i].data_ptr<T>(),
{nHot, numInPlanes}, options);
torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
......@@ -524,27 +555,27 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil
// totalGEMMTime += timer.report() / 1000.0;
// totalGEMMTime += timer.report() / 1000.0;
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data<int>()[i];
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(),
auto outputBufferBlob = torch::from_blob(outputBuffer[i].data_ptr<T>(),
{nHot, numOutPlanes}, options);
auto inputBufferBlob = torch::from_blob(inputBuffer[i].data<T>(),
auto inputBufferBlob = torch::from_blob(inputBuffer[i].data_ptr<T>(),
{nHot, numInPlanes}, options);
if (device == torch::kCPU) {
functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
scatterFtor(tv::CPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBufferBlob),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
true);
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot, true);
} else {
functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBufferBlob),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
true);
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot, true);
TV_CHECK_CUDA_ERR();
}
// totalSAddTime += timer.report() / 1000.0;
......@@ -556,6 +587,6 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil
return output;
}
} // namespace spconv
} // namespace spconv
#endif
......@@ -13,48 +13,49 @@
// limitations under the License.
#pragma once
#include <tensorview/tensorview.h>
#include <torch/script.h>
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <tensorview/tensorview.h>
#include <torch/script.h>
namespace tv {
struct TorchGPU: public tv::GPU {
struct TorchGPU : public tv::GPU {
virtual cudaStream_t getStream() const override {
return at::cuda::getCurrentCUDAStream();
}
};
template <typename T> void check_torch_dtype(const torch::Tensor &tensor) {
template <typename T>
void check_torch_dtype(const torch::Tensor &tensor) {
switch (tensor.type().scalarType()) {
case at::ScalarType::Double: {
auto val = std::is_same<std::remove_const_t<T>, double>::value;
TV_ASSERT_RT_ERR(val, "error");
break;
}
case at::ScalarType::Float: {
auto val = std::is_same<std::remove_const_t<T>, float>::value;
TV_ASSERT_RT_ERR(val, "error");
break;
}
case at::ScalarType::Int: {
auto val = std::is_same<std::remove_const_t<T>, int>::value;
TV_ASSERT_RT_ERR(val, "error");
break;
}
case at::ScalarType::Half: {
auto val = std::is_same<std::remove_const_t<T>, at::Half>::value;
TV_ASSERT_RT_ERR(val, "error");
break;
}
case at::ScalarType::Long: {
auto val = std::is_same<std::remove_const_t<T>, long>::value;
TV_ASSERT_RT_ERR(val, "error");
break;
}
default:
TV_ASSERT_RT_ERR(false, "error");
case at::ScalarType::Double: {
auto val = std::is_same<std::remove_const_t<T>, double>::value;
TV_ASSERT_RT_ERR(val, "error");
break;
}
case at::ScalarType::Float: {
auto val = std::is_same<std::remove_const_t<T>, float>::value;
TV_ASSERT_RT_ERR(val, "error");
break;
}
case at::ScalarType::Int: {
auto val = std::is_same<std::remove_const_t<T>, int>::value;
TV_ASSERT_RT_ERR(val, "error");
break;
}
case at::ScalarType::Half: {
auto val = std::is_same<std::remove_const_t<T>, at::Half>::value;
TV_ASSERT_RT_ERR(val, "error");
break;
}
case at::ScalarType::Long: {
auto val = std::is_same<std::remove_const_t<T>, long>::value;
TV_ASSERT_RT_ERR(val, "error");
break;
}
default:
TV_ASSERT_RT_ERR(false, "error");
}
}
......@@ -65,6 +66,6 @@ tv::TensorView<T> torch2tv(const torch::Tensor &tensor) {
for (auto i : tensor.sizes()) {
shape.push_back(i);
}
return tv::TensorView<T>(tensor.data<std::remove_const_t<T>>(), shape);
return tv::TensorView<T>(tensor.data_ptr<std::remove_const_t<T>>(), shape);
}
} // namespace tv
} // namespace tv
#include <torch/extension.h>
#include <ATen/TensorUtils.h>
#include <torch/extension.h>
// #include "voxelization.h"
namespace {
template <typename T_int>
void determin_max_points_kernel(torch::TensorAccessor<T_int,2> coor,
torch::TensorAccessor<T_int,1> point_to_voxelidx,
torch::TensorAccessor<T_int,1> num_points_per_voxel,
torch::TensorAccessor<T_int,3> coor_to_voxelidx,
int& voxel_num,
int& max_points,
const int num_points
) {
int voxelidx, num;
for (int i = 0; i < num_points; ++i) {
if (coor[i][0] == -1)
continue;
voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];
// record voxel
if (voxelidx == -1) {
voxelidx = voxel_num;
voxel_num += 1;
coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]] = voxelidx;
}
// put points into voxel
num = num_points_per_voxel[voxelidx];
point_to_voxelidx[i] = num;
num_points_per_voxel[voxelidx] += 1;
// update max points per voxel
max_points = std::max(max_points, num+1);
void determin_max_points_kernel(
torch::TensorAccessor<T_int, 2> coor,
torch::TensorAccessor<T_int, 1> point_to_voxelidx,
torch::TensorAccessor<T_int, 1> num_points_per_voxel,
torch::TensorAccessor<T_int, 3> coor_to_voxelidx, int& voxel_num,
int& max_points, const int num_points) {
int voxelidx, num;
for (int i = 0; i < num_points; ++i) {
if (coor[i][0] == -1) continue;
voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];
// record voxel
if (voxelidx == -1) {
voxelidx = voxel_num;
voxel_num += 1;
coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]] = voxelidx;
}
return;
}
// put points into voxel
num = num_points_per_voxel[voxelidx];
point_to_voxelidx[i] = num;
num_points_per_voxel[voxelidx] += 1;
// update max points per voxel
max_points = std::max(max_points, num + 1);
}
return;
}
template <typename T, typename T_int>
void scatter_point_to_voxel_kernel(
const torch::TensorAccessor<T,2> points,
torch::TensorAccessor<T_int,2> coor,
torch::TensorAccessor<T_int,1> point_to_voxelidx,
torch::TensorAccessor<T_int,3> coor_to_voxelidx,
torch::TensorAccessor<T,3> voxels,
torch::TensorAccessor<T_int,2> voxel_coors,
const int num_features,
const int num_points,
const int NDim
){
for (int i = 0; i < num_points; ++i) {
int num = point_to_voxelidx[i];
int voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];
for (int k = 0; k < num_features; ++k) {
voxels[voxelidx][num][k] = points[i][k];
}
for (int k = 0; k < NDim; ++k) {
voxel_coors[voxelidx][k] = coor[i][k];
}
const torch::TensorAccessor<T, 2> points,
torch::TensorAccessor<T_int, 2> coor,
torch::TensorAccessor<T_int, 1> point_to_voxelidx,
torch::TensorAccessor<T_int, 3> coor_to_voxelidx,
torch::TensorAccessor<T, 3> voxels,
torch::TensorAccessor<T_int, 2> voxel_coors, const int num_features,
const int num_points, const int NDim) {
for (int i = 0; i < num_points; ++i) {
int num = point_to_voxelidx[i];
int voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];
for (int k = 0; k < num_features; ++k) {
voxels[voxelidx][num][k] = points[i][k];
}
for (int k = 0; k < NDim; ++k) {
voxel_coors[voxelidx][k] = coor[i][k];
}
}
}
} // namespace
} // namespace
namespace voxelization {
std::vector<at::Tensor> dynamic_point_to_voxel_cpu(
const at::Tensor& points,
const at::Tensor& voxel_mapping,
const std::vector<float> voxel_size,
const std::vector<float> coors_range) {
// current version tooks about 0.02s_0.03s for one frame on cpu
// check device
AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
const int NDim = voxel_mapping.size(1);
const int num_points = points.size(0);
const int num_features = points.size(1);
std::vector<int> grid_size(NDim);
for (int i = 0; i < NDim; ++i) {
grid_size[i] = round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
}
at::Tensor num_points_per_voxel = at::zeros({num_points,}, voxel_mapping.options());
at::Tensor coor_to_voxelidx = -at::ones({grid_size[2], grid_size[1], grid_size[0]}, voxel_mapping.options());
at::Tensor point_to_voxelidx = -at::ones({num_points,}, voxel_mapping.options());
int voxel_num = 0;
int max_points = 0;
AT_DISPATCH_ALL_TYPES(voxel_mapping.type(), "determin_max_point", [&] {
determin_max_points_kernel<scalar_t>(
voxel_mapping.accessor<scalar_t,2>(),
point_to_voxelidx.accessor<scalar_t,1>(),
num_points_per_voxel.accessor<scalar_t,1>(),
coor_to_voxelidx.accessor<scalar_t,3>(),
voxel_num,
max_points,
num_points
);
});
at::Tensor voxels = at::zeros({voxel_num, max_points, num_features}, points.options());
at::Tensor voxel_coors = at::zeros({voxel_num, NDim}, points.options().dtype(at::kInt));
AT_DISPATCH_ALL_TYPES(points.type(), "scatter_point_to_voxel", [&] {
scatter_point_to_voxel_kernel<scalar_t, int>(
points.accessor<scalar_t,2>(),
voxel_mapping.accessor<int,2>(),
point_to_voxelidx.accessor<int,1>(),
coor_to_voxelidx.accessor<int,3>(),
voxels.accessor<scalar_t,3>(),
voxel_coors.accessor<int,2>(),
num_features,
num_points,
NDim
);
});
at::Tensor num_points_per_voxel_out = num_points_per_voxel.slice(/*dim=*/0, /*start=*/0, /*end=*/voxel_num);
return {voxels, voxel_coors, num_points_per_voxel_out};
const at::Tensor& points, const at::Tensor& voxel_mapping,
const std::vector<float> voxel_size, const std::vector<float> coors_range) {
// current version tooks about 0.02s_0.03s for one frame on cpu
// check device
AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
const int NDim = voxel_mapping.size(1);
const int num_points = points.size(0);
const int num_features = points.size(1);
std::vector<int> grid_size(NDim);
for (int i = 0; i < NDim; ++i) {
grid_size[i] =
round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
}
at::Tensor num_points_per_voxel = at::zeros(
{
num_points,
},
voxel_mapping.options());
at::Tensor coor_to_voxelidx = -at::ones(
{grid_size[2], grid_size[1], grid_size[0]}, voxel_mapping.options());
at::Tensor point_to_voxelidx = -at::ones(
{
num_points,
},
voxel_mapping.options());
int voxel_num = 0;
int max_points = 0;
AT_DISPATCH_ALL_TYPES(voxel_mapping.scalar_type(), "determin_max_point", [&] {
determin_max_points_kernel<scalar_t>(
voxel_mapping.accessor<scalar_t, 2>(),
point_to_voxelidx.accessor<scalar_t, 1>(),
num_points_per_voxel.accessor<scalar_t, 1>(),
coor_to_voxelidx.accessor<scalar_t, 3>(), voxel_num, max_points,
num_points);
});
at::Tensor voxels =
at::zeros({voxel_num, max_points, num_features}, points.options());
at::Tensor voxel_coors =
at::zeros({voxel_num, NDim}, points.options().dtype(at::kInt));
AT_DISPATCH_ALL_TYPES(points.scalar_type(), "scatter_point_to_voxel", [&] {
scatter_point_to_voxel_kernel<scalar_t, int>(
points.accessor<scalar_t, 2>(), voxel_mapping.accessor<int, 2>(),
point_to_voxelidx.accessor<int, 1>(),
coor_to_voxelidx.accessor<int, 3>(), voxels.accessor<scalar_t, 3>(),
voxel_coors.accessor<int, 2>(), num_features, num_points, NDim);
});
at::Tensor num_points_per_voxel_out =
num_points_per_voxel.slice(/*dim=*/0, /*start=*/0, /*end=*/voxel_num);
return {voxels, voxel_coors, num_points_per_voxel_out};
}
}
} // namespace voxelization
......@@ -6,7 +6,7 @@
#include <ATen/cuda/CUDAApplyUtils.cuh>
#define CHECK_CUDA(x) \
TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) \
......@@ -177,7 +177,7 @@ std::vector<at::Tensor> dynamic_point_to_voxel_forward_gpu(
dim3 threads(threadsPerBlock);
cudaStream_t map_stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_ALL_TYPES(
voxel_mapping.type(), "determin_duplicate", ([&] {
voxel_mapping.scalar_type(), "determin_duplicate", ([&] {
point_to_voxelidx_kernel<int><<<blocks, threads, 0, map_stream>>>(
voxel_mapping.data_ptr<int>(), point_to_voxelidx.data_ptr<int>(),
point_to_pointidx.data_ptr<int>(), num_points, NDim);
......@@ -203,7 +203,7 @@ std::vector<at::Tensor> dynamic_point_to_voxel_forward_gpu(
voxel_mapping.options()); // must be zero from the begining
cudaStream_t logic_stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_ALL_TYPES(
voxel_mapping.type(), "determin_duplicate", ([&] {
voxel_mapping.scalar_type(), "determin_duplicate", ([&] {
determin_voxel_num<int><<<1, 1, 0, logic_stream>>>(
voxel_mapping.data_ptr<int>(), num_points_per_voxel.data_ptr<int>(),
point_to_voxelidx.data_ptr<int>(),
......@@ -228,7 +228,7 @@ std::vector<at::Tensor> dynamic_point_to_voxel_forward_gpu(
dim3 cp_threads(threadsPerBlock, 4);
cudaStream_t cp_stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_ALL_TYPES(
points.type(), "scatter_point_to_voxel", ([&] {
points.scalar_type(), "scatter_point_to_voxel", ([&] {
scatter_point_to_voxel_kernel<float, int>
<<<blocks, cp_threads, 0, cp_stream>>>(
points.data_ptr<float>(), voxel_mapping.data_ptr<int>(),
......@@ -265,8 +265,8 @@ void dynamic_point_to_voxel_backward_gpu(at::Tensor& grad_input_points,
dim3 blocks(col_blocks);
dim3 cp_threads(threadsPerBlock, 4);
cudaStream_t cp_stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_ALL_TYPES(grad_input_points.type(), "scatter_point_to_voxel",
([&] {
AT_DISPATCH_ALL_TYPES(grad_input_points.scalar_type(),
"scatter_point_to_voxel", ([&] {
map_voxel_to_point_kernel<float, int>
<<<blocks, cp_threads, 0, cp_stream>>>(
grad_input_points.data_ptr<float>(),
......
......@@ -49,7 +49,7 @@ inline int hard_voxelize(const at::Tensor& points, at::Tensor& voxels,
const std::vector<float> coors_range,
const int max_points, const int max_voxels,
const int NDim = 3) {
if (points.type().is_cuda()) {
if (points.device().is_cuda()) {
#ifdef WITH_CUDA
return hard_voxelize_gpu(points, voxels, coors, num_points_per_voxel,
voxel_size, coors_range, max_points, max_voxels,
......@@ -67,7 +67,7 @@ inline void dynamic_voxelize(const at::Tensor& points, at::Tensor& coors,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const int NDim = 3) {
if (points.type().is_cuda()) {
if (points.device().is_cuda()) {
#ifdef WITH_CUDA
return dynamic_voxelize_gpu(points, coors, voxel_size, coors_range, NDim);
#else
......@@ -80,7 +80,7 @@ inline void dynamic_voxelize(const at::Tensor& points, at::Tensor& coors,
inline std::vector<torch::Tensor> dynamic_point_to_voxel_forward(
const at::Tensor& points, const at::Tensor& voxel_mapping,
const std::vector<float> voxel_size, const std::vector<float> coors_range) {
if (points.type().is_cuda()) {
if (points.device().is_cuda()) {
#ifdef WITH_CUDA
return dynamic_point_to_voxel_forward_gpu(points, voxel_mapping, voxel_size,
coors_range);
......@@ -95,7 +95,7 @@ inline std::vector<torch::Tensor> dynamic_point_to_voxel_forward(
inline void dynamic_point_to_voxel_backward(
at::Tensor& grad_input_points, const at::Tensor& grad_output_voxels,
const at::Tensor& point_to_voxelidx, const at::Tensor& coor_to_voxelidx) {
if (grad_input_points.type().is_cuda()) {
if (grad_input_points.device().is_cuda()) {
#ifdef WITH_CUDA
return dynamic_point_to_voxel_backward_gpu(
grad_input_points, grad_output_voxels, point_to_voxelidx,
......
#include <torch/extension.h>
#include <ATen/TensorUtils.h>
#include <torch/extension.h>
// #include "voxelization.h"
namespace {
template <typename T, typename T_int>
void dynamic_voxelize_kernel(const torch::TensorAccessor<T,2> points,
void dynamic_voxelize_kernel(const torch::TensorAccessor<T, 2> points,
torch::TensorAccessor<T_int, 2> coors,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const std::vector<int> grid_size,
const int num_points,
const int num_features,
const int NDim
) {
const int num_points, const int num_features,
const int NDim) {
const int ndim_minus_1 = NDim - 1;
bool failed = false;
int coor[NDim];
......@@ -44,56 +40,42 @@ void dynamic_voxelize_kernel(const torch::TensorAccessor<T,2> points,
return;
}
template <typename T, typename T_int>
void hard_voxelize_kernel(const torch::TensorAccessor<T,2> points,
torch::TensorAccessor<T,3> voxels,
torch::TensorAccessor<T_int,2> coors,
torch::TensorAccessor<T_int,1> num_points_per_voxel,
torch::TensorAccessor<T_int,3> coor_to_voxelidx,
int& voxel_num,
const std::vector<float> voxel_size,
void hard_voxelize_kernel(const torch::TensorAccessor<T, 2> points,
torch::TensorAccessor<T, 3> voxels,
torch::TensorAccessor<T_int, 2> coors,
torch::TensorAccessor<T_int, 1> num_points_per_voxel,
torch::TensorAccessor<T_int, 3> coor_to_voxelidx,
int& voxel_num, const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const std::vector<int> grid_size,
const int max_points,
const int max_voxels,
const int num_points,
const int num_features,
const int NDim
) {
const int max_points, const int max_voxels,
const int num_points, const int num_features,
const int NDim) {
// declare a temp coors
at::Tensor temp_coors = at::zeros({num_points, NDim}, at::TensorOptions().dtype(at::kInt).device(at::kCPU));
at::Tensor temp_coors = at::zeros(
{num_points, NDim}, at::TensorOptions().dtype(at::kInt).device(at::kCPU));
// First use dynamic voxelization to get coors,
// then check max points/voxels constraints
dynamic_voxelize_kernel<T, int>(
points,
temp_coors.accessor<int,2>(),
voxel_size,
coors_range,
grid_size,
num_points,
num_features,
NDim
);
dynamic_voxelize_kernel<T, int>(points, temp_coors.accessor<int, 2>(),
voxel_size, coors_range, grid_size,
num_points, num_features, NDim);
int voxelidx, num;
auto coor = temp_coors.accessor<int,2>();
auto coor = temp_coors.accessor<int, 2>();
for (int i = 0; i < num_points; ++i) {
// T_int* coor = temp_coors.data_ptr<int>() + i * NDim;
if (coor[i][0] == -1)
continue;
if (coor[i][0] == -1) continue;
voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];
// record voxel
if (voxelidx == -1) {
voxelidx = voxel_num;
if (max_voxels != -1 && voxel_num >= max_voxels)
break;
if (max_voxels != -1 && voxel_num >= max_voxels) break;
voxel_num += 1;
coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]] = voxelidx;
......@@ -116,93 +98,74 @@ void hard_voxelize_kernel(const torch::TensorAccessor<T,2> points,
return;
}
} // namespace
} // namespace
namespace voxelization {
int hard_voxelize_cpu(
const at::Tensor& points,
at::Tensor& voxels,
at::Tensor& coors,
at::Tensor& num_points_per_voxel,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const int max_points,
const int max_voxels,
const int NDim=3) {
// current version tooks about 0.02s_0.03s for one frame on cpu
// check device
AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
std::vector<int> grid_size(NDim);
const int num_points = points.size(0);
const int num_features = points.size(1);
for (int i = 0; i < NDim; ++i) {
grid_size[i] = round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
}
int hard_voxelize_cpu(const at::Tensor& points, at::Tensor& voxels,
at::Tensor& coors, at::Tensor& num_points_per_voxel,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const int max_points, const int max_voxels,
const int NDim = 3) {
// current version tooks about 0.02s_0.03s for one frame on cpu
// check device
AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
std::vector<int> grid_size(NDim);
const int num_points = points.size(0);
const int num_features = points.size(1);
for (int i = 0; i < NDim; ++i) {
grid_size[i] =
round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
}
// coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
//printf("cpu coor_to_voxelidx size: [%d, %d, %d]\n", grid_size[2], grid_size[1], grid_size[0]);
at::Tensor coor_to_voxelidx = -at::ones({grid_size[2], grid_size[1], grid_size[0]}, coors.options());
// coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
// printf("cpu coor_to_voxelidx size: [%d, %d, %d]\n", grid_size[2],
// grid_size[1], grid_size[0]);
at::Tensor coor_to_voxelidx =
-at::ones({grid_size[2], grid_size[1], grid_size[0]}, coors.options());
int voxel_num = 0;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(points.type(), "hard_voxelize_forward", [&] {
int voxel_num = 0;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
points.scalar_type(), "hard_voxelize_forward", [&] {
hard_voxelize_kernel<scalar_t, int>(
points.accessor<scalar_t,2>(),
voxels.accessor<scalar_t,3>(),
coors.accessor<int,2>(),
num_points_per_voxel.accessor<int,1>(),
coor_to_voxelidx.accessor<int,3>(),
voxel_num,
voxel_size,
coors_range,
grid_size,
max_points,
max_voxels,
num_points,
num_features,
NDim
);
});
return voxel_num;
points.accessor<scalar_t, 2>(), voxels.accessor<scalar_t, 3>(),
coors.accessor<int, 2>(), num_points_per_voxel.accessor<int, 1>(),
coor_to_voxelidx.accessor<int, 3>(), voxel_num, voxel_size,
coors_range, grid_size, max_points, max_voxels, num_points,
num_features, NDim);
});
return voxel_num;
}
void dynamic_voxelize_cpu(const at::Tensor& points, at::Tensor& coors,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const int NDim = 3) {
// check device
AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
void dynamic_voxelize_cpu(
const at::Tensor& points,
at::Tensor& coors,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const int NDim=3) {
// check device
AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
std::vector<int> grid_size(NDim);
const int num_points = points.size(0);
const int num_features = points.size(1);
std::vector<int> grid_size(NDim);
const int num_points = points.size(0);
const int num_features = points.size(1);
for (int i = 0; i < NDim; ++i) {
grid_size[i] = round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
}
for (int i = 0; i < NDim; ++i) {
grid_size[i] =
round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
}
// coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
AT_DISPATCH_FLOATING_TYPES_AND_HALF(points.type(), "hard_voxelize_forward", [&] {
// coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
points.scalar_type(), "hard_voxelize_forward", [&] {
dynamic_voxelize_kernel<scalar_t, int>(
points.accessor<scalar_t,2>(),
coors.accessor<int,2>(),
voxel_size,
coors_range,
grid_size,
num_points,
num_features,
NDim
);
});
return;
}
points.accessor<scalar_t, 2>(), coors.accessor<int, 2>(),
voxel_size, coors_range, grid_size, num_points, num_features, NDim);
});
return;
}
} // namespace voxelization
......@@ -6,7 +6,7 @@
#include <ATen/cuda/CUDAApplyUtils.cuh>
#define CHECK_CUDA(x) \
TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) \
......@@ -219,7 +219,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
// 1. link point to corresponding voxel coors
AT_DISPATCH_ALL_TYPES(
points.type(), "hard_voxelize_kernel", ([&] {
points.scalar_type(), "hard_voxelize_kernel", ([&] {
dynamic_voxelize_kernel<scalar_t, int>
<<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
points.contiguous().data_ptr<scalar_t>(),
......@@ -247,7 +247,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
dim3 map_grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
dim3 map_block(512);
AT_DISPATCH_ALL_TYPES(
temp_coors.type(), "determin_duplicate", ([&] {
temp_coors.scalar_type(), "determin_duplicate", ([&] {
point_to_voxelidx_kernel<int>
<<<map_grid, map_block, 0, at::cuda::getCurrentCUDAStream()>>>(
temp_coors.contiguous().data_ptr<int>(),
......@@ -272,7 +272,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
points.options().dtype(at::kInt)); // must be zero from the begining
AT_DISPATCH_ALL_TYPES(
temp_coors.type(), "determin_duplicate", ([&] {
temp_coors.scalar_type(), "determin_duplicate", ([&] {
determin_voxel_num<int><<<1, 1, 0, at::cuda::getCurrentCUDAStream()>>>(
num_points_per_voxel.contiguous().data_ptr<int>(),
point_to_voxelidx.contiguous().data_ptr<int>(),
......@@ -290,7 +290,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
dim3 cp_grid(std::min(at::cuda::ATenCeilDiv(pts_output_size, 512), 4096));
dim3 cp_block(512);
AT_DISPATCH_ALL_TYPES(
points.type(), "assign_point_to_voxel", ([&] {
points.scalar_type(), "assign_point_to_voxel", ([&] {
assign_point_to_voxel<float, int>
<<<cp_grid, cp_block, 0, at::cuda::getCurrentCUDAStream()>>>(
pts_output_size, points.contiguous().data_ptr<float>(),
......@@ -308,7 +308,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
std::min(at::cuda::ATenCeilDiv(coors_output_size, 512), 4096));
dim3 coors_cp_block(512);
AT_DISPATCH_ALL_TYPES(
points.type(), "assign_point_to_voxel", ([&] {
points.scalar_type(), "assign_point_to_voxel", ([&] {
assign_voxel_coors<float, int><<<coors_cp_grid, coors_cp_block, 0,
at::cuda::getCurrentCUDAStream()>>>(
coors_output_size, temp_coors.contiguous().data_ptr<int>(),
......
from mmdet.utils import (Registry, build_from_cfg, get_model_complexity_info,
get_root_logger, print_log)
from mmcv.utils import Registry, build_from_cfg
from mmdet.utils import get_model_complexity_info, get_root_logger, print_log
from .collect_env import collect_env
__all__ = [
......
"""
CommandLine:
pytest tests/test_anchor.py
xdoctest tests/test_anchor.py zero
"""
import torch
def test_aligned_anchor_generator():
from mmdet3d.core.anchor import build_anchor_generator
if torch.cuda.is_available():
device = 'cuda'
else:
device = 'cpu'
anchor_generator_cfg = dict(
type='AlignedAnchor3DRangeGenerator',
ranges=[[-51.2, -51.2, -1.80, 51.2, 51.2, -1.80]],
strides=[1, 2, 4],
sizes=[
[0.8660, 2.5981, 1.], # 1.5/sqrt(3)
[0.5774, 1.7321, 1.], # 1/sqrt(3)
[1., 1., 1.],
[0.4, 0.4, 1],
],
custom_values=[0, 0],
rotations=[0, 1.57],
size_per_range=False,
reshape_out=True)
featmap_sizes = [(256, 256), (128, 128), (64, 64)]
anchor_generator = build_anchor_generator(anchor_generator_cfg)
assert anchor_generator.num_base_anchors == 8
# check base anchors
expected_grid_anchors = [
torch.tensor([[
-51.0000, -51.0000, -1.8000, 0.8660, 2.5981, 1.0000, 0.0000,
0.0000, 0.0000
],
[
-51.0000, -51.0000, -1.8000, 0.4000, 0.4000, 1.0000,
1.5700, 0.0000, 0.0000
],
[
-50.6000, -51.0000, -1.8000, 0.4000, 0.4000, 1.0000,
0.0000, 0.0000, 0.0000
],
[
-50.2000, -51.0000, -1.8000, 1.0000, 1.0000, 1.0000,
1.5700, 0.0000, 0.0000
],
[
-49.8000, -51.0000, -1.8000, 1.0000, 1.0000, 1.0000,
0.0000, 0.0000, 0.0000
],
[
-49.4000, -51.0000, -1.8000, 0.5774, 1.7321, 1.0000,
1.5700, 0.0000, 0.0000
],
[
-49.0000, -51.0000, -1.8000, 0.5774, 1.7321, 1.0000,
0.0000, 0.0000, 0.0000
],
[
-48.6000, -51.0000, -1.8000, 0.8660, 2.5981, 1.0000,
1.5700, 0.0000, 0.0000
]],
device=device),
torch.tensor([[
-50.8000, -50.8000, -1.8000, 1.7320, 5.1962, 2.0000, 0.0000,
0.0000, 0.0000
],
[
-50.8000, -50.8000, -1.8000, 0.8000, 0.8000, 2.0000,
1.5700, 0.0000, 0.0000
],
[
-50.0000, -50.8000, -1.8000, 0.8000, 0.8000, 2.0000,
0.0000, 0.0000, 0.0000
],
[
-49.2000, -50.8000, -1.8000, 2.0000, 2.0000, 2.0000,
1.5700, 0.0000, 0.0000
],
[
-48.4000, -50.8000, -1.8000, 2.0000, 2.0000, 2.0000,
0.0000, 0.0000, 0.0000
],
[
-47.6000, -50.8000, -1.8000, 1.1548, 3.4642, 2.0000,
1.5700, 0.0000, 0.0000
],
[
-46.8000, -50.8000, -1.8000, 1.1548, 3.4642, 2.0000,
0.0000, 0.0000, 0.0000
],
[
-46.0000, -50.8000, -1.8000, 1.7320, 5.1962, 2.0000,
1.5700, 0.0000, 0.0000
]],
device=device),
torch.tensor([[
-50.4000, -50.4000, -1.8000, 3.4640, 10.3924, 4.0000, 0.0000,
0.0000, 0.0000
],
[
-50.4000, -50.4000, -1.8000, 1.6000, 1.6000, 4.0000,
1.5700, 0.0000, 0.0000
],
[
-48.8000, -50.4000, -1.8000, 1.6000, 1.6000, 4.0000,
0.0000, 0.0000, 0.0000
],
[
-47.2000, -50.4000, -1.8000, 4.0000, 4.0000, 4.0000,
1.5700, 0.0000, 0.0000
],
[
-45.6000, -50.4000, -1.8000, 4.0000, 4.0000, 4.0000,
0.0000, 0.0000, 0.0000
],
[
-44.0000, -50.4000, -1.8000, 2.3096, 6.9284, 4.0000,
1.5700, 0.0000, 0.0000
],
[
-42.4000, -50.4000, -1.8000, 2.3096, 6.9284, 4.0000,
0.0000, 0.0000, 0.0000
],
[
-40.8000, -50.4000, -1.8000, 3.4640, 10.3924, 4.0000,
1.5700, 0.0000, 0.0000
]],
device=device)
]
multi_level_anchors = anchor_generator.grid_anchors(
featmap_sizes, device=device)
expected_multi_level_shapes = [
torch.Size([524288, 9]),
torch.Size([131072, 9]),
torch.Size([32768, 9])
]
for i, single_level_anchor in enumerate(multi_level_anchors):
assert single_level_anchor.shape == expected_multi_level_shapes[i]
# set [:56:7] thus it could cover 8 (len(size) * len(rotations))
# anchors on 8 location
assert single_level_anchor[:56:7].allclose(expected_grid_anchors[i])
......@@ -70,6 +70,34 @@ def test_config_build_detector():
# _check_bbox_head(head_config, detector.bbox_head)
def test_config_build_pipeline():
"""
Test that all detection models defined in the configs can be initialized.
"""
from mmcv import Config
from mmdet3d.datasets.pipelines import Compose
config_dpath = _get_config_directory()
print('Found config_dpath = {!r}'.format(config_dpath))
# Other configs needs database sampler.
config_names = [
'nus/hv_pointpillars_secfpn_sbn-all_4x8_20e_nus-3d.py',
]
print('Using {} config files'.format(len(config_names)))
for config_fname in config_names:
config_fpath = join(config_dpath, config_fname)
config_mod = Config.fromfile(config_fpath)
# build train_pipeline
train_pipeline = Compose(config_mod.train_pipeline)
test_pipeline = Compose(config_mod.test_pipeline)
assert train_pipeline is not None
assert test_pipeline is not None
def test_config_data_pipeline():
"""
Test whether the data pipeline is valid and can process corner cases.
......@@ -77,7 +105,7 @@ def test_config_data_pipeline():
xdoctest -m tests/test_config.py test_config_build_data_pipeline
"""
from mmcv import Config
from mmdet.datasets.pipelines import Compose
from mmdet3d.datasets.pipelines import Compose
import numpy as np
config_dpath = _get_config_directory()
......
#!/usr/bin/env bash
set -x
export PYTHONPATH=`pwd`:$PYTHONPATH
PARTITION=$1
JOB_NAME=$2
......@@ -20,4 +19,4 @@ srun -p ${PARTITION} \
--ntasks-per-node=${GPUS_PER_NODE} \
--kill-on-bad-exit=1 \
${SRUN_ARGS} \
python -u tools/train.py ${CONFIG} --work_dir=${WORK_DIR} --launcher="slurm" ${PY_ARGS}
python -u tools/train.py ${CONFIG} --work-dir=${WORK_DIR} --launcher="slurm" ${PY_ARGS}
......@@ -11,10 +11,11 @@ from mmcv import Config
from mmcv.runner import init_dist
from mmdet3d import __version__
from mmdet3d.apis import train_detector
from mmdet3d.datasets import build_dataset
from mmdet3d.models import build_detector
from mmdet3d.utils import collect_env
from mmdet.apis import get_root_logger, set_random_seed, train_detector
from mmdet.apis import get_root_logger, set_random_seed
def parse_args():
......@@ -27,12 +28,18 @@ def parse_args():
'--validate',
action='store_true',
help='whether to evaluate the checkpoint during training')
parser.add_argument(
group_gpus = parser.add_mutually_exclusive_group()
group_gpus.add_argument(
'--gpus',
type=int,
default=1,
help='number of gpus to use '
'(only applicable to non-distributed training)')
group_gpus.add_argument(
'--gpu-ids',
type=int,
nargs='+',
help='ids of gpus to use '
'(only applicable to non-distributed training)')
parser.add_argument('--seed', type=int, default=0, help='random seed')
parser.add_argument(
'--deterministic',
......@@ -73,11 +80,14 @@ def main():
osp.splitext(osp.basename(args.config))[0])
if args.resume_from is not None:
cfg.resume_from = args.resume_from
cfg.gpus = args.gpus
if args.gpu_ids is not None:
cfg.gpu_ids = args.gpu_ids
else:
cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus)
if args.autoscale_lr:
# apply the linear scaling rule (https://arxiv.org/abs/1706.02677)
cfg.optimizer['lr'] = cfg.optimizer['lr'] * cfg.gpus / 8
cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8
# init distributed env first, since logger depends on the dist info.
if args.launcher == 'none':
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment