Commit 868c5fab authored by zhangwenwei's avatar zhangwenwei
Browse files

Merge branch 'fix-cuda-file' into 'master'

Fix cuda file

See merge request open-mmlab/mmdet.3d!24
parents a8e0f664 a9c2ecb5
...@@ -93,7 +93,8 @@ class_names = ['Car'] ...@@ -93,7 +93,8 @@ class_names = ['Car']
img_norm_cfg = dict( img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
input_modality = dict( input_modality = dict(
use_lidar=True, use_lidar=False,
use_lidar_reduced=True,
use_depth=False, use_depth=False,
use_lidar_intensity=True, use_lidar_intensity=True,
use_camera=False, use_camera=False,
......
...@@ -113,7 +113,8 @@ class_names = ['Pedestrian', 'Cyclist', 'Car'] ...@@ -113,7 +113,8 @@ class_names = ['Pedestrian', 'Cyclist', 'Car']
img_norm_cfg = dict( img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
input_modality = dict( input_modality = dict(
use_lidar=True, use_lidar=False,
use_lidar_reduced=True,
use_depth=False, use_depth=False,
use_lidar_intensity=True, use_lidar_intensity=True,
use_camera=True, use_camera=True,
......
...@@ -91,7 +91,8 @@ class_names = ['Car'] ...@@ -91,7 +91,8 @@ class_names = ['Car']
img_norm_cfg = dict( img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
input_modality = dict( input_modality = dict(
use_lidar=True, use_lidar=False,
use_lidar_reduced=True,
use_depth=False, use_depth=False,
use_lidar_intensity=True, use_lidar_intensity=True,
use_camera=True, use_camera=True,
......
...@@ -90,7 +90,8 @@ class_names = ['Car'] ...@@ -90,7 +90,8 @@ class_names = ['Car']
img_norm_cfg = dict( img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
input_modality = dict( input_modality = dict(
use_lidar=True, use_lidar=False,
use_lidar_reduced=True,
use_depth=False, use_depth=False,
use_lidar_intensity=True, use_lidar_intensity=True,
use_camera=False, use_camera=False,
......
...@@ -89,7 +89,8 @@ class_names = ['Car'] ...@@ -89,7 +89,8 @@ class_names = ['Car']
img_norm_cfg = dict( img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
input_modality = dict( input_modality = dict(
use_lidar=True, use_lidar=False,
use_lidar_reduced=True,
use_depth=False, use_depth=False,
use_lidar_intensity=True, use_lidar_intensity=True,
use_camera=False, use_camera=False,
......
...@@ -184,6 +184,8 @@ class KittiDataset(torch_data.Dataset): ...@@ -184,6 +184,8 @@ class KittiDataset(torch_data.Dataset):
if self.modality['use_depth'] and self.modality['use_lidar']: if self.modality['use_depth'] and self.modality['use_lidar']:
points = self.get_lidar_depth_reduced(sample_idx) points = self.get_lidar_depth_reduced(sample_idx)
elif self.modality['use_lidar']: elif self.modality['use_lidar']:
points = self.get_lidar(sample_idx)
elif self.modality['use_lidar_reduced']:
points = self.get_lidar_reduced(sample_idx) points = self.get_lidar_reduced(sample_idx)
elif self.modality['use_depth']: elif self.modality['use_depth']:
points = self.get_pure_depth_reduced(sample_idx) points = self.get_pure_depth_reduced(sample_idx)
...@@ -238,8 +240,6 @@ class KittiDataset(torch_data.Dataset): ...@@ -238,8 +240,6 @@ class KittiDataset(torch_data.Dataset):
axis=1).astype(np.float32) axis=1).astype(np.float32)
difficulty = annos['difficulty'] difficulty = annos['difficulty']
# this change gt_bboxes_3d to velodyne coordinates # this change gt_bboxes_3d to velodyne coordinates
import pdb
pdb.set_trace()
gt_bboxes_3d = box_np_ops.box_camera_to_lidar(gt_bboxes_3d, rect, gt_bboxes_3d = box_np_ops.box_camera_to_lidar(gt_bboxes_3d, rect,
Trv2c) Trv2c)
# only center format is allowed. so we need to convert # only center format is allowed. so we need to convert
......
import warnings
import numba import numba
import numpy as np import numpy as np
from numba.errors import NumbaPerformanceWarning
from mmdet3d.core.bbox import box_np_ops from mmdet3d.core.bbox import box_np_ops
warnings.filterwarnings("ignore", category=NumbaPerformanceWarning)
@numba.njit @numba.njit
def _rotation_box2d_jit_(corners, angle, rot_mat_T): def _rotation_box2d_jit_(corners, angle, rot_mat_T):
......
...@@ -7,8 +7,8 @@ ...@@ -7,8 +7,8 @@
#include <assert.h> #include <assert.h>
#include <math.h> #include <math.h>
#include <stdio.h> #include <stdio.h>
#include <torch/extension.h>
#include <torch/serialize/tensor.h> #include <torch/serialize/tensor.h>
#include <torch/types.h>
#define THREADS_PER_BLOCK 256 #define THREADS_PER_BLOCK 256
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0)) #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
......
//Modified from // Modified from
//https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu // https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
//RoI-aware point cloud feature pooling // RoI-aware point cloud feature pooling
//Written by Shaoshuai Shi // Written by Shaoshuai Shi
//All Rights Reserved 2019. // All Rights Reserved 2019.
#include <torch/serialize/tensor.h>
#include <torch/extension.h>
#include <assert.h> #include <assert.h>
#include <math.h> #include <math.h>
#include <stdio.h> #include <stdio.h>
#include <torch/serialize/tensor.h>
#include <torch/types.h>
#define THREADS_PER_BLOCK 256 #define THREADS_PER_BLOCK 256
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
// #define DEBUG // #define DEBUG
__device__ inline void lidar_to_local_coords(float shift_x, float shift_y,
__device__ inline void lidar_to_local_coords(float shift_x, float shift_y, float rz, float &local_x, float &local_y){ float rz, float &local_x,
// should rotate pi/2 + alpha to translate LiDAR to local float &local_y) {
float rot_angle = rz + M_PI / 2; // should rotate pi/2 + alpha to translate LiDAR to local
float cosa = cos(rot_angle), sina = sin(rot_angle); float rot_angle = rz + M_PI / 2;
local_x = shift_x * cosa + shift_y * (-sina); float cosa = cos(rot_angle), sina = sin(rot_angle);
local_y = shift_x * sina + shift_y * cosa; local_x = shift_x * cosa + shift_y * (-sina);
local_y = shift_x * sina + shift_y * cosa;
} }
__device__ inline int check_pt_in_box3d(const float *pt, const float *box3d,
__device__ inline int check_pt_in_box3d(const float *pt, const float *box3d, float &local_x, float &local_y){ float &local_x, float &local_y) {
// param pt: (x, y, z) // param pt: (x, y, z)
// param box3d: (cx, cy, cz, w, l, h, rz) in LiDAR coordinate, cz in the bottom center // param box3d: (cx, cy, cz, w, l, h, rz) in LiDAR coordinate, cz in the
float x = pt[0], y = pt[1], z = pt[2]; // bottom center
float cx = box3d[0], cy = box3d[1], cz = box3d[2]; float x = pt[0], y = pt[1], z = pt[2];
float w = box3d[3], l = box3d[4], h = box3d[5], rz = box3d[6]; float cx = box3d[0], cy = box3d[1], cz = box3d[2];
cz += h / 2.0; // shift to the center since cz in box3d is the bottom center float w = box3d[3], l = box3d[4], h = box3d[5], rz = box3d[6];
cz += h / 2.0; // shift to the center since cz in box3d is the bottom center
if (fabsf(z - cz) > h / 2.0) return 0;
lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y); if (fabsf(z - cz) > h / 2.0) return 0;
float in_flag = (local_x > -l / 2.0) & (local_x < l / 2.0) & (local_y > -w / 2.0) & (local_y < w / 2.0); lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
return in_flag; float in_flag = (local_x > -l / 2.0) & (local_x < l / 2.0) &
(local_y > -w / 2.0) & (local_y < w / 2.0);
return in_flag;
} }
__global__ void generate_pts_mask_for_box3d(int boxes_num, int pts_num,
__global__ void generate_pts_mask_for_box3d(int boxes_num, int pts_num, int out_x, int out_y, int out_z, int out_x, int out_y, int out_z,
const float *rois, const float *pts, int *pts_mask){ const float *rois, const float *pts,
// params rois: (N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate int *pts_mask) {
// params pts: (npoints, 3) [x, y, z] // params rois: (N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate
// params pts_mask: (N, npoints): -1 means point doesnot in this box, otherwise: encode (x_idxs, y_idxs, z_idxs) by binary bit // params pts: (npoints, 3) [x, y, z]
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; // params pts_mask: (N, npoints): -1 means point doesnot in this box,
int box_idx = blockIdx.y; // otherwise: encode (x_idxs, y_idxs, z_idxs) by binary bit
if (pt_idx >= pts_num || box_idx >= boxes_num) return; int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
int box_idx = blockIdx.y;
pts += pt_idx * 3; if (pt_idx >= pts_num || box_idx >= boxes_num) return;
rois += box_idx * 7;
pts_mask += box_idx * pts_num + pt_idx; pts += pt_idx * 3;
rois += box_idx * 7;
float local_x = 0, local_y = 0; pts_mask += box_idx * pts_num + pt_idx;
int cur_in_flag = check_pt_in_box3d(pts, rois, local_x, local_y);
float local_x = 0, local_y = 0;
pts_mask[0] = -1; int cur_in_flag = check_pt_in_box3d(pts, rois, local_x, local_y);
if (cur_in_flag > 0){
float local_z = pts[2] - rois[2]; pts_mask[0] = -1;
float w = rois[3], l = rois[4], h = rois[5]; if (cur_in_flag > 0) {
float local_z = pts[2] - rois[2];
float x_res = l / out_x; float w = rois[3], l = rois[4], h = rois[5];
float y_res = w / out_y;
float z_res = h / out_z; float x_res = l / out_x;
float y_res = w / out_y;
unsigned int x_idx = int((local_x + l / 2) / x_res); float z_res = h / out_z;
unsigned int y_idx = int((local_y + w / 2) / y_res);
unsigned int z_idx = int(local_z / z_res); unsigned int x_idx = int((local_x + l / 2) / x_res);
unsigned int y_idx = int((local_y + w / 2) / y_res);
x_idx = min(max(x_idx, 0), out_x - 1); unsigned int z_idx = int(local_z / z_res);
y_idx = min(max(y_idx, 0), out_y - 1);
z_idx = min(max(z_idx, 0), out_z - 1); x_idx = min(max(x_idx, 0), out_x - 1);
y_idx = min(max(y_idx, 0), out_y - 1);
unsigned int idx_encoding = (x_idx << 16) + (y_idx << 8) + z_idx; z_idx = min(max(z_idx, 0), out_z - 1);
unsigned int idx_encoding = (x_idx << 16) + (y_idx << 8) + z_idx;
#ifdef DEBUG #ifdef DEBUG
printf("mask: pts_%d(%.3f, %.3f, %.3f), local(%.3f, %.3f, %.3f), idx(%d, %d, %d), res(%.3f, %.3f, %.3f), idx_encoding=%x\n", printf(
pt_idx, pts[0], pts[1], pts[2], local_x, local_y, local_z, x_idx, y_idx, z_idx, x_res, y_res, z_res, idx_encoding); "mask: pts_%d(%.3f, %.3f, %.3f), local(%.3f, %.3f, %.3f), idx(%d, %d, "
"%d), res(%.3f, %.3f, %.3f), idx_encoding=%x\n",
pt_idx, pts[0], pts[1], pts[2], local_x, local_y, local_z, x_idx, y_idx,
z_idx, x_res, y_res, z_res, idx_encoding);
#endif #endif
pts_mask[0] = idx_encoding; pts_mask[0] = idx_encoding;
} }
} }
__global__ void collect_inside_pts_for_box3d(int boxes_num, int pts_num,
__global__ void collect_inside_pts_for_box3d(int boxes_num, int pts_num, int max_pts_each_voxel, int max_pts_each_voxel, int out_x,
int out_x, int out_y, int out_z, const int *pts_mask, int *pts_idx_of_voxels){ int out_y, int out_z,
// params pts_mask: (N, npoints) 0 or 1 const int *pts_mask,
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel) int *pts_idx_of_voxels) {
// params pts_mask: (N, npoints) 0 or 1
int box_idx = blockIdx.x * blockDim.x + threadIdx.x; // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
if (box_idx >= boxes_num) return;
int box_idx = blockIdx.x * blockDim.x + threadIdx.x;
int max_num_pts = max_pts_each_voxel - 1; // index 0 is the counter if (box_idx >= boxes_num) return;
pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel;
int max_num_pts = max_pts_each_voxel - 1; // index 0 is the counter
for (int k = 0; k < pts_num; k++){ pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel;
if (pts_mask[box_idx * pts_num + k] != -1){
unsigned int idx_encoding = pts_mask[box_idx * pts_num + k]; for (int k = 0; k < pts_num; k++) {
unsigned int x_idx = (idx_encoding >> 16) & 0xFF; if (pts_mask[box_idx * pts_num + k] != -1) {
unsigned int y_idx = (idx_encoding >> 8) & 0xFF; unsigned int idx_encoding = pts_mask[box_idx * pts_num + k];
unsigned int z_idx = idx_encoding & 0xFF; unsigned int x_idx = (idx_encoding >> 16) & 0xFF;
unsigned int base_offset = x_idx * out_y * out_z * max_pts_each_voxel + y_idx * out_z * max_pts_each_voxel + z_idx * max_pts_each_voxel; unsigned int y_idx = (idx_encoding >> 8) & 0xFF;
unsigned int cnt = pts_idx_of_voxels[base_offset]; unsigned int z_idx = idx_encoding & 0xFF;
if (cnt < max_num_pts){ unsigned int base_offset = x_idx * out_y * out_z * max_pts_each_voxel +
pts_idx_of_voxels[base_offset + cnt + 1] = k; y_idx * out_z * max_pts_each_voxel +
pts_idx_of_voxels[base_offset]++; z_idx * max_pts_each_voxel;
} unsigned int cnt = pts_idx_of_voxels[base_offset];
if (cnt < max_num_pts) {
pts_idx_of_voxels[base_offset + cnt + 1] = k;
pts_idx_of_voxels[base_offset]++;
}
#ifdef DEBUG #ifdef DEBUG
printf("collect: pts_%d, idx(%d, %d, %d), idx_encoding=%x\n", printf("collect: pts_%d, idx(%d, %d, %d), idx_encoding=%x\n", k, x_idx,
k, x_idx, y_idx, z_idx, idx_encoding); y_idx, z_idx, idx_encoding);
#endif #endif
}
} }
}
} }
__global__ void roiaware_maxpool3d(int boxes_num, int pts_num, int channels,
__global__ void roiaware_maxpool3d(int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x, int max_pts_each_voxel, int out_x, int out_y,
int out_y, int out_z, const float *pts_feature, const int *pts_idx_of_voxels, float *pooled_features, int *argmax){ int out_z, const float *pts_feature,
// params pts_feature: (npoints, C) const int *pts_idx_of_voxels,
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel), index 0 is the counter float *pooled_features, int *argmax) {
// params pooled_features: (N, out_x, out_y, out_z, C) // params pts_feature: (npoints, C)
// params argmax: (N, out_x, out_y, out_z, C) // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel),
// index 0 is the counter params pooled_features: (N, out_x, out_y, out_z, C)
int box_idx = blockIdx.z; // params argmax: (N, out_x, out_y, out_z, C)
int channel_idx = blockIdx.y;
int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x; int box_idx = blockIdx.z;
int channel_idx = blockIdx.y;
int x_idx = voxel_idx_flat / (out_y * out_z); int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
int z_idx = voxel_idx_flat % out_z; int x_idx = voxel_idx_flat / (out_y * out_z);
if (box_idx >= boxes_num || channel_idx >= channels|| x_idx >= out_x || y_idx >= out_y || z_idx >= out_z) return; int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
int z_idx = voxel_idx_flat % out_z;
if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
y_idx >= out_y || z_idx >= out_z)
return;
#ifdef DEBUG #ifdef DEBUG
printf("src pts_idx_of_voxels: (%p, ), argmax: %p\n", pts_idx_of_voxels, argmax); printf("src pts_idx_of_voxels: (%p, ), argmax: %p\n", pts_idx_of_voxels,
argmax);
#endif #endif
int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx; int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel + offset_base * max_pts_each_voxel; pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
pooled_features += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx; offset_base * max_pts_each_voxel;
argmax += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx; pooled_features += box_idx * out_x * out_y * out_z * channels +
offset_base * channels + channel_idx;
argmax += box_idx * out_x * out_y * out_z * channels +
offset_base * channels + channel_idx;
int argmax_idx = -1; int argmax_idx = -1;
float max_val = -1e50; float max_val = -1e50;
int total_pts = pts_idx_of_voxels[0]; int total_pts = pts_idx_of_voxels[0];
for (int k = 1; k <= total_pts; k++){ for (int k = 1; k <= total_pts; k++) {
if (pts_feature[pts_idx_of_voxels[k] * channels + channel_idx] > max_val){ if (pts_feature[pts_idx_of_voxels[k] * channels + channel_idx] > max_val) {
max_val = pts_feature[pts_idx_of_voxels[k] * channels + channel_idx]; max_val = pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
argmax_idx = pts_idx_of_voxels[k]; argmax_idx = pts_idx_of_voxels[k];
}
} }
}
if (argmax_idx != -1){ if (argmax_idx != -1) {
pooled_features[0] = max_val; pooled_features[0] = max_val;
} }
argmax[0] = argmax_idx; argmax[0] = argmax_idx;
#ifdef DEBUG #ifdef DEBUG
printf("channel_%d idx(%d, %d, %d), argmax_idx=(%d, %.3f), total=%d, after pts_idx: %p, argmax: (%p, %d)\n", printf(
channel_idx, x_idx, y_idx, z_idx, argmax_idx, max_val, total_pts, pts_idx_of_voxels, argmax, argmax_idx); "channel_%d idx(%d, %d, %d), argmax_idx=(%d, %.3f), total=%d, after "
"pts_idx: %p, argmax: (%p, %d)\n",
channel_idx, x_idx, y_idx, z_idx, argmax_idx, max_val, total_pts,
pts_idx_of_voxels, argmax, argmax_idx);
#endif #endif
} }
__global__ void roiaware_avgpool3d(int boxes_num, int pts_num, int channels,
__global__ void roiaware_avgpool3d(int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x, int max_pts_each_voxel, int out_x, int out_y,
int out_y, int out_z, const float *pts_feature, const int *pts_idx_of_voxels, float *pooled_features){ int out_z, const float *pts_feature,
// params pts_feature: (npoints, C) const int *pts_idx_of_voxels,
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel), index 0 is the counter float *pooled_features) {
// params pooled_features: (N, out_x, out_y, out_z, C) // params pts_feature: (npoints, C)
// params argmax: (N, out_x, out_y, out_z, C) // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel),
// index 0 is the counter params pooled_features: (N, out_x, out_y, out_z, C)
int box_idx = blockIdx.z; // params argmax: (N, out_x, out_y, out_z, C)
int channel_idx = blockIdx.y;
int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x; int box_idx = blockIdx.z;
int channel_idx = blockIdx.y;
int x_idx = voxel_idx_flat / (out_y * out_z); int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
int z_idx = voxel_idx_flat % out_z; int x_idx = voxel_idx_flat / (out_y * out_z);
if (box_idx >= boxes_num || channel_idx >= channels|| x_idx >= out_x || y_idx >= out_y || z_idx >= out_z) return; int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
int z_idx = voxel_idx_flat % out_z;
int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx; if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel + offset_base * max_pts_each_voxel; y_idx >= out_y || z_idx >= out_z)
pooled_features += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx; return;
float sum_val = 0; int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
int total_pts = pts_idx_of_voxels[0]; pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
offset_base * max_pts_each_voxel;
for (int k = 1; k <= total_pts; k++){ pooled_features += box_idx * out_x * out_y * out_z * channels +
sum_val += pts_feature[pts_idx_of_voxels[k] * channels + channel_idx]; offset_base * channels + channel_idx;
}
float sum_val = 0;
if (total_pts > 0){ int total_pts = pts_idx_of_voxels[0];
pooled_features[0] = sum_val / total_pts;
} for (int k = 1; k <= total_pts; k++) {
sum_val += pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
}
if (total_pts > 0) {
pooled_features[0] = sum_val / total_pts;
}
} }
void roiaware_pool3d_launcher(int boxes_num, int pts_num, int channels,
int max_pts_each_voxel, int out_x, int out_y,
void roiaware_pool3d_launcher(int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x, int out_y, int out_z, int out_z, const float *rois, const float *pts,
const float *rois, const float *pts, const float *pts_feature, int *argmax, int *pts_idx_of_voxels, float *pooled_features, int pool_method){ const float *pts_feature, int *argmax,
// params rois: (N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate int *pts_idx_of_voxels, float *pooled_features,
// params pts: (npoints, 3) [x, y, z] in LiDAR coordinate int pool_method) {
// params pts_feature: (npoints, C) // params rois: (N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate
// params argmax: (N, out_x, out_y, out_z, C) // params pts: (npoints, 3) [x, y, z] in LiDAR coordinate
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel) // params pts_feature: (npoints, C)
// params pooled_features: (N, out_x, out_y, out_z, C) // params argmax: (N, out_x, out_y, out_z, C)
// params pool_method: 0: max_pool 1: avg_pool // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
// params pooled_features: (N, out_x, out_y, out_z, C)
int *pts_mask = NULL; // params pool_method: 0: max_pool 1: avg_pool
cudaMalloc(&pts_mask, boxes_num * pts_num * sizeof(int)); // (N, M)
cudaMemset(pts_mask, -1, boxes_num * pts_num * sizeof(int)); int *pts_mask = NULL;
cudaMalloc(&pts_mask, boxes_num * pts_num * sizeof(int)); // (N, M)
dim3 blocks_mask(DIVUP(pts_num, THREADS_PER_BLOCK), boxes_num); cudaMemset(pts_mask, -1, boxes_num * pts_num * sizeof(int));
dim3 threads(THREADS_PER_BLOCK);
generate_pts_mask_for_box3d<<<blocks_mask, threads>>>(boxes_num, pts_num, out_x, out_y, out_z, rois, pts, pts_mask); dim3 blocks_mask(DIVUP(pts_num, THREADS_PER_BLOCK), boxes_num);
dim3 threads(THREADS_PER_BLOCK);
// TODO: Merge the collect and pool functions, SS generate_pts_mask_for_box3d<<<blocks_mask, threads>>>(
boxes_num, pts_num, out_x, out_y, out_z, rois, pts, pts_mask);
dim3 blocks_collect(DIVUP(boxes_num, THREADS_PER_BLOCK));
collect_inside_pts_for_box3d<<<blocks_collect, threads>>>(boxes_num, pts_num, max_pts_each_voxel, // TODO: Merge the collect and pool functions, SS
out_x, out_y, out_z, pts_mask, pts_idx_of_voxels);
dim3 blocks_collect(DIVUP(boxes_num, THREADS_PER_BLOCK));
dim3 blocks_pool(DIVUP(out_x * out_y * out_z, THREADS_PER_BLOCK), channels, boxes_num); collect_inside_pts_for_box3d<<<blocks_collect, threads>>>(
if (pool_method == 0){ boxes_num, pts_num, max_pts_each_voxel, out_x, out_y, out_z, pts_mask,
roiaware_maxpool3d<<<blocks_pool, threads>>>(boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z, pts_idx_of_voxels);
pts_feature, pts_idx_of_voxels, pooled_features, argmax);
} dim3 blocks_pool(DIVUP(out_x * out_y * out_z, THREADS_PER_BLOCK), channels,
else if (pool_method == 1){ boxes_num);
roiaware_avgpool3d<<<blocks_pool, threads>>>(boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z, if (pool_method == 0) {
pts_feature, pts_idx_of_voxels, pooled_features); roiaware_maxpool3d<<<blocks_pool, threads>>>(
} boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
pts_feature, pts_idx_of_voxels, pooled_features, argmax);
} else if (pool_method == 1) {
cudaFree(pts_mask); roiaware_avgpool3d<<<blocks_pool, threads>>>(
boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
pts_feature, pts_idx_of_voxels, pooled_features);
}
cudaFree(pts_mask);
#ifdef DEBUG #ifdef DEBUG
cudaDeviceSynchronize(); // for using printf in kernel function cudaDeviceSynchronize(); // for using printf in kernel function
#endif #endif
} }
__global__ void roiaware_maxpool3d_backward(int boxes_num, int channels,
__global__ void roiaware_maxpool3d_backward(int boxes_num, int channels, int out_x, int out_y, int out_z, int out_x, int out_y, int out_z,
const int *argmax, const float *grad_out, float *grad_in){ const int *argmax,
// params argmax: (N, out_x, out_y, out_z, C) const float *grad_out,
// params grad_out: (N, out_x, out_y, out_z, C) float *grad_in) {
// params grad_in: (npoints, C), return value // params argmax: (N, out_x, out_y, out_z, C)
// params grad_out: (N, out_x, out_y, out_z, C)
int box_idx = blockIdx.z; // params grad_in: (npoints, C), return value
int channel_idx = blockIdx.y;
int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x; int box_idx = blockIdx.z;
int channel_idx = blockIdx.y;
int x_idx = voxel_idx_flat / (out_y * out_z); int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
int z_idx = voxel_idx_flat % out_z; int x_idx = voxel_idx_flat / (out_y * out_z);
if (box_idx >= boxes_num || channel_idx >= channels|| x_idx >= out_x || y_idx >= out_y || z_idx >= out_z) return; int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
int z_idx = voxel_idx_flat % out_z;
int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx; if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
argmax += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx; y_idx >= out_y || z_idx >= out_z)
grad_out += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx; return;
if (argmax[0] == -1) return; int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
argmax += box_idx * out_x * out_y * out_z * channels +
atomicAdd(grad_in + argmax[0] * channels + channel_idx, grad_out[0] * 1); offset_base * channels + channel_idx;
grad_out += box_idx * out_x * out_y * out_z * channels +
offset_base * channels + channel_idx;
if (argmax[0] == -1) return;
atomicAdd(grad_in + argmax[0] * channels + channel_idx, grad_out[0] * 1);
} }
__global__ void roiaware_avgpool3d_backward(int boxes_num, int channels,
__global__ void roiaware_avgpool3d_backward(int boxes_num, int channels, int out_x, int out_y, int out_z, int out_x, int out_y, int out_z,
int max_pts_each_voxel, const int *pts_idx_of_voxels, const float *grad_out, float *grad_in){ int max_pts_each_voxel,
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel) const int *pts_idx_of_voxels,
// params grad_out: (N, out_x, out_y, out_z, C) const float *grad_out,
// params grad_in: (npoints, C), return value float *grad_in) {
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
int box_idx = blockIdx.z; // params grad_out: (N, out_x, out_y, out_z, C)
int channel_idx = blockIdx.y; // params grad_in: (npoints, C), return value
int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
int box_idx = blockIdx.z;
int x_idx = voxel_idx_flat / (out_y * out_z); int channel_idx = blockIdx.y;
int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z; int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
int z_idx = voxel_idx_flat % out_z;
if (box_idx >= boxes_num || channel_idx >= channels|| x_idx >= out_x || y_idx >= out_y || z_idx >= out_z) return; int x_idx = voxel_idx_flat / (out_y * out_z);
int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx; int z_idx = voxel_idx_flat % out_z;
pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel + offset_base * max_pts_each_voxel; if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
grad_out += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx; y_idx >= out_y || z_idx >= out_z)
return;
int total_pts = pts_idx_of_voxels[0]; int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
float cur_grad = 1 / fmaxf(float(total_pts), 1.0); pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
for (int k = 1; k <= total_pts; k++){ offset_base * max_pts_each_voxel;
atomicAdd(grad_in + pts_idx_of_voxels[k] * channels + channel_idx, grad_out[0] * cur_grad); grad_out += box_idx * out_x * out_y * out_z * channels +
} offset_base * channels + channel_idx;
int total_pts = pts_idx_of_voxels[0];
float cur_grad = 1 / fmaxf(float(total_pts), 1.0);
for (int k = 1; k <= total_pts; k++) {
atomicAdd(grad_in + pts_idx_of_voxels[k] * channels + channel_idx,
grad_out[0] * cur_grad);
}
} }
void roiaware_pool3d_backward_launcher(int boxes_num, int out_x, int out_y,
int out_z, int channels,
void roiaware_pool3d_backward_launcher(int boxes_num, int out_x, int out_y, int out_z, int channels, int max_pts_each_voxel, int max_pts_each_voxel,
const int *pts_idx_of_voxels, const int *argmax, const float *grad_out, float *grad_in, int pool_method){ const int *pts_idx_of_voxels,
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel) const int *argmax, const float *grad_out,
// params argmax: (N, out_x, out_y, out_z, C) float *grad_in, int pool_method) {
// params grad_out: (N, out_x, out_y, out_z, C) // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
// params grad_in: (npoints, C), return value // params argmax: (N, out_x, out_y, out_z, C)
// params pool_method: 0: max_pool, 1: avg_pool // params grad_out: (N, out_x, out_y, out_z, C)
// params grad_in: (npoints, C), return value
dim3 blocks(DIVUP(out_x * out_y * out_z, THREADS_PER_BLOCK), channels, boxes_num); // params pool_method: 0: max_pool, 1: avg_pool
dim3 threads(THREADS_PER_BLOCK);
if (pool_method == 0){ dim3 blocks(DIVUP(out_x * out_y * out_z, THREADS_PER_BLOCK), channels,
roiaware_maxpool3d_backward<<<blocks, threads>>>( boxes_num);
boxes_num, channels, out_x, out_y, out_z, argmax, grad_out, grad_in dim3 threads(THREADS_PER_BLOCK);
); if (pool_method == 0) {
} roiaware_maxpool3d_backward<<<blocks, threads>>>(
else if (pool_method == 1){ boxes_num, channels, out_x, out_y, out_z, argmax, grad_out, grad_in);
roiaware_avgpool3d_backward<<<blocks, threads>>>( } else if (pool_method == 1) {
boxes_num, channels, out_x, out_y, out_z, max_pts_each_voxel, pts_idx_of_voxels, grad_out, grad_in roiaware_avgpool3d_backward<<<blocks, threads>>>(
); boxes_num, channels, out_x, out_y, out_z, max_pts_each_voxel,
} pts_idx_of_voxels, grad_out, grad_in);
}
} }
from mmcv.cnn import build_norm_layer from mmcv.cnn import build_norm_layer
from torch import nn from torch import nn
import mmdet3d.ops.spconv as spconv
from mmdet.models.backbones.resnet import BasicBlock, Bottleneck from mmdet.models.backbones.resnet import BasicBlock, Bottleneck
from . import spconv
def conv3x3(in_planes, out_planes, stride=1, indice_key=None): def conv3x3(in_planes, out_planes, stride=1, indice_key=None):
......
#!/usr/bin/env bash
CONFIG=$1
CHECKPOINT=$2
GPUS=$3
PORT=${PORT:-29500}
PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
$(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4}
#!/usr/bin/env bash #!/usr/bin/env bash
PYTHON=${PYTHON:-"python"}
CONFIG=$1 CONFIG=$1
GPUS=$2 GPUS=$2
PORT=${PORT:-29500}
$PYTHON -m torch.distributed.launch --nproc_per_node=$GPUS \ PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
$(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3}
#!/usr/bin/env bash #!/usr/bin/env bash
set -x set -x
export PYTHONPATH=`pwd`:$PYTHONPATH
PARTITION=$1 PARTITION=$1
JOB_NAME=$2 JOB_NAME=$2
...@@ -9,14 +8,17 @@ CONFIG=$3 ...@@ -9,14 +8,17 @@ CONFIG=$3
CHECKPOINT=$4 CHECKPOINT=$4
GPUS=${GPUS:-8} GPUS=${GPUS:-8}
GPUS_PER_NODE=${GPUS_PER_NODE:-8} GPUS_PER_NODE=${GPUS_PER_NODE:-8}
CPUS_PER_TASK=${CPUS_PER_TASK:-5}
PY_ARGS=${@:5} PY_ARGS=${@:5}
SRUN_ARGS=${SRUN_ARGS:-""} SRUN_ARGS=${SRUN_ARGS:-""}
PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
srun -p ${PARTITION} \ srun -p ${PARTITION} \
--job-name=${JOB_NAME} \ --job-name=${JOB_NAME} \
--gres=gpu:${GPUS_PER_NODE} \ --gres=gpu:${GPUS_PER_NODE} \
--ntasks=${GPUS} \ --ntasks=${GPUS} \
--ntasks-per-node=${GPUS_PER_NODE} \ --ntasks-per-node=${GPUS_PER_NODE} \
--cpus-per-task=${CPUS_PER_TASK} \
--kill-on-bad-exit=1 \ --kill-on-bad-exit=1 \
${SRUN_ARGS} \ ${SRUN_ARGS} \
python -u tools/test.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS} python -u tools/test.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS}
...@@ -8,15 +8,17 @@ CONFIG=$3 ...@@ -8,15 +8,17 @@ CONFIG=$3
WORK_DIR=$4 WORK_DIR=$4
GPUS=${GPUS:-8} GPUS=${GPUS:-8}
GPUS_PER_NODE=${GPUS_PER_NODE:-8} GPUS_PER_NODE=${GPUS_PER_NODE:-8}
CPUS_PER_TASK=${CPUS_PER_TASK:-5}
SRUN_ARGS=${SRUN_ARGS:-""} SRUN_ARGS=${SRUN_ARGS:-""}
PY_ARGS=${PY_ARGS:-"--validate"} PY_ARGS=${@:5}
PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
srun -p ${PARTITION} \ srun -p ${PARTITION} \
--job-name=${JOB_NAME} \ --job-name=${JOB_NAME} \
--gres=gpu:${GPUS_PER_NODE} \ --gres=gpu:${GPUS_PER_NODE} \
--ntasks=${GPUS} \ --ntasks=${GPUS} \
--ntasks-per-node=${GPUS_PER_NODE} \ --ntasks-per-node=${GPUS_PER_NODE} \
--cpus-per-task=${CPUS_PER_TASK} \
--kill-on-bad-exit=1 \ --kill-on-bad-exit=1 \
${SRUN_ARGS} \ ${SRUN_ARGS} \
python -u tools/train.py ${CONFIG} --work-dir=${WORK_DIR} --launcher="slurm" ${PY_ARGS} python -u tools/train.py ${CONFIG} --work-dir=${WORK_DIR} --launcher="slurm" ${PY_ARGS}
...@@ -8,7 +8,7 @@ import time ...@@ -8,7 +8,7 @@ import time
import mmcv import mmcv
import torch import torch
from mmcv import Config from mmcv import Config, DictAction
from mmcv.runner import init_dist from mmcv.runner import init_dist
from mmdet3d import __version__ from mmdet3d import __version__
...@@ -26,9 +26,9 @@ def parse_args(): ...@@ -26,9 +26,9 @@ def parse_args():
parser.add_argument( parser.add_argument(
'--resume-from', help='the checkpoint file to resume from') '--resume-from', help='the checkpoint file to resume from')
parser.add_argument( parser.add_argument(
'--validate', '--no-validate',
action='store_true', action='store_true',
help='whether to evaluate the checkpoint during training') help='whether not to evaluate the checkpoint during training')
group_gpus = parser.add_mutually_exclusive_group() group_gpus = parser.add_mutually_exclusive_group()
group_gpus.add_argument( group_gpus.add_argument(
'--gpus', '--gpus',
...@@ -46,6 +46,8 @@ def parse_args(): ...@@ -46,6 +46,8 @@ def parse_args():
'--deterministic', '--deterministic',
action='store_true', action='store_true',
help='whether to set deterministic options for CUDNN backend.') help='whether to set deterministic options for CUDNN backend.')
parser.add_argument(
'--options', nargs='+', action=DictAction, help='arguments in dict')
parser.add_argument( parser.add_argument(
'--launcher', '--launcher',
choices=['none', 'pytorch', 'slurm', 'mpi'], choices=['none', 'pytorch', 'slurm', 'mpi'],
...@@ -67,6 +69,9 @@ def main(): ...@@ -67,6 +69,9 @@ def main():
args = parse_args() args = parse_args()
cfg = Config.fromfile(args.config) cfg = Config.fromfile(args.config)
if args.options is not None:
cfg.merge_from_dict(args.options)
# set cudnn_benchmark # set cudnn_benchmark
if cfg.get('cudnn_benchmark', False): if cfg.get('cudnn_benchmark', False):
torch.backends.cudnn.benchmark = True torch.backends.cudnn.benchmark = True
...@@ -101,7 +106,7 @@ def main(): ...@@ -101,7 +106,7 @@ def main():
mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir)) mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
# init the logger before other steps # init the logger before other steps
timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
log_file = osp.join(cfg.work_dir, '{}.log'.format(timestamp)) log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)
# add a logging filter # add a logging filter
...@@ -113,28 +118,27 @@ def main(): ...@@ -113,28 +118,27 @@ def main():
meta = dict() meta = dict()
# log env info # log env info
env_info_dict = collect_env() env_info_dict = collect_env()
env_info = '\n'.join([('{}: {}'.format(k, v)) env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])
for k, v in env_info_dict.items()])
dash_line = '-' * 60 + '\n' dash_line = '-' * 60 + '\n'
logger.info('Environment info:\n' + dash_line + env_info + '\n' + logger.info('Environment info:\n' + dash_line + env_info + '\n' +
dash_line) dash_line)
meta['env_info'] = env_info meta['env_info'] = env_info
# log some basic info # log some basic info
logger.info('Distributed training: {}'.format(distributed)) logger.info(f'Distributed training: {distributed}')
logger.info('Config:\n{}'.format(cfg.text)) logger.info(f'Config:\n{cfg.pretty_text}')
# set random seeds # set random seeds
if args.seed is not None: if args.seed is not None:
logger.info('Set random seed to {}, deterministic: {}'.format( logger.info(f'Set random seed to {args.seed}, '
args.seed, args.deterministic)) f'deterministic: {args.deterministic}')
set_random_seed(args.seed, deterministic=args.deterministic) set_random_seed(args.seed, deterministic=args.deterministic)
cfg.seed = args.seed cfg.seed = args.seed
meta['seed'] = args.seed meta['seed'] = args.seed
model = build_detector( model = build_detector(
cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg) cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg)
logger.info('Model:\n{}'.format(model)) logger.info(f'Model:\n{model}')
datasets = [build_dataset(cfg.data.train)] datasets = [build_dataset(cfg.data.train)]
if len(cfg.workflow) == 2: if len(cfg.workflow) == 2:
val_dataset = copy.deepcopy(cfg.data.val) val_dataset = copy.deepcopy(cfg.data.val)
...@@ -145,7 +149,7 @@ def main(): ...@@ -145,7 +149,7 @@ def main():
# checkpoints as meta data # checkpoints as meta data
cfg.checkpoint_config.meta = dict( cfg.checkpoint_config.meta = dict(
mmdet_version=__version__, mmdet_version=__version__,
config=cfg.text, config=cfg.pretty_text,
CLASSES=datasets[0].CLASSES) CLASSES=datasets[0].CLASSES)
# add an attribute for visualization convenience # add an attribute for visualization convenience
model.CLASSES = datasets[0].CLASSES model.CLASSES = datasets[0].CLASSES
...@@ -154,7 +158,7 @@ def main(): ...@@ -154,7 +158,7 @@ def main():
datasets, datasets,
cfg, cfg,
distributed=distributed, distributed=distributed,
validate=args.validate, validate=(not args.no_validate),
timestamp=timestamp, timestamp=timestamp,
meta=meta) meta=meta)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment