Merge branch 'fix-cuda-file' into 'master'

Fix cuda file See merge request open-mmlab/mmdet.3d!24

Merge branch 'fix-cuda-file' into 'master'
Fix cuda file See merge request open-mmlab/mmdet.3d!24
868c5fab · zhangwenwei · a8e0f664 · a9c2ecb5 · 868c5fab · 868c5fab
Commit 868c5fab authored May 11, 2020 by zhangwenwei
15 changed files
--- a/configs/kitti/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py
+++ b/configs/kitti/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py
@@ -93,7 +93,8 @@ class_names = ['Car']
 img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 input_modality = dict(
-    use_lidar=True,
+    use_lidar=False,
+    use_lidar_reduced=True,
    use_depth=False,
    use_lidar_intensity=True,
    use_camera=False,

--- a/configs/kitti/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class.py
+++ b/configs/kitti/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class.py
@@ -113,7 +113,8 @@ class_names = ['Pedestrian', 'Cyclist', 'Car']
 img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 input_modality = dict(
-    use_lidar=True,
+    use_lidar=False,
+    use_lidar_reduced=True,
    use_depth=False,
    use_lidar_intensity=True,
    use_camera=True,

--- a/configs/kitti/dv_second_secfpn_6x8_80e_kitti-3d-car.py
+++ b/configs/kitti/dv_second_secfpn_6x8_80e_kitti-3d-car.py
@@ -91,7 +91,8 @@ class_names = ['Car']
 img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 input_modality = dict(
-    use_lidar=True,
+    use_lidar=False,
+    use_lidar_reduced=True,
    use_depth=False,
    use_lidar_intensity=True,
    use_camera=True,

--- a/configs/kitti/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py
+++ b/configs/kitti/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py
@@ -90,7 +90,8 @@ class_names = ['Car']
 img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 input_modality = dict(
-    use_lidar=True,
+    use_lidar=False,
+    use_lidar_reduced=True,
    use_depth=False,
    use_lidar_intensity=True,
    use_camera=False,

--- a/configs/kitti/hv_second_secfpn_6x8_80e_kitti-3d-car.py
+++ b/configs/kitti/hv_second_secfpn_6x8_80e_kitti-3d-car.py
@@ -89,7 +89,8 @@ class_names = ['Car']
 img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 input_modality = dict(
-    use_lidar=True,
+    use_lidar=False,
+    use_lidar_reduced=True,
    use_depth=False,
    use_lidar_intensity=True,
    use_camera=False,

--- a/mmdet3d/datasets/kitti_dataset.py
+++ b/mmdet3d/datasets/kitti_dataset.py
@@ -184,6 +184,8 @@ class KittiDataset(torch_data.Dataset):
        if self.modality['use_depth'] and self.modality['use_lidar']:
            points = self.get_lidar_depth_reduced(sample_idx)
        elif self.modality['use_lidar']:
+            points = self.get_lidar(sample_idx)
+        elif self.modality['use_lidar_reduced']:
            points = self.get_lidar_reduced(sample_idx)
        elif self.modality['use_depth']:
            points = self.get_pure_depth_reduced(sample_idx)
@@ -238,8 +240,6 @@ class KittiDataset(torch_data.Dataset):
                                      axis=1).astype(np.float32)
        difficulty = annos['difficulty']
        # this change gt_bboxes_3d to velodyne coordinates
-        import pdb
-        pdb.set_trace()
        gt_bboxes_3d = box_np_ops.box_camera_to_lidar(gt_bboxes_3d, rect,
                                                      Trv2c)
        # only center format is allowed. so we need to convert

--- a/mmdet3d/datasets/pipelines/data_augment_utils.py
+++ b/mmdet3d/datasets/pipelines/data_augment_utils.py
+import warnings
+
 import numba
 import numpy as np
+from numba.errors import NumbaPerformanceWarning

 from mmdet3d.core.bbox import box_np_ops

+warnings.filterwarnings("ignore", category=NumbaPerformanceWarning)
+

 @numba.njit
 def _rotation_box2d_jit_(corners, angle, rot_mat_T):

--- a/mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cuda.cu
+++ b/mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cuda.cu
@@ -7,8 +7,8 @@
 #include <assert.h>
 #include <math.h>
 #include <stdio.h>
-#include <torch/extension.h>
 #include <torch/serialize/tensor.h>
+#include <torch/types.h>

 #define THREADS_PER_BLOCK 256
 #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))

--- a/mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
+++ b/mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
-//Modified from
-//https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
-//RoI-aware point cloud feature pooling
-//Written by Shaoshuai Shi
-//All Rights Reserved 2019.
+// Modified from
+// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
+// RoI-aware point cloud feature pooling
+// Written by Shaoshuai Shi
+// All Rights Reserved 2019.

-
-#include <torch/serialize/tensor.h>
-#include <torch/extension.h>
 #include <assert.h>
-
 #include <math.h>
 #include <stdio.h>
+#include <torch/serialize/tensor.h>
+#include <torch/types.h>

 #define THREADS_PER_BLOCK 256
-#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))

 // #define DEBUG

-
-__device__ inline void lidar_to_local_coords(float shift_x, float shift_y, float rz, float &local_x, float &local_y){
-    // should rotate pi/2 + alpha to translate LiDAR to local
-    float rot_angle = rz + M_PI / 2;
-    float cosa = cos(rot_angle), sina = sin(rot_angle);
-    local_x = shift_x * cosa + shift_y * (-sina);
-    local_y = shift_x * sina + shift_y * cosa;
+__device__ inline void lidar_to_local_coords(float shift_x, float shift_y,
+                                             float rz, float &local_x,
+                                             float &local_y) {
+  // should rotate pi/2 + alpha to translate LiDAR to local
+  float rot_angle = rz + M_PI / 2;
+  float cosa = cos(rot_angle), sina = sin(rot_angle);
+  local_x = shift_x * cosa + shift_y * (-sina);
+  local_y = shift_x * sina + shift_y * cosa;
 }

-
-__device__ inline int check_pt_in_box3d(const float *pt, const float *box3d, float &local_x, float &local_y){
-    // param pt: (x, y, z)
-    // param box3d: (cx, cy, cz, w, l, h, rz) in LiDAR coordinate, cz in the bottom center
-    float x = pt[0], y = pt[1], z = pt[2];
-    float cx = box3d[0], cy = box3d[1], cz = box3d[2];
-    float w = box3d[3], l = box3d[4], h = box3d[5], rz = box3d[6];
-    cz += h / 2.0;  // shift to the center since cz in box3d is the bottom center
-
-    if (fabsf(z - cz) > h / 2.0) return 0;
-    lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
-    float in_flag = (local_x > -l / 2.0) & (local_x < l / 2.0) & (local_y > -w / 2.0) & (local_y < w / 2.0);
-    return in_flag;
+__device__ inline int check_pt_in_box3d(const float *pt, const float *box3d,
+                                        float &local_x, float &local_y) {
+  // param pt: (x, y, z)
+  // param box3d: (cx, cy, cz, w, l, h, rz) in LiDAR coordinate, cz in the
+  // bottom center
+  float x = pt[0], y = pt[1], z = pt[2];
+  float cx = box3d[0], cy = box3d[1], cz = box3d[2];
+  float w = box3d[3], l = box3d[4], h = box3d[5], rz = box3d[6];
+  cz += h / 2.0;  // shift to the center since cz in box3d is the bottom center
+
+  if (fabsf(z - cz) > h / 2.0) return 0;
+  lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
+  float in_flag = (local_x > -l / 2.0) & (local_x < l / 2.0) &
+                  (local_y > -w / 2.0) & (local_y < w / 2.0);
+  return in_flag;
 }

-
-__global__ void generate_pts_mask_for_box3d(int boxes_num, int pts_num, int out_x, int out_y, int out_z,
-    const float *rois, const float *pts, int *pts_mask){
-    // params rois: (N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate
-    // params pts: (npoints, 3) [x, y, z]
-    // params pts_mask: (N, npoints): -1 means point doesnot in this box, otherwise: encode (x_idxs, y_idxs, z_idxs) by binary bit
-    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int box_idx = blockIdx.y;
-    if (pt_idx >= pts_num || box_idx >= boxes_num) return;
-
-    pts += pt_idx * 3;
-    rois += box_idx * 7;
-    pts_mask += box_idx * pts_num + pt_idx;
-
-    float local_x = 0, local_y = 0;
-    int cur_in_flag = check_pt_in_box3d(pts, rois, local_x, local_y);
-
-    pts_mask[0] = -1;
-    if (cur_in_flag > 0){
-        float local_z = pts[2] - rois[2];
-        float w = rois[3], l = rois[4], h = rois[5];
-
-        float x_res = l / out_x;
-        float y_res = w / out_y;
-        float z_res = h / out_z;
-
-        unsigned int x_idx = int((local_x + l / 2) / x_res);
-        unsigned int y_idx = int((local_y + w / 2) / y_res);
-        unsigned int z_idx = int(local_z / z_res);
-
-        x_idx = min(max(x_idx, 0), out_x - 1);
-        y_idx = min(max(y_idx, 0), out_y - 1);
-        z_idx = min(max(z_idx, 0), out_z - 1);
-
-        unsigned int idx_encoding = (x_idx << 16) + (y_idx << 8) + z_idx;
+__global__ void generate_pts_mask_for_box3d(int boxes_num, int pts_num,
+                                            int out_x, int out_y, int out_z,
+                                            const float *rois, const float *pts,
+                                            int *pts_mask) {
+  // params rois: (N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate
+  // params pts: (npoints, 3) [x, y, z]
+  // params pts_mask: (N, npoints): -1 means point doesnot in this box,
+  // otherwise: encode (x_idxs, y_idxs, z_idxs) by binary bit
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int box_idx = blockIdx.y;
+  if (pt_idx >= pts_num || box_idx >= boxes_num) return;
+
+  pts += pt_idx * 3;
+  rois += box_idx * 7;
+  pts_mask += box_idx * pts_num + pt_idx;
+
+  float local_x = 0, local_y = 0;
+  int cur_in_flag = check_pt_in_box3d(pts, rois, local_x, local_y);
+
+  pts_mask[0] = -1;
+  if (cur_in_flag > 0) {
+    float local_z = pts[2] - rois[2];
+    float w = rois[3], l = rois[4], h = rois[5];
+
+    float x_res = l / out_x;
+    float y_res = w / out_y;
+    float z_res = h / out_z;
+
+    unsigned int x_idx = int((local_x + l / 2) / x_res);
+    unsigned int y_idx = int((local_y + w / 2) / y_res);
+    unsigned int z_idx = int(local_z / z_res);
+
+    x_idx = min(max(x_idx, 0), out_x - 1);
+    y_idx = min(max(y_idx, 0), out_y - 1);
+    z_idx = min(max(z_idx, 0), out_z - 1);
+
+    unsigned int idx_encoding = (x_idx << 16) + (y_idx << 8) + z_idx;
 #ifdef DEBUG
-        printf("mask: pts_%d(%.3f, %.3f, %.3f), local(%.3f, %.3f, %.3f), idx(%d, %d, %d), res(%.3f, %.3f, %.3f), idx_encoding=%x\n",
-            pt_idx, pts[0], pts[1], pts[2], local_x, local_y, local_z, x_idx, y_idx, z_idx, x_res, y_res, z_res, idx_encoding);
+    printf(
+        "mask: pts_%d(%.3f, %.3f, %.3f), local(%.3f, %.3f, %.3f), idx(%d, %d, "
+        "%d), res(%.3f, %.3f, %.3f), idx_encoding=%x\n",
+        pt_idx, pts[0], pts[1], pts[2], local_x, local_y, local_z, x_idx, y_idx,
+        z_idx, x_res, y_res, z_res, idx_encoding);
 #endif

-        pts_mask[0] = idx_encoding;
-    }
+    pts_mask[0] = idx_encoding;
+  }
 }

-
-__global__ void collect_inside_pts_for_box3d(int boxes_num, int pts_num, int max_pts_each_voxel,
-    int out_x, int out_y, int out_z, const int *pts_mask, int *pts_idx_of_voxels){
-    // params pts_mask: (N, npoints)  0 or 1
-    // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
-
-    int box_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (box_idx >= boxes_num) return;
-
-    int max_num_pts = max_pts_each_voxel - 1;  // index 0 is the counter
-    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel;
-
-    for (int k = 0; k < pts_num; k++){
-        if (pts_mask[box_idx * pts_num + k] != -1){
-            unsigned int idx_encoding = pts_mask[box_idx * pts_num + k];
-            unsigned int x_idx = (idx_encoding >> 16) & 0xFF;
-            unsigned int y_idx = (idx_encoding >> 8) & 0xFF;
-            unsigned int z_idx = idx_encoding & 0xFF;
-            unsigned int base_offset = x_idx * out_y * out_z * max_pts_each_voxel + y_idx * out_z * max_pts_each_voxel + z_idx * max_pts_each_voxel;
-            unsigned int cnt = pts_idx_of_voxels[base_offset];
-            if (cnt < max_num_pts){
-                pts_idx_of_voxels[base_offset + cnt + 1] = k;
-                pts_idx_of_voxels[base_offset]++;
-            }
+__global__ void collect_inside_pts_for_box3d(int boxes_num, int pts_num,
+                                             int max_pts_each_voxel, int out_x,
+                                             int out_y, int out_z,
+                                             const int *pts_mask,
+                                             int *pts_idx_of_voxels) {
+  // params pts_mask: (N, npoints)  0 or 1
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+
+  int box_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (box_idx >= boxes_num) return;
+
+  int max_num_pts = max_pts_each_voxel - 1;  // index 0 is the counter
+  pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel;
+
+  for (int k = 0; k < pts_num; k++) {
+    if (pts_mask[box_idx * pts_num + k] != -1) {
+      unsigned int idx_encoding = pts_mask[box_idx * pts_num + k];
+      unsigned int x_idx = (idx_encoding >> 16) & 0xFF;
+      unsigned int y_idx = (idx_encoding >> 8) & 0xFF;
+      unsigned int z_idx = idx_encoding & 0xFF;
+      unsigned int base_offset = x_idx * out_y * out_z * max_pts_each_voxel +
+                                 y_idx * out_z * max_pts_each_voxel +
+                                 z_idx * max_pts_each_voxel;
+      unsigned int cnt = pts_idx_of_voxels[base_offset];
+      if (cnt < max_num_pts) {
+        pts_idx_of_voxels[base_offset + cnt + 1] = k;
+        pts_idx_of_voxels[base_offset]++;
+      }
 #ifdef DEBUG
-        printf("collect: pts_%d, idx(%d, %d, %d), idx_encoding=%x\n",
-            k, x_idx, y_idx, z_idx, idx_encoding);
+      printf("collect: pts_%d, idx(%d, %d, %d), idx_encoding=%x\n", k, x_idx,
+             y_idx, z_idx, idx_encoding);
 #endif
-
-        }
    }
+  }
 }

-
-__global__ void roiaware_maxpool3d(int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x,
-    int out_y, int out_z, const float *pts_feature, const int *pts_idx_of_voxels, float *pooled_features, int *argmax){
-    // params pts_feature: (npoints, C)
-    // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel), index 0 is the counter
-    // params pooled_features: (N, out_x, out_y, out_z, C)
-    // params argmax: (N, out_x, out_y, out_z, C)
-
-    int box_idx = blockIdx.z;
-    int channel_idx = blockIdx.y;
-    int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
-
-    int x_idx = voxel_idx_flat / (out_y * out_z);
-    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
-    int z_idx = voxel_idx_flat % out_z;
-    if (box_idx >= boxes_num || channel_idx >= channels|| x_idx >= out_x || y_idx >= out_y || z_idx >= out_z) return;
+__global__ void roiaware_maxpool3d(int boxes_num, int pts_num, int channels,
+                                   int max_pts_each_voxel, int out_x, int out_y,
+                                   int out_z, const float *pts_feature,
+                                   const int *pts_idx_of_voxels,
+                                   float *pooled_features, int *argmax) {
+  // params pts_feature: (npoints, C)
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel),
+  // index 0 is the counter params pooled_features: (N, out_x, out_y, out_z, C)
+  // params argmax: (N, out_x, out_y, out_z, C)
+
+  int box_idx = blockIdx.z;
+  int channel_idx = blockIdx.y;
+  int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
+
+  int x_idx = voxel_idx_flat / (out_y * out_z);
+  int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+  int z_idx = voxel_idx_flat % out_z;
+  if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
+      y_idx >= out_y || z_idx >= out_z)
+    return;

 #ifdef DEBUG
-    printf("src pts_idx_of_voxels: (%p, ), argmax: %p\n", pts_idx_of_voxels, argmax);
+  printf("src pts_idx_of_voxels: (%p, ), argmax: %p\n", pts_idx_of_voxels,
+         argmax);
 #endif

-    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
-    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel + offset_base * max_pts_each_voxel;
-    pooled_features += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx;
-    argmax += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx;
+  int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+  pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
+                       offset_base * max_pts_each_voxel;
+  pooled_features += box_idx * out_x * out_y * out_z * channels +
+                     offset_base * channels + channel_idx;
+  argmax += box_idx * out_x * out_y * out_z * channels +
+            offset_base * channels + channel_idx;

-    int argmax_idx = -1;
-    float max_val = -1e50;
+  int argmax_idx = -1;
+  float max_val = -1e50;

-    int total_pts = pts_idx_of_voxels[0];
+  int total_pts = pts_idx_of_voxels[0];

-    for (int k = 1; k <= total_pts; k++){
-        if (pts_feature[pts_idx_of_voxels[k] * channels + channel_idx] > max_val){
-            max_val = pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
-            argmax_idx = pts_idx_of_voxels[k];
-        }
+  for (int k = 1; k <= total_pts; k++) {
+    if (pts_feature[pts_idx_of_voxels[k] * channels + channel_idx] > max_val) {
+      max_val = pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
+      argmax_idx = pts_idx_of_voxels[k];
    }
+  }

-    if (argmax_idx != -1){
-        pooled_features[0] = max_val;
-    }
-    argmax[0] = argmax_idx;
+  if (argmax_idx != -1) {
+    pooled_features[0] = max_val;
+  }
+  argmax[0] = argmax_idx;

 #ifdef DEBUG
-    printf("channel_%d idx(%d, %d, %d), argmax_idx=(%d, %.3f), total=%d, after pts_idx: %p, argmax: (%p, %d)\n",
-        channel_idx, x_idx, y_idx, z_idx, argmax_idx, max_val, total_pts, pts_idx_of_voxels, argmax, argmax_idx);
+  printf(
+      "channel_%d idx(%d, %d, %d), argmax_idx=(%d, %.3f), total=%d, after "
+      "pts_idx: %p, argmax: (%p, %d)\n",
+      channel_idx, x_idx, y_idx, z_idx, argmax_idx, max_val, total_pts,
+      pts_idx_of_voxels, argmax, argmax_idx);
 #endif
 }

-
-__global__ void roiaware_avgpool3d(int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x,
-    int out_y, int out_z, const float *pts_feature, const int *pts_idx_of_voxels, float *pooled_features){
-    // params pts_feature: (npoints, C)
-    // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel), index 0 is the counter
-    // params pooled_features: (N, out_x, out_y, out_z, C)
-    // params argmax: (N, out_x, out_y, out_z, C)
-
-    int box_idx = blockIdx.z;
-    int channel_idx = blockIdx.y;
-    int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
-
-    int x_idx = voxel_idx_flat / (out_y * out_z);
-    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
-    int z_idx = voxel_idx_flat % out_z;
-    if (box_idx >= boxes_num || channel_idx >= channels|| x_idx >= out_x || y_idx >= out_y || z_idx >= out_z) return;
-
-    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
-    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel + offset_base * max_pts_each_voxel;
-    pooled_features += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx;
-
-    float sum_val = 0;
-    int total_pts = pts_idx_of_voxels[0];
-
-    for (int k = 1; k <= total_pts; k++){
-        sum_val += pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
-    }
-
-    if (total_pts > 0){
-        pooled_features[0] = sum_val / total_pts;
-    }
+__global__ void roiaware_avgpool3d(int boxes_num, int pts_num, int channels,
+                                   int max_pts_each_voxel, int out_x, int out_y,
+                                   int out_z, const float *pts_feature,
+                                   const int *pts_idx_of_voxels,
+                                   float *pooled_features) {
+  // params pts_feature: (npoints, C)
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel),
+  // index 0 is the counter params pooled_features: (N, out_x, out_y, out_z, C)
+  // params argmax: (N, out_x, out_y, out_z, C)
+
+  int box_idx = blockIdx.z;
+  int channel_idx = blockIdx.y;
+  int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
+
+  int x_idx = voxel_idx_flat / (out_y * out_z);
+  int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+  int z_idx = voxel_idx_flat % out_z;
+  if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
+      y_idx >= out_y || z_idx >= out_z)
+    return;
+
+  int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+  pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
+                       offset_base * max_pts_each_voxel;
+  pooled_features += box_idx * out_x * out_y * out_z * channels +
+                     offset_base * channels + channel_idx;
+
+  float sum_val = 0;
+  int total_pts = pts_idx_of_voxels[0];
+
+  for (int k = 1; k <= total_pts; k++) {
+    sum_val += pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
+  }
+
+  if (total_pts > 0) {
+    pooled_features[0] = sum_val / total_pts;
+  }
 }

-
-
-void roiaware_pool3d_launcher(int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x, int out_y, int out_z,
-    const float *rois, const float *pts, const float *pts_feature, int *argmax, int *pts_idx_of_voxels, float *pooled_features, int pool_method){
-    // params rois: (N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate
-    // params pts: (npoints, 3) [x, y, z] in LiDAR coordinate
-    // params pts_feature: (npoints, C)
-    // params argmax: (N, out_x, out_y, out_z, C)
-    // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
-    // params pooled_features: (N, out_x, out_y, out_z, C)
-    // params pool_method: 0: max_pool 1: avg_pool
-
-    int *pts_mask = NULL;
-    cudaMalloc(&pts_mask, boxes_num * pts_num * sizeof(int));  // (N, M)
-    cudaMemset(pts_mask, -1, boxes_num * pts_num * sizeof(int));
-
-    dim3 blocks_mask(DIVUP(pts_num, THREADS_PER_BLOCK), boxes_num);
-    dim3 threads(THREADS_PER_BLOCK);
-    generate_pts_mask_for_box3d<<<blocks_mask, threads>>>(boxes_num, pts_num, out_x, out_y, out_z, rois, pts, pts_mask);
-
-    // TODO: Merge the collect and pool functions, SS
-
-    dim3 blocks_collect(DIVUP(boxes_num, THREADS_PER_BLOCK));
-    collect_inside_pts_for_box3d<<<blocks_collect, threads>>>(boxes_num, pts_num, max_pts_each_voxel,
-        out_x, out_y, out_z, pts_mask, pts_idx_of_voxels);
-
-    dim3 blocks_pool(DIVUP(out_x * out_y * out_z, THREADS_PER_BLOCK), channels, boxes_num);
-    if (pool_method == 0){
-        roiaware_maxpool3d<<<blocks_pool, threads>>>(boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
-            pts_feature, pts_idx_of_voxels, pooled_features, argmax);
-    }
-    else if (pool_method == 1){
-        roiaware_avgpool3d<<<blocks_pool, threads>>>(boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
-            pts_feature, pts_idx_of_voxels, pooled_features);
-    }
-
-
-    cudaFree(pts_mask);
+void roiaware_pool3d_launcher(int boxes_num, int pts_num, int channels,
+                              int max_pts_each_voxel, int out_x, int out_y,
+                              int out_z, const float *rois, const float *pts,
+                              const float *pts_feature, int *argmax,
+                              int *pts_idx_of_voxels, float *pooled_features,
+                              int pool_method) {
+  // params rois: (N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate
+  // params pts: (npoints, 3) [x, y, z] in LiDAR coordinate
+  // params pts_feature: (npoints, C)
+  // params argmax: (N, out_x, out_y, out_z, C)
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  // params pooled_features: (N, out_x, out_y, out_z, C)
+  // params pool_method: 0: max_pool 1: avg_pool
+
+  int *pts_mask = NULL;
+  cudaMalloc(&pts_mask, boxes_num * pts_num * sizeof(int));  // (N, M)
+  cudaMemset(pts_mask, -1, boxes_num * pts_num * sizeof(int));
+
+  dim3 blocks_mask(DIVUP(pts_num, THREADS_PER_BLOCK), boxes_num);
+  dim3 threads(THREADS_PER_BLOCK);
+  generate_pts_mask_for_box3d<<<blocks_mask, threads>>>(
+      boxes_num, pts_num, out_x, out_y, out_z, rois, pts, pts_mask);
+
+  // TODO: Merge the collect and pool functions, SS
+
+  dim3 blocks_collect(DIVUP(boxes_num, THREADS_PER_BLOCK));
+  collect_inside_pts_for_box3d<<<blocks_collect, threads>>>(
+      boxes_num, pts_num, max_pts_each_voxel, out_x, out_y, out_z, pts_mask,
+      pts_idx_of_voxels);
+
+  dim3 blocks_pool(DIVUP(out_x * out_y * out_z, THREADS_PER_BLOCK), channels,
+                   boxes_num);
+  if (pool_method == 0) {
+    roiaware_maxpool3d<<<blocks_pool, threads>>>(
+        boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
+        pts_feature, pts_idx_of_voxels, pooled_features, argmax);
+  } else if (pool_method == 1) {
+    roiaware_avgpool3d<<<blocks_pool, threads>>>(
+        boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
+        pts_feature, pts_idx_of_voxels, pooled_features);
+  }
+
+  cudaFree(pts_mask);

 #ifdef DEBUG
-    cudaDeviceSynchronize();  // for using printf in kernel function
+  cudaDeviceSynchronize();  // for using printf in kernel function
 #endif
 }

-
-__global__ void roiaware_maxpool3d_backward(int boxes_num, int channels, int out_x, int out_y, int out_z,
-    const int *argmax, const float *grad_out, float *grad_in){
-    // params argmax: (N, out_x, out_y, out_z, C)
-    // params grad_out: (N, out_x, out_y, out_z, C)
-    // params grad_in: (npoints, C), return value
-
-    int box_idx = blockIdx.z;
-    int channel_idx = blockIdx.y;
-    int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
-
-    int x_idx = voxel_idx_flat / (out_y * out_z);
-    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
-    int z_idx = voxel_idx_flat % out_z;
-    if (box_idx >= boxes_num || channel_idx >= channels|| x_idx >= out_x || y_idx >= out_y || z_idx >= out_z) return;
-
-    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
-    argmax += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx;
-    grad_out += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx;
-
-    if (argmax[0] == -1) return;
-
-    atomicAdd(grad_in + argmax[0] * channels + channel_idx, grad_out[0] * 1);
+__global__ void roiaware_maxpool3d_backward(int boxes_num, int channels,
+                                            int out_x, int out_y, int out_z,
+                                            const int *argmax,
+                                            const float *grad_out,
+                                            float *grad_in) {
+  // params argmax: (N, out_x, out_y, out_z, C)
+  // params grad_out: (N, out_x, out_y, out_z, C)
+  // params grad_in: (npoints, C), return value
+
+  int box_idx = blockIdx.z;
+  int channel_idx = blockIdx.y;
+  int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
+
+  int x_idx = voxel_idx_flat / (out_y * out_z);
+  int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+  int z_idx = voxel_idx_flat % out_z;
+  if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
+      y_idx >= out_y || z_idx >= out_z)
+    return;
+
+  int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+  argmax += box_idx * out_x * out_y * out_z * channels +
+            offset_base * channels + channel_idx;
+  grad_out += box_idx * out_x * out_y * out_z * channels +
+              offset_base * channels + channel_idx;
+
+  if (argmax[0] == -1) return;
+
+  atomicAdd(grad_in + argmax[0] * channels + channel_idx, grad_out[0] * 1);
 }

-
-__global__ void roiaware_avgpool3d_backward(int boxes_num, int channels, int out_x, int out_y, int out_z,
-    int max_pts_each_voxel, const int *pts_idx_of_voxels, const float *grad_out, float *grad_in){
-    // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
-    // params grad_out: (N, out_x, out_y, out_z, C)
-    // params grad_in: (npoints, C), return value
-
-    int box_idx = blockIdx.z;
-    int channel_idx = blockIdx.y;
-    int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
-
-    int x_idx = voxel_idx_flat / (out_y * out_z);
-    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
-    int z_idx = voxel_idx_flat % out_z;
-    if (box_idx >= boxes_num || channel_idx >= channels|| x_idx >= out_x || y_idx >= out_y || z_idx >= out_z) return;
-
-    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
-    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel + offset_base * max_pts_each_voxel;
-    grad_out += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx;
-
-
-    int total_pts = pts_idx_of_voxels[0];
-    float cur_grad = 1 / fmaxf(float(total_pts), 1.0);
-    for (int k = 1; k <= total_pts; k++){
-        atomicAdd(grad_in + pts_idx_of_voxels[k] * channels + channel_idx, grad_out[0] * cur_grad);
-    }
+__global__ void roiaware_avgpool3d_backward(int boxes_num, int channels,
+                                            int out_x, int out_y, int out_z,
+                                            int max_pts_each_voxel,
+                                            const int *pts_idx_of_voxels,
+                                            const float *grad_out,
+                                            float *grad_in) {
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  // params grad_out: (N, out_x, out_y, out_z, C)
+  // params grad_in: (npoints, C), return value
+
+  int box_idx = blockIdx.z;
+  int channel_idx = blockIdx.y;
+  int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
+
+  int x_idx = voxel_idx_flat / (out_y * out_z);
+  int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+  int z_idx = voxel_idx_flat % out_z;
+  if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
+      y_idx >= out_y || z_idx >= out_z)
+    return;
+
+  int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+  pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
+                       offset_base * max_pts_each_voxel;
+  grad_out += box_idx * out_x * out_y * out_z * channels +
+              offset_base * channels + channel_idx;
+
+  int total_pts = pts_idx_of_voxels[0];
+  float cur_grad = 1 / fmaxf(float(total_pts), 1.0);
+  for (int k = 1; k <= total_pts; k++) {
+    atomicAdd(grad_in + pts_idx_of_voxels[k] * channels + channel_idx,
+              grad_out[0] * cur_grad);
+  }
 }

-
-
-void roiaware_pool3d_backward_launcher(int boxes_num, int out_x, int out_y, int out_z, int channels, int max_pts_each_voxel,
-    const int *pts_idx_of_voxels, const int *argmax, const float *grad_out, float *grad_in, int pool_method){
-    // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
-    // params argmax: (N, out_x, out_y, out_z, C)
-    // params grad_out: (N, out_x, out_y, out_z, C)
-    // params grad_in: (npoints, C), return value
-    // params pool_method: 0: max_pool, 1: avg_pool
-
-    dim3 blocks(DIVUP(out_x * out_y * out_z, THREADS_PER_BLOCK), channels, boxes_num);
-    dim3 threads(THREADS_PER_BLOCK);
-    if (pool_method == 0){
-        roiaware_maxpool3d_backward<<<blocks, threads>>>(
-            boxes_num, channels, out_x, out_y, out_z, argmax, grad_out, grad_in
-        );
-    }
-    else if (pool_method == 1){
-        roiaware_avgpool3d_backward<<<blocks, threads>>>(
-            boxes_num, channels, out_x, out_y, out_z, max_pts_each_voxel, pts_idx_of_voxels, grad_out, grad_in
-        );
-    }
-
+void roiaware_pool3d_backward_launcher(int boxes_num, int out_x, int out_y,
+                                       int out_z, int channels,
+                                       int max_pts_each_voxel,
+                                       const int *pts_idx_of_voxels,
+                                       const int *argmax, const float *grad_out,
+                                       float *grad_in, int pool_method) {
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  // params argmax: (N, out_x, out_y, out_z, C)
+  // params grad_out: (N, out_x, out_y, out_z, C)
+  // params grad_in: (npoints, C), return value
+  // params pool_method: 0: max_pool, 1: avg_pool
+
+  dim3 blocks(DIVUP(out_x * out_y * out_z, THREADS_PER_BLOCK), channels,
+              boxes_num);
+  dim3 threads(THREADS_PER_BLOCK);
+  if (pool_method == 0) {
+    roiaware_maxpool3d_backward<<<blocks, threads>>>(
+        boxes_num, channels, out_x, out_y, out_z, argmax, grad_out, grad_in);
+  } else if (pool_method == 1) {
+    roiaware_avgpool3d_backward<<<blocks, threads>>>(
+        boxes_num, channels, out_x, out_y, out_z, max_pts_each_voxel,
+        pts_idx_of_voxels, grad_out, grad_in);
+  }
 }
--- a/mmdet3d/ops/sparse_block.py
+++ b/mmdet3d/ops/sparse_block.py
 from mmcv.cnn import build_norm_layer
 from torch import nn

-import mmdet3d.ops.spconv as spconv
 from mmdet.models.backbones.resnet import BasicBlock, Bottleneck
+from . import spconv


 def conv3x3(in_planes, out_planes, stride=1, indice_key=None):

--- a/tools/dist_test.sh
+++ b/tools/dist_test.sh
+#!/usr/bin/env bash
+
+CONFIG=$1
+CHECKPOINT=$2
+GPUS=$3
+PORT=${PORT:-29500}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
+    $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4}
--- a/tools/dist_train.sh
+++ b/tools/dist_train.sh
 #!/usr/bin/env bash

-PYTHON=${PYTHON:-"python"}
-
 CONFIG=$1
 GPUS=$2
+PORT=${PORT:-29500}

-$PYTHON -m torch.distributed.launch --nproc_per_node=$GPUS \
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
    $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3}
--- a/tools/slurm_test.sh
+++ b/tools/slurm_test.sh
 #!/usr/bin/env bash

 set -x
-export PYTHONPATH=`pwd`:$PYTHONPATH

 PARTITION=$1
 JOB_NAME=$2
@@ -9,14 +8,17 @@ CONFIG=$3
 CHECKPOINT=$4
 GPUS=${GPUS:-8}
 GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+CPUS_PER_TASK=${CPUS_PER_TASK:-5}
 PY_ARGS=${@:5}
 SRUN_ARGS=${SRUN_ARGS:-""}

+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
 srun -p ${PARTITION} \
    --job-name=${JOB_NAME} \
    --gres=gpu:${GPUS_PER_NODE} \
    --ntasks=${GPUS} \
    --ntasks-per-node=${GPUS_PER_NODE} \
+    --cpus-per-task=${CPUS_PER_TASK} \
    --kill-on-bad-exit=1 \
    ${SRUN_ARGS} \
    python -u tools/test.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS}
--- a/tools/slurm_train.sh
+++ b/tools/slurm_train.sh
@@ -8,15 +8,17 @@ CONFIG=$3
 WORK_DIR=$4
 GPUS=${GPUS:-8}
 GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+CPUS_PER_TASK=${CPUS_PER_TASK:-5}
 SRUN_ARGS=${SRUN_ARGS:-""}
-PY_ARGS=${PY_ARGS:-"--validate"}
-
+PY_ARGS=${@:5}

+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
 srun -p ${PARTITION} \
    --job-name=${JOB_NAME} \
    --gres=gpu:${GPUS_PER_NODE} \
    --ntasks=${GPUS} \
    --ntasks-per-node=${GPUS_PER_NODE} \
+    --cpus-per-task=${CPUS_PER_TASK} \
    --kill-on-bad-exit=1 \
    ${SRUN_ARGS} \
    python -u tools/train.py ${CONFIG} --work-dir=${WORK_DIR} --launcher="slurm" ${PY_ARGS}
--- a/tools/train.py
+++ b/tools/train.py
@@ -8,7 +8,7 @@ import time

 import mmcv
 import torch
-from mmcv import Config
+from mmcv import Config, DictAction
 from mmcv.runner import init_dist

 from mmdet3d import __version__
@@ -26,9 +26,9 @@ def parse_args():
    parser.add_argument(
        '--resume-from', help='the checkpoint file to resume from')
    parser.add_argument(
-        '--validate',
+        '--no-validate',
        action='store_true',
-        help='whether to evaluate the checkpoint during training')
+        help='whether not to evaluate the checkpoint during training')
    group_gpus = parser.add_mutually_exclusive_group()
    group_gpus.add_argument(
        '--gpus',
@@ -46,6 +46,8 @@ def parse_args():
        '--deterministic',
        action='store_true',
        help='whether to set deterministic options for CUDNN backend.')
+    parser.add_argument(
+        '--options', nargs='+', action=DictAction, help='arguments in dict')
    parser.add_argument(
        '--launcher',
        choices=['none', 'pytorch', 'slurm', 'mpi'],
@@ -67,6 +69,9 @@ def main():
    args = parse_args()

    cfg = Config.fromfile(args.config)
+    if args.options is not None:
+        cfg.merge_from_dict(args.options)
+
    # set cudnn_benchmark
    if cfg.get('cudnn_benchmark', False):
        torch.backends.cudnn.benchmark = True
@@ -101,7 +106,7 @@ def main():
    mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
    # init the logger before other steps
    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
-    log_file = osp.join(cfg.work_dir, '{}.log'.format(timestamp))
+    log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
    logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)

    # add a logging filter
@@ -113,28 +118,27 @@ def main():
    meta = dict()
    # log env info
    env_info_dict = collect_env()
-    env_info = '\n'.join([('{}: {}'.format(k, v))
-                          for k, v in env_info_dict.items()])
+    env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])
    dash_line = '-' * 60 + '\n'
    logger.info('Environment info:\n' + dash_line + env_info + '\n' +
                dash_line)
    meta['env_info'] = env_info

    # log some basic info
-    logger.info('Distributed training: {}'.format(distributed))
-    logger.info('Config:\n{}'.format(cfg.text))
+    logger.info(f'Distributed training: {distributed}')
+    logger.info(f'Config:\n{cfg.pretty_text}')

    # set random seeds
    if args.seed is not None:
-        logger.info('Set random seed to {}, deterministic: {}'.format(
-            args.seed, args.deterministic))
+        logger.info(f'Set random seed to {args.seed}, '
+                    f'deterministic: {args.deterministic}')
        set_random_seed(args.seed, deterministic=args.deterministic)
    cfg.seed = args.seed
    meta['seed'] = args.seed

    model = build_detector(
        cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg)
-    logger.info('Model:\n{}'.format(model))
+    logger.info(f'Model:\n{model}')
    datasets = [build_dataset(cfg.data.train)]
    if len(cfg.workflow) == 2:
        val_dataset = copy.deepcopy(cfg.data.val)
@@ -145,7 +149,7 @@ def main():
        # checkpoints as meta data
        cfg.checkpoint_config.meta = dict(
            mmdet_version=__version__,
-            config=cfg.text,
+            config=cfg.pretty_text,
            CLASSES=datasets[0].CLASSES)
    # add an attribute for visualization convenience
    model.CLASSES = datasets[0].CLASSES
@@ -154,7 +158,7 @@ def main():
        datasets,
        cfg,
        distributed=distributed,
-        validate=args.validate,
+        validate=(not args.no_validate),
        timestamp=timestamp,
        meta=meta)