Merge pull request #192 from sshaoshuai/master

Release OpenPCDet v0.3.0

Merge pull request #192 from sshaoshuai/master
Release OpenPCDet v0.3.0
32567b04 · Shaoshuai Shi · GitHub · 853b759b · 04e0d4f0 · 32567b04
Unverified Commit 32567b04 authored Jul 30, 2020 by Shaoshuai Shi Committed by GitHub Jul 30, 2020
20 changed files
--- a/pcdet/ops/pointnet2/pointnet2_stack/src/sampling_gpu.h
+++ b/pcdet/ops/pointnet2/pointnet2_stack/src/sampling_gpu.h
@@ -10,6 +10,6 @@ int furthest_point_sampling_wrapper(int b, int n, int m,
    at::Tensor points_tensor, at::Tensor temp_tensor, at::Tensor idx_tensor);

 void furthest_point_sampling_kernel_launcher(int b, int n, int m,
-    const float *dataset, float *temp, int *idxs, cudaStream_t stream);
+    const float *dataset, float *temp, int *idxs);

 #endif
--- a/pcdet/ops/roiaware_pool3d/roiaware_pool3d_utils.py
+++ b/pcdet/ops/roiaware_pool3d/roiaware_pool3d_utils.py
 import torch
 import torch.nn as nn
 from torch.autograd import Function
+
 from ...utils import common_utils
 from . import roiaware_pool3d_cuda


--- a/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d.cpp
+++ b/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d.cpp
@@ -11,9 +11,9 @@ All Rights Reserved 2019-2020.
 #include <assert.h>


-#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
-#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
-#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
+//#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
+//#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
+//#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)


 void roiaware_pool3d_launcher(int boxes_num, int pts_num, int channels, int max_pts_each_voxel,
@@ -36,12 +36,12 @@ int roiaware_pool3d_gpu(at::Tensor rois, at::Tensor pts, at::Tensor pts_feature,
    // params pooled_features: (N, out_x, out_y, out_z, C)
    // params pool_method: 0: max_pool 1: avg_pool

-    CHECK_INPUT(rois);
-    CHECK_INPUT(pts);
-    CHECK_INPUT(pts_feature);
-    CHECK_INPUT(argmax);
-    CHECK_INPUT(pts_idx_of_voxels);
-    CHECK_INPUT(pooled_features);
+//    CHECK_INPUT(rois);
+//    CHECK_INPUT(pts);
+//    CHECK_INPUT(pts_feature);
+//    CHECK_INPUT(argmax);
+//    CHECK_INPUT(pts_idx_of_voxels);
+//    CHECK_INPUT(pooled_features);

    int boxes_num = rois.size(0);
    int pts_num = pts.size(0);
@@ -72,10 +72,10 @@ int roiaware_pool3d_gpu_backward(at::Tensor pts_idx_of_voxels, at::Tensor argmax
    // params grad_in: (npoints, C), return value
    // params pool_method: 0: max_pool 1: avg_pool

-    CHECK_INPUT(pts_idx_of_voxels);
-    CHECK_INPUT(argmax);
-    CHECK_INPUT(grad_out);
-    CHECK_INPUT(grad_in);
+//    CHECK_INPUT(pts_idx_of_voxels);
+//    CHECK_INPUT(argmax);
+//    CHECK_INPUT(grad_out);
+//    CHECK_INPUT(grad_in);

    int boxes_num = pts_idx_of_voxels.size(0);
    int out_x = pts_idx_of_voxels.size(1);
@@ -100,9 +100,9 @@ int points_in_boxes_gpu(at::Tensor boxes_tensor, at::Tensor pts_tensor, at::Tens
    // params pts: (B, npoints, 3) [x, y, z]
    // params boxes_idx_of_points: (B, npoints), default -1

-    CHECK_INPUT(boxes_tensor);
-    CHECK_INPUT(pts_tensor);
-    CHECK_INPUT(box_idx_of_points_tensor);
+//    CHECK_INPUT(boxes_tensor);
+//    CHECK_INPUT(pts_tensor);
+//    CHECK_INPUT(box_idx_of_points_tensor);

    int batch_size = boxes_tensor.size(0);
    int boxes_num = boxes_tensor.size(1);
@@ -145,9 +145,9 @@ int points_in_boxes_cpu(at::Tensor boxes_tensor, at::Tensor pts_tensor, at::Tens
    // params pts: (num_points, 3) [x, y, z]
    // params pts_indices: (N, num_points)

-    CHECK_CONTIGUOUS(boxes_tensor);
-    CHECK_CONTIGUOUS(pts_tensor);
-    CHECK_CONTIGUOUS(pts_indices_tensor);
+//    CHECK_CONTIGUOUS(boxes_tensor);
+//    CHECK_CONTIGUOUS(pts_tensor);
+//    CHECK_CONTIGUOUS(pts_indices_tensor);

    int boxes_num = boxes_tensor.size(0);
    int pts_num = pts_tensor.size(0);

--- a/pcdet/ops/roipoint_pool3d/roipoint_pool3d_utils.py
+++ b/pcdet/ops/roipoint_pool3d/roipoint_pool3d_utils.py
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+
+from ...utils import box_utils
+from . import roipoint_pool3d_cuda
+
+
+class RoIPointPool3d(nn.Module):
+    def __init__(self, num_sampled_points=512, pool_extra_width=1.0):
+        super().__init__()
+        self.num_sampled_points = num_sampled_points
+        self.pool_extra_width = pool_extra_width
+
+    def forward(self, points, point_features, boxes3d):
+        """
+        Args:
+            points: (B, N, 3)
+            point_features: (B, N, C)
+            boxes3d: (B, M, 7), [x, y, z, dx, dy, dz, heading]
+
+        Returns:
+            pooled_features: (B, M, 512, 3 + C)
+            pooled_empty_flag: (B, M)
+        """
+        return RoIPointPool3dFunction.apply(
+            points, point_features, boxes3d, self.pool_extra_width, self.num_sampled_points
+        )
+
+
+class RoIPointPool3dFunction(Function):
+    @staticmethod
+    def forward(ctx, points, point_features, boxes3d, pool_extra_width, num_sampled_points=512):
+        """
+        Args:
+            ctx:
+            points: (B, N, 3)
+            point_features: (B, N, C)
+            boxes3d: (B, num_boxes, 7), [x, y, z, dx, dy, dz, heading]
+            pool_extra_width:
+            num_sampled_points:
+
+        Returns:
+            pooled_features: (B, num_boxes, 512, 3 + C)
+            pooled_empty_flag: (B, num_boxes)
+        """
+        assert points.shape.__len__() == 3 and points.shape[2] == 3
+        batch_size, boxes_num, feature_len = points.shape[0], boxes3d.shape[1], point_features.shape[2]
+        pooled_boxes3d = box_utils.enlarge_box3d(boxes3d.view(-1, 7), pool_extra_width).view(batch_size, -1, 7)
+
+        pooled_features = point_features.new_zeros((batch_size, boxes_num, num_sampled_points, 3 + feature_len))
+        pooled_empty_flag = point_features.new_zeros((batch_size, boxes_num)).int()
+
+        roipoint_pool3d_cuda.forward(
+            points.contiguous(), pooled_boxes3d.contiguous(),
+            point_features.contiguous(), pooled_features, pooled_empty_flag
+        )
+
+        return pooled_features, pooled_empty_flag
+
+    @staticmethod
+    def backward(ctx, grad_out):
+        raise NotImplementedError
+
+
+if __name__ == '__main__':
+    pass
--- a/pcdet/ops/roipoint_pool3d/src/roipoint_pool3d.cpp
+++ b/pcdet/ops/roipoint_pool3d/src/roipoint_pool3d.cpp
+#include <torch/serialize/tensor.h>
+#include <torch/extension.h>
+
+#define CHECK_CUDA(x) do { \
+  if (!x.type().is_cuda()) { \
+    fprintf(stderr, "%s must be CUDA tensor at %s:%d\n", #x, __FILE__, __LINE__); \
+    exit(-1); \
+  } \
+} while (0)
+#define CHECK_CONTIGUOUS(x) do { \
+  if (!x.is_contiguous()) { \
+    fprintf(stderr, "%s must be contiguous tensor at %s:%d\n", #x, __FILE__, __LINE__); \
+    exit(-1); \
+  } \
+} while (0)
+#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
+
+
+void roipool3dLauncher(int batch_size, int pts_num, int boxes_num, int feature_in_len, int sampled_pts_num,
+                       const float *xyz, const float *boxes3d, const float *pts_feature, float *pooled_features, int *pooled_empty_flag);
+
+
+int roipool3d_gpu(at::Tensor xyz, at::Tensor boxes3d, at::Tensor pts_feature, at::Tensor pooled_features, at::Tensor pooled_empty_flag){
+    // params xyz: (B, N, 3)
+    // params boxes3d: (B, M, 7)
+    // params pts_feature: (B, N, C)
+    // params pooled_features: (B, M, 512, 3+C)
+    // params pooled_empty_flag: (B, M)
+    CHECK_INPUT(xyz);
+    CHECK_INPUT(boxes3d);
+    CHECK_INPUT(pts_feature);
+    CHECK_INPUT(pooled_features);
+    CHECK_INPUT(pooled_empty_flag);
+
+    int batch_size = xyz.size(0);
+    int pts_num = xyz.size(1);
+    int boxes_num = boxes3d.size(1);
+    int feature_in_len = pts_feature.size(2);
+    int sampled_pts_num = pooled_features.size(2);
+
+
+    const float * xyz_data = xyz.data<float>();
+    const float * boxes3d_data = boxes3d.data<float>();
+    const float * pts_feature_data = pts_feature.data<float>();
+    float * pooled_features_data = pooled_features.data<float>();
+    int * pooled_empty_flag_data = pooled_empty_flag.data<int>();
+
+    roipool3dLauncher(batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
+                       xyz_data, boxes3d_data, pts_feature_data, pooled_features_data, pooled_empty_flag_data);
+
+
+
+    return 1;
+}
+
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("forward", &roipool3d_gpu, "roipool3d forward (CUDA)");
+}
+
--- a/pcdet/ops/roipoint_pool3d/src/roipoint_pool3d_kernel.cu
+++ b/pcdet/ops/roipoint_pool3d/src/roipoint_pool3d_kernel.cu
+/*
+Point cloud feature pooling
+Written by Shaoshuai Shi
+All Rights Reserved 2018.
+*/
+
+#include <math.h>
+#include <stdio.h>
+
+#define THREADS_PER_BLOCK 256
+#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
+// #define DEBUG
+
+
+__device__ inline void lidar_to_local_coords(float shift_x, float shift_y, float rot_angle, float &local_x, float &local_y){
+    float cosa = cos(-rot_angle), sina = sin(-rot_angle);
+    local_x = shift_x * cosa + shift_y * (-sina);
+    local_y = shift_x * sina + shift_y * cosa;
+}
+
+
+__device__ inline int check_pt_in_box3d(const float *pt, const float *box3d, float &local_x, float &local_y){
+    // param pt: (x, y, z)
+    // param box3d: [x, y, z, dx, dy, dz, heading] (x, y, z) is the box center
+
+    const float MARGIN = 1e-5;
+    float x = pt[0], y = pt[1], z = pt[2];
+    float cx = box3d[0], cy = box3d[1], cz = box3d[2];
+    float dx = box3d[3], dy = box3d[4], dz = box3d[5], rz = box3d[6];
+
+    if (fabsf(z - cz) > dz / 2.0) return 0;
+    lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
+    float in_flag = (fabs(local_x) < dx / 2.0 + MARGIN) & (fabs(local_y) < dy / 2.0 + MARGIN);
+    return in_flag;
+}
+
+
+__global__ void assign_pts_to_box3d(int batch_size, int pts_num, int boxes_num, const float *xyz, const float *boxes3d, int *pts_assign){
+    // params xyz: (B, N, 3)
+    // params boxes3d: (B, M, 7)
+    // params pts_assign: (B, N, M): idx of the corresponding box3d, -1 means background points
+    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int box_idx = blockIdx.y;
+    int bs_idx = blockIdx.z;
+
+    if (pt_idx >= pts_num || box_idx >= boxes_num || bs_idx >= batch_size){
+        return;
+    }
+    int assign_idx = bs_idx * pts_num * boxes_num + pt_idx * boxes_num + box_idx;
+    pts_assign[assign_idx] = 0;
+
+    int box_offset = bs_idx * boxes_num * 7 + box_idx * 7;
+    int pt_offset = bs_idx * pts_num * 3 + pt_idx * 3;
+
+
+    float local_x = 0, local_y = 0;
+    int cur_in_flag = check_pt_in_box3d(xyz + pt_offset, boxes3d + box_offset, local_x, local_y);
+    pts_assign[assign_idx] = cur_in_flag;
+    // printf("bs=%d, pt=%d, in=%d\n", bs_idx, pt_idx, pts_assign[bs_idx * pts_num + pt_idx]);
+}
+
+
+__global__ void get_pooled_idx(int batch_size, int pts_num, int boxes_num, int sampled_pts_num,
+                               const int *pts_assign, int *pts_idx, int *pooled_empty_flag){
+    // params xyz: (B, N, 3)
+    // params pts_feature: (B, N, C)
+    // params pts_assign: (B, N)
+    // params pts_idx: (B, M, 512)
+    // params pooled_empty_flag: (B, M)
+
+    int boxes_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (boxes_idx >= boxes_num){
+        return;
+    }
+
+    int bs_idx = blockIdx.y;
+
+    int cnt = 0;
+    for (int k = 0; k < pts_num; k++){
+        if (pts_assign[bs_idx * pts_num * boxes_num + k * boxes_num + boxes_idx]){
+            if (cnt < sampled_pts_num){
+                pts_idx[bs_idx * boxes_num * sampled_pts_num + boxes_idx * sampled_pts_num + cnt] = k;
+                cnt++;
+            }
+            else break;
+        }
+    }
+
+    if (cnt == 0){
+        pooled_empty_flag[bs_idx * boxes_num + boxes_idx] = 1;
+    }
+    else if (cnt < sampled_pts_num){
+        // duplicate same points for sampling
+        for (int k = cnt; k < sampled_pts_num; k++){
+            int duplicate_idx = k % cnt;
+            int base_offset = bs_idx * boxes_num * sampled_pts_num + boxes_idx * sampled_pts_num;
+            pts_idx[base_offset + k] = pts_idx[base_offset + duplicate_idx];
+        }
+    }
+}
+
+
+__global__ void roipool3d_forward(int batch_size, int pts_num, int boxes_num, int feature_in_len, int sampled_pts_num,
+                                   const float *xyz, const int *pts_idx, const float *pts_feature,
+                                   float *pooled_features, int *pooled_empty_flag){
+    // params xyz: (B, N, 3)
+    // params pts_idx: (B, M, 512)
+    // params pts_feature: (B, N, C)
+    // params pooled_features: (B, M, 512, 3+C)
+    // params pooled_empty_flag: (B, M)
+
+    int sample_pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int box_idx = blockIdx.y;
+    int bs_idx = blockIdx.z;
+
+    if (sample_pt_idx >= sampled_pts_num || box_idx >= boxes_num || bs_idx >= batch_size){
+        return;
+    }
+
+    if (pooled_empty_flag[bs_idx * boxes_num + box_idx]){
+        return;
+    }
+
+    int temp_idx = bs_idx * boxes_num * sampled_pts_num + box_idx * sampled_pts_num + sample_pt_idx;
+    int src_pt_idx = pts_idx[temp_idx];
+    int dst_feature_offset = temp_idx * (3 + feature_in_len);
+
+    for (int j = 0; j < 3; j++)
+        pooled_features[dst_feature_offset + j] = xyz[bs_idx * pts_num * 3 + src_pt_idx * 3 + j];
+
+    int src_feature_offset = bs_idx * pts_num * feature_in_len + src_pt_idx * feature_in_len;
+    for (int j = 0; j < feature_in_len; j++)
+        pooled_features[dst_feature_offset + 3 + j] = pts_feature[src_feature_offset + j];
+}
+
+
+void roipool3dLauncher(int batch_size, int pts_num, int boxes_num, int feature_in_len, int sampled_pts_num,
+                       const float *xyz, const float *boxes3d, const float *pts_feature, float *pooled_features, int *pooled_empty_flag){
+
+    // printf("batch_size=%d, pts_num=%d, boxes_num=%d\n", batch_size, pts_num, boxes_num);
+    int *pts_assign = NULL;
+    cudaMalloc(&pts_assign, batch_size * pts_num * boxes_num * sizeof(int));  // (batch_size, N, M)
+    // cudaMemset(&pts_assign, -1, batch_size * pts_num * boxes_num * sizeof(int));
+
+    dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), boxes_num, batch_size);  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    assign_pts_to_box3d<<<blocks, threads>>>(batch_size, pts_num, boxes_num, xyz, boxes3d, pts_assign);
+
+    int *pts_idx = NULL;
+    cudaMalloc(&pts_idx, batch_size * boxes_num * sampled_pts_num * sizeof(int));  // (batch_size, M, sampled_pts_num)
+
+    dim3 blocks2(DIVUP(boxes_num, THREADS_PER_BLOCK), batch_size);  // blockIdx.x(col), blockIdx.y(row)
+    get_pooled_idx<<<blocks2, threads>>>(batch_size, pts_num, boxes_num, sampled_pts_num, pts_assign, pts_idx, pooled_empty_flag);
+
+    dim3 blocks_pool(DIVUP(sampled_pts_num, THREADS_PER_BLOCK), boxes_num, batch_size);
+    roipool3d_forward<<<blocks_pool, threads>>>(batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
+                                                      xyz, pts_idx, pts_feature, pooled_features, pooled_empty_flag);
+
+    cudaFree(pts_assign);
+    cudaFree(pts_idx);
+
+#ifdef DEBUG
+    cudaDeviceSynchronize();  // for using printf in kernel function
+#endif
+}
\ No newline at end of file
--- a/pcdet/utils/box_coder_utils.py
+++ b/pcdet/utils/box_coder_utils.py
+import numpy as np
 import torch


 class ResidualCoder(object):
-    def __init__(self, code_size=7, **kwargs):
+    def __init__(self, code_size=7, encode_angle_by_sincos=False, **kwargs):
        super().__init__()
        self.code_size = code_size
+        self.encode_angle_by_sincos = encode_angle_by_sincos
+        if self.encode_angle_by_sincos:
+            self.code_size += 1

-    @staticmethod
-    def encode_torch(boxes, anchors):
+    def encode_torch(self, boxes, anchors):
        """
        Args:
            boxes: (N, 7 + C) [x, y, z, dx, dy, dz, heading, ...]
-            anchors: (N, 7 + C) [x, y, z, dx, dy, dz, heading, ...]
+            anchors: (N, 7 + C) [x, y, z, dx, dy, dz, heading or *[cos, sin], ...]

        Returns:

@@ -29,23 +32,30 @@ class ResidualCoder(object):
        dxt = torch.log(dxg / dxa)
        dyt = torch.log(dyg / dya)
        dzt = torch.log(dzg / dza)
-        rt = rg - ra
+        if self.encode_angle_by_sincos:
+            rt_cos = torch.cos(rg) - torch.cos(ra)
+            rt_sin = torch.sin(rg) - torch.sin(ra)
+            rts = [rt_cos, rt_sin]
+        else:
+            rts = [rg - ra]

        cts = [g - a for g, a in zip(cgs, cas)]
-        return torch.cat([xt, yt, zt, dxt, dyt, dzt, rt, *cts], dim=-1)
+        return torch.cat([xt, yt, zt, dxt, dyt, dzt, *rts, *cts], dim=-1)

-    @staticmethod
-    def decode_torch(box_encodings, anchors):
+    def decode_torch(self, box_encodings, anchors):
        """
        Args:
-            box_encodings: (B, N, 7 + C) or (N, 7 + C) [x, y, z, dx, dy, dz, heading, ...]
+            box_encodings: (B, N, 7 + C) or (N, 7 + C) [x, y, z, dx, dy, dz, heading or *[cos, sin], ...]
            anchors: (B, N, 7 + C) or (N, 7 + C) [x, y, z, dx, dy, dz, heading, ...]

        Returns:

        """
        xa, ya, za, dxa, dya, dza, ra, *cas = torch.split(anchors, 1, dim=-1)
+        if not self.encode_angle_by_sincos:
            xt, yt, zt, dxt, dyt, dzt, rt, *cts = torch.split(box_encodings, 1, dim=-1)
+        else:
+            xt, yt, zt, dxt, dyt, dzt, cost, sint, *cts = torch.split(box_encodings, 1, dim=-1)

        diagonal = torch.sqrt(dxa ** 2 + dya ** 2)
        xg = xt * diagonal + xa
@@ -55,6 +65,12 @@ class ResidualCoder(object):
        dxg = torch.exp(dxt) * dxa
        dyg = torch.exp(dyt) * dya
        dzg = torch.exp(dzt) * dza
+
+        if self.encode_angle_by_sincos:
+            rg_cos = cost + torch.cos(ra)
+            rg_sin = sint + torch.sin(ra)
+            rg = torch.atan2(rg_sin, rg_cos)
+        else:
            rg = rt + ra

        cgs = [t + a for t, a in zip(cts, cas)]
@@ -123,3 +139,84 @@ class PreviousResidualRoIDecoder(object):

        cgs = [t + a for t, a in zip(cts, cas)]
        return torch.cat([xg, yg, zg, dxg, dyg, dzg, rg, *cgs], dim=-1)
+
+
+class PointResidualCoder(object):
+    def __init__(self, code_size=8, use_mean_size=True, **kwargs):
+        super().__init__()
+        self.code_size = code_size
+        self.use_mean_size = use_mean_size
+        if self.use_mean_size:
+            self.mean_size = torch.from_numpy(np.array(kwargs['mean_size'])).cuda().float()
+            assert self.mean_size.min() > 0
+
+    def encode_torch(self, gt_boxes, points, gt_classes=None):
+        """
+        Args:
+            gt_boxes: (N, 7 + C) [x, y, z, dx, dy, dz, heading, ...]
+            points: (N, 3) [x, y, z]
+            gt_classes: (N) [1, num_classes]
+        Returns:
+            box_coding: (N, 8 + C)
+        """
+        gt_boxes[:, 3:6] = torch.clamp_min(gt_boxes[:, 3:6], min=1e-5)
+
+        xg, yg, zg, dxg, dyg, dzg, rg, *cgs = torch.split(gt_boxes, 1, dim=-1)
+        xa, ya, za = torch.split(points, 1, dim=-1)
+
+        if self.use_mean_size:
+            assert gt_classes.max() <= self.mean_size.shape[0]
+            point_anchor_size = self.mean_size[gt_classes - 1]
+            dxa, dya, dza = torch.split(point_anchor_size, 1, dim=-1)
+            diagonal = torch.sqrt(dxa ** 2 + dya ** 2)
+            xt = (xg - xa) / diagonal
+            yt = (yg - ya) / diagonal
+            zt = (zg - za) / dza
+            dxt = torch.log(dxg / dxa)
+            dyt = torch.log(dyg / dya)
+            dzt = torch.log(dzg / dza)
+        else:
+            xt = (xg - xa)
+            yt = (yg - ya)
+            zt = (zg - za)
+            dxt = torch.log(dxg)
+            dyt = torch.log(dyg)
+            dzt = torch.log(dzg)
+
+        cts = [g for g in cgs]
+        return torch.cat([xt, yt, zt, dxt, dyt, dzt, torch.cos(rg), torch.sin(rg), *cts], dim=-1)
+
+    def decode_torch(self, box_encodings, points, pred_classes=None):
+        """
+        Args:
+            box_encodings: (N, 8 + C) [x, y, z, dx, dy, dz, cos, sin, ...]
+            points: [x, y, z]
+            pred_classes: (N) [1, num_classes]
+        Returns:
+
+        """
+        xt, yt, zt, dxt, dyt, dzt, cost, sint, *cts = torch.split(box_encodings, 1, dim=-1)
+        xa, ya, za = torch.split(points, 1, dim=-1)
+
+        if self.use_mean_size:
+            assert pred_classes.max() <= self.mean_size.shape[0]
+            point_anchor_size = self.mean_size[pred_classes - 1]
+            dxa, dya, dza = torch.split(point_anchor_size, 1, dim=-1)
+            diagonal = torch.sqrt(dxa ** 2 + dya ** 2)
+            xg = xt * diagonal + xa
+            yg = yt * diagonal + ya
+            zg = zt * dza + za
+
+            dxg = torch.exp(dxt) * dxa
+            dyg = torch.exp(dyt) * dya
+            dzg = torch.exp(dzt) * dza
+        else:
+            xg = xt + xa
+            yg = yt + ya
+            zg = zt + za
+            dxg, dyg, dzg = torch.split(torch.exp(box_encodings[..., 3:6]), 1, dim=-1)
+
+        rg = torch.atan2(sint, cost)
+
+        cgs = [t for t in cts]
+        return torch.cat([xg, yg, zg, dxg, dyg, dzg, rg, *cgs], dim=-1)
--- a/pcdet/utils/box_utils.py
+++ b/pcdet/utils/box_utils.py
 import numpy as np
+import scipy
 import torch
-from . import common_utils
-from ..ops.roiaware_pool3d import roiaware_pool3d_utils
 from scipy.spatial import Delaunay
-import scipy
+
+from ..ops.roiaware_pool3d import roiaware_pool3d_utils
+from . import common_utils
+

 def in_hull(p, hull):
    """
@@ -283,4 +285,3 @@ def boxes3d_nearest_bev_iou(boxes_a, boxes_b):
    boxes_bev_b = boxes3d_lidar_to_aligned_bev_boxes(boxes_b)

    return boxes_iou_normal(boxes_bev_a, boxes_bev_b)
-
--- a/pcdet/utils/common_utils.py
+++ b/pcdet/utils/common_utils.py
-import numpy as np
-import torch
-import random
 import logging
 import os
-import torch.multiprocessing as mp
-import torch.distributed as dist
-import subprocess
 import pickle
+import random
 import shutil
+import subprocess
+
+import numpy as np
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp


 def check_numpy_to_torch(x):
@@ -110,11 +111,10 @@ def keep_arrays_by_name(gt_names, used_classes):
    return inds


-def init_dist_slurm(batch_size, tcp_port, local_rank, backend='nccl'):
+def init_dist_slurm(tcp_port, local_rank, backend='nccl'):
    """
    modified from https://github.com/open-mmlab/mmdetection
    Args:
-        batch_size:
        tcp_port:
        backend:

@@ -134,13 +134,11 @@ def init_dist_slurm(batch_size, tcp_port, local_rank, backend='nccl'):
    dist.init_process_group(backend=backend)

    total_gpus = dist.get_world_size()
-    assert batch_size % total_gpus == 0, 'Batch size should be matched with GPUS: (%d, %d)' % (batch_size, total_gpus)
-    batch_size_each_gpu = batch_size // total_gpus
    rank = dist.get_rank()
-    return batch_size_each_gpu, rank
+    return total_gpus, rank


-def init_dist_pytorch(batch_size, tcp_port, local_rank, backend='nccl'):
+def init_dist_pytorch(tcp_port, local_rank, backend='nccl'):
    if mp.get_start_method(allow_none=True) is None:
        mp.set_start_method('spawn')

@@ -152,10 +150,9 @@ def init_dist_pytorch(batch_size, tcp_port, local_rank, backend='nccl'):
        rank=local_rank,
        world_size=num_gpus
    )
-    assert batch_size % num_gpus == 0, 'Batch size should be matched with GPUS: (%d, %d)' % (batch_size, num_gpus)
-    batch_size_each_gpu = batch_size // num_gpus
    rank = dist.get_rank()
-    return batch_size_each_gpu, rank
+    return num_gpus, rank
+

 def get_dist_info():
    if torch.__version__ < '1.0':
@@ -173,6 +170,7 @@ def get_dist_info():
        world_size = 1
    return rank, world_size

+
 def merge_results_dist(result_part, size, tmpdir):
    rank, world_size = get_dist_info()
    os.makedirs(tmpdir, exist_ok=True)

--- a/pcdet/utils/loss_utils.py
+++ b/pcdet/utils/loss_utils.py
@@ -2,6 +2,7 @@ import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+
 from . import box_utils


@@ -118,6 +119,8 @@ class WeightedSmoothL1Loss(nn.Module):
            loss: (B, #anchors) float tensor.
                Weighted smooth l1 loss without reduction.
        """
+        target = torch.where(torch.isnan(target), input, target)  # ignore nan targets
+
        diff = input - target
        # code-wise weighting
        if self.code_weights is not None:
@@ -133,6 +136,48 @@ class WeightedSmoothL1Loss(nn.Module):
        return loss


+class WeightedL1Loss(nn.Module):
+    def __init__(self, code_weights: list = None):
+        """
+        Args:
+            code_weights: (#codes) float list if not None.
+                Code-wise weights.
+        """
+        super(WeightedL1Loss, self).__init__()
+        if code_weights is not None:
+            self.code_weights = np.array(code_weights, dtype=np.float32)
+            self.code_weights = torch.from_numpy(self.code_weights).cuda()
+
+    def forward(self, input: torch.Tensor, target: torch.Tensor, weights: torch.Tensor = None):
+        """
+        Args:
+            input: (B, #anchors, #codes) float tensor.
+                Ecoded predicted locations of objects.
+            target: (B, #anchors, #codes) float tensor.
+                Regression targets.
+            weights: (B, #anchors) float tensor if not None.
+
+        Returns:
+            loss: (B, #anchors) float tensor.
+                Weighted smooth l1 loss without reduction.
+        """
+        target = torch.where(torch.isnan(target), input, target)  # ignore nan targets
+
+        diff = input - target
+        # code-wise weighting
+        if self.code_weights is not None:
+            diff = diff * self.code_weights.view(1, 1, -1)
+
+        loss = torch.abs(diff)
+
+        # anchor-wise weighting
+        if weights is not None:
+            assert weights.shape[0] == loss.shape[0] and weights.shape[1] == loss.shape[1]
+            loss = loss * weights.unsqueeze(-1)
+
+        return loss
+
+
 class WeightedCrossEntropyLoss(nn.Module):
    """
    Transform input to fit the fomation of PyTorch offical cross entropy loss

--- a/pcdet/utils/object3d_kitti.py
+++ b/pcdet/utils/object3d_kitti.py
@@ -81,4 +81,3 @@ class Object3d(object):
                       self.box2d[2], self.box2d[3], self.h, self.w, self.l, self.loc[0], self.loc[1], self.loc[2],
                       self.ry)
        return kitti_str
-
--- a/setup.py
+++ b/setup.py
 import os
-from setuptools import setup, find_packages
 import subprocess
+
+from setuptools import find_packages, setup
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension


@@ -27,7 +28,7 @@ def write_version_to_file(version, target_file):


 if __name__ == '__main__':
-    version = '0.2.0+%s' % get_git_commit_number()
+    version = '0.3.0+%s' % get_git_commit_number()
    write_version_to_file(version, 'pcdet/version.py')

    setup(
@@ -37,7 +38,7 @@ if __name__ == '__main__':
        install_requires=[
            'numpy',
            'torch>=1.1',
-            'spconv==1.0',
+            'spconv',
            'numba',
            'tensorboardX',
            'easydict',
@@ -67,6 +68,14 @@ if __name__ == '__main__':
                    'src/roiaware_pool3d_kernel.cu',
                ]
            ),
+            make_cuda_ext(
+                name='roipoint_pool3d_cuda',
+                module='pcdet.ops.roipoint_pool3d',
+                sources=[
+                    'src/roipoint_pool3d.cpp',
+                    'src/roipoint_pool3d_kernel.cu',
+                ]
+            ),
            make_cuda_ext(
                name='pointnet2_stack_cuda',
                module='pcdet.ops.pointnet2.pointnet2_stack',
@@ -78,8 +87,25 @@ if __name__ == '__main__':
                    'src/group_points_gpu.cu',
                    'src/sampling.cpp',
                    'src/sampling_gpu.cu', 
+                    'src/interpolate.cpp', 
+                    'src/interpolate_gpu.cu',
+                ],
+            ),
+            make_cuda_ext(
+                name='pointnet2_batch_cuda',
+                module='pcdet.ops.pointnet2.pointnet2_batch',
+                sources=[
+                    'src/pointnet2_api.cpp',
+                    'src/ball_query.cpp',
+                    'src/ball_query_gpu.cu',
+                    'src/group_points.cpp',
+                    'src/group_points_gpu.cu',
+                    'src/interpolate.cpp',
+                    'src/interpolate_gpu.cu',
+                    'src/sampling.cpp',
+                    'src/sampling_gpu.cu',
+
                ],
            ),
        ],
    )
-
--- a/tools/cfgs/dataset_configs/nuscenes_dataset.yaml
+++ b/tools/cfgs/dataset_configs/nuscenes_dataset.yaml
+DATASET: 'NuScenesDataset'
+DATA_PATH: '../data/nuscenes'
+
+VERSION: 'v1.0-trainval'
+MAX_SWEEPS: 10
+PRED_VELOCITY: True
+SET_NAN_VELOCITY_TO_ZEROS: True
+FILTER_MIN_POINTS_IN_GT: 1
+
+DATA_SPLIT: {
+    'train': train,
+    'test': val
+}
+
+INFO_PATH: {
+    'train': [nuscenes_infos_10sweeps_train.pkl],
+    'test': [nuscenes_infos_10sweeps_val.pkl],
+}
+
+POINT_CLOUD_RANGE: [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+
+BALANCED_RESAMPLING: True 
+
+DATA_AUGMENTOR:
+    DISABLE_AUG_LIST: ['placeholder']
+    AUG_CONFIG_LIST:
+        - NAME: gt_sampling
+          DB_INFO_PATH:
+              - nuscenes_dbinfos_10sweeps_withvelo.pkl
+          PREPARE: {
+             filter_by_min_points: [
+                 'car:5','truck:5', 'construction_vehicle:5', 'bus:5', 'trailer:5',
+                 'barrier:5', 'motorcycle:5', 'bicycle:5', 'pedestrian:5', 'traffic_cone:5'
+             ],
+          }
+
+          SAMPLE_GROUPS: [
+              'car:2','truck:3', 'construction_vehicle:7', 'bus:4', 'trailer:6',
+              'barrier:2', 'motorcycle:6', 'bicycle:6', 'pedestrian:2', 'traffic_cone:2'
+          ]
+
+          NUM_POINT_FEATURES: 5
+          DATABASE_WITH_FAKELIDAR: False
+          REMOVE_EXTRA_WIDTH: [0.0, 0.0, 0.0]
+          LIMIT_WHOLE_SCENE: True
+
+        - NAME: random_world_flip
+          ALONG_AXIS_LIST: ['x', 'y']
+
+        - NAME: random_world_rotation
+          WORLD_ROT_ANGLE: [-0.3925, 0.3925]
+
+        - NAME: random_world_scaling
+          WORLD_SCALE_RANGE: [0.95, 1.05]
+
+
+POINT_FEATURE_ENCODING: {
+    encoding_type: absolute_coordinates_encoding,
+    used_feature_list: ['x', 'y', 'z', 'intensity', 'timestamp'],
+    src_feature_list: ['x', 'y', 'z', 'intensity', 'timestamp'],
+}
+
+
+DATA_PROCESSOR:
+    - NAME: mask_points_and_boxes_outside_range
+      REMOVE_OUTSIDE_BOXES: True
+
+    - NAME: shuffle_points
+      SHUFFLE_ENABLED: {
+        'train': True,
+        'test': True
+      }
+
+    - NAME: transform_points_to_voxels
+      VOXEL_SIZE: [0.1, 0.1, 0.2]
+      MAX_POINTS_PER_VOXEL: 10
+      MAX_NUMBER_OF_VOXELS: {
+        'train': 60000,
+        'test': 60000
+      }
--- a/tools/cfgs/kitti_models/PartA2.yaml
+++ b/tools/cfgs/kitti_models/PartA2.yaml
@@ -170,6 +170,9 @@ MODEL:


 OPTIMIZATION:
+    BATCH_SIZE_PER_GPU: 4
+    NUM_EPOCHS: 80
+
    OPTIMIZER: adam_onecycle
    LR: 0.01
    WEIGHT_DECAY: 0.01

--- a/tools/cfgs/kitti_models/PartA2_free.yaml
+++ b/tools/cfgs/kitti_models/PartA2_free.yaml
+CLASS_NAMES: ['Car', 'Pedestrian', 'Cyclist']
+
+DATA_CONFIG:
+    _BASE_CONFIG_: cfgs/dataset_configs/kitti_dataset.yaml
+
+
+MODEL:
+    NAME: PointRCNN
+
+    VFE:
+        NAME: MeanVFE
+
+    BACKBONE_3D:
+        NAME: UNetV2
+        RETURN_ENCODED_TENSOR: False
+
+    POINT_HEAD:
+        NAME: PointIntraPartOffsetHead
+        CLS_FC: [128, 128]
+        PART_FC: [128, 128]
+        REG_FC: [128, 128]
+        CLASS_AGNOSTIC: False
+        USE_POINT_FEATURES_BEFORE_FUSION: False
+        TARGET_CONFIG:
+            GT_EXTRA_WIDTH: [0.2, 0.2, 0.2]
+            BOX_CODER: PointResidualCoder
+            BOX_CODER_CONFIG: {
+                'use_mean_size': True,
+                'mean_size': [
+                    [3.9, 1.6, 1.56],
+                    [0.8, 0.6, 1.73],
+                    [1.76, 0.6, 1.73]
+                ]
+            }
+
+        LOSS_CONFIG:
+            LOSS_REG: WeightedSmoothL1Loss
+            LOSS_WEIGHTS: {
+                'point_cls_weight': 1.0,
+                'point_box_weight': 1.0,
+                'point_part_weight': 1.0,
+                'code_weights': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+            }
+
+    ROI_HEAD:
+        NAME: PartA2FCHead
+        CLASS_AGNOSTIC: True
+
+        SHARED_FC: [256, 256, 256]
+        CLS_FC: [256, 256]
+        REG_FC: [256, 256]
+        DP_RATIO: 0.3
+        DISABLE_PART: True
+        SEG_MASK_SCORE_THRESH: 0.0
+
+        NMS_CONFIG:
+            TRAIN:
+                NMS_TYPE: nms_gpu
+                MULTI_CLASSES_NMS: False
+                NMS_PRE_MAXSIZE: 9000
+                NMS_POST_MAXSIZE: 512
+                NMS_THRESH: 0.8
+            TEST:
+                NMS_TYPE: nms_gpu
+                MULTI_CLASSES_NMS: False
+                NMS_PRE_MAXSIZE: 9000
+                NMS_POST_MAXSIZE: 100
+                NMS_THRESH: 0.85
+
+        ROI_AWARE_POOL:
+            POOL_SIZE: 12
+            NUM_FEATURES: 128
+            MAX_POINTS_PER_VOXEL: 128
+
+        TARGET_CONFIG:
+            BOX_CODER: ResidualCoder
+            ROI_PER_IMAGE: 128
+            FG_RATIO: 0.5
+
+            SAMPLE_ROI_BY_EACH_CLASS: True
+            CLS_SCORE_TYPE: roi_iou
+
+            CLS_FG_THRESH: 0.75
+            CLS_BG_THRESH: 0.25
+            CLS_BG_THRESH_LO: 0.1
+            HARD_BG_RATIO: 0.8
+
+            REG_FG_THRESH: 0.65
+
+        LOSS_CONFIG:
+            CLS_LOSS: BinaryCrossEntropy
+            REG_LOSS: smooth-l1
+            CORNER_LOSS_REGULARIZATION: True
+            LOSS_WEIGHTS: {
+                'rcnn_cls_weight': 1.0,
+                'rcnn_reg_weight': 1.0,
+                'rcnn_corner_weight': 1.0,
+                'code_weights': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+            }
+
+    POST_PROCESSING:
+        RECALL_THRESH_LIST: [0.3, 0.5, 0.7]
+        SCORE_THRESH: 0.1
+        OUTPUT_RAW_SCORE: False
+
+        EVAL_METRIC: kitti
+
+        NMS_CONFIG:
+            MULTI_CLASSES_NMS: False
+            NMS_TYPE: nms_gpu
+            NMS_THRESH: 0.1
+            NMS_PRE_MAXSIZE: 4096
+            NMS_POST_MAXSIZE: 500
+
+
+OPTIMIZATION:
+    BATCH_SIZE_PER_GPU: 4
+    NUM_EPOCHS: 80
+
+    OPTIMIZER: adam_onecycle
+    LR: 0.003
+    WEIGHT_DECAY: 0.01
+    MOMENTUM: 0.9
+
+    MOMS: [0.95, 0.85]
+    PCT_START: 0.4
+    DIV_FACTOR: 10
+    DECAY_STEP_LIST: [35, 45]
+    LR_DECAY: 0.1
+    LR_CLIP: 0.0000001
+
+    LR_WARMUP: False
+    WARMUP_EPOCH: 1
+
+    GRAD_NORM_CLIP: 10
--- a/tools/cfgs/kitti_models/pointpillar.yaml
+++ b/tools/cfgs/kitti_models/pointpillar.yaml
@@ -143,6 +143,9 @@ MODEL:


 OPTIMIZATION:
+    BATCH_SIZE_PER_GPU: 4
+    NUM_EPOCHS: 80
+
    OPTIMIZER: adam_onecycle
    LR: 0.003
    WEIGHT_DECAY: 0.01

--- a/tools/cfgs/kitti_models/pointrcnn.yaml
+++ b/tools/cfgs/kitti_models/pointrcnn.yaml
+CLASS_NAMES: ['Car', 'Pedestrian', 'Cyclist']
+
+DATA_CONFIG:
+    _BASE_CONFIG_: cfgs/dataset_configs/kitti_dataset.yaml
+
+    DATA_PROCESSOR:
+        -   NAME: mask_points_and_boxes_outside_range
+            REMOVE_OUTSIDE_BOXES: True
+
+        -   NAME: sample_points
+            NUM_POINTS: {
+                'train': 16384,
+                'test': 16384
+            }
+
+        -   NAME: shuffle_points
+            SHUFFLE_ENABLED: {
+                'train': True,
+                'test': False
+            }
+
+MODEL:
+    NAME: PointRCNN
+
+    BACKBONE_3D:
+        NAME: PointNet2MSG
+        SA_CONFIG:
+            NPOINTS: [4096, 1024, 256, 64]
+            RADIUS: [[0.1, 0.5], [0.5, 1.0], [1.0, 2.0], [2.0, 4.0]]
+            NSAMPLE: [[16, 32], [16, 32], [16, 32], [16, 32]]
+            MLPS: [[[16, 16, 32], [32, 32, 64]],
+                   [[64, 64, 128], [64, 96, 128]],
+                   [[128, 196, 256], [128, 196, 256]],
+                   [[256, 256, 512], [256, 384, 512]]]
+        FP_MLPS: [[128, 128], [256, 256], [512, 512], [512, 512]]
+
+    POINT_HEAD:
+        NAME: PointHeadBox
+        CLS_FC: [256, 256]
+        REG_FC: [256, 256]
+        CLASS_AGNOSTIC: False
+        USE_POINT_FEATURES_BEFORE_FUSION: False
+        TARGET_CONFIG:
+            GT_EXTRA_WIDTH: [0.2, 0.2, 0.2]
+            BOX_CODER: PointResidualCoder
+            BOX_CODER_CONFIG: {
+                'use_mean_size': True,
+                'mean_size': [
+                    [3.9, 1.6, 1.56],
+                    [0.8, 0.6, 1.73],
+                    [1.76, 0.6, 1.73]
+                ]
+            }
+
+        LOSS_CONFIG:
+            LOSS_REG: WeightedSmoothL1Loss
+            LOSS_WEIGHTS: {
+                'point_cls_weight': 1.0,
+                'point_box_weight': 1.0,
+                'code_weights': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+            }
+
+    ROI_HEAD:
+        NAME: PointRCNNHead
+        CLASS_AGNOSTIC: True
+
+        ROI_POINT_POOL:
+            POOL_EXTRA_WIDTH: [0.0, 0.0, 0.0]
+            NUM_SAMPLED_POINTS: 512
+            DEPTH_NORMALIZER: 70.0
+
+        XYZ_UP_LAYER: [128, 128]
+        CLS_FC: [256, 256]
+        REG_FC: [256, 256]
+        DP_RATIO: 0.0
+        USE_BN: False
+
+        SA_CONFIG:
+            NPOINTS: [128, 32, -1]
+            RADIUS: [0.2, 0.4, 100]
+            NSAMPLE: [16, 16, 16]
+            MLPS: [[128, 128, 128],
+                   [128, 128, 256],
+                   [256, 256, 512]]
+
+        NMS_CONFIG:
+            TRAIN:
+                NMS_TYPE: nms_gpu
+                MULTI_CLASSES_NMS: False
+                NMS_PRE_MAXSIZE: 9000
+                NMS_POST_MAXSIZE: 512
+                NMS_THRESH: 0.8
+            TEST:
+                NMS_TYPE: nms_gpu
+                MULTI_CLASSES_NMS: False
+                NMS_PRE_MAXSIZE: 9000
+                NMS_POST_MAXSIZE: 100
+                NMS_THRESH: 0.85
+
+        TARGET_CONFIG:
+            BOX_CODER: ResidualCoder
+            ROI_PER_IMAGE: 128
+            FG_RATIO: 0.5
+
+            SAMPLE_ROI_BY_EACH_CLASS: True
+            CLS_SCORE_TYPE: cls
+
+            CLS_FG_THRESH: 0.6
+            CLS_BG_THRESH: 0.45
+            CLS_BG_THRESH_LO: 0.1
+            HARD_BG_RATIO: 0.8
+
+            REG_FG_THRESH: 0.55
+
+        LOSS_CONFIG:
+            CLS_LOSS: BinaryCrossEntropy
+            REG_LOSS: smooth-l1
+            CORNER_LOSS_REGULARIZATION: True
+            LOSS_WEIGHTS: {
+                'rcnn_cls_weight': 1.0,
+                'rcnn_reg_weight': 1.0,
+                'rcnn_corner_weight': 1.0,
+                'code_weights': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+            }
+
+    POST_PROCESSING:
+        RECALL_THRESH_LIST: [0.3, 0.5, 0.7]
+        SCORE_THRESH: 0.1
+        OUTPUT_RAW_SCORE: False
+
+        EVAL_METRIC: kitti
+
+        NMS_CONFIG:
+            MULTI_CLASSES_NMS: False
+            NMS_TYPE: nms_gpu
+            NMS_THRESH: 0.1
+            NMS_PRE_MAXSIZE: 4096
+            NMS_POST_MAXSIZE: 500
+
+
+OPTIMIZATION:
+    BATCH_SIZE_PER_GPU: 2
+    NUM_EPOCHS: 80
+
+    OPTIMIZER: adam_onecycle
+    LR: 0.01
+    WEIGHT_DECAY: 0.01
+    MOMENTUM: 0.9
+
+    MOMS: [0.95, 0.85]
+    PCT_START: 0.4
+    DIV_FACTOR: 10
+    DECAY_STEP_LIST: [35, 45]
+    LR_DECAY: 0.1
+    LR_CLIP: 0.0000001
+
+    LR_WARMUP: False
+    WARMUP_EPOCH: 1
+
+    GRAD_NORM_CLIP: 10
--- a/tools/cfgs/kitti_models/pointrcnn_iou.yaml
+++ b/tools/cfgs/kitti_models/pointrcnn_iou.yaml
+CLASS_NAMES: ['Car', 'Pedestrian', 'Cyclist']
+
+DATA_CONFIG:
+    _BASE_CONFIG_: cfgs/dataset_configs/kitti_dataset.yaml
+
+    DATA_PROCESSOR:
+        -   NAME: mask_points_and_boxes_outside_range
+            REMOVE_OUTSIDE_BOXES: True
+
+        -   NAME: sample_points
+            NUM_POINTS: {
+                'train': 16384,
+                'test': 16384
+            }
+
+        -   NAME: shuffle_points
+            SHUFFLE_ENABLED: {
+                'train': True,
+                'test': False
+            }
+
+MODEL:
+    NAME: PointRCNN
+
+    BACKBONE_3D:
+        NAME: PointNet2MSG
+        SA_CONFIG:
+            NPOINTS: [4096, 1024, 256, 64]
+            RADIUS: [[0.1, 0.5], [0.5, 1.0], [1.0, 2.0], [2.0, 4.0]]
+            NSAMPLE: [[16, 32], [16, 32], [16, 32], [16, 32]]
+            MLPS: [[[16, 16, 32], [32, 32, 64]],
+                   [[64, 64, 128], [64, 96, 128]],
+                   [[128, 196, 256], [128, 196, 256]],
+                   [[256, 256, 512], [256, 384, 512]]]
+        FP_MLPS: [[128, 128], [256, 256], [512, 512], [512, 512]]
+
+    POINT_HEAD:
+        NAME: PointHeadBox
+        CLS_FC: [256, 256]
+        REG_FC: [256, 256]
+        CLASS_AGNOSTIC: False
+        USE_POINT_FEATURES_BEFORE_FUSION: False
+        TARGET_CONFIG:
+            GT_EXTRA_WIDTH: [0.2, 0.2, 0.2]
+            BOX_CODER: PointResidualCoder
+            BOX_CODER_CONFIG: {
+                'use_mean_size': True,
+                'mean_size': [
+                    [3.9, 1.6, 1.56],
+                    [0.8, 0.6, 1.73],
+                    [1.76, 0.6, 1.73]
+                ]
+            }
+
+        LOSS_CONFIG:
+            LOSS_REG: WeightedSmoothL1Loss
+            LOSS_WEIGHTS: {
+                'point_cls_weight': 1.0,
+                'point_box_weight': 1.0,
+                'code_weights': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+            }
+
+    ROI_HEAD:
+        NAME: PointRCNNHead
+        CLASS_AGNOSTIC: True
+
+        ROI_POINT_POOL:
+            POOL_EXTRA_WIDTH: [0.0, 0.0, 0.0]
+            NUM_SAMPLED_POINTS: 512
+            DEPTH_NORMALIZER: 70.0
+
+        XYZ_UP_LAYER: [128, 128]
+        CLS_FC: [256, 256]
+        REG_FC: [256, 256]
+        DP_RATIO: 0.0
+        USE_BN: False
+
+        SA_CONFIG:
+            NPOINTS: [128, 32, -1]
+            RADIUS: [0.2, 0.4, 100]
+            NSAMPLE: [16, 16, 16]
+            MLPS: [[128, 128, 128],
+                   [128, 128, 256],
+                   [256, 256, 512]]
+
+        NMS_CONFIG:
+            TRAIN:
+                NMS_TYPE: nms_gpu
+                MULTI_CLASSES_NMS: False
+                NMS_PRE_MAXSIZE: 9000
+                NMS_POST_MAXSIZE: 512
+                NMS_THRESH: 0.8
+            TEST:
+                NMS_TYPE: nms_gpu
+                MULTI_CLASSES_NMS: False
+                NMS_PRE_MAXSIZE: 9000
+                NMS_POST_MAXSIZE: 100
+                NMS_THRESH: 0.85
+
+        TARGET_CONFIG:
+            BOX_CODER: ResidualCoder
+            ROI_PER_IMAGE: 128
+            FG_RATIO: 0.5
+
+            SAMPLE_ROI_BY_EACH_CLASS: True
+            CLS_SCORE_TYPE: roi_iou
+
+            CLS_FG_THRESH: 0.7
+            CLS_BG_THRESH: 0.25
+            CLS_BG_THRESH_LO: 0.1
+            HARD_BG_RATIO: 0.8
+
+            REG_FG_THRESH: 0.55
+
+        LOSS_CONFIG:
+            CLS_LOSS: BinaryCrossEntropy
+            REG_LOSS: smooth-l1
+            CORNER_LOSS_REGULARIZATION: True
+            LOSS_WEIGHTS: {
+                'rcnn_cls_weight': 1.0,
+                'rcnn_reg_weight': 1.0,
+                'rcnn_corner_weight': 1.0,
+                'code_weights': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+            }
+
+    POST_PROCESSING:
+        RECALL_THRESH_LIST: [0.3, 0.5, 0.7]
+        SCORE_THRESH: 0.1
+        OUTPUT_RAW_SCORE: False
+
+        EVAL_METRIC: kitti
+
+        NMS_CONFIG:
+            MULTI_CLASSES_NMS: False
+            NMS_TYPE: nms_gpu
+            NMS_THRESH: 0.1
+            NMS_PRE_MAXSIZE: 4096
+            NMS_POST_MAXSIZE: 500
+
+
+OPTIMIZATION:
+    BATCH_SIZE_PER_GPU: 3
+    NUM_EPOCHS: 80
+
+    OPTIMIZER: adam_onecycle
+    LR: 0.01
+    WEIGHT_DECAY: 0.01
+    MOMENTUM: 0.9
+
+    MOMS: [0.95, 0.85]
+    PCT_START: 0.4
+    DIV_FACTOR: 10
+    DECAY_STEP_LIST: [35, 45]
+    LR_DECAY: 0.1
+    LR_CLIP: 0.0000001
+
+    LR_WARMUP: False
+    WARMUP_EPOCH: 1
+
+    GRAD_NORM_CLIP: 10
--- a/tools/cfgs/kitti_models/pv_rcnn.yaml
+++ b/tools/cfgs/kitti_models/pv_rcnn.yaml
@@ -228,6 +228,9 @@ MODEL:


 OPTIMIZATION:
+    BATCH_SIZE_PER_GPU: 2
+    NUM_EPOCHS: 80
+
    OPTIMIZER: adam_onecycle
    LR: 0.01
    WEIGHT_DECAY: 0.01

--- a/tools/cfgs/kitti_models/second.yaml
+++ b/tools/cfgs/kitti_models/second.yaml
@@ -100,6 +100,9 @@ MODEL:


 OPTIMIZATION:
+    BATCH_SIZE_PER_GPU: 4
+    NUM_EPOCHS: 80
+
    OPTIMIZER: adam_onecycle
    LR: 0.003
    WEIGHT_DECAY: 0.01