add iou3d_nms cuda ops and roiaware_pool3d cuda ops in the unified normative coordinate

0f73c62c · Shaoshuai Shi · fea61d34 · 0f73c62c · 0f73c62c · 0f73c62c
Commit 0f73c62c authored Jun 23, 2020 by Shaoshuai Shi
10 changed files
--- a/pcdet/ops/iou3d_nms/iou3d_nms_utils.py
+++ b/pcdet/ops/iou3d_nms/iou3d_nms_utils.py
+"""
+3D IoU Calculation and Rotated NMS
+Written by Shaoshuai Shi
+All Rights Reserved 2019-2020.
+"""
+import torch
+from . import iou3d_nms_cuda
+from ...utils import common_utils
+
+
+def boxes_bev_iou_cpu(boxes_a, boxes_b):
+    """
+    Args:
+        boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
+        boxes_b: (N, 7) [x, y, z, dx, dy, dz, heading]
+
+    Returns:
+
+    """
+    boxes_a, is_numpy = common_utils.check_numpy_to_torch(boxes_a)
+    boxes_b, is_numpy = common_utils.check_numpy_to_torch(boxes_b)
+    assert not (boxes_a.is_cuda or boxes_b.is_cuda), 'Only support CPU tensors'
+    assert boxes_a.shape[1] == 7 and boxes_b.shape[1] == 7
+    ans_iou = boxes_a.new_zeros(torch.Size((boxes_a.shape[0], boxes_b.shape[0])))
+    iou3d_nms_cuda.boxes_iou_bev_cpu(boxes_a.contiguous(), boxes_b.contiguous(), ans_iou)
+
+    return ans_iou.numpy() if is_numpy else ans_iou
+
+
+def boxes_iou_bev(boxes_a, boxes_b):
+    """
+    Args:
+        boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
+        boxes_b: (N, 7) [x, y, z, dx, dy, dz, heading]
+
+    Returns:
+        ans_iou: (N, M)
+    """
+    assert boxes_a.shape[1] == boxes_b.shape[1] == 7
+    ans_iou = torch.cuda.FloatTensor(torch.Size((boxes_a.shape[0], boxes_b.shape[0]))).zero_()
+
+    iou3d_nms_cuda.boxes_iou_bev_gpu(boxes_a.contiguous(), boxes_b.contiguous(), ans_iou)
+
+    return ans_iou
+
+
+def boxes_iou3d_gpu(boxes_a, boxes_b):
+    """
+    Args:
+        boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
+        boxes_b: (N, 7) [x, y, z, dx, dy, dz, heading]
+
+    Returns:
+        ans_iou: (N, M)
+    """
+    assert boxes_a.shape[1] == boxes_b.shape[1] == 7
+
+    # height overlap
+    boxes_a_height_max = (boxes_a[:, 2] + boxes_a[:, 5] / 2).view(-1, 1)
+    boxes_a_height_min = (boxes_a[:, 2] - boxes_a[:, 5] / 2).view(-1, 1)
+    boxes_b_height_max = (boxes_b[:, 2] + boxes_b[:, 5] / 2).view(1, -1)
+    boxes_b_height_min = (boxes_b[:, 2] - boxes_b[:, 5] / 2).view(1, -1)
+
+    # bev overlap
+    overlaps_bev = torch.cuda.FloatTensor(torch.Size((boxes_a.shape[0], boxes_b.shape[0]))).zero_()  # (N, M)
+    iou3d_nms_cuda.boxes_overlap_bev_gpu(boxes_a.contiguous(), boxes_b.contiguous(), overlaps_bev)
+
+    max_of_min = torch.max(boxes_a_height_min, boxes_b_height_min)
+    min_of_max = torch.min(boxes_a_height_max, boxes_b_height_max)
+    overlaps_h = torch.clamp(min_of_max - max_of_min, min=0)
+
+    # 3d iou
+    overlaps_3d = overlaps_bev * overlaps_h
+
+    vol_a = (boxes_a[:, 3] * boxes_a[:, 4] * boxes_a[:, 5]).view(-1, 1)
+    vol_b = (boxes_b[:, 3] * boxes_b[:, 4] * boxes_b[:, 5]).view(1, -1)
+
+    iou3d = overlaps_3d / torch.clamp(vol_a + vol_b - overlaps_3d, min=1e-6)
+
+    return iou3d
+
+
+def nms_gpu(boxes, scores, thresh, pre_maxsize=None, **kwargs):
+    """
+    :param boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
+    :param scores: (N)
+    :param thresh:
+    :return:
+    """
+    assert boxes.shape[1] == 7
+    order = scores.sort(0, descending=True)[1]
+    if pre_maxsize is not None:
+        order = order[:pre_maxsize]
+
+    boxes = boxes[order].contiguous()
+    keep = torch.LongTensor(boxes.size(0))
+    num_out = iou3d_nms_cuda.nms_gpu(boxes, keep, thresh)
+    return order[keep[:num_out].cuda()].contiguous(), None
+
+
+def nms_normal_gpu(boxes, scores, thresh, **kwargs):
+    """
+    :param boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
+    :param scores: (N)
+    :param thresh:
+    :return:
+    """
+    assert boxes.shape[0] == 7
+    order = scores.sort(0, descending=True)[1]
+
+    boxes = boxes[order].contiguous()
+
+    keep = torch.LongTensor(boxes.size(0))
+    num_out = iou3d_nms_cuda.nms_normal_gpu(boxes, keep, thresh)
+    return order[keep[:num_out].cuda()].contiguous(), None
--- a/pcdet/ops/iou3d_nms/src/iou3d_cpu.cpp
+++ b/pcdet/ops/iou3d_nms/src/iou3d_cpu.cpp
+/*
+3D Rotated IoU Calculation (CPU)
+Written by Shaoshuai Shi
+All Rights Reserved 2020.
+*/
+
+#include <stdio.h>
+#include <math.h>
+#include <torch/serialize/tensor.h>
+#include <torch/extension.h>
+#include <vector>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include "iou3d_cpu.h"
+
+#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
+
+inline float min(float a, float b){
+    return a > b ? b : a;
+}
+
+inline float max(float a, float b){
+    return a > b ? a : b;
+}
+
+const float EPS = 1e-8;
+struct Point {
+    float x, y;
+    __device__ Point() {}
+    __device__ Point(double _x, double _y){
+        x = _x, y = _y;
+    }
+
+    __device__ void set(float _x, float _y){
+        x = _x; y = _y;
+    }
+
+    __device__ Point operator +(const Point &b)const{
+        return Point(x + b.x, y + b.y);
+    }
+
+    __device__ Point operator -(const Point &b)const{
+        return Point(x - b.x, y - b.y);
+    }
+};
+
+inline float cross(const Point &a, const Point &b){
+    return a.x * b.y - a.y * b.x;
+}
+
+inline float cross(const Point &p1, const Point &p2, const Point &p0){
+    return (p1.x - p0.x) * (p2.y - p0.y) - (p2.x - p0.x) * (p1.y - p0.y);
+}
+
+inline int check_rect_cross(const Point &p1, const Point &p2, const Point &q1, const Point &q2){
+    int ret = min(p1.x,p2.x) <= max(q1.x,q2.x)  &&
+              min(q1.x,q2.x) <= max(p1.x,p2.x) &&
+              min(p1.y,p2.y) <= max(q1.y,q2.y) &&
+              min(q1.y,q2.y) <= max(p1.y,p2.y);
+    return ret;
+}
+
+inline int check_in_box2d(const float *box, const Point &p){
+    //params: (7) [x, y, z, dx, dy, dz, heading]
+    const float MARGIN = 1e-2;
+
+    float center_x = box[0], center_y = box[1];
+    float angle_cos = cos(-box[6]), angle_sin = sin(-box[6]);  // rotate the point in the opposite direction of box
+    float rot_x = (p.x - center_x) * angle_cos + (p.y - center_y) * (-angle_sin);
+    float rot_y = (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos;
+
+    return (fabs(rot_x) < box[3] / 2 + MARGIN && fabs(rot_y) < box[4] / 2 + MARGIN);
+}
+
+inline int intersection(const Point &p1, const Point &p0, const Point &q1, const Point &q0, Point &ans){
+    // fast exclusion
+    if (check_rect_cross(p0, p1, q0, q1) == 0) return 0;
+
+    // check cross standing
+    float s1 = cross(q0, p1, p0);
+    float s2 = cross(p1, q1, p0);
+    float s3 = cross(p0, q1, q0);
+    float s4 = cross(q1, p1, q0);
+
+    if (!(s1 * s2 > 0 && s3 * s4 > 0)) return 0;
+
+    // calculate intersection of two lines
+    float s5 = cross(q1, p1, p0);
+    if(fabs(s5 - s1) > EPS){
+        ans.x = (s5 * q0.x - s1 * q1.x) / (s5 - s1);
+        ans.y = (s5 * q0.y - s1 * q1.y) / (s5 - s1);
+
+    }
+    else{
+        float a0 = p0.y - p1.y, b0 = p1.x - p0.x, c0 = p0.x * p1.y - p1.x * p0.y;
+        float a1 = q0.y - q1.y, b1 = q1.x - q0.x, c1 = q0.x * q1.y - q1.x * q0.y;
+        float D = a0 * b1 - a1 * b0;
+
+        ans.x = (b0 * c1 - b1 * c0) / D;
+        ans.y = (a1 * c0 - a0 * c1) / D;
+    }
+
+    return 1;
+}
+
+inline void rotate_around_center(const Point &center, const float angle_cos, const float angle_sin, Point &p){
+    float new_x = (p.x - center.x) * angle_cos + (p.y - center.y) * (-angle_sin) + center.x;
+    float new_y = (p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y;
+    p.set(new_x, new_y);
+}
+
+inline int point_cmp(const Point &a, const Point &b, const Point &center){
+    return atan2(a.y - center.y, a.x - center.x) > atan2(b.y - center.y, b.x - center.x);
+}
+
+inline float box_overlap(const float *box_a, const float *box_b){
+    // params: box_a (7) [x, y, z, dx, dy, dz, heading]
+    // params: box_b (7) [x, y, z, dx, dy, dz, heading]
+
+//    float a_x1 = box_a[0], a_y1 = box_a[1], a_x2 = box_a[2], a_y2 = box_a[3], a_angle = box_a[4];
+//    float b_x1 = box_b[0], b_y1 = box_b[1], b_x2 = box_b[2], b_y2 = box_b[3], b_angle = box_b[4];
+    float a_angle = box_a[6], b_angle = box_b[6];
+    float a_dx_half = box_a[3] / 2, b_dx_half = box_b[3] / 2, a_dy_half = box_a[4] / 2, b_dy_half = box_b[4] / 2;
+    float a_x1 = box_a[0] - a_dx_half, a_y1 = box_a[1] - a_dy_half;
+    float a_x2 = box_a[0] + a_dx_half, a_y2 = box_a[1] + a_dy_half;
+    float b_x1 = box_b[0] - b_dx_half, b_y1 = box_b[1] - b_dy_half;
+    float b_x2 = box_b[0] + b_dx_half, b_y2 = box_b[1] + b_dy_half;
+
+    Point center_a(box_a[0], box_a[1]);
+    Point center_b(box_b[0], box_b[1]);
+
+    Point box_a_corners[5];
+    box_a_corners[0].set(a_x1, a_y1);
+    box_a_corners[1].set(a_x2, a_y1);
+    box_a_corners[2].set(a_x2, a_y2);
+    box_a_corners[3].set(a_x1, a_y2);
+
+    Point box_b_corners[5];
+    box_b_corners[0].set(b_x1, b_y1);
+    box_b_corners[1].set(b_x2, b_y1);
+    box_b_corners[2].set(b_x2, b_y2);
+    box_b_corners[3].set(b_x1, b_y2);
+
+    // get oriented corners
+    float a_angle_cos = cos(a_angle), a_angle_sin = sin(a_angle);
+    float b_angle_cos = cos(b_angle), b_angle_sin = sin(b_angle);
+
+    for (int k = 0; k < 4; k++){
+        rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners[k]);
+        rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners[k]);
+    }
+
+    box_a_corners[4] = box_a_corners[0];
+    box_b_corners[4] = box_b_corners[0];
+
+    // get intersection of lines
+    Point cross_points[16];
+    Point poly_center;
+    int cnt = 0, flag = 0;
+
+    poly_center.set(0, 0);
+    for (int i = 0; i < 4; i++){
+        for (int j = 0; j < 4; j++){
+            flag = intersection(box_a_corners[i + 1], box_a_corners[i], box_b_corners[j + 1], box_b_corners[j], cross_points[cnt]);
+            if (flag){
+                poly_center = poly_center + cross_points[cnt];
+                cnt++;
+            }
+        }
+    }
+
+    // check corners
+    for (int k = 0; k < 4; k++){
+        if (check_in_box2d(box_a, box_b_corners[k])){
+            poly_center = poly_center + box_b_corners[k];
+            cross_points[cnt] = box_b_corners[k];
+            cnt++;
+        }
+        if (check_in_box2d(box_b, box_a_corners[k])){
+            poly_center = poly_center + box_a_corners[k];
+            cross_points[cnt] = box_a_corners[k];
+            cnt++;
+        }
+    }
+
+    poly_center.x /= cnt;
+    poly_center.y /= cnt;
+
+    // sort the points of polygon
+    Point temp;
+    for (int j = 0; j < cnt - 1; j++){
+        for (int i = 0; i < cnt - j - 1; i++){
+            if (point_cmp(cross_points[i], cross_points[i + 1], poly_center)){
+                temp = cross_points[i];
+                cross_points[i] = cross_points[i + 1];
+                cross_points[i + 1] = temp;
+            }
+        }
+    }
+
+    // get the overlap areas
+    float area = 0;
+    for (int k = 0; k < cnt - 1; k++){
+        area += cross(cross_points[k] - cross_points[0], cross_points[k + 1] - cross_points[0]);
+    }
+
+    return fabs(area) / 2.0;
+}
+
+inline float iou_bev(const float *box_a, const float *box_b){
+    // params: box_a (7) [x, y, z, dx, dy, dz, heading]
+    // params: box_b (7) [x, y, z, dx, dy, dz, heading]
+    float sa = box_a[3] * box_a[4];
+    float sb = box_b[3] * box_b[4];
+    float s_overlap = box_overlap(box_a, box_b);
+    return s_overlap / fmaxf(sa + sb - s_overlap, EPS);
+}
+
+
+int boxes_iou_bev_cpu(at::Tensor boxes_a_tensor, at::Tensor boxes_b_tensor, at::Tensor ans_iou_tensor){
+    // params boxes_a_tensor: (N, 7) [x, y, z, dx, dy, dz, heading]
+    // params boxes_b_tensor: (M, 7) [x, y, z, dx, dy, dz, heading]
+    // params ans_iou_tensor: (N, M)
+
+    CHECK_CONTIGUOUS(boxes_a_tensor);
+    CHECK_CONTIGUOUS(boxes_b_tensor);
+
+    int num_boxes_a = boxes_a_tensor.size(0);
+    int num_boxes_b = boxes_b_tensor.size(0);
+    const float *boxes_a = boxes_a_tensor.data<float>();
+    const float *boxes_b = boxes_b_tensor.data<float>();
+    float *ans_iou = ans_iou_tensor.data<float>();
+
+    for (int i = 0; i < num_boxes_a; i++){
+        for (int j = 0; j < num_boxes_b; j++){
+            ans_iou[i * num_boxes_b + j] = iou_bev(boxes_a + i * 7, boxes_b + j * 7);
+        }
+    }
+    return 1;
+}
--- a/pcdet/ops/iou3d_nms/src/iou3d_cpu.h
+++ b/pcdet/ops/iou3d_nms/src/iou3d_cpu.h
+#ifndef IOU3D_CPU_H
+#define IOU3D_CPU_H
+
+#include <torch/serialize/tensor.h>
+#include <vector>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+
+int boxes_iou_bev_cpu(at::Tensor boxes_a_tensor, at::Tensor boxes_b_tensor, at::Tensor ans_iou_tensor);
+
+#endif
--- a/pcdet/ops/iou3d_nms/src/iou3d_nms.cpp
+++ b/pcdet/ops/iou3d_nms/src/iou3d_nms.cpp
+/*
+3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
+Written by Shaoshuai Shi
+All Rights Reserved 2019-2020.
+*/
+
+#include <torch/serialize/tensor.h>
+#include <torch/extension.h>
+#include <vector>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include "iou3d_nms.h"
+
+#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
+#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
+#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
+
+#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
+
+#define CHECK_ERROR(ans) { gpuAssert((ans), __FILE__, __LINE__); }
+inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
+{
+   if (code != cudaSuccess)
+   {
+      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
+      if (abort) exit(code);
+   }
+}
+
+const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
+
+
+void boxesoverlapLauncher(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_overlap);
+void boxesioubevLauncher(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_iou);
+void nmsLauncher(const float *boxes, unsigned long long * mask, int boxes_num, float nms_overlap_thresh);
+void nmsNormalLauncher(const float *boxes, unsigned long long * mask, int boxes_num, float nms_overlap_thresh);
+
+
+int boxes_overlap_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b, at::Tensor ans_overlap){
+    // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
+    // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
+    // params ans_overlap: (N, M)
+
+    CHECK_INPUT(boxes_a);
+    CHECK_INPUT(boxes_b);
+    CHECK_INPUT(ans_overlap);
+
+    int num_a = boxes_a.size(0);
+    int num_b = boxes_b.size(0);
+
+    const float * boxes_a_data = boxes_a.data<float>();
+    const float * boxes_b_data = boxes_b.data<float>();
+    float * ans_overlap_data = ans_overlap.data<float>();
+
+    boxesoverlapLauncher(num_a, boxes_a_data, num_b, boxes_b_data, ans_overlap_data);
+
+    return 1;
+}
+
+int boxes_iou_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b, at::Tensor ans_iou){
+    // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
+    // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
+    // params ans_overlap: (N, M)
+    CHECK_INPUT(boxes_a);
+    CHECK_INPUT(boxes_b);
+    CHECK_INPUT(ans_iou);
+
+    int num_a = boxes_a.size(0);
+    int num_b = boxes_b.size(0);
+
+    const float * boxes_a_data = boxes_a.data<float>();
+    const float * boxes_b_data = boxes_b.data<float>();
+    float * ans_iou_data = ans_iou.data<float>();
+
+    boxesioubevLauncher(num_a, boxes_a_data, num_b, boxes_b_data, ans_iou_data);
+
+    return 1;
+}
+
+int nms_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh){
+    // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
+    // params keep: (N)
+
+    CHECK_INPUT(boxes);
+    CHECK_CONTIGUOUS(keep);
+
+    int boxes_num = boxes.size(0);
+    const float * boxes_data = boxes.data<float>();
+    long * keep_data = keep.data<long>();
+
+    const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+
+    unsigned long long *mask_data = NULL;
+    CHECK_ERROR(cudaMalloc((void**)&mask_data, boxes_num * col_blocks * sizeof(unsigned long long)));
+    nmsLauncher(boxes_data, mask_data, boxes_num, nms_overlap_thresh);
+
+    // unsigned long long mask_cpu[boxes_num * col_blocks];
+    // unsigned long long *mask_cpu = new unsigned long long [boxes_num * col_blocks];
+    std::vector<unsigned long long> mask_cpu(boxes_num * col_blocks);
+
+//    printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks);
+    CHECK_ERROR(cudaMemcpy(&mask_cpu[0], mask_data, boxes_num * col_blocks * sizeof(unsigned long long),
+                           cudaMemcpyDeviceToHost));
+
+    cudaFree(mask_data);
+
+    unsigned long long remv_cpu[col_blocks];
+    memset(remv_cpu, 0, col_blocks * sizeof(unsigned long long));
+
+    int num_to_keep = 0;
+
+    for (int i = 0; i < boxes_num; i++){
+        int nblock = i / THREADS_PER_BLOCK_NMS;
+        int inblock = i % THREADS_PER_BLOCK_NMS;
+
+        if (!(remv_cpu[nblock] & (1ULL << inblock))){
+            keep_data[num_to_keep++] = i;
+            unsigned long long *p = &mask_cpu[0] + i * col_blocks;
+            for (int j = nblock; j < col_blocks; j++){
+                remv_cpu[j] |= p[j];
+            }
+        }
+    }
+    if ( cudaSuccess != cudaGetLastError() ) printf( "Error!\n" );
+
+    return num_to_keep;
+}
+
+
+int nms_normal_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh){
+    // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
+    // params keep: (N)
+
+    CHECK_INPUT(boxes);
+    CHECK_CONTIGUOUS(keep);
+
+    int boxes_num = boxes.size(0);
+    const float * boxes_data = boxes.data<float>();
+    long * keep_data = keep.data<long>();
+
+    const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+
+    unsigned long long *mask_data = NULL;
+    CHECK_ERROR(cudaMalloc((void**)&mask_data, boxes_num * col_blocks * sizeof(unsigned long long)));
+    nmsNormalLauncher(boxes_data, mask_data, boxes_num, nms_overlap_thresh);
+
+    // unsigned long long mask_cpu[boxes_num * col_blocks];
+    // unsigned long long *mask_cpu = new unsigned long long [boxes_num * col_blocks];
+    std::vector<unsigned long long> mask_cpu(boxes_num * col_blocks);
+
+//    printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks);
+    CHECK_ERROR(cudaMemcpy(&mask_cpu[0], mask_data, boxes_num * col_blocks * sizeof(unsigned long long),
+                           cudaMemcpyDeviceToHost));
+
+    cudaFree(mask_data);
+
+    unsigned long long remv_cpu[col_blocks];
+    memset(remv_cpu, 0, col_blocks * sizeof(unsigned long long));
+
+    int num_to_keep = 0;
+
+    for (int i = 0; i < boxes_num; i++){
+        int nblock = i / THREADS_PER_BLOCK_NMS;
+        int inblock = i % THREADS_PER_BLOCK_NMS;
+
+        if (!(remv_cpu[nblock] & (1ULL << inblock))){
+            keep_data[num_to_keep++] = i;
+            unsigned long long *p = &mask_cpu[0] + i * col_blocks;
+            for (int j = nblock; j < col_blocks; j++){
+                remv_cpu[j] |= p[j];
+            }
+        }
+    }
+    if ( cudaSuccess != cudaGetLastError() ) printf( "Error!\n" );
+
+    return num_to_keep;
+}
+
+
--- a/pcdet/ops/iou3d_nms/src/iou3d_nms.h
+++ b/pcdet/ops/iou3d_nms/src/iou3d_nms.h
+#ifndef IOU3D_NMS_H
+#define IOU3D_NMS_H
+
+#include <torch/serialize/tensor.h>
+#include <vector>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+
+int boxes_overlap_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b, at::Tensor ans_overlap);
+int boxes_iou_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b, at::Tensor ans_iou);
+int nms_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh);
+int nms_normal_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh);
+
+#endif
--- a/pcdet/ops/iou3d_nms/src/iou3d_nms_api.cpp
+++ b/pcdet/ops/iou3d_nms/src/iou3d_nms_api.cpp
+#include <torch/serialize/tensor.h>
+#include <torch/extension.h>
+#include <vector>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+
+#include "iou3d_cpu.h"
+#include "iou3d_nms.h"
+
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+	m.def("boxes_overlap_bev_gpu", &boxes_overlap_bev_gpu, "oriented boxes overlap");
+	m.def("boxes_iou_bev_gpu", &boxes_iou_bev_gpu, "oriented boxes iou");
+	m.def("nms_gpu", &nms_gpu, "oriented nms gpu");
+	m.def("nms_normal_gpu", &nms_normal_gpu, "nms gpu");
+	m.def("boxes_iou_bev_cpu", &boxes_iou_bev_cpu, "oriented boxes iou");
+}
--- a/pcdet/ops/iou3d_nms/src/iou3d_nms_kernel.cu
+++ b/pcdet/ops/iou3d_nms/src/iou3d_nms_kernel.cu
+/*
+3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
+Written by Shaoshuai Shi
+All Rights Reserved 2019-2020.
+*/
+
+
+#include <stdio.h>
+#define THREADS_PER_BLOCK 16
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+
+// #define DEBUG
+const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
+const float EPS = 1e-8;
+struct Point {
+    float x, y;
+    __device__ Point() {}
+    __device__ Point(double _x, double _y){
+        x = _x, y = _y;
+    }
+
+    __device__ void set(float _x, float _y){
+        x = _x; y = _y;
+    }
+
+    __device__ Point operator +(const Point &b)const{
+        return Point(x + b.x, y + b.y);
+    }
+
+    __device__ Point operator -(const Point &b)const{
+        return Point(x - b.x, y - b.y);
+    }
+};
+
+__device__ inline float cross(const Point &a, const Point &b){
+    return a.x * b.y - a.y * b.x;
+}
+
+__device__ inline float cross(const Point &p1, const Point &p2, const Point &p0){
+    return (p1.x - p0.x) * (p2.y - p0.y) - (p2.x - p0.x) * (p1.y - p0.y);
+}
+
+__device__ int check_rect_cross(const Point &p1, const Point &p2, const Point &q1, const Point &q2){
+    int ret = min(p1.x,p2.x) <= max(q1.x,q2.x)  &&
+              min(q1.x,q2.x) <= max(p1.x,p2.x) &&
+              min(p1.y,p2.y) <= max(q1.y,q2.y) &&
+              min(q1.y,q2.y) <= max(p1.y,p2.y);
+    return ret;
+}
+
+__device__ inline int check_in_box2d(const float *box, const Point &p){
+    //params: (7) [x, y, z, dx, dy, dz, heading]
+    const float MARGIN = 1e-2;
+
+    float center_x = box[0], center_y = box[1];
+    float angle_cos = cos(-box[6]), angle_sin = sin(-box[6]);  // rotate the point in the opposite direction of box
+    float rot_x = (p.x - center_x) * angle_cos + (p.y - center_y) * (-angle_sin);
+    float rot_y = (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos;
+
+    return (fabs(rot_x) < box[3] / 2 + MARGIN && fabs(rot_y) < box[4] / 2 + MARGIN);
+}
+
+__device__ inline int intersection(const Point &p1, const Point &p0, const Point &q1, const Point &q0, Point &ans){
+    // fast exclusion
+    if (check_rect_cross(p0, p1, q0, q1) == 0) return 0;
+
+    // check cross standing
+    float s1 = cross(q0, p1, p0);
+    float s2 = cross(p1, q1, p0);
+    float s3 = cross(p0, q1, q0);
+    float s4 = cross(q1, p1, q0);
+
+    if (!(s1 * s2 > 0 && s3 * s4 > 0)) return 0;
+
+    // calculate intersection of two lines
+    float s5 = cross(q1, p1, p0);
+    if(fabs(s5 - s1) > EPS){
+        ans.x = (s5 * q0.x - s1 * q1.x) / (s5 - s1);
+        ans.y = (s5 * q0.y - s1 * q1.y) / (s5 - s1);
+
+    }
+    else{
+        float a0 = p0.y - p1.y, b0 = p1.x - p0.x, c0 = p0.x * p1.y - p1.x * p0.y;
+        float a1 = q0.y - q1.y, b1 = q1.x - q0.x, c1 = q0.x * q1.y - q1.x * q0.y;
+        float D = a0 * b1 - a1 * b0;
+
+        ans.x = (b0 * c1 - b1 * c0) / D;
+        ans.y = (a1 * c0 - a0 * c1) / D;
+    }
+
+    return 1;
+}
+
+__device__ inline void rotate_around_center(const Point &center, const float angle_cos, const float angle_sin, Point &p){
+    float new_x = (p.x - center.x) * angle_cos + (p.y - center.y) * (-angle_sin) + center.x;
+    float new_y = (p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y;
+    p.set(new_x, new_y);
+}
+
+__device__ inline int point_cmp(const Point &a, const Point &b, const Point &center){
+    return atan2(a.y - center.y, a.x - center.x) > atan2(b.y - center.y, b.x - center.x);
+}
+
+__device__ inline float box_overlap(const float *box_a, const float *box_b){
+    // params box_a: [x, y, z, dx, dy, dz, heading]
+    // params box_b: [x, y, z, dx, dy, dz, heading]
+
+    float a_angle = box_a[6], b_angle = box_b[6];
+    float a_dx_half = box_a[3] / 2, b_dx_half = box_b[3] / 2, a_dy_half = box_a[4] / 2, b_dy_half = box_b[4] / 2;
+    float a_x1 = box_a[0] - a_dx_half, a_y1 = box_a[1] - a_dy_half;
+    float a_x2 = box_a[0] + a_dx_half, a_y2 = box_a[1] + a_dy_half;
+    float b_x1 = box_b[0] - b_dx_half, b_y1 = box_b[1] - b_dy_half;
+    float b_x2 = box_b[0] + b_dx_half, b_y2 = box_b[1] + b_dy_half;
+
+    Point center_a(box_a[0], box_a[1]);
+    Point center_b(box_b[0], box_b[1]);
+
+#ifdef DEBUG
+    printf("a: (%.3f, %.3f, %.3f, %.3f, %.3f), b: (%.3f, %.3f, %.3f, %.3f, %.3f)\n", a_x1, a_y1, a_x2, a_y2, a_angle,
+           b_x1, b_y1, b_x2, b_y2, b_angle);
+    printf("center a: (%.3f, %.3f), b: (%.3f, %.3f)\n", center_a.x, center_a.y, center_b.x, center_b.y);
+#endif
+
+    Point box_a_corners[5];
+    box_a_corners[0].set(a_x1, a_y1);
+    box_a_corners[1].set(a_x2, a_y1);
+    box_a_corners[2].set(a_x2, a_y2);
+    box_a_corners[3].set(a_x1, a_y2);
+
+    Point box_b_corners[5];
+    box_b_corners[0].set(b_x1, b_y1);
+    box_b_corners[1].set(b_x2, b_y1);
+    box_b_corners[2].set(b_x2, b_y2);
+    box_b_corners[3].set(b_x1, b_y2);
+
+    // get oriented corners
+    float a_angle_cos = cos(a_angle), a_angle_sin = sin(a_angle);
+    float b_angle_cos = cos(b_angle), b_angle_sin = sin(b_angle);
+
+    for (int k = 0; k < 4; k++){
+#ifdef DEBUG
+        printf("before corner %d: a(%.3f, %.3f), b(%.3f, %.3f) \n", k, box_a_corners[k].x, box_a_corners[k].y, box_b_corners[k].x, box_b_corners[k].y);
+#endif
+        rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners[k]);
+        rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners[k]);
+#ifdef DEBUG
+        printf("corner %d: a(%.3f, %.3f), b(%.3f, %.3f) \n", k, box_a_corners[k].x, box_a_corners[k].y, box_b_corners[k].x, box_b_corners[k].y);
+#endif
+    }
+
+    box_a_corners[4] = box_a_corners[0];
+    box_b_corners[4] = box_b_corners[0];
+
+    // get intersection of lines
+    Point cross_points[16];
+    Point poly_center;
+    int cnt = 0, flag = 0;
+
+    poly_center.set(0, 0);
+    for (int i = 0; i < 4; i++){
+        for (int j = 0; j < 4; j++){
+            flag = intersection(box_a_corners[i + 1], box_a_corners[i], box_b_corners[j + 1], box_b_corners[j], cross_points[cnt]);
+            if (flag){
+                poly_center = poly_center + cross_points[cnt];
+                cnt++;
+#ifdef DEBUG
+                printf("Cross points (%.3f, %.3f): a(%.3f, %.3f)->(%.3f, %.3f), b(%.3f, %.3f)->(%.3f, %.3f) \n",
+                    cross_points[cnt - 1].x, cross_points[cnt - 1].y,
+                    box_a_corners[i].x, box_a_corners[i].y, box_a_corners[i + 1].x, box_a_corners[i + 1].y,
+                    box_b_corners[i].x, box_b_corners[i].y, box_b_corners[i + 1].x, box_b_corners[i + 1].y);
+#endif
+            }
+        }
+    }
+
+    // check corners
+    for (int k = 0; k < 4; k++){
+        if (check_in_box2d(box_a, box_b_corners[k])){
+            poly_center = poly_center + box_b_corners[k];
+            cross_points[cnt] = box_b_corners[k];
+            cnt++;
+#ifdef DEBUG
+                printf("b corners in a: corner_b(%.3f, %.3f)", cross_points[cnt - 1].x, cross_points[cnt - 1].y);
+#endif
+        }
+        if (check_in_box2d(box_b, box_a_corners[k])){
+            poly_center = poly_center + box_a_corners[k];
+            cross_points[cnt] = box_a_corners[k];
+            cnt++;
+#ifdef DEBUG
+                printf("a corners in b: corner_a(%.3f, %.3f)", cross_points[cnt - 1].x, cross_points[cnt - 1].y);
+#endif
+        }
+    }
+
+    poly_center.x /= cnt;
+    poly_center.y /= cnt;
+
+    // sort the points of polygon
+    Point temp;
+    for (int j = 0; j < cnt - 1; j++){
+        for (int i = 0; i < cnt - j - 1; i++){
+            if (point_cmp(cross_points[i], cross_points[i + 1], poly_center)){
+                temp = cross_points[i];
+                cross_points[i] = cross_points[i + 1];
+                cross_points[i + 1] = temp;
+            }
+        }
+    }
+
+#ifdef DEBUG
+    printf("cnt=%d\n", cnt);
+    for (int i = 0; i < cnt; i++){
+        printf("All cross point %d: (%.3f, %.3f)\n", i, cross_points[i].x, cross_points[i].y);
+    }
+#endif
+
+    // get the overlap areas
+    float area = 0;
+    for (int k = 0; k < cnt - 1; k++){
+        area += cross(cross_points[k] - cross_points[0], cross_points[k + 1] - cross_points[0]);
+    }
+
+    return fabs(area) / 2.0;
+}
+
+__device__ inline float iou_bev(const float *box_a, const float *box_b){
+    // params box_a: [x, y, z, dx, dy, dz, heading]
+    // params box_b: [x, y, z, dx, dy, dz, heading]
+    float sa = box_a[3] * box_a[4];
+    float sb = box_b[3] * box_b[4];
+    float s_overlap = box_overlap(box_a, box_b);
+    return s_overlap / fmaxf(sa + sb - s_overlap, EPS);
+}
+
+__global__ void boxes_overlap_kernel(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_overlap){
+    // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
+    // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
+    const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
+    const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
+
+    if (a_idx >= num_a || b_idx >= num_b){
+        return;
+    }
+    const float * cur_box_a = boxes_a + a_idx * 7;
+    const float * cur_box_b = boxes_b + b_idx * 7;
+    float s_overlap = box_overlap(cur_box_a, cur_box_b);
+    ans_overlap[a_idx * num_b + b_idx] = s_overlap;
+}
+
+__global__ void boxes_iou_bev_kernel(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_iou){
+    // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
+    // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
+    const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
+    const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
+
+    if (a_idx >= num_a || b_idx >= num_b){
+        return;
+    }
+
+    const float * cur_box_a = boxes_a + a_idx * 7;
+    const float * cur_box_b = boxes_b + b_idx * 7;
+    float cur_iou_bev = iou_bev(cur_box_a, cur_box_b);
+    ans_iou[a_idx * num_b + b_idx] = cur_iou_bev;
+}
+
+__global__ void nms_kernel(const int boxes_num, const float nms_overlap_thresh,
+                           const float *boxes, unsigned long long *mask){
+    //params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]
+    //params: mask (N, N/THREADS_PER_BLOCK_NMS)
+
+    const int row_start = blockIdx.y;
+    const int col_start = blockIdx.x;
+
+    // if (row_start > col_start) return;
+
+    const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
+    const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
+
+    __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7];
+
+    if (threadIdx.x < col_size) {
+        block_boxes[threadIdx.x * 7 + 0] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0];
+        block_boxes[threadIdx.x * 7 + 1] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1];
+        block_boxes[threadIdx.x * 7 + 2] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2];
+        block_boxes[threadIdx.x * 7 + 3] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3];
+        block_boxes[threadIdx.x * 7 + 4] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4];
+        block_boxes[threadIdx.x * 7 + 5] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5];
+        block_boxes[threadIdx.x * 7 + 6] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6];
+    }
+    __syncthreads();
+
+    if (threadIdx.x < row_size) {
+        const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
+        const float *cur_box = boxes + cur_box_idx * 7;
+
+        int i = 0;
+        unsigned long long t = 0;
+        int start = 0;
+        if (row_start == col_start) {
+          start = threadIdx.x + 1;
+        }
+        for (i = start; i < col_size; i++) {
+            if (iou_bev(cur_box, block_boxes + i * 7) > nms_overlap_thresh){
+                t |= 1ULL << i;
+            }
+        }
+        const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+        mask[cur_box_idx * col_blocks + col_start] = t;
+    }
+}
+
+
+__device__ inline float iou_normal(float const * const a, float const * const b) {
+    //params: a: [x, y, z, dx, dy, dz, heading]
+    //params: b: [x, y, z, dx, dy, dz, heading]
+
+    float left = fmaxf(a[0] - a[3] / 2, b[0] - b[3] / 2), right = fminf(a[0] + a[3] / 2, b[0] + b[3] / 2);
+    float top = fmaxf(a[1] - a[4] / 2, b[1] - b[4] / 2), bottom = fminf(a[1] + a[4] / 2, b[1] + b[4] / 2);
+    float width = fmaxf(right - left, 0.f), height = fmaxf(bottom - top, 0.f);
+    float interS = width * height;
+    float Sa = a[3] * a[4];
+    float Sb = b[3] * b[4];
+    return interS / fmaxf(Sa + Sb - interS, EPS);
+}
+
+
+__global__ void nms_normal_kernel(const int boxes_num, const float nms_overlap_thresh,
+                           const float *boxes, unsigned long long *mask){
+    //params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]
+    //params: mask (N, N/THREADS_PER_BLOCK_NMS)
+
+    const int row_start = blockIdx.y;
+    const int col_start = blockIdx.x;
+
+    // if (row_start > col_start) return;
+
+    const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
+    const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
+
+    __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7];
+
+    if (threadIdx.x < col_size) {
+        block_boxes[threadIdx.x * 7 + 0] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0];
+        block_boxes[threadIdx.x * 7 + 1] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1];
+        block_boxes[threadIdx.x * 7 + 2] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2];
+        block_boxes[threadIdx.x * 7 + 3] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3];
+        block_boxes[threadIdx.x * 7 + 4] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4];
+        block_boxes[threadIdx.x * 7 + 5] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5];
+        block_boxes[threadIdx.x * 7 + 6] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6];
+    }
+    __syncthreads();
+
+    if (threadIdx.x < row_size) {
+        const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
+        const float *cur_box = boxes + cur_box_idx * 7;
+
+        int i = 0;
+        unsigned long long t = 0;
+        int start = 0;
+        if (row_start == col_start) {
+          start = threadIdx.x + 1;
+        }
+        for (i = start; i < col_size; i++) {
+            if (iou_normal(cur_box, block_boxes + i * 7) > nms_overlap_thresh){
+                t |= 1ULL << i;
+            }
+        }
+        const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+        mask[cur_box_idx * col_blocks + col_start] = t;
+    }
+}
+
+
+
+
+
+void boxesoverlapLauncher(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_overlap){
+
+    dim3 blocks(DIVUP(num_b, THREADS_PER_BLOCK), DIVUP(num_a, THREADS_PER_BLOCK));  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK, THREADS_PER_BLOCK);
+
+    boxes_overlap_kernel<<<blocks, threads>>>(num_a, boxes_a, num_b, boxes_b, ans_overlap);
+#ifdef DEBUG
+    cudaDeviceSynchronize();  // for using printf in kernel function
+#endif
+}
+
+void boxesioubevLauncher(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_iou){
+
+    dim3 blocks(DIVUP(num_b, THREADS_PER_BLOCK), DIVUP(num_a, THREADS_PER_BLOCK));  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK, THREADS_PER_BLOCK);
+
+    boxes_iou_bev_kernel<<<blocks, threads>>>(num_a, boxes_a, num_b, boxes_b, ans_iou);
+#ifdef DEBUG
+    cudaDeviceSynchronize();  // for using printf in kernel function
+#endif
+}
+
+
+void nmsLauncher(const float *boxes, unsigned long long * mask, int boxes_num, float nms_overlap_thresh){
+    dim3 blocks(DIVUP(boxes_num, THREADS_PER_BLOCK_NMS),
+                DIVUP(boxes_num, THREADS_PER_BLOCK_NMS));
+    dim3 threads(THREADS_PER_BLOCK_NMS);
+    nms_kernel<<<blocks, threads>>>(boxes_num, nms_overlap_thresh, boxes, mask);
+}
+
+
+void nmsNormalLauncher(const float *boxes, unsigned long long * mask, int boxes_num, float nms_overlap_thresh){
+    dim3 blocks(DIVUP(boxes_num, THREADS_PER_BLOCK_NMS),
+                DIVUP(boxes_num, THREADS_PER_BLOCK_NMS));
+    dim3 threads(THREADS_PER_BLOCK_NMS);
+    nms_normal_kernel<<<blocks, threads>>>(boxes_num, nms_overlap_thresh, boxes, mask);
+}
--- a/pcdet/ops/roiaware_pool3d/roiaware_pool3d_utils.py
+++ b/pcdet/ops/roiaware_pool3d/roiaware_pool3d_utils.py
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from ...utils import common_utils
+from . import roiaware_pool3d_cuda
+
+
+def points_in_boxes_cpu(points, boxes):
+    """
+    Args:
+        points: (num_points, 3)
+        boxes: [x, y, z, dx, dy, dz, heading], (x, y, z) is the box center, each box DO NOT overlaps
+    Returns:
+        point_indices: (N, num_points)
+    """
+    assert boxes.shape[1] == 7
+    assert points.shape[1] == 3
+    points, is_numpy = common_utils.check_numpy_to_torch(points)
+    boxes, is_numpy = common_utils.check_numpy_to_torch(boxes)
+
+    point_indices = points.new_zeros((boxes.shape[0], points.shape[0]), dtype=torch.int)
+    roiaware_pool3d_cuda.points_in_boxes_cpu(boxes.float().contiguous(), points.float().contiguous(), point_indices)
+
+    return point_indices.numpy() if is_numpy else point_indices
+
+
+def points_in_boxes_gpu(points, boxes):
+    """
+    :param points: (B, M, 3)
+    :param boxes: (B, T, 7), num_valid_boxes <= T
+    :return box_idxs_of_pts: (B, M), default background = -1
+    """
+    assert boxes.shape[0] == points.shape[0]
+    assert boxes.shape[2] == 7 and points.shape[2] == 3
+    batch_size, num_points, _ = points.shape
+
+    box_idxs_of_pts = points.new_zeros((batch_size, num_points), dtype=torch.int).fill_(-1)
+    roiaware_pool3d_cuda.points_in_boxes_gpu(boxes.contiguous(), points.contiguous(), box_idxs_of_pts)
+
+    return box_idxs_of_pts
+
+
+class RoIAwarePool3d(nn.Module):
+    def __init__(self, out_size, max_pts_each_voxel=128):
+        super().__init__()
+        self.out_size = out_size
+        self.max_pts_each_voxel = max_pts_each_voxel
+
+    def forward(self, rois, pts, pts_feature, pool_method='max'):
+        assert pool_method in ['max', 'avg']
+        return RoIAwarePool3dFunction.apply(rois, pts, pts_feature, self.out_size, self.max_pts_each_voxel, pool_method)
+
+
+class RoIAwarePool3dFunction(Function):
+    @staticmethod
+    def forward(ctx, rois, pts, pts_feature, out_size, max_pts_each_voxel, pool_method):
+        """
+        Args:
+            ctx:
+            rois: (N, 7) [x, y, z, dx, dy, dz, heading] (x, y, z) is the box center
+            pts: (npoints, 3)
+            pts_feature: (npoints, C)
+            out_size: int or tuple, like 7 or (7, 7, 7)
+            max_pts_each_voxel:
+            pool_method: 'max' or 'avg'
+
+        Returns:
+            pooled_features: (N, out_x, out_y, out_z, C)
+        """
+        assert rois.shape[1] == 7 and pts.shape[1] == 3
+        if isinstance(out_size, int):
+            out_x = out_y = out_z = out_size
+        else:
+            assert len(out_size) == 3
+            for k in range(3):
+                assert isinstance(out_size[k], int)
+            out_x, out_y, out_z = out_size
+
+        num_rois = rois.shape[0]
+        num_channels = pts_feature.shape[-1]
+        num_pts = pts.shape[0]
+
+        pooled_features = pts_feature.new_zeros((num_rois, out_x, out_y, out_z, num_channels))
+        argmax = pts_feature.new_zeros((num_rois, out_x, out_y, out_z, num_channels), dtype=torch.int)
+        pts_idx_of_voxels = pts_feature.new_zeros((num_rois, out_x, out_y, out_z, max_pts_each_voxel), dtype=torch.int)
+
+        pool_method_map = {'max': 0, 'avg': 1}
+        pool_method = pool_method_map[pool_method]
+        roiaware_pool3d_cuda.forward(rois, pts, pts_feature, argmax, pts_idx_of_voxels, pooled_features, pool_method)
+
+        ctx.roiaware_pool3d_for_backward = (pts_idx_of_voxels, argmax, pool_method, num_pts, num_channels)
+        return pooled_features
+
+    @staticmethod
+    def backward(ctx, grad_out):
+        """
+        :param grad_out: (N, out_x, out_y, out_z, C)
+        :return:
+            grad_in: (npoints, C)
+        """
+        pts_idx_of_voxels, argmax, pool_method, num_pts, num_channels = ctx.roiaware_pool3d_for_backward
+
+        grad_in = grad_out.new_zeros((num_pts, num_channels))
+        roiaware_pool3d_cuda.backward(pts_idx_of_voxels, argmax, grad_out.contiguous(), grad_in, pool_method)
+
+        return None, None, grad_in, None, None, None
+
+
+if __name__ == '__main__':
+    pass
--- a/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d.cpp
+++ b/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d.cpp
+/*
+RoI-aware point cloud feature pooling
+Reference paper:  https://arxiv.org/abs/1907.03670
+Written by Shaoshuai Shi
+All Rights Reserved 2019-2020.
+*/
+
+
+#include <torch/serialize/tensor.h>
+#include <torch/extension.h>
+#include <assert.h>
+
+
+#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
+#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
+#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
+
+
+void roiaware_pool3d_launcher(int boxes_num, int pts_num, int channels, int max_pts_each_voxel,
+    int out_x, int out_y, int out_z, const float *rois, const float *pts, const float *pts_feature,
+    int *argmax, int *pts_idx_of_voxels, float *pooled_features, int pool_method);
+
+void roiaware_pool3d_backward_launcher(int boxes_num, int out_x, int out_y, int out_z, int channels, int max_pts_each_voxel,
+    const int *pts_idx_of_voxels, const int *argmax, const float *grad_out, float *grad_in, int pool_method);
+
+void points_in_boxes_launcher(int batch_size, int boxes_num, int pts_num, const float *boxes,
+    const float *pts, int *box_idx_of_points);
+
+int roiaware_pool3d_gpu(at::Tensor rois, at::Tensor pts, at::Tensor pts_feature, at::Tensor argmax,
+    at::Tensor pts_idx_of_voxels, at::Tensor pooled_features, int pool_method){
+    // params rois: (N, 7) [x, y, z, dx, dy, dz, heading] (x, y, z) is the box center
+    // params pts: (npoints, 3) [x, y, z]
+    // params pts_feature: (npoints, C)
+    // params argmax: (N, out_x, out_y, out_z, C)
+    // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+    // params pooled_features: (N, out_x, out_y, out_z, C)
+    // params pool_method: 0: max_pool 1: avg_pool
+
+    CHECK_INPUT(rois);
+    CHECK_INPUT(pts);
+    CHECK_INPUT(pts_feature);
+    CHECK_INPUT(argmax);
+    CHECK_INPUT(pts_idx_of_voxels);
+    CHECK_INPUT(pooled_features);
+
+    int boxes_num = rois.size(0);
+    int pts_num = pts.size(0);
+    int channels = pts_feature.size(1);
+    int max_pts_each_voxel = pts_idx_of_voxels.size(4);  // index 0 is the counter
+    int out_x = pts_idx_of_voxels.size(1);
+    int out_y = pts_idx_of_voxels.size(2);
+    int out_z = pts_idx_of_voxels.size(3);
+    assert ((out_x < 256) && (out_y < 256) && (out_z < 256));  // we encode index with 8bit
+
+    const float *rois_data = rois.data<float>();
+    const float *pts_data = pts.data<float>();
+    const float *pts_feature_data = pts_feature.data<float>();
+    int *argmax_data = argmax.data<int>();
+    int *pts_idx_of_voxels_data = pts_idx_of_voxels.data<int>();
+    float *pooled_features_data = pooled_features.data<float>();
+
+    roiaware_pool3d_launcher(boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
+        rois_data, pts_data, pts_feature_data, argmax_data, pts_idx_of_voxels_data, pooled_features_data, pool_method);
+
+    return 1;
+}
+
+int roiaware_pool3d_gpu_backward(at::Tensor pts_idx_of_voxels, at::Tensor argmax, at::Tensor grad_out, at::Tensor grad_in, int pool_method){
+    // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+    // params argmax: (N, out_x, out_y, out_z, C)
+    // params grad_out: (N, out_x, out_y, out_z, C)
+    // params grad_in: (npoints, C), return value
+    // params pool_method: 0: max_pool 1: avg_pool
+
+    CHECK_INPUT(pts_idx_of_voxels);
+    CHECK_INPUT(argmax);
+    CHECK_INPUT(grad_out);
+    CHECK_INPUT(grad_in);
+
+    int boxes_num = pts_idx_of_voxels.size(0);
+    int out_x = pts_idx_of_voxels.size(1);
+    int out_y = pts_idx_of_voxels.size(2);
+    int out_z = pts_idx_of_voxels.size(3);
+    int max_pts_each_voxel = pts_idx_of_voxels.size(4);  // index 0 is the counter
+    int channels = grad_out.size(4);
+
+    const int *pts_idx_of_voxels_data = pts_idx_of_voxels.data<int>();
+    const int *argmax_data = argmax.data<int>();
+    const float *grad_out_data = grad_out.data<float>();
+    float *grad_in_data = grad_in.data<float>();
+
+    roiaware_pool3d_backward_launcher(boxes_num, out_x, out_y, out_z, channels, max_pts_each_voxel,
+        pts_idx_of_voxels_data, argmax_data, grad_out_data, grad_in_data, pool_method);
+
+    return 1;
+}
+
+int points_in_boxes_gpu(at::Tensor boxes_tensor, at::Tensor pts_tensor, at::Tensor box_idx_of_points_tensor){
+    // params boxes: (B, N, 7) [x, y, z, dx, dy, dz, heading] (x, y, z) is the box center
+    // params pts: (B, npoints, 3) [x, y, z]
+    // params boxes_idx_of_points: (B, npoints), default -1
+
+    CHECK_INPUT(boxes_tensor);
+    CHECK_INPUT(pts_tensor);
+    CHECK_INPUT(box_idx_of_points_tensor);
+
+    int batch_size = boxes_tensor.size(0);
+    int boxes_num = boxes_tensor.size(1);
+    int pts_num = pts_tensor.size(1);
+
+    const float *boxes = boxes_tensor.data<float>();
+    const float *pts = pts_tensor.data<float>();
+    int *box_idx_of_points = box_idx_of_points_tensor.data<int>();
+
+    points_in_boxes_launcher(batch_size, boxes_num, pts_num, boxes, pts, box_idx_of_points);
+
+    return 1;
+}
+
+
+inline void lidar_to_local_coords_cpu(float shift_x, float shift_y, float rot_angle, float &local_x, float &local_y){
+    float cosa = cos(-rot_angle), sina = sin(-rot_angle);
+    local_x = shift_x * cosa + shift_y * (-sina);
+    local_y = shift_x * sina + shift_y * cosa;
+}
+
+
+inline int check_pt_in_box3d_cpu(const float *pt, const float *box3d, float &local_x, float &local_y){
+    // param pt: (x, y, z)
+    // param box3d: [x, y, z, dx, dy, dz, heading], (x, y, z) is the box center
+    const float MARGIN = 1e-2;
+    float x = pt[0], y = pt[1], z = pt[2];
+    float cx = box3d[0], cy = box3d[1], cz = box3d[2];
+    float dx = box3d[3], dy = box3d[4], dz = box3d[5], rz = box3d[6];
+
+    if (fabsf(z - cz) > dz / 2.0) return 0;
+    lidar_to_local_coords_cpu(x - cx, y - cy, rz, local_x, local_y);
+    float in_flag = (fabs(local_x) < dx / 2.0 + MARGIN) & (fabs(local_y) < dy / 2.0 + MARGIN);
+    return in_flag;
+}
+
+
+int points_in_boxes_cpu(at::Tensor boxes_tensor, at::Tensor pts_tensor, at::Tensor pts_indices_tensor){
+    // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading], (x, y, z) is the box center, each box DO NOT overlaps
+    // params pts: (num_points, 3) [x, y, z]
+    // params pts_indices: (N, num_points)
+
+    CHECK_CONTIGUOUS(boxes_tensor);
+    CHECK_CONTIGUOUS(pts_tensor);
+    CHECK_CONTIGUOUS(pts_indices_tensor);
+
+    int boxes_num = boxes_tensor.size(0);
+    int pts_num = pts_tensor.size(0);
+
+    const float *boxes = boxes_tensor.data<float>();
+    const float *pts = pts_tensor.data<float>();
+    int *pts_indices = pts_indices_tensor.data<int>();
+
+    float local_x = 0, local_y = 0;
+    for (int i = 0; i < boxes_num; i++){
+        for (int j = 0; j < pts_num; j++){
+            int cur_in_flag = check_pt_in_box3d_cpu(pts + j * 3, boxes + i * 7, local_x, local_y);
+            pts_indices[i * pts_num + j] = cur_in_flag;
+        }
+    }
+
+    return 1;
+}
+
+
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("forward", &roiaware_pool3d_gpu, "roiaware pool3d forward (CUDA)");
+    m.def("backward", &roiaware_pool3d_gpu_backward, "roiaware pool3d backward (CUDA)");
+    m.def("points_in_boxes_gpu", &points_in_boxes_gpu, "points_in_boxes_gpu forward (CUDA)");
+    m.def("points_in_boxes_cpu", &points_in_boxes_cpu, "points_in_boxes_cpu forward (CUDA)");
+}
--- a/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
+++ b/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
+/*
+RoI-aware point cloud feature pooling
+Written by Shaoshuai Shi
+All Rights Reserved 2019-2020.
+*/
+
+
+#include <math.h>
+#include <stdio.h>
+
+#define THREADS_PER_BLOCK 256
+#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
+// #define DEBUG
+
+
+__device__ inline void lidar_to_local_coords(float shift_x, float shift_y, float rot_angle, float &local_x, float &local_y){
+    float cosa = cos(-rot_angle), sina = sin(-rot_angle);
+    local_x = shift_x * cosa + shift_y * (-sina);
+    local_y = shift_x * sina + shift_y * cosa;
+}
+
+
+__device__ inline int check_pt_in_box3d(const float *pt, const float *box3d, float &local_x, float &local_y){
+    // param pt: (x, y, z)
+    // param box3d: [x, y, z, dx, dy, dz, heading] (x, y, z) is the box center
+
+    const float MARGIN = 1e-5;
+    float x = pt[0], y = pt[1], z = pt[2];
+    float cx = box3d[0], cy = box3d[1], cz = box3d[2];
+    float dx = box3d[3], dy = box3d[4], dz = box3d[5], rz = box3d[6];
+
+    if (fabsf(z - cz) > dz / 2.0) return 0;
+    lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
+    float in_flag = (fabs(local_x) < dx / 2.0 + MARGIN) & (fabs(local_y) < dy / 2.0 + MARGIN);
+    return in_flag;
+}
+
+
+__global__ void generate_pts_mask_for_box3d(int boxes_num, int pts_num, int out_x, int out_y, int out_z,
+    const float *rois, const float *pts, int *pts_mask){
+    // params rois: [x, y, z, dx, dy, dz, heading] (x, y, z) is the box center
+    // params pts: (npoints, 3) [x, y, z]
+    // params pts_mask: (N, npoints): -1 means point doesnot in this box, otherwise: encode (x_idxs, y_idxs, z_idxs) by binary bit
+    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int box_idx = blockIdx.y;
+    if (pt_idx >= pts_num || box_idx >= boxes_num) return;
+
+    pts += pt_idx * 3;
+    rois += box_idx * 7;
+    pts_mask += box_idx * pts_num + pt_idx;
+
+    float local_x = 0, local_y = 0;
+    int cur_in_flag = check_pt_in_box3d(pts, rois, local_x, local_y);
+
+    pts_mask[0] = -1;
+    if (cur_in_flag > 0){
+        float local_z = pts[2] - rois[2];
+        float dx = rois[3], dy = rois[4], dz = rois[5];
+
+        float x_res = dx / out_x;
+        float y_res = dy / out_y;
+        float z_res = dz / out_z;
+
+        unsigned int x_idx = int((local_x + dx / 2) / x_res);
+        unsigned int y_idx = int((local_y + dy / 2) / y_res);
+        unsigned int z_idx = int((local_z + dz / 2) / z_res);
+
+        x_idx = min(max(x_idx, 0), out_x - 1);
+        y_idx = min(max(y_idx, 0), out_y - 1);
+        z_idx = min(max(z_idx, 0), out_z - 1);
+
+        unsigned int idx_encoding = (x_idx << 16) + (y_idx << 8) + z_idx;
+        pts_mask[0] = idx_encoding;
+    }
+}
+
+
+__global__ void collect_inside_pts_for_box3d(int boxes_num, int pts_num, int max_pts_each_voxel,
+    int out_x, int out_y, int out_z, const int *pts_mask, int *pts_idx_of_voxels){
+    // params pts_mask: (N, npoints)  0 or 1
+    // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+
+    int box_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (box_idx >= boxes_num) return;
+
+    int max_num_pts = max_pts_each_voxel - 1;  // index 0 is the counter
+    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel;
+
+    for (int k = 0; k < pts_num; k++){
+        if (pts_mask[box_idx * pts_num + k] != -1){
+            unsigned int idx_encoding = pts_mask[box_idx * pts_num + k];
+            unsigned int x_idx = (idx_encoding >> 16) & 0xFF;
+            unsigned int y_idx = (idx_encoding >> 8) & 0xFF;
+            unsigned int z_idx = idx_encoding & 0xFF;
+            unsigned int base_offset = x_idx * out_y * out_z * max_pts_each_voxel + y_idx * out_z * max_pts_each_voxel + z_idx * max_pts_each_voxel;
+            unsigned int cnt = pts_idx_of_voxels[base_offset];
+            if (cnt < max_num_pts){
+                pts_idx_of_voxels[base_offset + cnt + 1] = k;
+                pts_idx_of_voxels[base_offset]++;
+            }
+#ifdef DEBUG
+        printf("collect: pts_%d, idx(%d, %d, %d), idx_encoding=%x\n",
+            k, x_idx, y_idx, z_idx, idx_encoding);
+#endif
+
+        }
+    }
+}
+
+
+__global__ void roiaware_maxpool3d(int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x,
+    int out_y, int out_z, const float *pts_feature, const int *pts_idx_of_voxels, float *pooled_features, int *argmax){
+    // params pts_feature: (npoints, C)
+    // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel), index 0 is the counter
+    // params pooled_features: (N, out_x, out_y, out_z, C)
+    // params argmax: (N, out_x, out_y, out_z, C)
+
+    int box_idx = blockIdx.z;
+    int channel_idx = blockIdx.y;
+    int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
+
+    int x_idx = voxel_idx_flat / (out_y * out_z);
+    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+    int z_idx = voxel_idx_flat % out_z;
+    if (box_idx >= boxes_num || channel_idx >= channels|| x_idx >= out_x || y_idx >= out_y || z_idx >= out_z) return;
+
+#ifdef DEBUG
+    printf("src pts_idx_of_voxels: (%p, ), argmax: %p\n", pts_idx_of_voxels, argmax);
+#endif
+
+    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel + offset_base * max_pts_each_voxel;
+    pooled_features += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx;
+    argmax += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx;
+
+    int argmax_idx = -1;
+    float max_val = -1e50;
+
+    int total_pts = pts_idx_of_voxels[0];
+
+    for (int k = 1; k <= total_pts; k++){
+        if (pts_feature[pts_idx_of_voxels[k] * channels + channel_idx] > max_val){
+            max_val = pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
+            argmax_idx = pts_idx_of_voxels[k];
+        }
+    }
+
+    if (argmax_idx != -1){
+        pooled_features[0] = max_val;
+    }
+    argmax[0] = argmax_idx;
+
+#ifdef DEBUG
+    printf("channel_%d idx(%d, %d, %d), argmax_idx=(%d, %.3f), total=%d, after pts_idx: %p, argmax: (%p, %d)\n",
+        channel_idx, x_idx, y_idx, z_idx, argmax_idx, max_val, total_pts, pts_idx_of_voxels, argmax, argmax_idx);
+#endif
+}
+
+
+__global__ void roiaware_avgpool3d(int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x,
+    int out_y, int out_z, const float *pts_feature, const int *pts_idx_of_voxels, float *pooled_features){
+    // params pts_feature: (npoints, C)
+    // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel), index 0 is the counter
+    // params pooled_features: (N, out_x, out_y, out_z, C)
+    // params argmax: (N, out_x, out_y, out_z, C)
+
+    int box_idx = blockIdx.z;
+    int channel_idx = blockIdx.y;
+    int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
+
+    int x_idx = voxel_idx_flat / (out_y * out_z);
+    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+    int z_idx = voxel_idx_flat % out_z;
+    if (box_idx >= boxes_num || channel_idx >= channels|| x_idx >= out_x || y_idx >= out_y || z_idx >= out_z) return;
+
+    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel + offset_base * max_pts_each_voxel;
+    pooled_features += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx;
+
+    float sum_val = 0;
+    int total_pts = pts_idx_of_voxels[0];
+
+    for (int k = 1; k <= total_pts; k++){
+        sum_val += pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
+    }
+
+    if (total_pts > 0){
+        pooled_features[0] = sum_val / total_pts;
+    }
+}
+
+
+void roiaware_pool3d_launcher(int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x, int out_y, int out_z,
+    const float *rois, const float *pts, const float *pts_feature, int *argmax, int *pts_idx_of_voxels, float *pooled_features, int pool_method){
+    // params rois: (N, 7) [x, y, z, dx, dy, dz, heading] (x, y, z) is the box center
+    // params pts: (npoints, 3) [x, y, z]
+    // params pts_feature: (npoints, C)
+    // params argmax: (N, out_x, out_y, out_z, C)
+    // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+    // params pooled_features: (N, out_x, out_y, out_z, C)
+    // params pool_method: 0: max_pool 1: avg_pool
+
+    int *pts_mask = NULL;
+    cudaMalloc(&pts_mask, boxes_num * pts_num * sizeof(int));  // (N, M)
+    cudaMemset(pts_mask, -1, boxes_num * pts_num * sizeof(int));
+
+    dim3 blocks_mask(DIVUP(pts_num, THREADS_PER_BLOCK), boxes_num);
+    dim3 threads(THREADS_PER_BLOCK);
+    generate_pts_mask_for_box3d<<<blocks_mask, threads>>>(boxes_num, pts_num, out_x, out_y, out_z, rois, pts, pts_mask);
+
+    // TODO: Merge the collect and pool functions, SS
+
+    dim3 blocks_collect(DIVUP(boxes_num, THREADS_PER_BLOCK));
+    collect_inside_pts_for_box3d<<<blocks_collect, threads>>>(boxes_num, pts_num, max_pts_each_voxel,
+        out_x, out_y, out_z, pts_mask, pts_idx_of_voxels);
+
+    dim3 blocks_pool(DIVUP(out_x * out_y * out_z, THREADS_PER_BLOCK), channels, boxes_num);
+    if (pool_method == 0){
+        roiaware_maxpool3d<<<blocks_pool, threads>>>(boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
+            pts_feature, pts_idx_of_voxels, pooled_features, argmax);
+    }
+    else if (pool_method == 1){
+        roiaware_avgpool3d<<<blocks_pool, threads>>>(boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
+            pts_feature, pts_idx_of_voxels, pooled_features);
+    }
+
+
+    cudaFree(pts_mask);
+
+#ifdef DEBUG
+    cudaDeviceSynchronize();  // for using printf in kernel function
+#endif
+}
+
+
+__global__ void roiaware_maxpool3d_backward(int boxes_num, int channels, int out_x, int out_y, int out_z,
+    const int *argmax, const float *grad_out, float *grad_in){
+    // params argmax: (N, out_x, out_y, out_z, C)
+    // params grad_out: (N, out_x, out_y, out_z, C)
+    // params grad_in: (npoints, C), return value
+
+    int box_idx = blockIdx.z;
+    int channel_idx = blockIdx.y;
+    int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
+
+    int x_idx = voxel_idx_flat / (out_y * out_z);
+    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+    int z_idx = voxel_idx_flat % out_z;
+    if (box_idx >= boxes_num || channel_idx >= channels|| x_idx >= out_x || y_idx >= out_y || z_idx >= out_z) return;
+
+    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+    argmax += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx;
+    grad_out += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx;
+
+    if (argmax[0] == -1) return;
+
+    atomicAdd(grad_in + argmax[0] * channels + channel_idx, grad_out[0] * 1);
+}
+
+
+__global__ void roiaware_avgpool3d_backward(int boxes_num, int channels, int out_x, int out_y, int out_z,
+    int max_pts_each_voxel, const int *pts_idx_of_voxels, const float *grad_out, float *grad_in){
+    // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+    // params grad_out: (N, out_x, out_y, out_z, C)
+    // params grad_in: (npoints, C), return value
+
+    int box_idx = blockIdx.z;
+    int channel_idx = blockIdx.y;
+    int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
+
+    int x_idx = voxel_idx_flat / (out_y * out_z);
+    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+    int z_idx = voxel_idx_flat % out_z;
+    if (box_idx >= boxes_num || channel_idx >= channels|| x_idx >= out_x || y_idx >= out_y || z_idx >= out_z) return;
+
+    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel + offset_base * max_pts_each_voxel;
+    grad_out += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx;
+
+
+    int total_pts = pts_idx_of_voxels[0];
+    float cur_grad = 1 / fmaxf(float(total_pts), 1.0);
+    for (int k = 1; k <= total_pts; k++){
+        atomicAdd(grad_in + pts_idx_of_voxels[k] * channels + channel_idx, grad_out[0] * cur_grad);
+    }
+}
+
+
+void roiaware_pool3d_backward_launcher(int boxes_num, int out_x, int out_y, int out_z, int channels, int max_pts_each_voxel,
+    const int *pts_idx_of_voxels, const int *argmax, const float *grad_out, float *grad_in, int pool_method){
+    // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+    // params argmax: (N, out_x, out_y, out_z, C)
+    // params grad_out: (N, out_x, out_y, out_z, C)
+    // params grad_in: (npoints, C), return value
+    // params pool_method: 0: max_pool, 1: avg_pool
+
+    dim3 blocks(DIVUP(out_x * out_y * out_z, THREADS_PER_BLOCK), channels, boxes_num);
+    dim3 threads(THREADS_PER_BLOCK);
+    if (pool_method == 0){
+        roiaware_maxpool3d_backward<<<blocks, threads>>>(
+            boxes_num, channels, out_x, out_y, out_z, argmax, grad_out, grad_in
+        );
+    }
+    else if (pool_method == 1){
+        roiaware_avgpool3d_backward<<<blocks, threads>>>(
+            boxes_num, channels, out_x, out_y, out_z, max_pts_each_voxel, pts_idx_of_voxels, grad_out, grad_in
+        );
+    }
+
+}
+
+
+__global__ void points_in_boxes_kernel(int batch_size, int boxes_num, int pts_num, const float *boxes,
+    const float *pts, int *box_idx_of_points){
+    // params boxes: (B, N, 7) [x, y, z, dx, dy, dz, heading] (x, y, z) is the box center
+    // params pts: (B, npoints, 3) [x, y, z] in LiDAR coordinate
+    // params boxes_idx_of_points: (B, npoints), default -1
+
+    int bs_idx = blockIdx.y;
+    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (bs_idx >= batch_size || pt_idx >= pts_num) return;
+
+    boxes += bs_idx * boxes_num * 7;
+    pts += bs_idx * pts_num * 3 + pt_idx * 3;
+    box_idx_of_points += bs_idx * pts_num + pt_idx;
+
+    float local_x = 0, local_y = 0;
+    int cur_in_flag = 0;
+    for (int k = 0; k < boxes_num; k++){
+        cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
+        if (cur_in_flag){
+            box_idx_of_points[0] = k;
+            break;
+        }
+    }
+}
+
+
+void points_in_boxes_launcher(int batch_size, int boxes_num, int pts_num, const float *boxes,
+    const float *pts, int *box_idx_of_points){
+    // params boxes: (B, N, 7) [x, y, z, dx, dy, dz, heading] (x, y, z) is the box center
+    // params pts: (B, npoints, 3) [x, y, z]
+    // params boxes_idx_of_points: (B, npoints), default -1
+    cudaError_t err;
+
+    dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), batch_size);
+    dim3 threads(THREADS_PER_BLOCK);
+    points_in_boxes_kernel<<<blocks, threads>>>(batch_size, boxes_num, pts_num, boxes, pts, box_idx_of_points);
+
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+
+#ifdef DEBUG
+    cudaDeviceSynchronize();  // for using printf in kernel function
+#endif
+}