Indoor boxstructure

b70ecdb5 · wuyuefeng · zhangwenwei · 9fcfd78f · b70ecdb5 · b70ecdb5
Commit b70ecdb5 authored Jun 14, 2020 by wuyuefeng Committed by zhangwenwei Jun 14, 2020
20 changed files
--- a/mmdet3d/core/bbox/box_np_ops.py
+++ b/mmdet3d/core/bbox/box_np_ops.py
@@ -148,7 +148,7 @@ def center_to_corner_box3d(centers,
        dims (float array, shape=[N, 3]): dimensions in kitti label file.
        angles (float array, shape=[N]): rotation_y in kitti label file.
        origin (list or array or float): origin point relate to smallest point.
-            use [0.5, 1.0, 0.5] in camera and [0.5, 0.5, 0] in lidar.
+            use (0.5, 1.0, 0.5) in camera and (0.5, 0.5, 0) in lidar.
        axis (int): rotation axis. 1 for camera and 2 for lidar.
    Returns:
        [type]: [description]

--- a/mmdet3d/core/bbox/box_torch_ops.py
+++ b/mmdet3d/core/bbox/box_torch_ops.py
@@ -74,7 +74,7 @@ def rotation_3d_in_axis(points, angles, axis=0):
 def center_to_corner_box3d(centers,
                           dims,
                           angles,
-                           origin=[0.5, 1.0, 0.5],
+                           origin=(0.5, 1.0, 0.5),
                           axis=1):
    """convert kitti locations, dimensions and angles to corners

@@ -83,7 +83,7 @@ def center_to_corner_box3d(centers,
        dims (float array, shape=[N, 3]): dimensions in kitti label file.
        angles (float array, shape=[N]): rotation_y in kitti label file.
        origin (list or array or float): origin point relate to smallest point.
-            use [0.5, 1.0, 0.5] in camera and [0.5, 0.5, 0] in lidar.
+            use (0.5, 1.0, 0.5) in camera and (0.5, 0.5, 0) in lidar.
        axis (int): rotation axis. 1 for camera and 2 for lidar.
    Returns:
        [type]: [description]

--- a/mmdet3d/core/bbox/coders/partial_bin_based_bbox_coder.py
+++ b/mmdet3d/core/bbox/coders/partial_bin_based_bbox_coder.py
@@ -28,28 +28,28 @@ class PartialBinBasedBBoxCoder(BaseBBoxCoder):
        """Encode ground truth to prediction targets.

        Args:
-            gt_bboxes_3d (Tensor): 3d gt bboxes with shape (n, 7).
-            gt_labels_3d (Tensor): Gt classes.
+            gt_bboxes_3d (BaseInstance3DBoxes): gt bboxes with shape (n, 7).
+            gt_labels_3d (Tensor): gt classes.

        Returns:
            tuple: Targets of center, size and direction.
        """
        # generate center target
-        center_target = gt_bboxes_3d[..., 0:3]
+        center_target = gt_bboxes_3d.gravity_center

        # generate bbox size target
        size_class_target = gt_labels_3d
-        size_res_target = gt_bboxes_3d[..., 3:6] - gt_bboxes_3d.new_tensor(
+        size_res_target = gt_bboxes_3d.dims - gt_bboxes_3d.tensor.new_tensor(
            self.mean_sizes)[size_class_target]

        # generate dir target
-        box_num = gt_bboxes_3d.shape[0]
+        box_num = gt_labels_3d.shape[0]
        if self.with_rot:
            (dir_class_target,
-             dir_res_target) = self.angle2class(gt_bboxes_3d[..., 6])
+             dir_res_target) = self.angle2class(gt_bboxes_3d.yaw)
        else:
            dir_class_target = gt_labels_3d.new_zeros(box_num)
-            dir_res_target = gt_bboxes_3d.new_zeros(box_num)
+            dir_res_target = gt_bboxes_3d.tensor.new_zeros(box_num)

        return (center_target, size_class_target, size_res_target,
                dir_class_target, dir_res_target)

--- a/mmdet3d/core/bbox/iou_calculators/iou3d_calculator.py
+++ b/mmdet3d/core/bbox/iou_calculators/iou3d_calculator.py
@@ -83,8 +83,6 @@ def bbox_overlaps_3d(bboxes1, bboxes2, mode='iou', coordinate='camera'):
    Return:
        iou: (M, N) not support aligned mode currently
    """
-    # TODO: check the input dimension meanings,
-    #  this is inconsistent with that in bbox_overlaps_nearest_3d
    assert bboxes1.size(-1) == bboxes2.size(-1) == 7
    assert coordinate in ['camera', 'lidar']


--- a/mmdet3d/core/bbox/structures/base_box3d.py
+++ b/mmdet3d/core/bbox/structures/base_box3d.py
@@ -12,7 +12,7 @@ class BaseInstance3DBoxes(object):

    Note:
        The box is bottom centered, i.e. the relative position of origin in
-            the box is [0.5, 0.5, 0].
+            the box is (0.5, 0.5, 0).

    Args:
        tensor (torch.Tensor | np.ndarray | list): a Nxbox_dim matrix.
@@ -23,11 +23,11 @@ class BaseInstance3DBoxes(object):
            If False, the value of yaw will be set to 0 as minmax boxes.
            Default to True.
        origin (tuple): The relative position of origin in the box.
-            Default to [0.5, 0.5, 0]. This will guide the box be converted to
-            [0.5, 0.5, 0] mode.
+            Default to (0.5, 0.5, 0). This will guide the box be converted to
+            (0.5, 0.5, 0) mode.
    """

-    def __init__(self, tensor, box_dim=7, with_yaw=True, origin=[0.5, 0.5, 0]):
+    def __init__(self, tensor, box_dim=7, with_yaw=True, origin=(0.5, 0.5, 0)):
        if isinstance(tensor, torch.Tensor):
            device = tensor.device
        else:
@@ -40,18 +40,21 @@ class BaseInstance3DBoxes(object):
                dtype=torch.float32, device=device)
        assert tensor.dim() == 2 and tensor.size(-1) == box_dim, tensor.size()

-        if not with_yaw and tensor.shape[-1] == 6:
+        if tensor.shape[-1] == 6:
+            # If the dimension of boxes is 6, we expand box_dim by padding
+            # 0 as a fake yaw and set with_yaw to False.
            assert box_dim == 6
            fake_rot = tensor.new_zeros(tensor.shape[0], 1)
            tensor = torch.cat((tensor, fake_rot), dim=-1)
            self.box_dim = box_dim + 1
+            self.with_yaw = False
        else:
            self.box_dim = box_dim
-        self.with_yaw = with_yaw
+            self.with_yaw = with_yaw
        self.tensor = tensor

-        if origin != [0.5, 0.5, 0]:
-            dst = self.tensor.new_tensor([0.5, 0.5, 0])
+        if origin != (0.5, 0.5, 0):
+            dst = self.tensor.new_tensor((0.5, 0.5, 0))
            src = self.tensor.new_tensor(origin)
            self.tensor[:, :3] += self.tensor[:, 3:6] * (dst - src)

@@ -121,7 +124,7 @@ class BaseInstance3DBoxes(object):

            The relative position of the centers in different kinds of
            boxes are different, e.g., the relative center of a boxes is
-            [0.5, 1.0, 0.5] in camera and [0.5, 0.5, 0] in lidar.
+            (0.5, 1.0, 0.5) in camera and (0.5, 0.5, 0) in lidar.
            It is recommended to use `bottom_center` or `gravity_center`
            for more clear usage.


--- a/mmdet3d/core/bbox/structures/box_3d_mode.py
+++ b/mmdet3d/core/bbox/structures/box_3d_mode.py
@@ -22,7 +22,7 @@ class Box3DMode(IntEnum):
                       | /
        left y <------ 0

-    The relative coordinate of bottom center in a LiDAR box is [0.5, 0.5, 0],
+    The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0),
    and the yaw is around the z axis, thus the rotation axis=2.

    Coordinates in camera:
@@ -49,7 +49,7 @@ class Box3DMode(IntEnum):
           | /
           0 ------> x right

-    The relative coordinate of bottom center in a DEPTH box is [0.5, 0.5, 0],
+    The relative coordinate of bottom center in a DEPTH box is (0.5, 0.5, 0),
    and the yaw is around the z axis, thus the rotation axis=2.
    """


--- a/mmdet3d/core/bbox/structures/cam_box3d.py
+++ b/mmdet3d/core/bbox/structures/cam_box3d.py
@@ -20,7 +20,7 @@ class CameraInstance3DBoxes(BaseInstance3DBoxes):
             v
        down y

-    The relative coordinate of bottom center in a CAM box is [0.5, 1.0, 0.5],
+    The relative coordinate of bottom center in a CAM box is (0.5, 1.0, 0.5),
    and the yaw is around the y axis, thus the rotation axis=1.
    The yaw is 0 at the positive direction of x axis, and increases from
    the positive direction of x to the positive direction of z.

--- a/mmdet3d/core/bbox/structures/depth_box3d.py
+++ b/mmdet3d/core/bbox/structures/depth_box3d.py
 import numpy as np
 import torch

+from mmdet3d.ops import points_in_boxes_batch
 from .base_box3d import BaseInstance3DBoxes
 from .utils import limit_period, rotation_3d_in_axis

@@ -17,7 +18,7 @@ class DepthInstance3DBoxes(BaseInstance3DBoxes):
                       | /
                       0 ------> x right (yaw=0)

-    The relative coordinate of bottom center in a Depth box is [0.5, 0.5, 0],
+    The relative coordinate of bottom center in a Depth box is (0.5, 0.5, 0),
    and the yaw is around the z axis, thus the rotation axis=2.
    The yaw is 0 at the positive direction of x axis, and increases from
    the positive direction of x to the positive direction of y.
@@ -74,7 +75,7 @@ class DepthInstance3DBoxes(BaseInstance3DBoxes):
                device=dims.device, dtype=dims.dtype)

        corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
-        # use relative origin [0.5, 0.5, 0]
+        # use relative origin (0.5, 0.5, 0)
        corners_norm = corners_norm - dims.new_tensor([0.5, 0.5, 0])
        corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3])

@@ -201,3 +202,30 @@ class DepthInstance3DBoxes(BaseInstance3DBoxes):
        from .box_3d_mode import Box3DMode
        return Box3DMode.convert(
            box=self, src=Box3DMode.DEPTH, dst=dst, rt_mat=rt_mat)
+
+    def points_in_boxes(self, points):
+        """Find points that are in boxes (CUDA)
+
+        Args:
+            points (torch.Tensor): [1, M, 3] or [M, 3], [x, y, z]
+                in LiDAR coordinate.
+
+        Returns:
+            torch.Tensor: The box index of each point in, shape is (B, M, T).
+        """
+        from .box_3d_mode import Box3DMode
+
+        # to lidar
+        points_lidar = points.clone()
+        points_lidar = points_lidar[..., [1, 0, 2]]
+        points_lidar[..., 1] *= -1
+        if points.dim() == 2:
+            points_lidar = points_lidar.unsqueeze(0)
+        else:
+            assert points.dim() == 3 and points_lidar.shape[0] == 1
+
+        boxes_lidar = self.convert_to(Box3DMode.LIDAR).tensor
+        boxes_lidar = boxes_lidar.to(points.device).unsqueeze(0)
+        box_idxs_of_pts = points_in_boxes_batch(points_lidar, boxes_lidar)
+
+        return box_idxs_of_pts.squeeze(0)
--- a/mmdet3d/core/bbox/structures/lidar_box3d.py
+++ b/mmdet3d/core/bbox/structures/lidar_box3d.py
@@ -18,7 +18,7 @@ class LiDARInstance3DBoxes(BaseInstance3DBoxes):
                               | /
       (yaw=pi) left y <------ 0

-    The relative coordinate of bottom center in a LiDAR box is [0.5, 0.5, 0],
+    The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0),
    and the yaw is around the z axis, thus the rotation axis=2.
    The yaw is 0 at the negative direction of y axis, and increases from
    the negative direction of y to the positive direction of x.

--- a/mmdet3d/core/evaluation/indoor_eval.py
+++ b/mmdet3d/core/evaluation/indoor_eval.py
@@ -3,47 +3,6 @@ import torch
 from mmcv.utils import print_log
 from terminaltables import AsciiTable

-from mmdet3d.core.bbox.iou_calculators.iou3d_calculator import bbox_overlaps_3d
-
-
-def boxes3d_depth_to_lidar(boxes3d, mid_to_bottom=True):
-    """Boxes3d depth to lidar.
-
-    Flip X-right,Y-forward,Z-up to X-forward,Y-left,Z-up.
-
-    Args:
-        boxes3d (ndarray): (N, 7) [x, y, z, w, l, h, r] in depth coords.
-
-    Return:
-        boxes3d_lidar (ndarray): (N, 7) [x, y, z, l, h, w, r] in LiDAR coords.
-    """
-    boxes3d_lidar = boxes3d.copy()
-    boxes3d_lidar[..., [0, 1, 2, 3, 4, 5]] = boxes3d_lidar[...,
-                                                           [1, 0, 2, 4, 3, 5]]
-    boxes3d_lidar[..., 1] *= -1
-    if mid_to_bottom:
-        boxes3d_lidar[..., 2] -= boxes3d_lidar[..., 5] / 2
-    return boxes3d_lidar
-
-
-def get_iou_gpu(bb1, bb2):
-    """Get IoU.
-
-    Compute IoU of two bounding boxes.
-
-    Args:
-        bb1 (ndarray): [x, y, z, w, l, h, ry] in LiDAR.
-        bb2 (ndarray): [x, y, z, h, w, l, ry] in LiDAR.
-
-    Returns:
-        ans_iou (tensor): The answer of IoU.
-    """
-
-    bb1 = torch.from_numpy(bb1).float().cuda()
-    bb2 = torch.from_numpy(bb2).float().cuda()
-    iou3d = bbox_overlaps_3d(bb1, bb2, mode='iou', coordinate='lidar')
-    return iou3d.cpu().numpy()
-

 def average_precision(recalls, precisions, mode='area'):
    """Calculate average precision (for single or multiple scales).
@@ -61,7 +20,10 @@ def average_precision(recalls, precisions, mode='area'):
    if recalls.ndim == 1:
        recalls = recalls[np.newaxis, :]
        precisions = precisions[np.newaxis, :]
-    assert recalls.shape == precisions.shape and recalls.ndim == 2
+
+    assert recalls.shape == precisions.shape
+    assert recalls.ndim == 2
+
    num_scales = recalls.shape[0]
    ap = np.zeros(num_scales, dtype=np.float32)
    if mode == 'area':
@@ -103,40 +65,42 @@ def eval_det_cls(pred, gt, iou_thr=None):
        float: scalar, average precision.
    """

-    # construct gt objects
-    class_recs = {}  # {img_id: {'bbox': bbox list, 'det': matched list}}
+    # {img_id: {'bbox': box structure, 'det': matched list}}
+    class_recs = {}
    npos = 0
    for img_id in gt.keys():
-        bbox = np.array(gt[img_id])
+        cur_gt_num = len(gt[img_id])
+        if cur_gt_num != 0:
+            gt_cur = torch.zeros([cur_gt_num, 7], dtype=torch.float32)
+            for i in range(cur_gt_num):
+                gt_cur[i] = gt[img_id][i].tensor
+            bbox = gt[img_id][0].new_box(gt_cur)
+        else:
+            bbox = gt[img_id]
        det = [[False] * len(bbox) for i in iou_thr]
        npos += len(bbox)
        class_recs[img_id] = {'bbox': bbox, 'det': det}
-    # pad empty list to all other imgids
-    for img_id in pred.keys():
-        if img_id not in gt:
-            class_recs[img_id] = {'bbox': np.array([]), 'det': []}

    # construct dets
    image_ids = []
    confidence = []
-    BB = []
    ious = []
    for img_id in pred.keys():
        cur_num = len(pred[img_id])
        if cur_num == 0:
            continue
-        BB_cur = np.zeros((cur_num, 7))  # hard code
+        pred_cur = torch.zeros((cur_num, 7), dtype=torch.float32)
        box_idx = 0
        for box, score in pred[img_id]:
            image_ids.append(img_id)
            confidence.append(score)
-            BB.append(box)
-            BB_cur[box_idx] = box
+            pred_cur[box_idx] = box.tensor
            box_idx += 1
-        gt_cur = class_recs[img_id]['bbox'].astype(float)
+        pred_cur = box.new_box(pred_cur)
+        gt_cur = class_recs[img_id]['bbox']
        if len(gt_cur) > 0:
            # calculate iou in each image
-            iou_cur = get_iou_gpu(BB_cur, gt_cur)
+            iou_cur = pred_cur.overlaps(pred_cur, gt_cur)
            for i in range(cur_num):
                ious.append(iou_cur[i])
        else:
@@ -157,12 +121,12 @@ def eval_det_cls(pred, gt, iou_thr=None):
    for d in range(nd):
        R = class_recs[image_ids[d]]
        iou_max = -np.inf
-        BBGT = R['bbox'].astype(float)
+        BBGT = R['bbox']
        cur_iou = ious[d]

-        if BBGT.size > 0:
+        if len(BBGT) > 0:
            # compute overlaps
-            for j in range(BBGT.shape[0]):
+            for j in range(len(BBGT)):
                # iou = get_iou_main(get_iou_func, (bb, BBGT[j,...]))
                iou = cur_iou[j]
                if iou > iou_max:
@@ -194,61 +158,22 @@ def eval_det_cls(pred, gt, iou_thr=None):
    return ret


-def eval_map_recall(det_infos, gt_infos, ovthresh=None):
+def eval_map_recall(pred, gt, ovthresh=None):
    """Evaluate mAP and recall.

    Generic functions to compute precision/recall for object detection
        for multiple classes.

    Args:
-        det_infos (list[dict]): Information of detection results, the dict
-            includes the following keys
-            - labels_3d (Tensor): Labels of boxes.
-            - boxes_3d (Tensor): 3d bboxes.
-            - scores_3d (Tensor): Scores of boxes.
-
-        gt_infos (list[dict]): information of gt results, the dict
-            includes the following keys
-            - labels_3d (Tensor): labels of boxes.
-            - boxes_3d (Tensor): 3d bboxes.
+        pred (dict): Information of detection results,
+            which maps class_id and predictions.
+        gt (dict): information of gt results, which maps class_id and gt.
        ovthresh (list[float]): iou threshold.
            Default: None.

    Return:
        tuple[dict]: dict results of recall, AP, and precision for all classes.
    """
-    pred_all = {}
-    scan_cnt = 0
-    for det_info in det_infos:
-        pred_all[scan_cnt] = det_info
-        scan_cnt += 1
-
-    pred = {}  # map {classname: pred}
-    gt = {}  # map {classname: gt}
-    for img_id in pred_all.keys():
-        for i in range(len(pred_all[img_id]['labels_3d'])):
-            label = pred_all[img_id]['labels_3d'].numpy()[i]
-            bbox = pred_all[img_id]['boxes_3d'].numpy()[i]
-            score = pred_all[img_id]['scores_3d'].numpy()[i]
-            if label not in pred:
-                pred[int(label)] = {}
-            if img_id not in pred[label]:
-                pred[int(label)][img_id] = []
-            if label not in gt:
-                gt[int(label)] = {}
-            if img_id not in gt[label]:
-                gt[int(label)][img_id] = []
-            pred[int(label)][img_id].append((bbox, score))
-
-    for img_id in range(len(gt_infos)):
-        for i in range(len(gt_infos[img_id]['labels_3d'])):
-            label = gt_infos[img_id]['labels_3d'][i]
-            bbox = gt_infos[img_id]['boxes_3d'][i]
-            if label not in gt:
-                gt[label] = {}
-            if img_id not in gt[label]:
-                gt[label][img_id] = []
-            gt[label][img_id].append(bbox)

    ret_values = []
    for classname in gt.keys():
@@ -272,14 +197,24 @@ def eval_map_recall(det_infos, gt_infos, ovthresh=None):
    return recall, precision, ap


-def indoor_eval(gt_annos, dt_annos, metric, label2cat, logger=None):
+def indoor_eval(gt_annos,
+                dt_annos,
+                metric,
+                label2cat,
+                logger=None,
+                box_type_3d=None,
+                box_mode_3d=None):
    """Scannet Evaluation.

    Evaluate the result of the detection.

    Args:
        gt_annos (list[dict]): GT annotations.
-        dt_annos (list[dict]): Detection annotations.
+        dt_annos (list[dict]): Detection annotations. the dict
+            includes the following keys
+            - labels_3d (Tensor): Labels of boxes.
+            - boxes_3d (BaseInstance3DBoxes): 3d bboxes in Depth coordinate.
+            - scores_3d (Tensor): Scores of boxes.
        metric (list[float]): AP IoU thresholds.
        label2cat (dict): {label: cat}.
        logger (logging.Logger | str | None): The way to print the mAP
@@ -288,24 +223,48 @@ def indoor_eval(gt_annos, dt_annos, metric, label2cat, logger=None):
    Return:
        dict: Dict of results.
    """
-    gt_infos = []
-    for gt_anno in gt_annos:
+    assert len(dt_annos) == len(gt_annos)
+    pred = {}  # map {class_id: pred}
+    gt = {}  # map {class_id: gt}
+    for img_id in range(len(dt_annos)):
+        # parse detected annotations
+        det_anno = dt_annos[img_id]
+        for i in range(len(det_anno['labels_3d'])):
+            label = det_anno['labels_3d'].numpy()[i]
+            bbox = det_anno['boxes_3d'].convert_to(box_mode_3d)[i]
+            score = det_anno['scores_3d'].numpy()[i]
+            if label not in pred:
+                pred[int(label)] = {}
+            if img_id not in pred[label]:
+                pred[int(label)][img_id] = []
+            if label not in gt:
+                gt[int(label)] = {}
+            if img_id not in gt[label]:
+                gt[int(label)][img_id] = []
+            pred[int(label)][img_id].append((bbox, score))
+
+        # parse gt annotations
+        gt_anno = gt_annos[img_id]
        if gt_anno['gt_num'] != 0:
-            # convert to lidar coor for evaluation
-            bbox_lidar_bottom = boxes3d_depth_to_lidar(
-                gt_anno['gt_boxes_upright_depth'], mid_to_bottom=True)
-            if bbox_lidar_bottom.shape[-1] == 6:
-                bbox_lidar_bottom = np.pad(bbox_lidar_bottom, ((0, 0), (0, 1)),
-                                           'constant')
-            gt_infos.append(
-                dict(boxes_3d=bbox_lidar_bottom, labels_3d=gt_anno['class']))
+            gt_boxes = box_type_3d(
+                gt_anno['gt_boxes_upright_depth'],
+                box_dim=gt_anno['gt_boxes_upright_depth'].shape[-1],
+                origin=(0.5, 0.5, 0.5)).convert_to(box_mode_3d)
+            labels_3d = gt_anno['class']
        else:
-            gt_infos.append(
-                dict(
-                    boxes_3d=np.array([], dtype=np.float32),
-                    labels_3d=np.array([], dtype=np.int64)))
+            gt_boxes = box_type_3d(np.array([], dtype=np.float32))
+            labels_3d = np.array([], dtype=np.int64)
+
+        for i in range(len(labels_3d)):
+            label = labels_3d[i]
+            bbox = gt_boxes[i]
+            if label not in gt:
+                gt[label] = {}
+            if img_id not in gt[label]:
+                gt[label][img_id] = []
+            gt[label][img_id].append(bbox)

-    rec, prec, ap = eval_map_recall(dt_annos, gt_infos, metric)
+    rec, prec, ap = eval_map_recall(pred, gt, metric)
    ret_dict = dict()
    header = ['classes']
    table_columns = [[label2cat[label]

--- a/mmdet3d/datasets/custom_3d.py
+++ b/mmdet3d/datasets/custom_3d.py
@@ -186,7 +186,13 @@ class Custom3DDataset(Dataset):
        gt_annos = [info['annos'] for info in self.data_infos]
        label2cat = {i: cat_id for i, cat_id in enumerate(self.CLASSES)}
        ret_dict = indoor_eval(
-            gt_annos, results, iou_thr, label2cat, logger=logger)
+            gt_annos,
+            results,
+            iou_thr,
+            label2cat,
+            logger=logger,
+            box_type_3d=self.box_type_3d,
+            box_mode_3d=self.box_mode_3d)

        return ret_dict


--- a/mmdet3d/datasets/nuscenes_dataset.py
+++ b/mmdet3d/datasets/nuscenes_dataset.py
@@ -172,11 +172,11 @@ class NuScenesDataset(Custom3DDataset):
            gt_bboxes_3d = np.concatenate([gt_bboxes_3d, gt_velocity], axis=-1)

        # the nuscenes box center is [0.5, 0.5, 0.5], we keep it
-        # the same as KITTI [0.5, 0.5, 0]
+        # the same as KITTI (0.5, 0.5, 0)
        gt_bboxes_3d = LiDARInstance3DBoxes(
            gt_bboxes_3d,
            box_dim=gt_bboxes_3d.shape[-1],
-            origin=[0.5, 0.5, 0.5]).convert_to(self.box_mode_3d)
+            origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)

        anns_results = dict(
            gt_bboxes_3d=gt_bboxes_3d,

--- a/mmdet3d/datasets/pipelines/data_augment_utils.py
+++ b/mmdet3d/datasets/pipelines/data_augment_utils.py
@@ -6,7 +6,7 @@ from numba.errors import NumbaPerformanceWarning

 from mmdet3d.core.bbox import box_np_ops

-warnings.filterwarnings("ignore", category=NumbaPerformanceWarning)
+warnings.filterwarnings('ignore', category=NumbaPerformanceWarning)


 @numba.njit
@@ -301,7 +301,7 @@ def noise_per_object_v3_(gt_boxes,
        grot_uppers[..., np.newaxis],
        size=[num_boxes, num_try])

-    origin = [0.5, 0.5, 0]
+    origin = (0.5, 0.5, 0)
    gt_box_corners = box_np_ops.center_to_corner_box3d(
        gt_boxes[:, :3],
        gt_boxes[:, 3:6],

--- a/mmdet3d/datasets/pipelines/indoor_augment.py
+++ b/mmdet3d/datasets/pipelines/indoor_augment.py
@@ -25,21 +25,19 @@ class IndoorFlipData(object):
    def __call__(self, results):
        points = results['points']
        gt_bboxes_3d = results['gt_bboxes_3d']
-        aligned = True if gt_bboxes_3d.shape[1] == 6 else False
        results['flip_yz'] = False
        results['flip_xz'] = False
        if np.random.random() < self.flip_ratio_yz:
            # Flipping along the YZ plane
            points[:, 0] = -1 * points[:, 0]
-            gt_bboxes_3d[:, 0] = -1 * gt_bboxes_3d[:, 0]
-            if not aligned:
-                gt_bboxes_3d[:, 6] = np.pi - gt_bboxes_3d[:, 6]
+            gt_bboxes_3d.flip('horizontal')
            results['flip_yz'] = True

-        if aligned and np.random.random() < self.flip_ratio_xz:
+        if not gt_bboxes_3d.with_yaw and np.random.random(
+        ) < self.flip_ratio_xz:
            # Flipping along the XZ plane
            points[:, 1] = -1 * points[:, 1]
-            gt_bboxes_3d[:, 1] = -1 * gt_bboxes_3d[:, 1]
+            gt_bboxes_3d.flip('vertical')
            results['flip_xz'] = True

        results['points'] = points
@@ -154,57 +152,18 @@ class IndoorGlobalRotScale(object):
        rot_mat = np.array([[c, -s, 0], [s, c, 0], [0, 0, 1]])
        return rot_mat

-    def _rotate_aligned_boxes(self, input_boxes, rot_mat):
-        """Rotate aligned boxes.
-
-        Rotate function for the aligned boxes.
-
-        Args:
-            input_boxes (ndarray): 3D boxes.
-            rot_mat (ndarray): Rotation matrix.
-
-        Returns:
-            rotated_boxes (ndarry): 3D boxes after rotation.
-        """
-        centers, lengths = input_boxes[:, 0:3], input_boxes[:, 3:6]
-        new_centers = np.dot(centers, rot_mat.T)
-
-        dx, dy = lengths[:, 0] / 2.0, lengths[:, 1] / 2.0
-        new_x = np.zeros((dx.shape[0], 4))
-        new_y = np.zeros((dx.shape[0], 4))
-
-        for i, corner in enumerate([(-1, -1), (1, -1), (1, 1), (-1, 1)]):
-            corners = np.zeros((dx.shape[0], 3))
-            corners[:, 0] = corner[0] * dx
-            corners[:, 1] = corner[1] * dy
-            corners = np.dot(corners, rot_mat.T)
-            new_x[:, i] = corners[:, 0]
-            new_y[:, i] = corners[:, 1]
-
-        new_dx = 2.0 * np.max(new_x, 1)
-        new_dy = 2.0 * np.max(new_y, 1)
-        new_lengths = np.stack((new_dx, new_dy, lengths[:, 2]), axis=1)
-
-        return np.concatenate([new_centers, new_lengths], axis=1)
-
    def __call__(self, results):
        points = results['points']
        gt_bboxes_3d = results['gt_bboxes_3d']
-        aligned = True if gt_bboxes_3d.shape[1] == 6 else False

        if self.rot_range is not None:
            assert len(self.rot_range) == 2, \
                f'Expect length of rot range =2, ' \
                f'got {len(self.rot_range)}.'
            rot_angle = np.random.uniform(self.rot_range[0], self.rot_range[1])
-            rot_mat = self._rotz(rot_angle)
-            points[:, :3] = np.dot(points[:, :3], rot_mat.T)
-            if aligned:
-                gt_bboxes_3d = self._rotate_aligned_boxes(
-                    gt_bboxes_3d, rot_mat)
-            else:
-                gt_bboxes_3d[:, :3] = np.dot(gt_bboxes_3d[:, :3], rot_mat.T)
-                gt_bboxes_3d[:, 6] -= rot_angle
+            if gt_bboxes_3d.tensor.shape[0] != 0:
+                gt_bboxes_3d.rotate(rot_angle)
+            points[:, :3] = np.dot(points[:, :3], self._rotz(rot_angle).T)
            results['rot_angle'] = rot_angle

        if self.scale_range is not None:
@@ -216,15 +175,14 @@ class IndoorGlobalRotScale(object):
                                            self.scale_range[1])

            points[:, :3] *= scale_ratio
-            gt_bboxes_3d[:, :3] *= scale_ratio
-            gt_bboxes_3d[:, 3:6] *= scale_ratio
+            gt_bboxes_3d.scale(scale_ratio)
            if self.shift_height:
                points[:, -1] *= scale_ratio

            results['scale_ratio'] = scale_ratio

        results['points'] = points
-        results['gt_bboxes_3d'] = gt_bboxes_3d.astype(np.float32)
+        results['gt_bboxes_3d'] = gt_bboxes_3d
        return results

    def __repr__(self):

--- a/mmdet3d/datasets/pipelines/train_aug.py
+++ b/mmdet3d/datasets/pipelines/train_aug.py
@@ -113,7 +113,7 @@ class ObjectSample(object):
        #         Trv2c = input_dict['Trv2c']
        #         P2 = input_dict['P2']
        if self.sample_2d:
-            img = input_dict['img']  # .astype(np.float32)
+            img = input_dict['img']
            gt_bboxes_2d = input_dict['gt_bboxes']
            # Assume for now 3D & 2D bboxes are the same
            sampled_dict = self.db_sampler.sample_all(
@@ -148,7 +148,7 @@ class ObjectSample(object):
                    [gt_bboxes_2d, sampled_gt_bboxes_2d]).astype(np.float32)

                input_dict['gt_bboxes'] = gt_bboxes_2d
-                input_dict['img'] = sampled_dict['img']  # .astype(np.uint8)
+                input_dict['img'] = sampled_dict['img']

        input_dict['gt_bboxes_3d'] = gt_bboxes_3d
        input_dict['gt_labels_3d'] = gt_labels_3d

--- a/mmdet3d/datasets/scannet_dataset.py
+++ b/mmdet3d/datasets/scannet_dataset.py
@@ -2,6 +2,7 @@ import os.path as osp

 import numpy as np

+from mmdet3d.core.bbox import DepthInstance3DBoxes
 from mmdet.datasets import DATASETS
 from .custom_3d import Custom3DDataset

@@ -43,6 +44,14 @@ class ScanNetDataset(Custom3DDataset):
        else:
            gt_bboxes_3d = np.zeros((0, 6), dtype=np.float32)
            gt_labels_3d = np.zeros((0, ), dtype=np.long)
+
+        # to target box structure
+        gt_bboxes_3d = DepthInstance3DBoxes(
+            gt_bboxes_3d,
+            box_dim=gt_bboxes_3d.shape[-1],
+            with_yaw=False,
+            origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+
        pts_instance_mask_path = osp.join(self.data_root,
                                          info['pts_instance_mask_path'])
        pts_semantic_mask_path = osp.join(self.data_root,

--- a/mmdet3d/datasets/sunrgbd_dataset.py
+++ b/mmdet3d/datasets/sunrgbd_dataset.py
 import numpy as np

+from mmdet3d.core.bbox import DepthInstance3DBoxes
 from mmdet.datasets import DATASETS
 from .custom_3d import Custom3DDataset

@@ -40,6 +41,10 @@ class SUNRGBDDataset(Custom3DDataset):
            gt_bboxes_3d = np.zeros((0, 7), dtype=np.float32)
            gt_labels_3d = np.zeros((0, ), dtype=np.long)

+        # to target box structure
+        gt_bboxes_3d = DepthInstance3DBoxes(
+            gt_bboxes_3d, origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+
        anns_results = dict(
            gt_bboxes_3d=gt_bboxes_3d, gt_labels_3d=gt_labels_3d)
        return anns_results
--- a/mmdet3d/models/dense_heads/train_mixins.py
+++ b/mmdet3d/models/dense_heads/train_mixins.py
@@ -20,7 +20,8 @@ class AnchorTrainMixin(object):

        Args:
            anchor_list (list[list]): Multi level anchors of each image.
-            gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image.
+            gt_bboxes_list (list[BaseInstance3DBoxes]): Ground truth
+                bboxes of each image.
            img_metas (list[dict]): Meta info of each image.

        Returns:

--- a/mmdet3d/models/dense_heads/vote_head.py
+++ b/mmdet3d/models/dense_heads/vote_head.py
@@ -5,14 +5,11 @@ import torch.nn.functional as F
 from mmcv.cnn import ConvModule

 from mmdet3d.core import build_bbox_coder, multi_apply
-from mmdet3d.core.bbox.box_torch_ops import boxes3d_to_corners3d_lidar_torch
-from mmdet3d.core.bbox.transforms import upright_depth_to_lidar_torch
 from mmdet3d.core.post_processing import aligned_3d_nms
 from mmdet3d.models.builder import build_loss
 from mmdet3d.models.losses import chamfer_distance
 from mmdet3d.models.model_utils import VoteModule
-from mmdet3d.ops import (PointSAModule, furthest_point_sample,
-                         points_in_boxes_batch)
+from mmdet3d.ops import PointSAModule, furthest_point_sample
 from mmdet.models import HEADS


@@ -276,7 +273,7 @@ class VoteHead(nn.Module):

        Args:
            points (list[Tensor]): Points of each batch.
-            gt_bboxes_3d (list[Tensor]): gt bboxes of each batch.
+            gt_bboxes_3d (BaseInstance3DBoxes): gt bboxes of each batch.
            gt_labels_3d (list[Tensor]): gt class labels of each batch.
            pts_semantic_mask (None | list[Tensor]): point-wise semantic
                label of each batch.
@@ -293,8 +290,9 @@ class VoteHead(nn.Module):
        gt_num = list()
        for index in range(len(gt_labels_3d)):
            if len(gt_labels_3d[index]) == 0:
-                gt_bboxes_3d[index] = gt_bboxes_3d[index].new_zeros(
-                    1, gt_bboxes_3d[index].shape[-1])
+                fake_box = gt_bboxes_3d[index].tensor.new_zeros(
+                    1, gt_bboxes_3d[index].tensor.shape[-1])
+                gt_bboxes_3d[index] = gt_bboxes_3d[index].new_box(fake_box)
                gt_labels_3d[index] = gt_labels_3d[index].new_zeros(1)
                valid_gt_masks.append(gt_labels_3d[index].new_zeros(1))
                gt_num.append(1)
@@ -359,25 +357,23 @@ class VoteHead(nn.Module):
                           aggregated_points=None):
        assert self.bbox_coder.with_rot or pts_semantic_mask is not None

+        gt_bboxes_3d = gt_bboxes_3d.to(points.device)
+
        # generate votes target
        num_points = points.shape[0]
        if self.bbox_coder.with_rot:
-            points_lidar, gt_bboxes_3d_lidar = upright_depth_to_lidar_torch(
-                points, gt_bboxes_3d, to_bottom_center=True)
            vote_targets = points.new_zeros([num_points, 3 * self.gt_per_seed])
            vote_target_masks = points.new_zeros([num_points],
                                                 dtype=torch.long)
            vote_target_idx = points.new_zeros([num_points], dtype=torch.long)
-
-            box_indices_all = points_in_boxes_batch(
-                points_lidar.unsqueeze(0), gt_bboxes_3d_lidar.unsqueeze(0))[0]
-            for i in range(gt_bboxes_3d.shape[0]):
+            box_indices_all = gt_bboxes_3d.points_in_boxes(points)
+            for i in range(gt_labels_3d.shape[0]):
                box_indices = box_indices_all[:, i]
                indices = torch.nonzero(box_indices).squeeze(-1)
                selected_points = points[indices]
                vote_target_masks[indices] = 1
                vote_targets_tmp = vote_targets[indices]
-                votes = gt_bboxes_3d[i][:3].unsqueeze(
+                votes = gt_bboxes_3d.gravity_center[i].unsqueeze(
                    0) - selected_points[:, :3]

                for j in range(self.gt_per_seed):
@@ -438,7 +434,7 @@ class VoteHead(nn.Module):
        size_class_targets = size_class_targets[assignment]
        size_res_targets = size_res_targets[assignment]

-        one_hot_size_targets = gt_bboxes_3d.new_zeros(
+        one_hot_size_targets = gt_bboxes_3d.tensor.new_zeros(
            (proposal_num, self.num_sizes))
        one_hot_size_targets.scatter_(1, size_class_targets.unsqueeze(-1), 1)
        one_hot_size_targets = one_hot_size_targets.unsqueeze(-1).repeat(
@@ -455,38 +451,43 @@ class VoteHead(nn.Module):
                dir_class_targets, dir_res_targets, center_targets,
                mask_targets.long(), objectness_targets, objectness_masks)

-    def get_bboxes(self, points, bbox_preds, img_meta, rescale=False):
+    def get_bboxes(self, points, bbox_preds, input_meta, rescale=False):
        # decode boxes
        obj_scores = F.softmax(bbox_preds['obj_scores'], dim=-1)[..., -1]
        sem_scores = F.softmax(bbox_preds['sem_scores'], dim=-1)
        bbox_depth = self.bbox_coder.decode(bbox_preds)

-        points_lidar, bbox_lidar = upright_depth_to_lidar_torch(
-            points[..., :3], bbox_depth, to_bottom_center=True)
        batch_size = bbox_depth.shape[0]
-
        results = list()
        for b in range(batch_size):
            bbox_selected, score_selected, labels = self.multiclass_nms_single(
-                obj_scores[b], sem_scores[b], bbox_lidar[b], points_lidar[b])
-            results.append((bbox_selected, score_selected, labels))
+                obj_scores[b], sem_scores[b], bbox_depth[b],
+                points[b, ..., :3], input_meta[b])
+            bbox = input_meta[b]['box_type_3d'](
+                bbox_selected,
+                box_dim=bbox_selected.shape[-1],
+                with_yaw=self.bbox_coder.with_rot)
+            results.append((bbox, score_selected, labels))

        return results

-    def multiclass_nms_single(self, obj_scores, sem_scores, bbox,
-                              points_lidar):
-        box_indices = points_in_boxes_batch(
-            points_lidar.unsqueeze(0), bbox.unsqueeze(0))[0]
-        nonempty_box_mask = box_indices.T.sum(1) > 5
+    def multiclass_nms_single(self, obj_scores, sem_scores, bbox, points,
+                              input_meta):
+        bbox = input_meta['box_type_3d'](
+            bbox,
+            box_dim=bbox.shape[-1],
+            with_yaw=self.bbox_coder.with_rot,
+            origin=(0.5, 0.5, 0.5))
+        box_indices = bbox.points_in_boxes(points)

-        bbox_classes = torch.argmax(sem_scores, -1)
-
-        # boxes3d to aligned boxes
-        corner3d = boxes3d_to_corners3d_lidar_torch(bbox)
+        corner3d = bbox.corners
        minmax_box3d = corner3d.new(torch.Size((corner3d.shape[0], 6)))
        minmax_box3d[:, :3] = torch.min(corner3d, dim=1)[0]
        minmax_box3d[:, 3:] = torch.max(corner3d, dim=1)[0]

+        nonempty_box_mask = box_indices.T.sum(1) > 5
+
+        bbox_classes = torch.argmax(sem_scores, -1)
        nms_selected = aligned_3d_nms(minmax_box3d[nonempty_box_mask],
                                      obj_scores[nonempty_box_mask],
                                      bbox_classes[nonempty_box_mask],
@@ -502,7 +503,7 @@ class VoteHead(nn.Module):
        if self.test_cfg.per_class_proposal:
            bbox_selected, score_selected, labels = [], [], []
            for k in range(sem_scores.shape[-1]):
-                bbox_selected.append(bbox[selected])
+                bbox_selected.append(bbox[selected].tensor)
                score_selected.append(obj_scores[selected] *
                                      sem_scores[selected][:, k])
                labels.append(
@@ -511,7 +512,7 @@ class VoteHead(nn.Module):
            score_selected = torch.cat(score_selected, 0)
            labels = torch.cat(labels, 0)
        else:
-            bbox_selected = bbox[selected]
+            bbox_selected = bbox[selected].tensor
            score_selected = obj_scores[selected]
            labels = bbox_classes[selected]


--- a/mmdet3d/models/detectors/votenet.py
+++ b/mmdet3d/models/detectors/votenet.py
@@ -43,7 +43,7 @@ class VoteNet(SingleStageDetector):
        Args:
            points (list[Tensor]): Points of each batch.
            img_meta (list): Image metas.
-            gt_bboxes_3d (list[Tensor]): gt bboxes of each batch.
+            gt_bboxes_3d (BaseInstance3DBoxes): gt bboxes of each batch.
            gt_labels_3d (list[Tensor]): gt class labels of each batch.
            pts_semantic_mask (None | list[Tensor]): point-wise semantic
                label of each batch.
@@ -86,7 +86,7 @@ class VoteNet(SingleStageDetector):
        Args:
            points (list[Tensor]): Points of each sample.
            img_meta (list): Image metas.
-            gt_bboxes_3d (list[Tensor]): gt bboxes of each sample.
+            gt_bboxes_3d (BaseInstance3DBoxes): gt bboxes of each sample.
            gt_labels_3d (list[Tensor]): gt class labels of each sample.
            pts_semantic_mask (None | list[Tensor]): point-wise semantic
                label of each sample.