[Feature] Support monocular 3D detection on nuScenes (#392)

* Support nuscenes mono3d json info generation * Support nuscenes mono3d dataset class * Support attribute and bbox2d prediction in bbox3dnms and bbox3d2result * Rename dataset class and add comments to 'attrs' * Support mono3d related pipelines * Fix unittest for loading 3D annotations * Add unit test for nuscenes mono3d dataset * Rename the sample result file * Upload sample data for mono3d unit test * Upload sample data for mono3d unit test * Upload sample image for unit test * Delete tests/data/nuscenes/samples/LIDAR_TOP/CAM_BACK_LEFT directory * Add files via upload * Remove unnecessary 'f' * Remove unnecessary \ in arguments * Remove check for pycocotools version because it has been done in the cocodataset * Remove unnecessary comma, add TODO and change init of attrs in format_results * Merge RandomFlip3D and RandomFlipMono3D * Add pytest to check whether cuda is available in the unit test * Add visualization TODO * Remove useless init in loading mono3d images

[Feature] Support monocular 3D detection on nuScenes (#392)
* Support nuscenes mono3d json info generation * Support nuscenes mono3d dataset class * Support attribute and bbox2d prediction in bbox3dnms and bbox3d2result * Rename dataset class and add comments to 'attrs' * Support mono3d related pipelines * Fix unittest for loading 3D annotations * Add unit test for nuscenes mono3d dataset * Rename the sample result file * Upload sample data for mono3d unit test * Upload sample data for mono3d unit test * Upload sample image for unit test * Delete tests/data/nuscenes/samples/LIDAR_TOP/CAM_BACK_LEFT directory * Add files via upload * Remove unnecessary 'f' * Remove unnecessary \ in arguments * Remove check for pycocotools version because it has been done in the cocodataset * Remove unnecessary comma, add TODO and change init of attrs in format_results * Merge RandomFlip3D and RandomFlipMono3D * Add pytest to check whether cuda is available in the unit test * Add visualization TODO * Remove useless init in loading mono3d images
ecd0d06a · twang · GitHub · 3a5a2010 · ecd0d06a · ecd0d06a
Unverified Commit ecd0d06a authored Apr 07, 2021 by twang Committed by GitHub Apr 07, 2021
16 changed files
--- a/mmdet3d/core/bbox/box_np_ops.py
+++ b/mmdet3d/core/bbox/box_np_ops.py
@@ -335,21 +335,40 @@ def rotation_points_single_angle(points, angle, axis=0):
    return points @ rot_mat_T, rot_mat_T


-def points_cam2img(points_3d, proj_mat):
+def points_cam2img(points_3d, proj_mat, with_depth=False):
    """Project points in camera coordinates to image coordinates.

    Args:
        points_3d (np.ndarray): Points in shape (N, 3)
        proj_mat (np.ndarray): Transformation matrix between coordinates.
+        with_depth (bool): Whether to keep depth in the output.

    Returns:
        np.ndarray: Points in image coordinates with shape [N, 2].
    """
    points_shape = list(points_3d.shape)
    points_shape[-1] = 1
+
+    assert len(proj_mat.shape) == 2, 'The dimension of the projection'\
+        f' matrix should be 2 instead of {len(proj_mat.shape)}.'
+    d1, d2 = proj_mat.shape[:2]
+    assert (d1 == 3 and d2 == 3) or (d1 == 3 and d2 == 4) or (
+        d1 == 4 and d2 == 4), 'The shape of the projection matrix'\
+        f' ({d1}*{d2}) is not supported.'
+    if d1 == 3:
+        proj_mat_expanded = np.eye(4, dtype=proj_mat.dtype)
+        proj_mat_expanded[:d1, :d2] = proj_mat
+        proj_mat = proj_mat_expanded
+
    points_4 = np.concatenate([points_3d, np.ones(points_shape)], axis=-1)
    point_2d = points_4 @ proj_mat.T
    point_2d_res = point_2d[..., :2] / point_2d[..., 2:3]
+
+    if with_depth:
+        points_2d_depth = np.concatenate([point_2d_res, point_2d[..., 2:3]],
+                                         axis=-1)
+        return points_2d_depth
+
    return point_2d_res



--- a/mmdet3d/core/bbox/structures/utils.py
+++ b/mmdet3d/core/bbox/structures/utils.py
@@ -124,11 +124,11 @@ def points_cam2img(points_3d, proj_mat):
    points_num = list(points_3d.shape)[:-1]

    points_shape = np.concatenate([points_num, [1]], axis=0).tolist()
-    assert len(proj_mat.shape) == 2, f'The dimension of the projection'\
-        f'matrix should be 2 instead of {len(proj_mat.shape)}.'
+    assert len(proj_mat.shape) == 2, 'The dimension of the projection'\
+        f' matrix should be 2 instead of {len(proj_mat.shape)}.'
    d1, d2 = proj_mat.shape[:2]
    assert (d1 == 3 and d2 == 3) or (d1 == 3 and d2 == 4) or (
-        d1 == 4 and d2 == 4), f'The shape of the projection matrix'\
+        d1 == 4 and d2 == 4), 'The shape of the projection matrix'\
        f' ({d1}*{d2}) is not supported.'
    if d1 == 3:
        proj_mat_expanded = torch.eye(

--- a/mmdet3d/core/bbox/transforms.py
+++ b/mmdet3d/core/bbox/transforms.py
@@ -46,13 +46,15 @@ def bbox3d2roi(bbox_list):
    return rois


-def bbox3d2result(bboxes, scores, labels):
+def bbox3d2result(bboxes, scores, labels, attrs=None):
    """Convert detection results to a list of numpy arrays.

    Args:
        bboxes (torch.Tensor): Bounding boxes with shape of (n, 5).
        labels (torch.Tensor): Labels with shape of (n, ).
        scores (torch.Tensor): Scores with shape of (n, ).
+        attrs (torch.Tensor, optional): Attributes with shape of (n, ). \
+            Defaults to None.

    Returns:
        dict[str, torch.Tensor]: Bounding box results in cpu mode.
@@ -60,8 +62,14 @@ def bbox3d2result(bboxes, scores, labels):
            - boxes_3d (torch.Tensor): 3D boxes.
            - scores (torch.Tensor): Prediction scores.
            - labels_3d (torch.Tensor): Box labels.
+            - attrs_3d (torch.Tensor, optional): Box attributes.
    """
-    return dict(
+    result_dict = dict(
        boxes_3d=bboxes.to('cpu'),
        scores_3d=scores.cpu(),
        labels_3d=labels.cpu())
+
+    if attrs is not None:
+        result_dict['attrs_3d'] = attrs.cpu()
+
+    return result_dict
--- a/mmdet3d/core/post_processing/box3d_nms.py
+++ b/mmdet3d/core/post_processing/box3d_nms.py
@@ -11,7 +11,9 @@ def box3d_multiclass_nms(mlvl_bboxes,
                         score_thr,
                         max_num,
                         cfg,
-                         mlvl_dir_scores=None):
+                         mlvl_dir_scores=None,
+                         mlvl_attr_scores=None,
+                         mlvl_bboxes2d=None):
    """Multi-class nms for 3D boxes.

    Args:
@@ -27,10 +29,15 @@ def box3d_multiclass_nms(mlvl_bboxes,
        cfg (dict): Configuration dict of NMS.
        mlvl_dir_scores (torch.Tensor, optional): Multi-level scores
            of direction classifier. Defaults to None.
+        mlvl_attr_scores (torch.Tensor, optional): Multi-level scores
+            of attribute classifier. Defaults to None.
+        mlvl_bboxes2d (torch.Tensor, optional): Multi-level 2D bounding
+            boxes. Defaults to None.

    Returns:
        tuple[torch.Tensor]: Return results after nms, including 3D \
-            bounding boxes, scores, labels and direction scores.
+            bounding boxes, scores, labels, direction scores, attribute \
+            scores (optional) and 2D bounding boxes (optional).
    """
    # do multi class nms
    # the fg class id range: [0, num_classes-1]
@@ -39,6 +46,8 @@ def box3d_multiclass_nms(mlvl_bboxes,
    scores = []
    labels = []
    dir_scores = []
+    attr_scores = []
+    bboxes2d = []
    for i in range(0, num_classes):
        # get bboxes and scores of this class
        cls_inds = mlvl_scores[:, i] > score_thr
@@ -65,6 +74,12 @@ def box3d_multiclass_nms(mlvl_bboxes,
        if mlvl_dir_scores is not None:
            _mlvl_dir_scores = mlvl_dir_scores[cls_inds]
            dir_scores.append(_mlvl_dir_scores[selected])
+        if mlvl_attr_scores is not None:
+            _mlvl_attr_scores = mlvl_attr_scores[cls_inds]
+            attr_scores.append(_mlvl_attr_scores[selected])
+        if mlvl_bboxes2d is not None:
+            _mlvl_bboxes2d = mlvl_bboxes2d[cls_inds]
+            bboxes2d.append(_mlvl_bboxes2d[selected])

    if bboxes:
        bboxes = torch.cat(bboxes, dim=0)
@@ -72,6 +87,10 @@ def box3d_multiclass_nms(mlvl_bboxes,
        labels = torch.cat(labels, dim=0)
        if mlvl_dir_scores is not None:
            dir_scores = torch.cat(dir_scores, dim=0)
+        if mlvl_attr_scores is not None:
+            attr_scores = torch.cat(attr_scores, dim=0)
+        if mlvl_bboxes2d is not None:
+            bboxes2d = torch.cat(bboxes2d, dim=0)
        if bboxes.shape[0] > max_num:
            _, inds = scores.sort(descending=True)
            inds = inds[:max_num]
@@ -80,12 +99,31 @@ def box3d_multiclass_nms(mlvl_bboxes,
            scores = scores[inds]
            if mlvl_dir_scores is not None:
                dir_scores = dir_scores[inds]
+            if mlvl_attr_scores is not None:
+                attr_scores = attr_scores[inds]
+            if mlvl_bboxes2d is not None:
+                bboxes2d = bboxes2d[inds]
    else:
        bboxes = mlvl_scores.new_zeros((0, mlvl_bboxes.size(-1)))
        scores = mlvl_scores.new_zeros((0, ))
        labels = mlvl_scores.new_zeros((0, ), dtype=torch.long)
-        dir_scores = mlvl_scores.new_zeros((0, ))
-    return bboxes, scores, labels, dir_scores
+        if mlvl_dir_scores is not None:
+            dir_scores = mlvl_scores.new_zeros((0, ))
+        if mlvl_attr_scores is not None:
+            attr_scores = mlvl_scores.new_zeros((0, ))
+        if mlvl_bboxes2d is not None:
+            bboxes2d = mlvl_scores.new_zeros((0, 4))
+
+    results = (bboxes, scores, labels)
+
+    if mlvl_dir_scores is not None:
+        results = results + (dir_scores, )
+    if mlvl_attr_scores is not None:
+        results = results + (attr_scores, )
+    if mlvl_bboxes2d is not None:
+        results = results + (bboxes2d, )
+
+    return results


 def aligned_3d_nms(boxes, scores, classes, thresh):

--- a/mmdet3d/datasets/__init__.py
+++ b/mmdet3d/datasets/__init__.py
@@ -5,6 +5,7 @@ from .custom_3d_seg import Custom3DSegDataset
 from .kitti_dataset import KittiDataset
 from .lyft_dataset import LyftDataset
 from .nuscenes_dataset import NuScenesDataset
+from .nuscenes_mono_dataset import NuScenesMonoDataset
 from .pipelines import (BackgroundPointsFilter, GlobalRotScaleTrans,
                        IndoorPointSample, LoadAnnotations3D,
                        LoadPointsFromFile, LoadPointsFromMultiSweeps,
@@ -19,9 +20,9 @@ from .waymo_dataset import WaymoDataset
 __all__ = [
    'KittiDataset', 'GroupSampler', 'DistributedGroupSampler',
    'build_dataloader', 'RepeatFactorDataset', 'DATASETS', 'build_dataset',
-    'CocoDataset', 'NuScenesDataset', 'LyftDataset', 'ObjectSample',
-    'RandomFlip3D', 'ObjectNoise', 'GlobalRotScaleTrans', 'PointShuffle',
-    'ObjectRangeFilter', 'PointsRangeFilter', 'Collect3D',
+    'CocoDataset', 'NuScenesDataset', 'NuScenesMonoDataset', 'LyftDataset',
+    'ObjectSample', 'RandomFlip3D', 'ObjectNoise', 'GlobalRotScaleTrans',
+    'PointShuffle', 'ObjectRangeFilter', 'PointsRangeFilter', 'Collect3D',
    'LoadPointsFromFile', 'NormalizePointsColor', 'IndoorPointSample',
    'LoadAnnotations3D', 'SUNRGBDDataset', 'ScanNetDataset',
    'ScanNetSegDataset', 'SemanticKITTIDataset', 'Custom3DDataset',

--- a/mmdet3d/datasets/nuscenes_mono_dataset.py
+++ b/mmdet3d/datasets/nuscenes_mono_dataset.py
+import copy
+import mmcv
+import numpy as np
+import pyquaternion
+import tempfile
+import torch
+from nuscenes.utils.data_classes import Box as NuScenesBox
+from os import path as osp
+
+from mmdet3d.core import bbox3d2result, box3d_multiclass_nms, xywhr2xyxyr
+from mmdet.datasets import DATASETS, CocoDataset
+from ..core.bbox import CameraInstance3DBoxes, get_box_type
+
+
+@DATASETS.register_module()
+class NuScenesMonoDataset(CocoDataset):
+    r"""Monocular 3D detection on NuScenes Dataset.
+
+    This class serves as the API for experiments on the NuScenes Dataset.
+
+    Please refer to `NuScenes Dataset <https://www.nuscenes.org/download>`_
+    for data downloading.
+
+    Args:
+        ann_file (str): Path of annotation file.
+        data_root (str): Path of dataset root.
+        load_interval (int, optional): Interval of loading the dataset. It is
+            used to uniformly sample the dataset. Defaults to 1.
+        with_velocity (bool, optional): Whether include velocity prediction
+            into the experiments. Defaults to True.
+        modality (dict, optional): Modality to specify the sensor data used
+            as input. Defaults to None.
+        box_type_3d (str, optional): Type of 3D box of this dataset.
+            Based on the `box_type_3d`, the dataset will encapsulate the box
+            to its original format then converted them to `box_type_3d`.
+            Defaults to 'Camera' in this class. Available options includes.
+            - 'LiDAR': Box in LiDAR coordinates.
+            - 'Depth': Box in depth coordinates, usually for indoor dataset.
+            - 'Camera': Box in camera coordinates.
+        eval_version (str, optional): Configuration version of evaluation.
+            Defaults to  'detection_cvpr_2019'.
+        use_valid_flag (bool): Whether to use `use_valid_flag` key in the info
+            file as mask to filter gt_boxes and gt_names. Defaults to False.
+        version (str, optional): Dataset version. Defaults to 'v1.0-trainval'.
+    """
+    CLASSES = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
+               'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
+               'barrier')
+    DefaultAttribute = {
+        'car': 'vehicle.parked',
+        'pedestrian': 'pedestrian.moving',
+        'trailer': 'vehicle.parked',
+        'truck': 'vehicle.parked',
+        'bus': 'vehicle.moving',
+        'motorcycle': 'cycle.without_rider',
+        'construction_vehicle': 'vehicle.parked',
+        'bicycle': 'cycle.without_rider',
+        'barrier': '',
+        'traffic_cone': '',
+    }
+
+    def __init__(self,
+                 data_root,
+                 load_interval=1,
+                 with_velocity=True,
+                 modality=None,
+                 box_type_3d='Camera',
+                 eval_version='detection_cvpr_2019',
+                 use_valid_flag=False,
+                 version='v1.0-trainval',
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.data_root = data_root
+        self.load_interval = load_interval
+        self.with_velocity = with_velocity
+        self.modality = modality
+        self.box_type_3d, self.box_mode_3d = get_box_type(box_type_3d)
+        self.eval_version = eval_version
+        self.use_valid_flag = use_valid_flag
+        self.bbox_code_size = 9
+        self.version = version
+        if self.eval_version is not None:
+            from nuscenes.eval.detection.config import config_factory
+            self.eval_detection_configs = config_factory(self.eval_version)
+        if self.modality is None:
+            self.modality = dict(
+                use_camera=True,
+                use_lidar=False,
+                use_radar=False,
+                use_map=False,
+                use_external=False)
+
+    def pre_pipeline(self, results):
+        """Initialization before data preparation.
+
+        Args:
+            results (dict): Dict before data preprocessing.
+
+                - img_fields (list): Image fields.
+                - bbox3d_fields (list): 3D bounding boxes fields.
+                - pts_mask_fields (list): Mask fields of points.
+                - pts_seg_fields (list): Mask fields of point segments.
+                - bbox_fields (list): Fields of bounding boxes.
+                - mask_fields (list): Fields of masks.
+                - seg_fields (list): Segment fields.
+                - box_type_3d (str): 3D box type.
+                - box_mode_3d (str): 3D box mode.
+        """
+        results['img_prefix'] = self.img_prefix
+        results['seg_prefix'] = self.seg_prefix
+        results['proposal_file'] = self.proposal_file
+        results['img_fields'] = []
+        results['bbox3d_fields'] = []
+        results['pts_mask_fields'] = []
+        results['pts_seg_fields'] = []
+        results['bbox_fields'] = []
+        results['mask_fields'] = []
+        results['seg_fields'] = []
+        results['box_type_3d'] = self.box_type_3d
+        results['box_mode_3d'] = self.box_mode_3d
+
+    def _parse_ann_info(self, img_info, ann_info):
+        """Parse bbox annotation.
+
+        Args:
+            img_info (list[dict]): Image info.
+            ann_info (list[dict]): Annotation info of an image.
+
+        Returns:
+            dict: A dict containing the following keys: bboxes, labels, \
+                gt_bboxes_3d, gt_labels_3d, attr_labels, centers2d, \
+                depths, bboxes_ignore, masks, seg_map
+        """
+        gt_bboxes = []
+        gt_labels = []
+        attr_labels = []
+        gt_bboxes_ignore = []
+        gt_masks_ann = []
+        gt_bboxes_cam3d = []
+        centers2d = []
+        depths = []
+        for i, ann in enumerate(ann_info):
+            if ann.get('ignore', False):
+                continue
+            x1, y1, w, h = ann['bbox']
+            inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0))
+            inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0))
+            if inter_w * inter_h == 0:
+                continue
+            if ann['area'] <= 0 or w < 1 or h < 1:
+                continue
+            if ann['category_id'] not in self.cat_ids:
+                continue
+            bbox = [x1, y1, x1 + w, y1 + h]
+            if ann.get('iscrowd', False):
+                gt_bboxes_ignore.append(bbox)
+            else:
+                gt_bboxes.append(bbox)
+                gt_labels.append(self.cat2label[ann['category_id']])
+                attr_labels.append(ann['attribute_id'])
+                gt_masks_ann.append(ann.get('segmentation', None))
+                # 3D annotations in camera coordinates
+                bbox_cam3d = np.array(ann['bbox_cam3d']).reshape(1, -1)
+                # change orientation to local yaw
+                bbox_cam3d[0, 6] = -np.arctan2(
+                    bbox_cam3d[0, 0], bbox_cam3d[0, 2]) + bbox_cam3d[0, 6]
+                velo_cam3d = np.array(ann['velo_cam3d']).reshape(1, 2)
+                nan_mask = np.isnan(velo_cam3d[:, 0])
+                velo_cam3d[nan_mask] = [0.0, 0.0]
+                bbox_cam3d = np.concatenate([bbox_cam3d, velo_cam3d], axis=-1)
+                gt_bboxes_cam3d.append(bbox_cam3d.squeeze())
+                # 2.5D annotations in camera coordinates
+                center2d = ann['center2d'][:2]
+                depth = ann['center2d'][2]
+                centers2d.append(center2d)
+                depths.append(depth)
+
+        if gt_bboxes:
+            gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
+            gt_labels = np.array(gt_labels, dtype=np.int64)
+            attr_labels = np.array(attr_labels, dtype=np.int64)
+        else:
+            gt_bboxes = np.zeros((0, 4), dtype=np.float32)
+            gt_labels = np.array([], dtype=np.int64)
+            attr_labels = np.array([], dtype=np.int64)
+
+        if gt_bboxes_cam3d:
+            gt_bboxes_cam3d = np.array(gt_bboxes_cam3d, dtype=np.float32)
+            centers2d = np.array(centers2d, dtype=np.float32)
+            depths = np.array(depths, dtype=np.float32)
+        else:
+            gt_bboxes_cam3d = np.zeros((0, self.bbox_code_size),
+                                       dtype=np.float32)
+            centers2d = np.zeros((0, 2), dtype=np.float32)
+            depths = np.zeros((0), dtype=np.float32)
+
+        gt_bboxes_cam3d = CameraInstance3DBoxes(
+            gt_bboxes_cam3d,
+            box_dim=gt_bboxes_cam3d.shape[-1],
+            origin=(0.5, 0.5, 0.5))
+        gt_labels_3d = copy.deepcopy(gt_labels)
+
+        if gt_bboxes_ignore:
+            gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
+        else:
+            gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)
+
+        seg_map = img_info['filename'].replace('jpg', 'png')
+
+        ann = dict(
+            bboxes=gt_bboxes,
+            labels=gt_labels,
+            gt_bboxes_3d=gt_bboxes_cam3d,
+            gt_labels_3d=gt_labels_3d,
+            attr_labels=attr_labels,
+            centers2d=centers2d,
+            depths=depths,
+            bboxes_ignore=gt_bboxes_ignore,
+            masks=gt_masks_ann,
+            seg_map=seg_map)
+
+        return ann
+
+    def get_attr_name(self, attr_idx, label_name):
+        """Get attribute from predicted index.
+
+        This is a workaround to predict attribute when the predicted velocity
+        is not reliable. We map the predicted attribute index to the one
+        in the attribute set. If it is consistent with the category, we will
+        keep it. Otherwise, we will use the default attribute.
+
+        Args:
+            attr_idx (int): Attribute index.
+            label_name (str): Predicted category name.
+
+        Returns:
+            str: Predicted attribute name.
+        """
+        # TODO: Simplify the variable name
+        AttrMapping_rev2 = [
+            'cycle.with_rider', 'cycle.without_rider', 'pedestrian.moving',
+            'pedestrian.standing', 'pedestrian.sitting_lying_down',
+            'vehicle.moving', 'vehicle.parked', 'vehicle.stopped', 'None'
+        ]
+        if label_name == 'car' or label_name == 'bus' \
+            or label_name == 'truck' or label_name == 'trailer' \
+                or label_name == 'construction_vehicle':
+            if AttrMapping_rev2[attr_idx] == 'vehicle.moving' or \
+                AttrMapping_rev2[attr_idx] == 'vehicle.parked' or \
+                    AttrMapping_rev2[attr_idx] == 'vehicle.stopped':
+                return AttrMapping_rev2[attr_idx]
+            else:
+                return NuScenesMonoDataset.DefaultAttribute[label_name]
+        elif label_name == 'pedestrian':
+            if AttrMapping_rev2[attr_idx] == 'pedestrian.moving' or \
+                AttrMapping_rev2[attr_idx] == 'pedestrian.standing' or \
+                    AttrMapping_rev2[attr_idx] == \
+                    'pedestrian.sitting_lying_down':
+                return AttrMapping_rev2[attr_idx]
+            else:
+                return NuScenesMonoDataset.DefaultAttribute[label_name]
+        elif label_name == 'bicycle' or label_name == 'motorcycle':
+            if AttrMapping_rev2[attr_idx] == 'cycle.with_rider' or \
+                    AttrMapping_rev2[attr_idx] == 'cycle.without_rider':
+                return AttrMapping_rev2[attr_idx]
+            else:
+                return NuScenesMonoDataset.DefaultAttribute[label_name]
+        else:
+            return NuScenesMonoDataset.DefaultAttribute[label_name]
+
+    def _format_bbox(self, results, jsonfile_prefix=None):
+        """Convert the results to the standard format.
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            jsonfile_prefix (str): The prefix of the output jsonfile.
+                You can specify the output directory/filename by
+                modifying the jsonfile_prefix. Default: None.
+
+        Returns:
+            str: Path of the output json file.
+        """
+        nusc_annos = {}
+        mapped_class_names = self.CLASSES
+
+        print('Start to convert detection format...')
+
+        CAM_NUM = 6
+
+        for sample_id, det in enumerate(mmcv.track_iter_progress(results)):
+
+            if sample_id % CAM_NUM == 0:
+                boxes_per_frame = []
+                attrs_per_frame = []
+
+            # need to merge results from images of the same sample
+            annos = []
+            boxes, attrs = output_to_nusc_box(det)
+            sample_token = self.data_infos[sample_id]['token']
+            boxes, attrs = cam_nusc_box_to_global(self.data_infos[sample_id],
+                                                  boxes, attrs,
+                                                  mapped_class_names,
+                                                  self.eval_detection_configs,
+                                                  self.eval_version)
+
+            boxes_per_frame.extend(boxes)
+            attrs_per_frame.extend(attrs)
+            # Remove redundant predictions caused by overlap of images
+            if (sample_id + 1) % CAM_NUM != 0:
+                continue
+            boxes = global_nusc_box_to_cam(
+                self.data_infos[sample_id + 1 - CAM_NUM], boxes_per_frame,
+                mapped_class_names, self.eval_detection_configs,
+                self.eval_version)
+            cam_boxes3d, scores, labels = nusc_box_to_cam_box3d(boxes)
+            # box nms 3d over 6 images in a frame
+            # TODO: move this global setting into config
+            nms_cfg = dict(
+                use_rotate_nms=True,
+                nms_across_levels=False,
+                nms_pre=4096,
+                nms_thr=0.05,
+                score_thr=0.01,
+                min_bbox_size=0,
+                max_per_frame=500)
+            from mmcv import Config
+            nms_cfg = Config(nms_cfg)
+            cam_boxes3d_for_nms = xywhr2xyxyr(cam_boxes3d.bev)
+            boxes3d = cam_boxes3d.tensor
+            # generate attr scores from attr labels
+            attrs = labels.new_tensor([attr for attr in attrs_per_frame])
+            boxes3d, scores, labels, attrs = box3d_multiclass_nms(
+                boxes3d,
+                cam_boxes3d_for_nms,
+                scores,
+                nms_cfg.score_thr,
+                nms_cfg.max_per_frame,
+                nms_cfg,
+                mlvl_attr_scores=attrs)
+            cam_boxes3d = CameraInstance3DBoxes(boxes3d, box_dim=9)
+            det = bbox3d2result(cam_boxes3d, scores, labels, attrs)
+            boxes, attrs = output_to_nusc_box(det)
+            boxes, attrs = cam_nusc_box_to_global(
+                self.data_infos[sample_id + 1 - CAM_NUM], boxes, attrs,
+                mapped_class_names, self.eval_detection_configs,
+                self.eval_version)
+
+            for i, box in enumerate(boxes):
+                name = mapped_class_names[box.label]
+                attr = self.get_attr_name(attrs[i], name)
+                nusc_anno = dict(
+                    sample_token=sample_token,
+                    translation=box.center.tolist(),
+                    size=box.wlh.tolist(),
+                    rotation=box.orientation.elements.tolist(),
+                    velocity=box.velocity[:2].tolist(),
+                    detection_name=name,
+                    detection_score=box.score,
+                    attribute_name=attr)
+                annos.append(nusc_anno)
+            # other views results of the same frame should be concatenated
+            if sample_token in nusc_annos:
+                nusc_annos[sample_token].extend(annos)
+            else:
+                nusc_annos[sample_token] = annos
+
+        nusc_submissions = {
+            'meta': self.modality,
+            'results': nusc_annos,
+        }
+
+        mmcv.mkdir_or_exist(jsonfile_prefix)
+        res_path = osp.join(jsonfile_prefix, 'results_nusc.json')
+        print('Results writes to', res_path)
+        mmcv.dump(nusc_submissions, res_path)
+        return res_path
+
+    def _evaluate_single(self,
+                         result_path,
+                         logger=None,
+                         metric='bbox',
+                         result_name='img_bbox'):
+        """Evaluation for a single model in nuScenes protocol.
+
+        Args:
+            result_path (str): Path of the result file.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            metric (str): Metric name used for evaluation. Default: 'bbox'.
+            result_name (str): Result name in the metric prefix.
+                Default: 'img_bbox'.
+
+        Returns:
+            dict: Dictionary of evaluation details.
+        """
+        from nuscenes import NuScenes
+        from nuscenes.eval.detection.evaluate import NuScenesEval
+
+        output_dir = osp.join(*osp.split(result_path)[:-1])
+        nusc = NuScenes(
+            version=self.version, dataroot=self.data_root, verbose=False)
+        eval_set_map = {
+            'v1.0-mini': 'mini_val',
+            'v1.0-trainval': 'val',
+        }
+        nusc_eval = NuScenesEval(
+            nusc,
+            config=self.eval_detection_configs,
+            result_path=result_path,
+            eval_set=eval_set_map[self.version],
+            output_dir=output_dir,
+            verbose=False)
+        nusc_eval.main(render_curves=True)
+
+        # record metrics
+        metrics = mmcv.load(osp.join(output_dir, 'metrics_summary.json'))
+        detail = dict()
+        metric_prefix = f'{result_name}_NuScenes'
+        for name in self.CLASSES:
+            for k, v in metrics['label_aps'][name].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}_AP_dist_{}'.format(metric_prefix, name, k)] = val
+            for k, v in metrics['label_tp_errors'][name].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}_{}'.format(metric_prefix, name, k)] = val
+
+        detail['{}/NDS'.format(metric_prefix)] = metrics['nd_score']
+        detail['{}/mAP'.format(metric_prefix)] = metrics['mean_ap']
+        return detail
+
+    def format_results(self, results, jsonfile_prefix=None, **kwargs):
+        """Format the results to json (standard format for COCO evaluation).
+
+        Args:
+            results (list[tuple | numpy.ndarray]): Testing results of the
+                dataset.
+            jsonfile_prefix (str | None): The prefix of json files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+
+        Returns:
+            tuple: (result_files, tmp_dir), result_files is a dict containing \
+                the json filepaths, tmp_dir is the temporal directory created \
+                for saving json files when jsonfile_prefix is not specified.
+        """
+        assert isinstance(results, list), 'results must be a list'
+        assert len(results) == len(self), (
+            'The length of results is not equal to the dataset len: {} != {}'.
+            format(len(results), len(self)))
+
+        if jsonfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            jsonfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            tmp_dir = None
+
+        if not isinstance(results[0], dict):
+            result_files = self._format_bbox(results, jsonfile_prefix)
+        else:
+            result_files = dict()
+            for name in results[0]:
+                # not evaluate 2D predictions on nuScenes
+                if '2d' in name:
+                    continue
+                print(f'\nFormating bboxes of {name}')
+                results_ = [out[name] for out in results]
+                tmp_file_ = osp.join(jsonfile_prefix, name)
+                result_files.update(
+                    {name: self._format_bbox(results_, tmp_file_)})
+
+        return result_files, tmp_dir
+
+    def evaluate(self,
+                 results,
+                 metric='bbox',
+                 logger=None,
+                 jsonfile_prefix=None,
+                 result_names=['img_bbox'],
+                 show=False,
+                 out_dir=None):
+        """Evaluation in nuScenes protocol.
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            metric (str | list[str]): Metrics to be evaluated.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            jsonfile_prefix (str | None): The prefix of json files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+            show (bool): Whether to visualize.
+                Default: False.
+            out_dir (str): Path to save the visualization results.
+                Default: None.
+
+        Returns:
+            dict[str, float]: Results of each evaluation metric.
+        """
+
+        result_files, tmp_dir = self.format_results(results, jsonfile_prefix)
+
+        if isinstance(result_files, dict):
+            results_dict = dict()
+            for name in result_names:
+                print('Evaluating bboxes of {}'.format(name))
+                ret_dict = self._evaluate_single(result_files[name])
+            results_dict.update(ret_dict)
+        elif isinstance(result_files, str):
+            results_dict = self._evaluate_single(result_files)
+
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+
+        if show:
+            self.show(results, out_dir)
+        return results_dict
+
+    def show(self, results, out_dir):
+        """Results visualization.
+
+        Args:
+            results (list[dict]): List of bounding boxes results.
+            out_dir (str): Output directory of visualization result.
+        """
+        # TODO: support mono3d visualization
+        pass
+
+
+def output_to_nusc_box(detection):
+    """Convert the output to the box class in the nuScenes.
+
+    Args:
+        detection (dict): Detection results.
+
+            - boxes_3d (:obj:`BaseInstance3DBoxes`): Detection bbox.
+            - scores_3d (torch.Tensor): Detection scores.
+            - labels_3d (torch.Tensor): Predicted box labels.
+            - attrs_3d (torch.Tensor, optional): Predicted attributes.
+
+    Returns:
+        list[:obj:`NuScenesBox`]: List of standard NuScenesBoxes.
+    """
+    box3d = detection['boxes_3d']
+    scores = detection['scores_3d'].numpy()
+    labels = detection['labels_3d'].numpy()
+    attrs = None
+    if 'attrs_3d' in detection:
+        attrs = detection['attrs_3d'].numpy()
+
+    box_gravity_center = box3d.gravity_center.numpy()
+    box_dims = box3d.dims.numpy()
+    box_yaw = box3d.yaw.numpy()
+
+    box_list = []
+    for i in range(len(box3d)):
+        q1 = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw[i])
+        q2 = pyquaternion.Quaternion(axis=[1, 0, 0], radians=np.pi / 2)
+        quat = q2 * q1
+        velocity = (box3d.tensor[i, 7], 0.0, box3d.tensor[i, 8])
+        box = NuScenesBox(
+            box_gravity_center[i],
+            box_dims[i],
+            quat,
+            label=labels[i],
+            score=scores[i],
+            velocity=velocity)
+        box_list.append(box)
+    return box_list, attrs
+
+
+def cam_nusc_box_to_global(info,
+                           boxes,
+                           attrs,
+                           classes,
+                           eval_configs,
+                           eval_version='detection_cvpr_2019'):
+    """Convert the box from camera to global coordinate.
+
+    Args:
+        info (dict): Info for a specific sample data, including the
+            calibration information.
+        boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
+        classes (list[str]): Mapped classes in the evaluation.
+        eval_configs (object): Evaluation configuration object.
+        eval_version (str): Evaluation version.
+            Default: 'detection_cvpr_2019'
+
+    Returns:
+        list: List of standard NuScenesBoxes in the global
+            coordinate.
+    """
+    box_list = []
+    attr_list = []
+    for (box, attr) in zip(boxes, attrs):
+        # Move box to ego vehicle coord system
+        box.rotate(pyquaternion.Quaternion(info['cam2ego_rotation']))
+        box.translate(np.array(info['cam2ego_translation']))
+        # filter det in ego.
+        cls_range_map = eval_configs.class_range
+        radius = np.linalg.norm(box.center[:2], 2)
+        det_range = cls_range_map[classes[box.label]]
+        if radius > det_range:
+            continue
+        # Move box to global coord system
+        box.rotate(pyquaternion.Quaternion(info['ego2global_rotation']))
+        box.translate(np.array(info['ego2global_translation']))
+        box_list.append(box)
+        attr_list.append(attr)
+    return box_list, attr_list
+
+
+def global_nusc_box_to_cam(info,
+                           boxes,
+                           classes,
+                           eval_configs,
+                           eval_version='detection_cvpr_2019'):
+    """Convert the box from global to camera coordinate.
+
+    Args:
+        info (dict): Info for a specific sample data, including the
+            calibration information.
+        boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
+        classes (list[str]): Mapped classes in the evaluation.
+        eval_configs (object): Evaluation configuration object.
+        eval_version (str): Evaluation version.
+            Default: 'detection_cvpr_2019'
+
+    Returns:
+        list: List of standard NuScenesBoxes in the global
+            coordinate.
+    """
+    box_list = []
+    for box in boxes:
+        # Move box to ego vehicle coord system
+        box.translate(-np.array(info['ego2global_translation']))
+        box.rotate(
+            pyquaternion.Quaternion(info['ego2global_rotation']).inverse)
+        # filter det in ego.
+        cls_range_map = eval_configs.class_range
+        radius = np.linalg.norm(box.center[:2], 2)
+        det_range = cls_range_map[classes[box.label]]
+        if radius > det_range:
+            continue
+        # Move box to camera coord system
+        box.translate(-np.array(info['cam2ego_translation']))
+        box.rotate(pyquaternion.Quaternion(info['cam2ego_rotation']).inverse)
+        box_list.append(box)
+    return box_list
+
+
+def nusc_box_to_cam_box3d(boxes):
+    """Convert boxes from :obj:`NuScenesBox` to :obj:`CameraInstance3DBoxes`.
+
+    Args:
+        boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
+
+    Returns:
+        tuple (:obj:`CameraInstance3DBoxes` | torch.Tensor | torch.Tensor): \
+            Converted 3D bounding boxes, scores and labels.
+    """
+    locs = torch.Tensor([b.center for b in boxes]).view(-1, 3)
+    dims = torch.Tensor([b.wlh for b in boxes]).view(-1, 3)
+    rots = torch.Tensor([b.orientation.yaw_pitch_roll[0]
+                         for b in boxes]).view(-1, 1)
+    velocity = torch.Tensor([b.velocity[:2] for b in boxes]).view(-1, 2)
+    boxes_3d = torch.cat([locs, dims, rots, velocity], dim=1).cuda()
+    cam_boxes3d = CameraInstance3DBoxes(
+        boxes_3d, box_dim=9, origin=(0.5, 0.5, 0.5))
+    scores = torch.Tensor([b.score for b in boxes]).cuda()
+    labels = torch.LongTensor([b.label for b in boxes]).cuda()
+    nms_scores = scores.new_zeros(scores.shape[0], 10 + 1)
+    indices = labels.new_tensor(list(range(scores.shape[0])))
+    nms_scores[indices, labels] = scores
+    return cam_boxes3d, nms_scores, labels
--- a/mmdet3d/datasets/pipelines/formating.py
+++ b/mmdet3d/datasets/pipelines/formating.py
@@ -51,7 +51,8 @@ class DefaultFormatBundle(object):
                results['img'] = DC(to_tensor(img), stack=True)
        for key in [
                'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels',
-                'gt_labels_3d', 'pts_instance_mask', 'pts_semantic_mask'
+                'gt_labels_3d', 'attr_labels', 'pts_instance_mask',
+                'pts_semantic_mask', 'centers2d', 'depths'
        ]:
            if key not in results:
                continue
@@ -134,11 +135,11 @@ class Collect3D(object):
                 keys,
                 meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img',
                            'pad_shape', 'scale_factor', 'flip',
-                            'pcd_horizontal_flip', 'pcd_vertical_flip',
-                            'box_mode_3d', 'box_type_3d', 'img_norm_cfg',
-                            'rect', 'Trv2c', 'P2', 'pcd_trans', 'sample_idx',
-                            'pcd_scale_factor', 'pcd_rotation', 'pts_filename',
-                            'transformation_3d_flow')):
+                            'cam_intrinsic', 'pcd_horizontal_flip',
+                            'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d',
+                            'img_norm_cfg', 'rect', 'Trv2c', 'P2', 'pcd_trans',
+                            'sample_idx', 'pcd_scale_factor', 'pcd_rotation',
+                            'pts_filename', 'transformation_3d_flow')):
        self.keys = keys
        self.meta_keys = meta_keys

@@ -222,6 +223,11 @@ class DefaultFormatBundle3D(DefaultFormatBundle):
                if 'gt_names_3d' in results:
                    results['gt_names_3d'] = results['gt_names_3d'][
                        gt_bboxes_3d_mask]
+                if 'centers2d' in results:
+                    results['centers2d'] = results['centers2d'][
+                        gt_bboxes_3d_mask]
+                if 'depths' in results:
+                    results['depths'] = results['depths'][gt_bboxes_3d_mask]
            if 'gt_bboxes_mask' in results:
                gt_bboxes_mask = results['gt_bboxes_mask']
                if 'gt_bboxes' in results:
@@ -230,6 +236,7 @@ class DefaultFormatBundle3D(DefaultFormatBundle):
            if self.with_label:
                if 'gt_names' in results and len(results['gt_names']) == 0:
                    results['gt_labels'] = np.array([], dtype=np.int64)
+                    results['attr_labels'] = np.array([], dtype=np.int64)
                elif 'gt_names' in results and isinstance(
                        results['gt_names'][0], list):
                    # gt_labels might be a list of list in multi-view setting

--- a/mmdet3d/datasets/pipelines/loading.py
+++ b/mmdet3d/datasets/pipelines/loading.py
@@ -3,7 +3,7 @@ import numpy as np

 from mmdet3d.core.points import BasePoints, get_points_type
 from mmdet.datasets.builder import PIPELINES
-from mmdet.datasets.pipelines import LoadAnnotations
+from mmdet.datasets.pipelines import LoadAnnotations, LoadImageFromFile


 @PIPELINES.register_module()
@@ -65,6 +65,30 @@ class LoadMultiViewImageFromFiles(object):
            f"color_type='{self.color_type}')"


+@PIPELINES.register_module()
+class LoadImageFromFileMono3D(LoadImageFromFile):
+    """Load an image from file in monocular 3D object detection. Compared to 2D
+    detection, additional camera parameters need to be loaded.
+
+    Args:
+        kwargs (dict): Arguments are the same as those in \
+            :class:`LoadImageFromFile`.
+    """
+
+    def __call__(self, results):
+        """Call functions to load image and get image meta information.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+        super().__call__(results)
+        results['cam_intrinsic'] = results['img_info']['cam_intrinsic']
+        return results
+
+
 @PIPELINES.register_module()
 class LoadPointsFromMultiSweeps(object):
    """Load points from multiple sweeps.
@@ -426,6 +450,8 @@ class LoadAnnotations3D(LoadAnnotations):
            Defaults to True.
        with_label_3d (bool, optional): Whether to load 3D labels.
            Defaults to True.
+        with_attr_label (bool, optional): Whether to load attribute label.
+            Defaults to False.
        with_mask_3d (bool, optional): Whether to load 3D instance masks.
            for points. Defaults to False.
        with_seg_3d (bool, optional): Whether to load 3D semantic masks.
@@ -438,6 +464,8 @@ class LoadAnnotations3D(LoadAnnotations):
            Defaults to False.
        with_seg (bool, optional): Whether to load 2D semantic masks.
            Defaults to False.
+        with_bbox_depth (bool, optional): Whether to load 2.5D boxes.
+            Defaults to False.
        poly2mask (bool, optional): Whether to convert polygon annotations
            to bitmasks. Defaults to True.
        seg_3d_dtype (dtype, optional): Dtype of 3D semantic masks.
@@ -450,12 +478,14 @@ class LoadAnnotations3D(LoadAnnotations):
    def __init__(self,
                 with_bbox_3d=True,
                 with_label_3d=True,
+                 with_attr_label=False,
                 with_mask_3d=False,
                 with_seg_3d=False,
                 with_bbox=False,
                 with_label=False,
                 with_mask=False,
                 with_seg=False,
+                 with_bbox_depth=False,
                 poly2mask=True,
                 seg_3d_dtype='int',
                 file_client_args=dict(backend='disk')):
@@ -467,7 +497,9 @@ class LoadAnnotations3D(LoadAnnotations):
            poly2mask,
            file_client_args=file_client_args)
        self.with_bbox_3d = with_bbox_3d
+        self.with_bbox_depth = with_bbox_depth
        self.with_label_3d = with_label_3d
+        self.with_attr_label = with_attr_label
        self.with_mask_3d = with_mask_3d
        self.with_seg_3d = with_seg_3d
        self.seg_3d_dtype = seg_3d_dtype
@@ -485,6 +517,19 @@ class LoadAnnotations3D(LoadAnnotations):
        results['bbox3d_fields'].append('gt_bboxes_3d')
        return results

+    def _load_bboxes_depth(self, results):
+        """Private function to load 2.5D bounding box annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
+
+        Returns:
+            dict: The dict containing loaded 2.5D bounding box annotations.
+        """
+        results['centers2d'] = results['ann_info']['centers2d']
+        results['depths'] = results['ann_info']['depths']
+        return results
+
    def _load_labels_3d(self, results):
        """Private function to load label annotations.

@@ -497,6 +542,18 @@ class LoadAnnotations3D(LoadAnnotations):
        results['gt_labels_3d'] = results['ann_info']['gt_labels_3d']
        return results

+    def _load_attr_labels(self, results):
+        """Private function to load label annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
+
+        Returns:
+            dict: The dict containing loaded label annotations.
+        """
+        results['attr_labels'] = results['ann_info']['attr_labels']
+        return results
+
    def _load_masks_3d(self, results):
        """Private function to load 3D mask annotations.

@@ -564,8 +621,14 @@ class LoadAnnotations3D(LoadAnnotations):
            results = self._load_bboxes_3d(results)
            if results is None:
                return None
+        if self.with_bbox_depth:
+            results = self._load_bboxes_depth(results)
+            if results is None:
+                return None
        if self.with_label_3d:
            results = self._load_labels_3d(results)
+        if self.with_attr_label:
+            results = self._load_attr_labels(results)
        if self.with_mask_3d:
            results = self._load_masks_3d(results)
        if self.with_seg_3d:
@@ -579,11 +642,13 @@ class LoadAnnotations3D(LoadAnnotations):
        repr_str = self.__class__.__name__ + '(\n'
        repr_str += f'{indent_str}with_bbox_3d={self.with_bbox_3d}, '
        repr_str += f'{indent_str}with_label_3d={self.with_label_3d}, '
+        repr_str += f'{indent_str}with_attr_label={self.with_attr_label}, '
        repr_str += f'{indent_str}with_mask_3d={self.with_mask_3d}, '
        repr_str += f'{indent_str}with_seg_3d={self.with_seg_3d}, '
        repr_str += f'{indent_str}with_bbox={self.with_bbox}, '
        repr_str += f'{indent_str}with_label={self.with_label}, '
        repr_str += f'{indent_str}with_mask={self.with_mask}, '
        repr_str += f'{indent_str}with_seg={self.with_seg}, '
+        repr_str += f'{indent_str}with_bbox_depth={self.with_bbox_depth}, '
        repr_str += f'{indent_str}poly2mask={self.poly2mask})'
        return repr_str
--- a/mmdet3d/datasets/pipelines/transforms_3d.py
+++ b/mmdet3d/datasets/pipelines/transforms_3d.py
@@ -65,8 +65,17 @@ class RandomFlip3D(RandomFlip):
                np.array([], dtype=np.float32))
        assert len(input_dict['bbox3d_fields']) == 1
        for key in input_dict['bbox3d_fields']:
-            input_dict['points'] = input_dict[key].flip(
-                direction, points=input_dict['points'])
+            if 'points' in input_dict:
+                input_dict['points'] = input_dict[key].flip(
+                    direction, points=input_dict['points'])
+            else:
+                input_dict[key].flip(direction)
+        if 'centers2d' in input_dict:
+            assert self.sync_2d is True and direction == 'horizontal', \
+                'Only support sync_2d=True and horizontal flip with images'
+            w = input_dict['img_shape'][1]
+            input_dict['centers2d'][..., 0] = \
+                w - input_dict['centers2d'][..., 0]

    def __call__(self, input_dict):
        """Call function to flip points, values in the ``bbox3d_fields`` and \

--- a/tests/data/nuscenes/mono3d_sample_results.pkl
+++ b/tests/data/nuscenes/mono3d_sample_results.pkl
--- a/tests/data/nuscenes/nus_infos_mono3d.coco.json
+++ b/tests/data/nuscenes/nus_infos_mono3d.coco.json
+{"images": [{"file_name": "samples/CAM_BACK_LEFT/n015-2018-07-18-11-07-57+0800__CAM_BACK_LEFT__1531883530447423.jpg", "id": "86e6806d626b4711a6d0f5015b090116", "token": "e93e98b63d3b40209056d129dc53ceee", "cam2ego_rotation": [0.6924185592174665, -0.7031619420114925, -0.11648342771943819, 0.11203317912370753], "cam2ego_translation": [1.03569100218, 0.484795032713, 1.59097014818], "ego2global_rotation": [-0.7495886280607293, -0.0077695335695504636, 0.00829759813869316, -0.6618063711504101], "ego2global_translation": [1010.1328353833223, 610.8111652918716, 0.0], "cam_intrinsic": [[1256.7414812095406, 0.0, 792.1125740759628], [0.0, 1256.7414812095406, 492.7757465151356], [0.0, 0.0, 1.0]], "width": 1600, "height": 900}, {"file_name": "samples/CAM_FRONT/n015-2018-07-18-11-07-57+0800__CAM_FRONT__1531883530412470.jpg", "id": "020d7b4f858147558106c504f7f31bef", "token": "e93e98b63d3b40209056d129dc53ceee", "cam2ego_rotation": [0.4998015430569128, -0.5030316162024876, 0.4997798114386805, -0.49737083824542755], "cam2ego_translation": [1.70079118954, 0.0159456324149, 1.51095763913], "ego2global_rotation": [-0.7495886280607293, -0.0077695335695504636, 0.00829759813869316, -0.6618063711504101], "ego2global_translation": [1010.1328353833223, 610.8111652918716, 0.0], "cam_intrinsic": [[1266.417203046554, 0.0, 816.2670197447984], [0.0, 1266.417203046554, 491.50706579294757], [0.0, 0.0, 1.0]], "width": 1600, "height": 900}, {"file_name": "samples/CAM_FRONT_RIGHT/n015-2018-07-18-11-07-57+0800__CAM_FRONT_RIGHT__1531883530420339.jpg", "id": "16d39ff22a8545b0a4ee3236a0fe1c20", "token": "e93e98b63d3b40209056d129dc53ceee", "cam2ego_rotation": [0.2060347966337182, -0.2026940577919598, 0.6824507824531167, -0.6713610884174485], "cam2ego_translation": [1.5508477543, -0.493404796419, 1.49574800619], "ego2global_rotation": [-0.7495886280607293, -0.0077695335695504636, 0.00829759813869316, -0.6618063711504101], "ego2global_translation": [1010.1328353833223, 610.8111652918716, 0.0], "cam_intrinsic": [[1260.8474446004698, 0.0, 807.968244525554], [0.0, 1260.8474446004698, 495.3344268742088], [0.0, 0.0, 1.0]], "width": 1600, "height": 900}, {"file_name": "samples/CAM_FRONT_LEFT/n015-2018-07-18-11-07-57+0800__CAM_FRONT_LEFT__1531883530404844.jpg", "id": "24332e9c554a406f880430f17771b608", "token": "e93e98b63d3b40209056d129dc53ceee", "cam2ego_rotation": [0.6757265034669446, -0.6736266522251881, 0.21214015046209478, -0.21122827103904068], "cam2ego_translation": [1.52387798135, 0.494631336551, 1.50932822144], "ego2global_rotation": [-0.7495886280607293, -0.0077695335695504636, 0.00829759813869316, -0.6618063711504101], "ego2global_translation": [1010.1328353833223, 610.8111652918716, 0.0], "cam_intrinsic": [[1272.5979470598488, 0.0, 826.6154927353808], [0.0, 1272.5979470598488, 479.75165386361925], [0.0, 0.0, 1.0]], "width": 1600, "height": 900}, {"file_name": "samples/CAM_BACK/n015-2018-07-18-11-07-57+0800__CAM_BACK__1531883530437525.jpg", "id": "aab35aeccbda42de82b2ff5c278a0d48", "token": "e93e98b63d3b40209056d129dc53ceee", "cam2ego_rotation": [0.5037872666382278, -0.49740249788611096, -0.4941850223835201, 0.5045496097725578], "cam2ego_translation": [0.0283260309358, 0.00345136761476, 1.57910346144], "ego2global_rotation": [-0.7495886280607293, -0.0077695335695504636, 0.00829759813869316, -0.6618063711504101], "ego2global_translation": [1010.1328353833223, 610.8111652918716, 0.0], "cam_intrinsic": [[809.2209905677063, 0.0, 829.2196003259838], [0.0, 809.2209905677063, 481.77842384512485], [0.0, 0.0, 1.0]], "width": 1600, "height": 900}, {"file_name": "samples/CAM_BACK_RIGHT/n015-2018-07-18-11-07-57+0800__CAM_BACK_RIGHT__1531883530427893.jpg", "id": "ec7096278e484c9ebe6894a2ad5682e9", "token": "e93e98b63d3b40209056d129dc53ceee", "cam2ego_rotation": [0.12280980120078765, -0.132400842670559, -0.7004305821388234, 0.690496031265798], "cam2ego_translation": [1.0148780988, -0.480568219723, 1.56239545128], "ego2global_rotation": [-0.7495886280607293, -0.0077695335695504636, 0.00829759813869316, -0.6618063711504101], "ego2global_translation": [1010.1328353833223, 610.8111652918716, 0.0], "cam_intrinsic": [[1259.5137405846733, 0.0, 807.2529053838625], [0.0, 1259.5137405846733, 501.19579884916527], [0.0, 0.0, 1.0]], "width": 1600, "height": 900}], "annotations": [{"file_name": "samples/CAM_FRONT/n015-2018-07-18-11-07-57+0800__CAM_FRONT__1531883530412470.jpg", "image_id": "020d7b4f858147558106c504f7f31bef", "area": 85383.89600714693, "category_name": "truck", "category_id": 1, "bbox": [0.0, 357.732750319127, 342.56437261895206, 249.24920053528984], "iscrowd": 0, "bbox_cam3d": [-10.356295829208502, -0.06394600736590471, 18.785737229926998, 2.312, 7.516, 3.093, -0.5996975863361309], "velo_cam3d": [0.05742557272436208, 0.06990201482350666], "center2d": [118.11016609440316, 487.19622492451936, 18.785737229926998], "attribute_name": "vehicle.parked", "attribute_id": 6, "segmentation": [], "id": 0}, {"file_name": "samples/CAM_FRONT_LEFT/n015-2018-07-18-11-07-57+0800__CAM_FRONT_LEFT__1531883530404844.jpg", "image_id": "24332e9c554a406f880430f17771b608", "area": 76274.38331683438, "category_name": "truck", "category_id": 1, "bbox": [1305.1296604171719, 350.75901341602525, 294.87033958282814, 258.6709243959383], "iscrowd": 0, "bbox_cam3d": [9.795917040815693, 0.07538275380197612, 19.033148401567978, 2.312, 7.516, 3.093, -1.5546044317874126], "velo_cam3d": [0.09022854769195846, -0.0065096147400431695], "center2d": [1481.5919397578637, 484.79190972187814, 19.033148401567978], "attribute_name": "vehicle.parked", "attribute_id": 6, "segmentation": [], "id": 1}, {"file_name": "samples/CAM_FRONT_LEFT/n015-2018-07-18-11-07-57+0800__CAM_FRONT_LEFT__1531883530404844.jpg", "image_id": "24332e9c554a406f880430f17771b608", "area": 5248.9339273703135, "category_name": "truck", "category_id": 1, "bbox": [808.1218983320856, 436.2076328554, 75.28483638734929, 69.72099800235912], "iscrowd": 0, "bbox_cam3d": [0.7896581102503435, -0.32866532307883706, 58.48166239420381, 2.877, 6.372, 2.978, 1.641180695066564], "velo_cam3d": [0.009938485543455734, 0.0010084200213775884], "center2d": [843.7989524532317, 472.5996886441534, 58.48166239420381], "attribute_name": "vehicle.parked", "attribute_id": 6, "segmentation": [], "id": 2}, {"file_name": "samples/CAM_FRONT_LEFT/n015-2018-07-18-11-07-57+0800__CAM_FRONT_LEFT__1531883530404844.jpg", "image_id": "24332e9c554a406f880430f17771b608", "area": 25266.816070927107, "category_name": "truck", "category_id": 1, "bbox": [1133.5883785276196, 424.4436001005383, 202.5256666350731, 124.75858734712807], "iscrowd": 0, "bbox_cam3d": [9.39338221449255, 0.19762751304835102, 30.01455814405707, 2.156, 6.227, 2.601, -1.4587684025759116], "velo_cam3d": [0.0, 0.0], "center2d": [1224.88885277412, 488.1309332180172, 30.01455814405707], "attribute_name": "vehicle.parked", "attribute_id": 6, "segmentation": [], "id": 3}, {"file_name": "samples/CAM_BACK/n015-2018-07-18-11-07-57+0800__CAM_BACK__1531883530437525.jpg", "image_id": "aab35aeccbda42de82b2ff5c278a0d48", "area": 31981.88483023472, "category_name": "car", "category_id": 0, "bbox": [652.8710695836726, 487.2457293359287, 256.3734471348506, 124.74725907715583], "iscrowd": 0, "bbox_cam3d": [-0.48041137691585667, 0.8426032188612489, 12.27160016308813, 1.871, 4.478, 1.456, -2.0402647554154876], "velo_cam3d": [-2.4043357184501866, -4.232358489028598], "center2d": [797.5400340802389, 537.3418550489371, 12.27160016308813], "attribute_name": "vehicle.moving", "attribute_id": 5, "segmentation": [], "id": 4}, {"file_name": "samples/CAM_BACK_LEFT/n015-2018-07-18-11-07-57+0800__CAM_BACK_LEFT__1531883530447423.jpg", "image_id": "86e6806d626b4711a6d0f5015b090116", "area": 1874.1656394574547, "category_name": "traffic_cone", "category_id": 8, "bbox": [1084.536273989852, 513.7567766430512, 30.043100006470013, 62.382565016720605], "iscrowd": 0, "bbox_cam3d": [3.745641322414848, 0.6321604510604618, 15.319339525420224, 0.3, 0.291, 0.734, 1.4550554479430875], "velo_cam3d": [0.028202672296939114, -0.001622377193634249], "center2d": [1099.3910188026568, 544.635832278593, 15.319339525420224], "attribute_name": "None", "attribute_id": 8, "segmentation": [], "id": 5}, {"file_name": "samples/CAM_BACK_LEFT/n015-2018-07-18-11-07-57+0800__CAM_BACK_LEFT__1531883530447423.jpg", "image_id": "86e6806d626b4711a6d0f5015b090116", "area": 1641.3529623313364, "category_name": "traffic_cone", "category_id": 8, "bbox": [823.5058461203419, 512.0451382733748, 27.545987206560085, 59.58591899514306], "iscrowd": 0, "bbox_cam3d": [0.558956408408079, 0.6054486006477211, 15.607344275188172, 0.315, 0.338, 0.712, 1.5596704833049395], "velo_cam3d": [0.07717355032092023, -0.0013264953734539453], "center2d": [837.1211093045397, 541.5279466177432, 15.607344275188172], "attribute_name": "None", "attribute_id": 8, "segmentation": [], "id": 6}, {"file_name": "samples/CAM_BACK_LEFT/n015-2018-07-18-11-07-57+0800__CAM_BACK_LEFT__1531883530447423.jpg", "image_id": "86e6806d626b4711a6d0f5015b090116", "area": 11464.868967812941, "category_name": "pedestrian", "category_id": 7, "bbox": [1091.57108913607, 427.8805195896188, 76.29701915190844, 150.2662763926101], "iscrowd": 0, "bbox_cam3d": [3.953820859983739, 0.11100574170732268, 14.75668416993455, 0.739, 0.563, 1.711, 1.4550554479430875], "velo_cam3d": [0.10262495353364391, -0.0064695610507391095], "center2d": [1128.8366393735657, 502.22946380348515, 14.75668416993455], "attribute_name": "pedestrian.sitting_lying_down", "attribute_id": 4, "segmentation": [], "id": 7}, {"file_name": "samples/CAM_BACK_LEFT/n015-2018-07-18-11-07-57+0800__CAM_BACK_LEFT__1531883530447423.jpg", "image_id": "86e6806d626b4711a6d0f5015b090116", "area": 10887.814254422945, "category_name": "pedestrian", "category_id": 7, "bbox": [1160.5755663065963, 427.76823935577545, 72.04645850373822, 151.1221298109749], "iscrowd": 0, "bbox_cam3d": [4.7798492054669035, 0.1162134030605403, 14.880252178422799, 0.665, 0.544, 1.739, 1.4550554479430875], "velo_cam3d": [0.08665208940588605, -0.12554131041835265], "center2d": [1195.8043058026105, 502.5907820768639, 14.880252178422799], "attribute_name": "pedestrian.sitting_lying_down", "attribute_id": 4, "segmentation": [], "id": 8}, {"file_name": "samples/CAM_BACK_LEFT/n015-2018-07-18-11-07-57+0800__CAM_BACK_LEFT__1531883530447423.jpg", "image_id": "86e6806d626b4711a6d0f5015b090116", "area": 1840.396836351825, "category_name": "traffic_cone", "category_id": 8, "bbox": [976.5016497372175, 515.0039595028874, 30.627062877370918, 60.09054292018379], "iscrowd": 0, "bbox_cam3d": [2.4596094747766615, 0.6404788797338883, 15.49228428713527, 0.338, 0.309, 0.712, 1.461625206011101], "velo_cam3d": [0.02389033738396964, -0.0027892907804445547], "center2d": [991.6372663187118, 544.7316983348808, 15.49228428713527], "attribute_name": "None", "attribute_id": 8, "segmentation": [], "id": 9}, {"file_name": "samples/CAM_BACK_RIGHT/n015-2018-07-18-11-07-57+0800__CAM_BACK_RIGHT__1531883530427893.jpg", "image_id": "ec7096278e484c9ebe6894a2ad5682e9", "area": 130637.82232697189, "category_name": "car", "category_id": 0, "bbox": [806.290660237549, 470.86948127698895, 564.486943265249, 231.42753589888787], "iscrowd": 0, "bbox_cam3d": [2.041080764231013, 0.5400087467741127, 10.16381197333443, 1.638, 4.25, 1.44, 2.3008777344302445], "velo_cam3d": [-3.11975390859937, 4.71824099865795], "center2d": [1060.1864774468488, 568.1144351228712, 10.16381197333443], "attribute_name": "vehicle.moving", "attribute_id": 5, "segmentation": [], "id": 10}], "categories": [{"id": 0, "name": "car"}, {"id": 1, "name": "truck"}, {"id": 2, "name": "trailer"}, {"id": 3, "name": "bus"}, {"id": 4, "name": "construction_vehicle"}, {"id": 5, "name": "bicycle"}, {"id": 6, "name": "motorcycle"}, {"id": 7, "name": "pedestrian"}, {"id": 8, "name": "traffic_cone"}, {"id": 9, "name": "barrier"}]}
\ No newline at end of file
--- a/tests/data/nuscenes/samples/CAM_BACK_LEFT/n015-2018-07-18-11-07-57+0800__CAM_BACK_LEFT__1531883530447423.jpg
+++ b/tests/data/nuscenes/samples/CAM_BACK_LEFT/n015-2018-07-18-11-07-57+0800__CAM_BACK_LEFT__1531883530447423.jpg
--- a/tests/test_data/test_datasets/test_nuscenes_mono_dataset.py
+++ b/tests/test_data/test_datasets/test_nuscenes_mono_dataset.py
+import mmcv
+import numpy as np
+import pytest
+import torch
+
+from mmdet3d.datasets import NuScenesMonoDataset
+
+
+def test_getitem():
+    np.random.seed(0)
+    class_names = [
+        'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+        'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+    ]
+    img_norm_cfg = dict(
+        mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False)
+    pipeline = [
+        dict(type='LoadImageFromFileMono3D'),
+        dict(
+            type='LoadAnnotations3D',
+            with_bbox=True,
+            with_label=True,
+            with_attr_label=True,
+            with_bbox_3d=True,
+            with_label_3d=True,
+            with_bbox_depth=True),
+        dict(type='Resize', img_scale=(1600, 900), keep_ratio=True),
+        dict(type='RandomFlip3D', flip_ratio_bev_horizontal=1.0),
+        dict(type='Normalize', **img_norm_cfg),
+        dict(type='Pad', size_divisor=32),
+        dict(type='DefaultFormatBundle3D', class_names=class_names),
+        dict(
+            type='Collect3D',
+            keys=[
+                'img', 'gt_bboxes', 'gt_labels', 'attr_labels', 'gt_bboxes_3d',
+                'gt_labels_3d', 'centers2d', 'depths'
+            ]),
+    ]
+
+    nus_dataset = NuScenesMonoDataset(
+        ann_file='tests/data/nuscenes/nus_infos_mono3d.coco.json',
+        pipeline=pipeline,
+        data_root='tests/data/nuscenes/',
+        img_prefix='tests/data/nuscenes/',
+        test_mode=False)
+
+    data = nus_dataset[0]
+    img_metas = data['img_metas']._data
+    filename = img_metas['filename']
+    img_shape = img_metas['img_shape']
+    pad_shape = img_metas['pad_shape']
+    flip = img_metas['flip']
+    bboxes = data['gt_bboxes']._data
+    attrs = data['attr_labels']._data
+    labels3d = data['gt_labels_3d']._data
+    labels = data['gt_labels']._data
+    centers2d = data['centers2d']._data
+    depths = data['depths']._data
+
+    expected_filename = 'tests/data/nuscenes/samples/CAM_BACK_LEFT/' + \
+        'n015-2018-07-18-11-07-57+0800__CAM_BACK_LEFT__1531883530447423.jpg'
+    expected_img_shape = (900, 1600, 3)
+    expected_pad_shape = (928, 1600, 3)
+    expected_flip = True
+    expected_bboxes = torch.tensor([[485.4207, 513.7568, 515.4637, 576.1393],
+                                    [748.9482, 512.0452, 776.4941, 571.6310],
+                                    [432.1318, 427.8805, 508.4290, 578.1468],
+                                    [367.3779, 427.7682, 439.4244, 578.8904],
+                                    [592.8713, 515.0040, 623.4984, 575.0945]])
+    expected_attr_labels = torch.tensor([8, 8, 4, 4, 8])
+    expected_labels = torch.tensor([8, 8, 7, 7, 8])
+    expected_centers2d = torch.tensor([[500.6090, 544.6358],
+                                       [762.8789, 541.5280],
+                                       [471.1633, 502.2295],
+                                       [404.1957, 502.5908],
+                                       [608.3627, 544.7317]])
+    expected_depths = torch.tensor(
+        [15.3193, 15.6073, 14.7567, 14.8803, 15.4923])
+
+    assert filename == expected_filename
+    assert img_shape == expected_img_shape
+    assert pad_shape == expected_pad_shape
+    assert flip == expected_flip
+    assert torch.allclose(bboxes, expected_bboxes, 1e-5)
+    assert torch.all(attrs == expected_attr_labels)
+    assert torch.all(labels == expected_labels)
+    assert torch.all(labels3d == expected_labels)
+    assert torch.allclose(centers2d, expected_centers2d, 1e-5)
+    assert torch.allclose(depths, expected_depths, 1e-5)
+
+
+def test_format_results():
+    if not torch.cuda.is_available():
+        pytest.skip('test requires GPU and torch+cuda')
+    root_path = 'tests/data/nuscenes/'
+    ann_file = 'tests/data/nuscenes/nus_infos_mono3d.coco.json'
+    class_names = [
+        'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+        'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+    ]
+    pipeline = [
+        dict(type='LoadImageFromFileMono3D'),
+        dict(
+            type='LoadAnnotations3D',
+            with_bbox=True,
+            with_label=True,
+            with_attr_label=True,
+            with_bbox_3d=True,
+            with_label_3d=True,
+            with_bbox_depth=True),
+        dict(type='Resize', img_scale=(1600, 900), keep_ratio=True),
+        dict(type='Pad', size_divisor=32),
+        dict(type='DefaultFormatBundle3D', class_names=class_names),
+        dict(
+            type='Collect3D',
+            keys=[
+                'img', 'gt_bboxes', 'gt_labels', 'attr_labels', 'gt_bboxes_3d',
+                'gt_labels_3d', 'centers2d', 'depths'
+            ]),
+    ]
+    nus_dataset = NuScenesMonoDataset(
+        ann_file=ann_file,
+        pipeline=pipeline,
+        data_root=root_path,
+        test_mode=True)
+    results = mmcv.load('tests/data/nuscenes/mono3d_sample_results.pkl')
+    result_files, tmp_dir = nus_dataset.format_results(results)
+    result_data = mmcv.load(result_files['img_bbox'])
+    assert len(result_data['results'].keys()) == 1
+    assert len(result_data['results']['e93e98b63d3b40209056d129dc53ceee']) == 8
+    det = result_data['results']['e93e98b63d3b40209056d129dc53ceee'][0]
+
+    expected_token = 'e93e98b63d3b40209056d129dc53ceee'
+    expected_trans = torch.tensor(
+        [1018.753821915645, 605.190386124652, 0.7266818822266328])
+    expected_size = torch.tensor([1.6380000114440918, 4.25, 1.440000057220459])
+    expected_rotation = torch.tensor([
+        -0.9924980733795628, -0.013604682549109839, 0.01027292674776989,
+        -0.12106590736714223
+    ])
+    expected_detname = 'car'
+    expected_attr = 'vehicle.moving'
+
+    assert det['sample_token'] == expected_token
+    assert torch.allclose(
+        torch.tensor(det['translation']), expected_trans, 1e-5)
+    assert torch.allclose(torch.tensor(det['size']), expected_size, 1e-5)
+    assert torch.allclose(
+        torch.tensor(det['rotation']), expected_rotation, 1e-5)
+    assert det['detection_name'] == expected_detname
+    assert det['attribute_name'] == expected_attr
--- a/tests/test_data/test_pipelines/test_loadings/test_loading.py
+++ b/tests/test_data/test_pipelines/test_loadings/test_loading.py
@@ -130,10 +130,11 @@ def test_load_annotations3D():
    scannet_pts_semantic_mask = scannet_results['pts_semantic_mask']
    repr_str = repr(scannet_load_annotations3D)
    expected_repr_str = 'LoadAnnotations3D(\n    with_bbox_3d=True,     ' \
-                        'with_label_3d=True,     with_mask_3d=True,     ' \
-                        'with_seg_3d=True,     with_bbox=False,     ' \
-                        'with_label=False,     with_mask=False,     ' \
-                        'with_seg=False,     poly2mask=True)'
+                        'with_label_3d=True,     with_attr_label=False,     ' \
+                        'with_mask_3d=True,     with_seg_3d=True,     ' \
+                        'with_bbox=False,     with_label=False,     ' \
+                        'with_mask=False,     with_seg=False,     ' \
+                        'with_bbox_depth=False,     poly2mask=True)'
    assert repr_str == expected_repr_str
    assert scannet_gt_boxes.tensor.shape == (27, 7)
    assert scannet_gt_labels.shape == (27, )

--- a/tools/create_data.py
+++ b/tools/create_data.py
@@ -55,6 +55,9 @@ def nuscenes_data_prep(root_path,
        root_path, info_prefix, version=version, max_sweeps=max_sweeps)

    if version == 'v1.0-test':
+        info_test_path = osp.join(root_path, f'{info_prefix}_infos_test.pkl')
+        nuscenes_converter.export_2d_annotation(
+            root_path, info_test_path, version=version)
        return

    info_train_path = osp.join(root_path, f'{info_prefix}_infos_train.pkl')

--- a/tools/data_converter/nuscenes_converter.py
+++ b/tools/data_converter/nuscenes_converter.py
@@ -9,12 +9,18 @@ from pyquaternion import Quaternion
 from shapely.geometry import MultiPoint, box
 from typing import List, Tuple, Union

+from mmdet3d.core.bbox.box_np_ops import points_cam2img
 from mmdet3d.datasets import NuScenesDataset

 nus_categories = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
                  'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
                  'barrier')

+nus_attributes = ('cycle.with_rider', 'cycle.without_rider',
+                  'pedestrian.moving', 'pedestrian.standing',
+                  'pedestrian.sitting_lying_down', 'vehicle.moving',
+                  'vehicle.parked', 'vehicle.stopped', 'None')
+

 def create_nuscenes_infos(root_path,
                          info_prefix,
@@ -324,13 +330,14 @@ def obtain_sensor2top(nusc,
    return sweep


-def export_2d_annotation(root_path, info_path, version):
+def export_2d_annotation(root_path, info_path, version, mono3d=True):
    """Export 2d annotation from the info file and raw data.

    Args:
        root_path (str): Root path of the raw data.
        info_path (str): Path of the info file.
        version (str): Dataset version.
+        mono3d (bool): Whether to export mono3d annotation. Default: True.
    """
    # get bbox annotations for camera
    camera_types = [
@@ -356,12 +363,20 @@ def export_2d_annotation(root_path, info_path, version):
            coco_infos = get_2d_boxes(
                nusc,
                cam_info['sample_data_token'],
-                visibilities=['', '1', '2', '3', '4'])
+                visibilities=['', '1', '2', '3', '4'],
+                mono3d=mono3d)
            (height, width, _) = mmcv.imread(cam_info['data_path']).shape
            coco_2d_dict['images'].append(
                dict(
-                    file_name=cam_info['data_path'],
+                    file_name=cam_info['data_path'].split('data/nuscenes/')
+                    [-1],
                    id=cam_info['sample_data_token'],
+                    token=info['token'],
+                    cam2ego_rotation=cam_info['sensor2ego_rotation'],
+                    cam2ego_translation=cam_info['sensor2ego_translation'],
+                    ego2global_rotation=info['ego2global_rotation'],
+                    ego2global_translation=info['ego2global_translation'],
+                    cam_intrinsic=cam_info['cam_intrinsic'],
                    width=width,
                    height=height))
            for coco_info in coco_infos:
@@ -372,16 +387,24 @@ def export_2d_annotation(root_path, info_path, version):
                coco_info['id'] = coco_ann_id
                coco_2d_dict['annotations'].append(coco_info)
                coco_ann_id += 1
-    mmcv.dump(coco_2d_dict, f'{info_path[:-4]}.coco.json')
+    if mono3d:
+        json_prefix = f'{info_path[:-4]}_mono3d'
+    else:
+        json_prefix = f'{info_path[:-4]}'
+    mmcv.dump(coco_2d_dict, f'{json_prefix}.coco.json')


-def get_2d_boxes(nusc, sample_data_token: str,
-                 visibilities: List[str]) -> List[OrderedDict]:
+def get_2d_boxes(nusc,
+                 sample_data_token: str,
+                 visibilities: List[str],
+                 mono3d=True):
    """Get the 2D annotation records for a given `sample_data_token`.

    Args:
-        sample_data_token: Sample data token belonging to a camera keyframe.
-        visibilities: Visibility filter.
+        sample_data_token (str): Sample data token belonging to a camera \
+            keyframe.
+        visibilities (list[str]): Visibility filter.
+        mono3d (bool): Whether to get boxes with mono3d annotation.

    Return:
        list[dict]: List of 2D annotation record that belongs to the input
@@ -456,6 +479,43 @@ def get_2d_boxes(nusc, sample_data_token: str,
        # Generate dictionary record to be included in the .json file.
        repro_rec = generate_record(ann_rec, min_x, min_y, max_x, max_y,
                                    sample_data_token, sd_rec['filename'])
+
+        # If mono3d=True, add 3D annotations in camera coordinates
+        if mono3d and (repro_rec is not None):
+            loc = box.center.tolist()
+            dim = box.wlh.tolist()
+            rot = [box.orientation.yaw_pitch_roll[0]]
+
+            global_velo2d = nusc.box_velocity(box.token)[:2]
+            global_velo3d = np.array([*global_velo2d, 0.0])
+            e2g_r_mat = Quaternion(pose_rec['rotation']).rotation_matrix
+            c2e_r_mat = Quaternion(cs_rec['rotation']).rotation_matrix
+            cam_velo3d = global_velo3d @ np.linalg.inv(
+                e2g_r_mat).T @ np.linalg.inv(c2e_r_mat).T
+            velo = cam_velo3d[0::2].tolist()
+
+            repro_rec['bbox_cam3d'] = loc + dim + rot
+            repro_rec['velo_cam3d'] = velo
+
+            center3d = np.array(loc).reshape([1, 3])
+            center2d = points_cam2img(
+                center3d, camera_intrinsic, with_depth=True)
+            repro_rec['center2d'] = center2d.squeeze().tolist()
+            # normalized center2D + depth
+            # if samples with depth < 0 will be removed
+            if repro_rec['center2d'][2] <= 0:
+                continue
+
+            ann_token = nusc.get('sample_annotation',
+                                 box.token)['attribute_tokens']
+            if len(ann_token) == 0:
+                attr_name = 'None'
+            else:
+                attr_name = nusc.get('attribute', ann_token[0])['name']
+            attr_id = nus_attributes.index(attr_name)
+            repro_rec['attribute_name'] = attr_name
+            repro_rec['attribute_id'] = attr_id
+
        repro_recs.append(repro_rec)

    return repro_recs