[Feature] Support monocular 3D detection on nuScenes (#392)

* Support nuscenes mono3d json info generation * Support nuscenes mono3d dataset class * Support attribute and bbox2d prediction in bbox3dnms and bbox3d2result * Rename dataset class and add comments to 'attrs' * Support mono3d related pipelines * Fix unittest for loading 3D annotations * Add unit test for nuscenes mono3d dataset * Rename the sample result file * Upload sample data for mono3d unit test * Upload sample data for mono3d unit test * Upload sample image for unit test * Delete tests/data/nuscenes/samples/LIDAR_TOP/CAM_BACK_LEFT directory * Add files via upload * Remove unnecessary 'f' * Remove unnecessary \ in arguments * Remove check for pycocotools version because it has been done in the cocodataset * Remove unnecessary comma, add TODO and change init of attrs in format_results * Merge RandomFlip3D and RandomFlipMono3D * Add pytest to check whether cuda is available in the unit test * Add visualization TODO * Remove useless init in loading mono3d images

[Feature] Support monocular 3D detection on nuScenes (#392)
* Support nuscenes mono3d json info generation * Support nuscenes mono3d dataset class * Support attribute and bbox2d prediction in bbox3dnms and bbox3d2result * Rename dataset class and add comments to 'attrs' * Support mono3d related pipelines * Fix unittest for loading 3D annotations * Add unit test for nuscenes mono3d dataset * Rename the sample result file * Upload sample data for mono3d unit test * Upload sample data for mono3d unit test * Upload sample image for unit test * Delete tests/data/nuscenes/samples/LIDAR_TOP/CAM_BACK_LEFT directory * Add files via upload * Remove unnecessary 'f' * Remove unnecessary \ in arguments * Remove check for pycocotools version because it has been done in the cocodataset * Remove unnecessary comma, add TODO and change init of attrs in format_results * Merge RandomFlip3D and RandomFlipMono3D * Add pytest to check whether cuda is available in the unit test * Add visualization TODO * Remove useless init in loading mono3d images
ecd0d06a · twang · GitHub · 3a5a2010 · ecd0d06a · ecd0d06a
Unverified Commit ecd0d06a authored Apr 07, 2021 by twang Committed by GitHub Apr 07, 2021
16 changed files
--- a/mmdet3d/core/bbox/box_np_ops.py
+++ b/mmdet3d/core/bbox/box_np_ops.py
@@ -335,21 +335,40 @@ def rotation_points_single_angle(points, angle, axis=0):
    return points @ rot_mat_T, rot_mat_T


-def points_cam2img(points_3d, proj_mat):
+def points_cam2img(points_3d, proj_mat, with_depth=False):
    """Project points in camera coordinates to image coordinates.

    Args:
        points_3d (np.ndarray): Points in shape (N, 3)
        proj_mat (np.ndarray): Transformation matrix between coordinates.
+        with_depth (bool): Whether to keep depth in the output.

    Returns:
        np.ndarray: Points in image coordinates with shape [N, 2].
    """
    points_shape = list(points_3d.shape)
    points_shape[-1] = 1
+
+    assert len(proj_mat.shape) == 2, 'The dimension of the projection'\
+        f' matrix should be 2 instead of {len(proj_mat.shape)}.'
+    d1, d2 = proj_mat.shape[:2]
+    assert (d1 == 3 and d2 == 3) or (d1 == 3 and d2 == 4) or (
+        d1 == 4 and d2 == 4), 'The shape of the projection matrix'\
+        f' ({d1}*{d2}) is not supported.'
+    if d1 == 3:
+        proj_mat_expanded = np.eye(4, dtype=proj_mat.dtype)
+        proj_mat_expanded[:d1, :d2] = proj_mat
+        proj_mat = proj_mat_expanded
+
    points_4 = np.concatenate([points_3d, np.ones(points_shape)], axis=-1)
    point_2d = points_4 @ proj_mat.T
    point_2d_res = point_2d[..., :2] / point_2d[..., 2:3]
+
+    if with_depth:
+        points_2d_depth = np.concatenate([point_2d_res, point_2d[..., 2:3]],
+                                         axis=-1)
+        return points_2d_depth
+
    return point_2d_res



--- a/mmdet3d/core/bbox/structures/utils.py
+++ b/mmdet3d/core/bbox/structures/utils.py
@@ -124,11 +124,11 @@ def points_cam2img(points_3d, proj_mat):
    points_num = list(points_3d.shape)[:-1]

    points_shape = np.concatenate([points_num, [1]], axis=0).tolist()
-    assert len(proj_mat.shape) == 2, f'The dimension of the projection'\
-        f'matrix should be 2 instead of {len(proj_mat.shape)}.'
+    assert len(proj_mat.shape) == 2, 'The dimension of the projection'\
+        f' matrix should be 2 instead of {len(proj_mat.shape)}.'
    d1, d2 = proj_mat.shape[:2]
    assert (d1 == 3 and d2 == 3) or (d1 == 3 and d2 == 4) or (
-        d1 == 4 and d2 == 4), f'The shape of the projection matrix'\
+        d1 == 4 and d2 == 4), 'The shape of the projection matrix'\
        f' ({d1}*{d2}) is not supported.'
    if d1 == 3:
        proj_mat_expanded = torch.eye(

--- a/mmdet3d/core/bbox/transforms.py
+++ b/mmdet3d/core/bbox/transforms.py
@@ -46,13 +46,15 @@ def bbox3d2roi(bbox_list):
    return rois


-def bbox3d2result(bboxes, scores, labels):
+def bbox3d2result(bboxes, scores, labels, attrs=None):
    """Convert detection results to a list of numpy arrays.

    Args:
        bboxes (torch.Tensor): Bounding boxes with shape of (n, 5).
        labels (torch.Tensor): Labels with shape of (n, ).
        scores (torch.Tensor): Scores with shape of (n, ).
+        attrs (torch.Tensor, optional): Attributes with shape of (n, ). \
+            Defaults to None.

    Returns:
        dict[str, torch.Tensor]: Bounding box results in cpu mode.
@@ -60,8 +62,14 @@ def bbox3d2result(bboxes, scores, labels):
            - boxes_3d (torch.Tensor): 3D boxes.
            - scores (torch.Tensor): Prediction scores.
            - labels_3d (torch.Tensor): Box labels.
+            - attrs_3d (torch.Tensor, optional): Box attributes.
    """
-    return dict(
+    result_dict = dict(
        boxes_3d=bboxes.to('cpu'),
        scores_3d=scores.cpu(),
        labels_3d=labels.cpu())
+
+    if attrs is not None:
+        result_dict['attrs_3d'] = attrs.cpu()
+
+    return result_dict
--- a/mmdet3d/core/post_processing/box3d_nms.py
+++ b/mmdet3d/core/post_processing/box3d_nms.py
@@ -11,7 +11,9 @@ def box3d_multiclass_nms(mlvl_bboxes,
                         score_thr,
                         max_num,
                         cfg,
-                         mlvl_dir_scores=None):
+                         mlvl_dir_scores=None,
+                         mlvl_attr_scores=None,
+                         mlvl_bboxes2d=None):
    """Multi-class nms for 3D boxes.

    Args:
@@ -27,10 +29,15 @@ def box3d_multiclass_nms(mlvl_bboxes,
        cfg (dict): Configuration dict of NMS.
        mlvl_dir_scores (torch.Tensor, optional): Multi-level scores
            of direction classifier. Defaults to None.
+        mlvl_attr_scores (torch.Tensor, optional): Multi-level scores
+            of attribute classifier. Defaults to None.
+        mlvl_bboxes2d (torch.Tensor, optional): Multi-level 2D bounding
+            boxes. Defaults to None.

    Returns:
        tuple[torch.Tensor]: Return results after nms, including 3D \
-            bounding boxes, scores, labels and direction scores.
+            bounding boxes, scores, labels, direction scores, attribute \
+            scores (optional) and 2D bounding boxes (optional).
    """
    # do multi class nms
    # the fg class id range: [0, num_classes-1]
@@ -39,6 +46,8 @@ def box3d_multiclass_nms(mlvl_bboxes,
    scores = []
    labels = []
    dir_scores = []
+    attr_scores = []
+    bboxes2d = []
    for i in range(0, num_classes):
        # get bboxes and scores of this class
        cls_inds = mlvl_scores[:, i] > score_thr
@@ -65,6 +74,12 @@ def box3d_multiclass_nms(mlvl_bboxes,
        if mlvl_dir_scores is not None:
            _mlvl_dir_scores = mlvl_dir_scores[cls_inds]
            dir_scores.append(_mlvl_dir_scores[selected])
+        if mlvl_attr_scores is not None:
+            _mlvl_attr_scores = mlvl_attr_scores[cls_inds]
+            attr_scores.append(_mlvl_attr_scores[selected])
+        if mlvl_bboxes2d is not None:
+            _mlvl_bboxes2d = mlvl_bboxes2d[cls_inds]
+            bboxes2d.append(_mlvl_bboxes2d[selected])

    if bboxes:
        bboxes = torch.cat(bboxes, dim=0)
@@ -72,6 +87,10 @@ def box3d_multiclass_nms(mlvl_bboxes,
        labels = torch.cat(labels, dim=0)
        if mlvl_dir_scores is not None:
            dir_scores = torch.cat(dir_scores, dim=0)
+        if mlvl_attr_scores is not None:
+            attr_scores = torch.cat(attr_scores, dim=0)
+        if mlvl_bboxes2d is not None:
+            bboxes2d = torch.cat(bboxes2d, dim=0)
        if bboxes.shape[0] > max_num:
            _, inds = scores.sort(descending=True)
            inds = inds[:max_num]
@@ -80,12 +99,31 @@ def box3d_multiclass_nms(mlvl_bboxes,
            scores = scores[inds]
            if mlvl_dir_scores is not None:
                dir_scores = dir_scores[inds]
+            if mlvl_attr_scores is not None:
+                attr_scores = attr_scores[inds]
+            if mlvl_bboxes2d is not None:
+                bboxes2d = bboxes2d[inds]
    else:
        bboxes = mlvl_scores.new_zeros((0, mlvl_bboxes.size(-1)))
        scores = mlvl_scores.new_zeros((0, ))
        labels = mlvl_scores.new_zeros((0, ), dtype=torch.long)
-        dir_scores = mlvl_scores.new_zeros((0, ))
-    return bboxes, scores, labels, dir_scores
+        if mlvl_dir_scores is not None:
+            dir_scores = mlvl_scores.new_zeros((0, ))
+        if mlvl_attr_scores is not None:
+            attr_scores = mlvl_scores.new_zeros((0, ))
+        if mlvl_bboxes2d is not None:
+            bboxes2d = mlvl_scores.new_zeros((0, 4))
+
+    results = (bboxes, scores, labels)
+
+    if mlvl_dir_scores is not None:
+        results = results + (dir_scores, )
+    if mlvl_attr_scores is not None:
+        results = results + (attr_scores, )
+    if mlvl_bboxes2d is not None:
+        results = results + (bboxes2d, )
+
+    return results


 def aligned_3d_nms(boxes, scores, classes, thresh):

--- a/mmdet3d/datasets/__init__.py
+++ b/mmdet3d/datasets/__init__.py
@@ -5,6 +5,7 @@ from .custom_3d_seg import Custom3DSegDataset
 from .kitti_dataset import KittiDataset
 from .lyft_dataset import LyftDataset
 from .nuscenes_dataset import NuScenesDataset
+from .nuscenes_mono_dataset import NuScenesMonoDataset
 from .pipelines import (BackgroundPointsFilter, GlobalRotScaleTrans,
                        IndoorPointSample, LoadAnnotations3D,
                        LoadPointsFromFile, LoadPointsFromMultiSweeps,
@@ -19,9 +20,9 @@ from .waymo_dataset import WaymoDataset
 __all__ = [
    'KittiDataset', 'GroupSampler', 'DistributedGroupSampler',
    'build_dataloader', 'RepeatFactorDataset', 'DATASETS', 'build_dataset',
-    'CocoDataset', 'NuScenesDataset', 'LyftDataset', 'ObjectSample',
-    'RandomFlip3D', 'ObjectNoise', 'GlobalRotScaleTrans', 'PointShuffle',
-    'ObjectRangeFilter', 'PointsRangeFilter', 'Collect3D',
+    'CocoDataset', 'NuScenesDataset', 'NuScenesMonoDataset', 'LyftDataset',
+    'ObjectSample', 'RandomFlip3D', 'ObjectNoise', 'GlobalRotScaleTrans',
+    'PointShuffle', 'ObjectRangeFilter', 'PointsRangeFilter', 'Collect3D',
    'LoadPointsFromFile', 'NormalizePointsColor', 'IndoorPointSample',
    'LoadAnnotations3D', 'SUNRGBDDataset', 'ScanNetDataset',
    'ScanNetSegDataset', 'SemanticKITTIDataset', 'Custom3DDataset',

--- a/mmdet3d/datasets/nuscenes_mono_dataset.py
+++ b/mmdet3d/datasets/nuscenes_mono_dataset.py
--- a/mmdet3d/datasets/pipelines/formating.py
+++ b/mmdet3d/datasets/pipelines/formating.py
@@ -51,7 +51,8 @@ class DefaultFormatBundle(object):
                results['img'] = DC(to_tensor(img), stack=True)
        for key in [
                'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels',
-                'gt_labels_3d', 'pts_instance_mask', 'pts_semantic_mask'
+                'gt_labels_3d', 'attr_labels', 'pts_instance_mask',
+                'pts_semantic_mask', 'centers2d', 'depths'
        ]:
            if key not in results:
                continue
@@ -134,11 +135,11 @@ class Collect3D(object):
                 keys,
                 meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img',
                            'pad_shape', 'scale_factor', 'flip',
-                            'pcd_horizontal_flip', 'pcd_vertical_flip',
-                            'box_mode_3d', 'box_type_3d', 'img_norm_cfg',
-                            'rect', 'Trv2c', 'P2', 'pcd_trans', 'sample_idx',
-                            'pcd_scale_factor', 'pcd_rotation', 'pts_filename',
-                            'transformation_3d_flow')):
+                            'cam_intrinsic', 'pcd_horizontal_flip',
+                            'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d',
+                            'img_norm_cfg', 'rect', 'Trv2c', 'P2', 'pcd_trans',
+                            'sample_idx', 'pcd_scale_factor', 'pcd_rotation',
+                            'pts_filename', 'transformation_3d_flow')):
        self.keys = keys
        self.meta_keys = meta_keys

@@ -222,6 +223,11 @@ class DefaultFormatBundle3D(DefaultFormatBundle):
                if 'gt_names_3d' in results:
                    results['gt_names_3d'] = results['gt_names_3d'][
                        gt_bboxes_3d_mask]
+                if 'centers2d' in results:
+                    results['centers2d'] = results['centers2d'][
+                        gt_bboxes_3d_mask]
+                if 'depths' in results:
+                    results['depths'] = results['depths'][gt_bboxes_3d_mask]
            if 'gt_bboxes_mask' in results:
                gt_bboxes_mask = results['gt_bboxes_mask']
                if 'gt_bboxes' in results:
@@ -230,6 +236,7 @@ class DefaultFormatBundle3D(DefaultFormatBundle):
            if self.with_label:
                if 'gt_names' in results and len(results['gt_names']) == 0:
                    results['gt_labels'] = np.array([], dtype=np.int64)
+                    results['attr_labels'] = np.array([], dtype=np.int64)
                elif 'gt_names' in results and isinstance(
                        results['gt_names'][0], list):
                    # gt_labels might be a list of list in multi-view setting

--- a/mmdet3d/datasets/pipelines/loading.py
+++ b/mmdet3d/datasets/pipelines/loading.py
@@ -3,7 +3,7 @@ import numpy as np

 from mmdet3d.core.points import BasePoints, get_points_type
 from mmdet.datasets.builder import PIPELINES
-from mmdet.datasets.pipelines import LoadAnnotations
+from mmdet.datasets.pipelines import LoadAnnotations, LoadImageFromFile


 @PIPELINES.register_module()
@@ -65,6 +65,30 @@ class LoadMultiViewImageFromFiles(object):
            f"color_type='{self.color_type}')"


+@PIPELINES.register_module()
+class LoadImageFromFileMono3D(LoadImageFromFile):
+    """Load an image from file in monocular 3D object detection. Compared to 2D
+    detection, additional camera parameters need to be loaded.
+
+    Args:
+        kwargs (dict): Arguments are the same as those in \
+            :class:`LoadImageFromFile`.
+    """
+
+    def __call__(self, results):
+        """Call functions to load image and get image meta information.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+        super().__call__(results)
+        results['cam_intrinsic'] = results['img_info']['cam_intrinsic']
+        return results
+
+
 @PIPELINES.register_module()
 class LoadPointsFromMultiSweeps(object):
    """Load points from multiple sweeps.
@@ -426,6 +450,8 @@ class LoadAnnotations3D(LoadAnnotations):
            Defaults to True.
        with_label_3d (bool, optional): Whether to load 3D labels.
            Defaults to True.
+        with_attr_label (bool, optional): Whether to load attribute label.
+            Defaults to False.
        with_mask_3d (bool, optional): Whether to load 3D instance masks.
            for points. Defaults to False.
        with_seg_3d (bool, optional): Whether to load 3D semantic masks.
@@ -438,6 +464,8 @@ class LoadAnnotations3D(LoadAnnotations):
            Defaults to False.
        with_seg (bool, optional): Whether to load 2D semantic masks.
            Defaults to False.
+        with_bbox_depth (bool, optional): Whether to load 2.5D boxes.
+            Defaults to False.
        poly2mask (bool, optional): Whether to convert polygon annotations
            to bitmasks. Defaults to True.
        seg_3d_dtype (dtype, optional): Dtype of 3D semantic masks.
@@ -450,12 +478,14 @@ class LoadAnnotations3D(LoadAnnotations):
    def __init__(self,
                 with_bbox_3d=True,
                 with_label_3d=True,
+                 with_attr_label=False,
                 with_mask_3d=False,
                 with_seg_3d=False,
                 with_bbox=False,
                 with_label=False,
                 with_mask=False,
                 with_seg=False,
+                 with_bbox_depth=False,
                 poly2mask=True,
                 seg_3d_dtype='int',
                 file_client_args=dict(backend='disk')):
@@ -467,7 +497,9 @@ class LoadAnnotations3D(LoadAnnotations):
            poly2mask,
            file_client_args=file_client_args)
        self.with_bbox_3d = with_bbox_3d
+        self.with_bbox_depth = with_bbox_depth
        self.with_label_3d = with_label_3d
+        self.with_attr_label = with_attr_label
        self.with_mask_3d = with_mask_3d
        self.with_seg_3d = with_seg_3d
        self.seg_3d_dtype = seg_3d_dtype
@@ -485,6 +517,19 @@ class LoadAnnotations3D(LoadAnnotations):
        results['bbox3d_fields'].append('gt_bboxes_3d')
        return results

+    def _load_bboxes_depth(self, results):
+        """Private function to load 2.5D bounding box annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
+
+        Returns:
+            dict: The dict containing loaded 2.5D bounding box annotations.
+        """
+        results['centers2d'] = results['ann_info']['centers2d']
+        results['depths'] = results['ann_info']['depths']
+        return results
+
    def _load_labels_3d(self, results):
        """Private function to load label annotations.

@@ -497,6 +542,18 @@ class LoadAnnotations3D(LoadAnnotations):
        results['gt_labels_3d'] = results['ann_info']['gt_labels_3d']
        return results

+    def _load_attr_labels(self, results):
+        """Private function to load label annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
+
+        Returns:
+            dict: The dict containing loaded label annotations.
+        """
+        results['attr_labels'] = results['ann_info']['attr_labels']
+        return results
+
    def _load_masks_3d(self, results):
        """Private function to load 3D mask annotations.

@@ -564,8 +621,14 @@ class LoadAnnotations3D(LoadAnnotations):
            results = self._load_bboxes_3d(results)
            if results is None:
                return None
+        if self.with_bbox_depth:
+            results = self._load_bboxes_depth(results)
+            if results is None:
+                return None
        if self.with_label_3d:
            results = self._load_labels_3d(results)
+        if self.with_attr_label:
+            results = self._load_attr_labels(results)
        if self.with_mask_3d:
            results = self._load_masks_3d(results)
        if self.with_seg_3d:
@@ -579,11 +642,13 @@ class LoadAnnotations3D(LoadAnnotations):
        repr_str = self.__class__.__name__ + '(\n'
        repr_str += f'{indent_str}with_bbox_3d={self.with_bbox_3d}, '
        repr_str += f'{indent_str}with_label_3d={self.with_label_3d}, '
+        repr_str += f'{indent_str}with_attr_label={self.with_attr_label}, '
        repr_str += f'{indent_str}with_mask_3d={self.with_mask_3d}, '
        repr_str += f'{indent_str}with_seg_3d={self.with_seg_3d}, '
        repr_str += f'{indent_str}with_bbox={self.with_bbox}, '
        repr_str += f'{indent_str}with_label={self.with_label}, '
        repr_str += f'{indent_str}with_mask={self.with_mask}, '
        repr_str += f'{indent_str}with_seg={self.with_seg}, '
+        repr_str += f'{indent_str}with_bbox_depth={self.with_bbox_depth}, '
        repr_str += f'{indent_str}poly2mask={self.poly2mask})'
        return repr_str
--- a/mmdet3d/datasets/pipelines/transforms_3d.py
+++ b/mmdet3d/datasets/pipelines/transforms_3d.py
@@ -65,8 +65,17 @@ class RandomFlip3D(RandomFlip):
                np.array([], dtype=np.float32))
        assert len(input_dict['bbox3d_fields']) == 1
        for key in input_dict['bbox3d_fields']:
-            input_dict['points'] = input_dict[key].flip(
-                direction, points=input_dict['points'])
+            if 'points' in input_dict:
+                input_dict['points'] = input_dict[key].flip(
+                    direction, points=input_dict['points'])
+            else:
+                input_dict[key].flip(direction)
+        if 'centers2d' in input_dict:
+            assert self.sync_2d is True and direction == 'horizontal', \
+                'Only support sync_2d=True and horizontal flip with images'
+            w = input_dict['img_shape'][1]
+            input_dict['centers2d'][..., 0] = \
+                w - input_dict['centers2d'][..., 0]

    def __call__(self, input_dict):
        """Call function to flip points, values in the ``bbox3d_fields`` and \

--- a/tests/data/nuscenes/mono3d_sample_results.pkl
+++ b/tests/data/nuscenes/mono3d_sample_results.pkl
--- a/tests/data/nuscenes/nus_infos_mono3d.coco.json
+++ b/tests/data/nuscenes/nus_infos_mono3d.coco.json
+{"images": [{"file_name": "samples/CAM_BACK_LEFT/n015-2018-07-18-11-07-57+0800__CAM_BACK_LEFT__1531883530447423.jpg", "id": "86e6806d626b4711a6d0f5015b090116", "token": "e93e98b63d3b40209056d129dc53ceee", "cam2ego_rotation": [0.6924185592174665, -0.7031619420114925, -0.11648342771943819, 0.11203317912370753], "cam2ego_translation": [1.03569100218, 0.484795032713, 1.59097014818], "ego2global_rotation": [-0.7495886280607293, -0.0077695335695504636, 0.00829759813869316, -0.6618063711504101], "ego2global_translation": [1010.1328353833223, 610.8111652918716, 0.0], "cam_intrinsic": [[1256.7414812095406, 0.0, 792.1125740759628], [0.0, 1256.7414812095406, 492.7757465151356], [0.0, 0.0, 1.0]], "width": 1600, "height": 900}, {"file_name": "samples/CAM_FRONT/n015-2018-07-18-11-07-57+0800__CAM_FRONT__1531883530412470.jpg", "id": "020d7b4f858147558106c504f7f31bef", "token": "e93e98b63d3b40209056d129dc53ceee", "cam2ego_rotation": [0.4998015430569128, -0.5030316162024876, 0.4997798114386805, -0.49737083824542755], "cam2ego_translation": [1.70079118954, 0.0159456324149, 1.51095763913], "ego2global_rotation": [-0.7495886280607293, -0.0077695335695504636, 0.00829759813869316, -0.6618063711504101], "ego2global_translation": [1010.1328353833223, 610.8111652918716, 0.0], "cam_intrinsic": [[1266.417203046554, 0.0, 816.2670197447984], [0.0, 1266.417203046554, 491.50706579294757], [0.0, 0.0, 1.0]], "width": 1600, "height": 900}, {"file_name": "samples/CAM_FRONT_RIGHT/n015-2018-07-18-11-07-57+0800__CAM_FRONT_RIGHT__1531883530420339.jpg", "id": "16d39ff22a8545b0a4ee3236a0fe1c20", "token": "e93e98b63d3b40209056d129dc53ceee", "cam2ego_rotation": [0.2060347966337182, -0.2026940577919598, 0.6824507824531167, -0.6713610884174485], "cam2ego_translation": [1.5508477543, -0.493404796419, 1.49574800619], "ego2global_rotation": [-0.7495886280607293, -0.0077695335695504636, 0.00829759813869316, -0.6618063711504101], "ego2global_translation": [1010.1328353833223, 610.8111652918716, 0.0], "cam_intrinsic": [[1260.8474446004698, 0.0, 807.968244525554], [0.0, 1260.8474446004698, 495.3344268742088], [0.0, 0.0, 1.0]], "width": 1600, "height": 900}, {"file_name": "samples/CAM_FRONT_LEFT/n015-2018-07-18-11-07-57+0800__CAM_FRONT_LEFT__1531883530404844.jpg", "id": "24332e9c554a406f880430f17771b608", "token": "e93e98b63d3b40209056d129dc53ceee", "cam2ego_rotation": [0.6757265034669446, -0.6736266522251881, 0.21214015046209478, -0.21122827103904068], "cam2ego_translation": [1.52387798135, 0.494631336551, 1.50932822144], "ego2global_rotation": [-0.7495886280607293, -0.0077695335695504636, 0.00829759813869316, -0.6618063711504101], "ego2global_translation": [1010.1328353833223, 610.8111652918716, 0.0], "cam_intrinsic": [[1272.5979470598488, 0.0, 826.6154927353808], [0.0, 1272.5979470598488, 479.75165386361925], [0.0, 0.0, 1.0]], "width": 1600, "height": 900}, {"file_name": "samples/CAM_BACK/n015-2018-07-18-11-07-57+0800__CAM_BACK__1531883530437525.jpg", "id": "aab35aeccbda42de82b2ff5c278a0d48", "token": "e93e98b63d3b40209056d129dc53ceee", "cam2ego_rotation": [0.5037872666382278, -0.49740249788611096, -0.4941850223835201, 0.5045496097725578], "cam2ego_translation": [0.0283260309358, 0.00345136761476, 1.57910346144], "ego2global_rotation": [-0.7495886280607293, -0.0077695335695504636, 0.00829759813869316, -0.6618063711504101], "ego2global_translation": [1010.1328353833223, 610.8111652918716, 0.0], "cam_intrinsic": [[809.2209905677063, 0.0, 829.2196003259838], [0.0, 809.2209905677063, 481.77842384512485], [0.0, 0.0, 1.0]], "width": 1600, "height": 900}, {"file_name": "samples/CAM_BACK_RIGHT/n015-2018-07-18-11-07-57+0800__CAM_BACK_RIGHT__1531883530427893.jpg", "id": "ec7096278e484c9ebe6894a2ad5682e9", "token": "e93e98b63d3b40209056d129dc53ceee", "cam2ego_rotation": [0.12280980120078765, -0.132400842670559, -0.7004305821388234, 0.690496031265798], "cam2ego_translation": [1.0148780988, -0.480568219723, 1.56239545128], "ego2global_rotation": [-0.7495886280607293, -0.0077695335695504636, 0.00829759813869316, -0.6618063711504101], "ego2global_translation": [1010.1328353833223, 610.8111652918716, 0.0], "cam_intrinsic": [[1259.5137405846733, 0.0, 807.2529053838625], [0.0, 1259.5137405846733, 501.19579884916527], [0.0, 0.0, 1.0]], "width": 1600, "height": 900}], "annotations": [{"file_name": "samples/CAM_FRONT/n015-2018-07-18-11-07-57+0800__CAM_FRONT__1531883530412470.jpg", "image_id": "020d7b4f858147558106c504f7f31bef", "area": 85383.89600714693, "category_name": "truck", "category_id": 1, "bbox": [0.0, 357.732750319127, 342.56437261895206, 249.24920053528984], "iscrowd": 0, "bbox_cam3d": [-10.356295829208502, -0.06394600736590471, 18.785737229926998, 2.312, 7.516, 3.093, -0.5996975863361309], "velo_cam3d": [0.05742557272436208, 0.06990201482350666], "center2d": [118.11016609440316, 487.19622492451936, 18.785737229926998], "attribute_name": "vehicle.parked", "attribute_id": 6, "segmentation": [], "id": 0}, {"file_name": "samples/CAM_FRONT_LEFT/n015-2018-07-18-11-07-57+0800__CAM_FRONT_LEFT__1531883530404844.jpg", "image_id": "24332e9c554a406f880430f17771b608", "area": 76274.38331683438, "category_name": "truck", "category_id": 1, "bbox": [1305.1296604171719, 350.75901341602525, 294.87033958282814, 258.6709243959383], "iscrowd": 0, "bbox_cam3d": [9.795917040815693, 0.07538275380197612, 19.033148401567978, 2.312, 7.516, 3.093, -1.5546044317874126], "velo_cam3d": [0.09022854769195846, -0.0065096147400431695], "center2d": [1481.5919397578637, 484.79190972187814, 19.033148401567978], "attribute_name": "vehicle.parked", "attribute_id": 6, "segmentation": [], "id": 1}, {"file_name": "samples/CAM_FRONT_LEFT/n015-2018-07-18-11-07-57+0800__CAM_FRONT_LEFT__1531883530404844.jpg", "image_id": "24332e9c554a406f880430f17771b608", "area": 5248.9339273703135, "category_name": "truck", "category_id": 1, "bbox": [808.1218983320856, 436.2076328554, 75.28483638734929, 69.72099800235912], "iscrowd": 0, "bbox_cam3d": [0.7896581102503435, -0.32866532307883706, 58.48166239420381, 2.877, 6.372, 2.978, 1.641180695066564], "velo_cam3d": [0.009938485543455734, 0.0010084200213775884], "center2d": [843.7989524532317, 472.5996886441534, 58.48166239420381], "attribute_name": "vehicle.parked", "attribute_id": 6, "segmentation": [], "id": 2}, {"file_name": "samples/CAM_FRONT_LEFT/n015-2018-07-18-11-07-57+0800__CAM_FRONT_LEFT__1531883530404844.jpg", "image_id": "24332e9c554a406f880430f17771b608", "area": 25266.816070927107, "category_name": "truck", "category_id": 1, "bbox": [1133.5883785276196, 424.4436001005383, 202.5256666350731, 124.75858734712807], "iscrowd": 0, "bbox_cam3d": [9.39338221449255, 0.19762751304835102, 30.01455814405707, 2.156, 6.227, 2.601, -1.4587684025759116], "velo_cam3d": [0.0, 0.0], "center2d": [1224.88885277412, 488.1309332180172, 30.01455814405707], "attribute_name": "vehicle.parked", "attribute_id": 6, "segmentation": [], "id": 3}, {"file_name": "samples/CAM_BACK/n015-2018-07-18-11-07-57+0800__CAM_BACK__1531883530437525.jpg", "image_id": "aab35aeccbda42de82b2ff5c278a0d48", "area": 31981.88483023472, "category_name": "car", "category_id": 0, "bbox": [652.8710695836726, 487.2457293359287, 256.3734471348506, 124.74725907715583], "iscrowd": 0, "bbox_cam3d": [-0.48041137691585667, 0.8426032188612489, 12.27160016308813, 1.871, 4.478, 1.456, -2.0402647554154876], "velo_cam3d": [-2.4043357184501866, -4.232358489028598], "center2d": [797.5400340802389, 537.3418550489371, 12.27160016308813], "attribute_name": "vehicle.moving", "attribute_id": 5, "segmentation": [], "id": 4}, {"file_name": "samples/CAM_BACK_LEFT/n015-2018-07-18-11-07-57+0800__CAM_BACK_LEFT__1531883530447423.jpg", "image_id": "86e6806d626b4711a6d0f5015b090116", "area": 1874.1656394574547, "category_name": "traffic_cone", "category_id": 8, "bbox": [1084.536273989852, 513.7567766430512, 30.043100006470013, 62.382565016720605], "iscrowd": 0, "bbox_cam3d": [3.745641322414848, 0.6321604510604618, 15.319339525420224, 0.3, 0.291, 0.734, 1.4550554479430875], "velo_cam3d": [0.028202672296939114, -0.001622377193634249], "center2d": [1099.3910188026568, 544.635832278593, 15.319339525420224], "attribute_name": "None", "attribute_id": 8, "segmentation": [], "id": 5}, {"file_name": "samples/CAM_BACK_LEFT/n015-2018-07-18-11-07-57+0800__CAM_BACK_LEFT__1531883530447423.jpg", "image_id": "86e6806d626b4711a6d0f5015b090116", "area": 1641.3529623313364, "category_name": "traffic_cone", "category_id": 8, "bbox": [823.5058461203419, 512.0451382733748, 27.545987206560085, 59.58591899514306], "iscrowd": 0, "bbox_cam3d": [0.558956408408079, 0.6054486006477211, 15.607344275188172, 0.315, 0.338, 0.712, 1.5596704833049395], "velo_cam3d": [0.07717355032092023, -0.0013264953734539453], "center2d": [837.1211093045397, 541.5279466177432, 15.607344275188172], "attribute_name": "None", "attribute_id": 8, "segmentation": [], "id": 6}, {"file_name": "samples/CAM_BACK_LEFT/n015-2018-07-18-11-07-57+0800__CAM_BACK_LEFT__1531883530447423.jpg", "image_id": "86e6806d626b4711a6d0f5015b090116", "area": 11464.868967812941, "category_name": "pedestrian", "category_id": 7, "bbox": [1091.57108913607, 427.8805195896188, 76.29701915190844, 150.2662763926101], "iscrowd": 0, "bbox_cam3d": [3.953820859983739, 0.11100574170732268, 14.75668416993455, 0.739, 0.563, 1.711, 1.4550554479430875], "velo_cam3d": [0.10262495353364391, -0.0064695610507391095], "center2d": [1128.8366393735657, 502.22946380348515, 14.75668416993455], "attribute_name": "pedestrian.sitting_lying_down", "attribute_id": 4, "segmentation": [], "id": 7}, {"file_name": "samples/CAM_BACK_LEFT/n015-2018-07-18-11-07-57+0800__CAM_BACK_LEFT__1531883530447423.jpg", "image_id": "86e6806d626b4711a6d0f5015b090116", "area": 10887.814254422945, "category_name": "pedestrian", "category_id": 7, "bbox": [1160.5755663065963, 427.76823935577545, 72.04645850373822, 151.1221298109749], "iscrowd": 0, "bbox_cam3d": [4.7798492054669035, 0.1162134030605403, 14.880252178422799, 0.665, 0.544, 1.739, 1.4550554479430875], "velo_cam3d": [0.08665208940588605, -0.12554131041835265], "center2d": [1195.8043058026105, 502.5907820768639, 14.880252178422799], "attribute_name": "pedestrian.sitting_lying_down", "attribute_id": 4, "segmentation": [], "id": 8}, {"file_name": "samples/CAM_BACK_LEFT/n015-2018-07-18-11-07-57+0800__CAM_BACK_LEFT__1531883530447423.jpg", "image_id": "86e6806d626b4711a6d0f5015b090116", "area": 1840.396836351825, "category_name": "traffic_cone", "category_id": 8, "bbox": [976.5016497372175, 515.0039595028874, 30.627062877370918, 60.09054292018379], "iscrowd": 0, "bbox_cam3d": [2.4596094747766615, 0.6404788797338883, 15.49228428713527, 0.338, 0.309, 0.712, 1.461625206011101], "velo_cam3d": [0.02389033738396964, -0.0027892907804445547], "center2d": [991.6372663187118, 544.7316983348808, 15.49228428713527], "attribute_name": "None", "attribute_id": 8, "segmentation": [], "id": 9}, {"file_name": "samples/CAM_BACK_RIGHT/n015-2018-07-18-11-07-57+0800__CAM_BACK_RIGHT__1531883530427893.jpg", "image_id": "ec7096278e484c9ebe6894a2ad5682e9", "area": 130637.82232697189, "category_name": "car", "category_id": 0, "bbox": [806.290660237549, 470.86948127698895, 564.486943265249, 231.42753589888787], "iscrowd": 0, "bbox_cam3d": [2.041080764231013, 0.5400087467741127, 10.16381197333443, 1.638, 4.25, 1.44, 2.3008777344302445], "velo_cam3d": [-3.11975390859937, 4.71824099865795], "center2d": [1060.1864774468488, 568.1144351228712, 10.16381197333443], "attribute_name": "vehicle.moving", "attribute_id": 5, "segmentation": [], "id": 10}], "categories": [{"id": 0, "name": "car"}, {"id": 1, "name": "truck"}, {"id": 2, "name": "trailer"}, {"id": 3, "name": "bus"}, {"id": 4, "name": "construction_vehicle"}, {"id": 5, "name": "bicycle"}, {"id": 6, "name": "motorcycle"}, {"id": 7, "name": "pedestrian"}, {"id": 8, "name": "traffic_cone"}, {"id": 9, "name": "barrier"}]}
\ No newline at end of file
--- a/tests/data/nuscenes/samples/CAM_BACK_LEFT/n015-2018-07-18-11-07-57+0800__CAM_BACK_LEFT__1531883530447423.jpg
+++ b/tests/data/nuscenes/samples/CAM_BACK_LEFT/n015-2018-07-18-11-07-57+0800__CAM_BACK_LEFT__1531883530447423.jpg
--- a/tests/test_data/test_datasets/test_nuscenes_mono_dataset.py
+++ b/tests/test_data/test_datasets/test_nuscenes_mono_dataset.py
+import mmcv
+import numpy as np
+import pytest
+import torch
+
+from mmdet3d.datasets import NuScenesMonoDataset
+
+
+def test_getitem():
+    np.random.seed(0)
+    class_names = [
+        'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+        'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+    ]
+    img_norm_cfg = dict(
+        mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False)
+    pipeline = [
+        dict(type='LoadImageFromFileMono3D'),
+        dict(
+            type='LoadAnnotations3D',
+            with_bbox=True,
+            with_label=True,
+            with_attr_label=True,
+            with_bbox_3d=True,
+            with_label_3d=True,
+            with_bbox_depth=True),
+        dict(type='Resize', img_scale=(1600, 900), keep_ratio=True),
+        dict(type='RandomFlip3D', flip_ratio_bev_horizontal=1.0),
+        dict(type='Normalize', **img_norm_cfg),
+        dict(type='Pad', size_divisor=32),
+        dict(type='DefaultFormatBundle3D', class_names=class_names),
+        dict(
+            type='Collect3D',
+            keys=[
+                'img', 'gt_bboxes', 'gt_labels', 'attr_labels', 'gt_bboxes_3d',
+                'gt_labels_3d', 'centers2d', 'depths'
+            ]),
+    ]
+
+    nus_dataset = NuScenesMonoDataset(
+        ann_file='tests/data/nuscenes/nus_infos_mono3d.coco.json',
+        pipeline=pipeline,
+        data_root='tests/data/nuscenes/',
+        img_prefix='tests/data/nuscenes/',
+        test_mode=False)
+
+    data = nus_dataset[0]
+    img_metas = data['img_metas']._data
+    filename = img_metas['filename']
+    img_shape = img_metas['img_shape']
+    pad_shape = img_metas['pad_shape']
+    flip = img_metas['flip']
+    bboxes = data['gt_bboxes']._data
+    attrs = data['attr_labels']._data
+    labels3d = data['gt_labels_3d']._data
+    labels = data['gt_labels']._data
+    centers2d = data['centers2d']._data
+    depths = data['depths']._data
+
+    expected_filename = 'tests/data/nuscenes/samples/CAM_BACK_LEFT/' + \
+        'n015-2018-07-18-11-07-57+0800__CAM_BACK_LEFT__1531883530447423.jpg'
+    expected_img_shape = (900, 1600, 3)
+    expected_pad_shape = (928, 1600, 3)
+    expected_flip = True
+    expected_bboxes = torch.tensor([[485.4207, 513.7568, 515.4637, 576.1393],
+                                    [748.9482, 512.0452, 776.4941, 571.6310],
+                                    [432.1318, 427.8805, 508.4290, 578.1468],
+                                    [367.3779, 427.7682, 439.4244, 578.8904],
+                                    [592.8713, 515.0040, 623.4984, 575.0945]])
+    expected_attr_labels = torch.tensor([8, 8, 4, 4, 8])
+    expected_labels = torch.tensor([8, 8, 7, 7, 8])
+    expected_centers2d = torch.tensor([[500.6090, 544.6358],
+                                       [762.8789, 541.5280],
+                                       [471.1633, 502.2295],
+                                       [404.1957, 502.5908],
+                                       [608.3627, 544.7317]])
+    expected_depths = torch.tensor(
+        [15.3193, 15.6073, 14.7567, 14.8803, 15.4923])
+
+    assert filename == expected_filename
+    assert img_shape == expected_img_shape
+    assert pad_shape == expected_pad_shape
+    assert flip == expected_flip
+    assert torch.allclose(bboxes, expected_bboxes, 1e-5)
+    assert torch.all(attrs == expected_attr_labels)
+    assert torch.all(labels == expected_labels)
+    assert torch.all(labels3d == expected_labels)
+    assert torch.allclose(centers2d, expected_centers2d, 1e-5)
+    assert torch.allclose(depths, expected_depths, 1e-5)
+
+
+def test_format_results():
+    if not torch.cuda.is_available():
+        pytest.skip('test requires GPU and torch+cuda')
+    root_path = 'tests/data/nuscenes/'
+    ann_file = 'tests/data/nuscenes/nus_infos_mono3d.coco.json'
+    class_names = [
+        'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+        'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+    ]
+    pipeline = [
+        dict(type='LoadImageFromFileMono3D'),
+        dict(
+            type='LoadAnnotations3D',
+            with_bbox=True,
+            with_label=True,
+            with_attr_label=True,
+            with_bbox_3d=True,
+            with_label_3d=True,
+            with_bbox_depth=True),
+        dict(type='Resize', img_scale=(1600, 900), keep_ratio=True),
+        dict(type='Pad', size_divisor=32),
+        dict(type='DefaultFormatBundle3D', class_names=class_names),
+        dict(
+            type='Collect3D',
+            keys=[
+                'img', 'gt_bboxes', 'gt_labels', 'attr_labels', 'gt_bboxes_3d',
+                'gt_labels_3d', 'centers2d', 'depths'
+            ]),
+    ]
+    nus_dataset = NuScenesMonoDataset(
+        ann_file=ann_file,
+        pipeline=pipeline,
+        data_root=root_path,
+        test_mode=True)
+    results = mmcv.load('tests/data/nuscenes/mono3d_sample_results.pkl')
+    result_files, tmp_dir = nus_dataset.format_results(results)
+    result_data = mmcv.load(result_files['img_bbox'])
+    assert len(result_data['results'].keys()) == 1
+    assert len(result_data['results']['e93e98b63d3b40209056d129dc53ceee']) == 8
+    det = result_data['results']['e93e98b63d3b40209056d129dc53ceee'][0]
+
+    expected_token = 'e93e98b63d3b40209056d129dc53ceee'
+    expected_trans = torch.tensor(
+        [1018.753821915645, 605.190386124652, 0.7266818822266328])
+    expected_size = torch.tensor([1.6380000114440918, 4.25, 1.440000057220459])
+    expected_rotation = torch.tensor([
+        -0.9924980733795628, -0.013604682549109839, 0.01027292674776989,
+        -0.12106590736714223
+    ])
+    expected_detname = 'car'
+    expected_attr = 'vehicle.moving'
+
+    assert det['sample_token'] == expected_token
+    assert torch.allclose(
+        torch.tensor(det['translation']), expected_trans, 1e-5)
+    assert torch.allclose(torch.tensor(det['size']), expected_size, 1e-5)
+    assert torch.allclose(
+        torch.tensor(det['rotation']), expected_rotation, 1e-5)
+    assert det['detection_name'] == expected_detname
+    assert det['attribute_name'] == expected_attr
--- a/tests/test_data/test_pipelines/test_loadings/test_loading.py
+++ b/tests/test_data/test_pipelines/test_loadings/test_loading.py
@@ -130,10 +130,11 @@ def test_load_annotations3D():
    scannet_pts_semantic_mask = scannet_results['pts_semantic_mask']
    repr_str = repr(scannet_load_annotations3D)
    expected_repr_str = 'LoadAnnotations3D(\n    with_bbox_3d=True,     ' \
-                        'with_label_3d=True,     with_mask_3d=True,     ' \
-                        'with_seg_3d=True,     with_bbox=False,     ' \
-                        'with_label=False,     with_mask=False,     ' \
-                        'with_seg=False,     poly2mask=True)'
+                        'with_label_3d=True,     with_attr_label=False,     ' \
+                        'with_mask_3d=True,     with_seg_3d=True,     ' \
+                        'with_bbox=False,     with_label=False,     ' \
+                        'with_mask=False,     with_seg=False,     ' \
+                        'with_bbox_depth=False,     poly2mask=True)'
    assert repr_str == expected_repr_str
    assert scannet_gt_boxes.tensor.shape == (27, 7)
    assert scannet_gt_labels.shape == (27, )

--- a/tools/create_data.py
+++ b/tools/create_data.py
@@ -55,6 +55,9 @@ def nuscenes_data_prep(root_path,
        root_path, info_prefix, version=version, max_sweeps=max_sweeps)

    if version == 'v1.0-test':
+        info_test_path = osp.join(root_path, f'{info_prefix}_infos_test.pkl')
+        nuscenes_converter.export_2d_annotation(
+            root_path, info_test_path, version=version)
        return

    info_train_path = osp.join(root_path, f'{info_prefix}_infos_train.pkl')

--- a/tools/data_converter/nuscenes_converter.py
+++ b/tools/data_converter/nuscenes_converter.py
@@ -9,12 +9,18 @@ from pyquaternion import Quaternion
 from shapely.geometry import MultiPoint, box
 from typing import List, Tuple, Union

+from mmdet3d.core.bbox.box_np_ops import points_cam2img
 from mmdet3d.datasets import NuScenesDataset

 nus_categories = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
                  'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
                  'barrier')

+nus_attributes = ('cycle.with_rider', 'cycle.without_rider',
+                  'pedestrian.moving', 'pedestrian.standing',
+                  'pedestrian.sitting_lying_down', 'vehicle.moving',
+                  'vehicle.parked', 'vehicle.stopped', 'None')
+

 def create_nuscenes_infos(root_path,
                          info_prefix,
@@ -324,13 +330,14 @@ def obtain_sensor2top(nusc,
    return sweep


-def export_2d_annotation(root_path, info_path, version):
+def export_2d_annotation(root_path, info_path, version, mono3d=True):
    """Export 2d annotation from the info file and raw data.

    Args:
        root_path (str): Root path of the raw data.
        info_path (str): Path of the info file.
        version (str): Dataset version.
+        mono3d (bool): Whether to export mono3d annotation. Default: True.
    """
    # get bbox annotations for camera
    camera_types = [
@@ -356,12 +363,20 @@ def export_2d_annotation(root_path, info_path, version):
            coco_infos = get_2d_boxes(
                nusc,
                cam_info['sample_data_token'],
-                visibilities=['', '1', '2', '3', '4'])
+                visibilities=['', '1', '2', '3', '4'],
+                mono3d=mono3d)
            (height, width, _) = mmcv.imread(cam_info['data_path']).shape
            coco_2d_dict['images'].append(
                dict(
-                    file_name=cam_info['data_path'],
+                    file_name=cam_info['data_path'].split('data/nuscenes/')
+                    [-1],
                    id=cam_info['sample_data_token'],
+                    token=info['token'],
+                    cam2ego_rotation=cam_info['sensor2ego_rotation'],
+                    cam2ego_translation=cam_info['sensor2ego_translation'],
+                    ego2global_rotation=info['ego2global_rotation'],
+                    ego2global_translation=info['ego2global_translation'],
+                    cam_intrinsic=cam_info['cam_intrinsic'],
                    width=width,
                    height=height))
            for coco_info in coco_infos:
@@ -372,16 +387,24 @@ def export_2d_annotation(root_path, info_path, version):
                coco_info['id'] = coco_ann_id
                coco_2d_dict['annotations'].append(coco_info)
                coco_ann_id += 1
-    mmcv.dump(coco_2d_dict, f'{info_path[:-4]}.coco.json')
+    if mono3d:
+        json_prefix = f'{info_path[:-4]}_mono3d'
+    else:
+        json_prefix = f'{info_path[:-4]}'
+    mmcv.dump(coco_2d_dict, f'{json_prefix}.coco.json')


-def get_2d_boxes(nusc, sample_data_token: str,
-                 visibilities: List[str]) -> List[OrderedDict]:
+def get_2d_boxes(nusc,
+                 sample_data_token: str,
+                 visibilities: List[str],
+                 mono3d=True):
    """Get the 2D annotation records for a given `sample_data_token`.

    Args:
-        sample_data_token: Sample data token belonging to a camera keyframe.
-        visibilities: Visibility filter.
+        sample_data_token (str): Sample data token belonging to a camera \
+            keyframe.
+        visibilities (list[str]): Visibility filter.
+        mono3d (bool): Whether to get boxes with mono3d annotation.

    Return:
        list[dict]: List of 2D annotation record that belongs to the input
@@ -456,6 +479,43 @@ def get_2d_boxes(nusc, sample_data_token: str,
        # Generate dictionary record to be included in the .json file.
        repro_rec = generate_record(ann_rec, min_x, min_y, max_x, max_y,
                                    sample_data_token, sd_rec['filename'])
+
+        # If mono3d=True, add 3D annotations in camera coordinates
+        if mono3d and (repro_rec is not None):
+            loc = box.center.tolist()
+            dim = box.wlh.tolist()
+            rot = [box.orientation.yaw_pitch_roll[0]]
+
+            global_velo2d = nusc.box_velocity(box.token)[:2]
+            global_velo3d = np.array([*global_velo2d, 0.0])
+            e2g_r_mat = Quaternion(pose_rec['rotation']).rotation_matrix
+            c2e_r_mat = Quaternion(cs_rec['rotation']).rotation_matrix
+            cam_velo3d = global_velo3d @ np.linalg.inv(
+                e2g_r_mat).T @ np.linalg.inv(c2e_r_mat).T
+            velo = cam_velo3d[0::2].tolist()
+
+            repro_rec['bbox_cam3d'] = loc + dim + rot
+            repro_rec['velo_cam3d'] = velo
+
+            center3d = np.array(loc).reshape([1, 3])
+            center2d = points_cam2img(
+                center3d, camera_intrinsic, with_depth=True)
+            repro_rec['center2d'] = center2d.squeeze().tolist()
+            # normalized center2D + depth
+            # if samples with depth < 0 will be removed
+            if repro_rec['center2d'][2] <= 0:
+                continue
+
+            ann_token = nusc.get('sample_annotation',
+                                 box.token)['attribute_tokens']
+            if len(ann_token) == 0:
+                attr_name = 'None'
+            else:
+                attr_name = nusc.get('attribute', ann_token[0])['name']
+            attr_id = nus_attributes.index(attr_name)
+            repro_rec['attribute_name'] = attr_name
+            repro_rec['attribute_id'] = attr_id
+
        repro_recs.append(repro_rec)

    return repro_recs