[Feature] Support monocular 3D detection on KITTI (#415)

* Support nuscenes mono3d json info generation * Support nuscenes mono3d dataset class * Support attribute and bbox2d prediction in bbox3dnms and bbox3d2result * Rename dataset class and add comments to 'attrs' * Support mono3d related pipelines * Fix unittest for loading 3D annotations * Add unit test for nuscenes mono3d dataset * Rename the sample result file * Upload sample data for mono3d unit test * Upload sample data for mono3d unit test * Upload sample image for unit test * Delete tests/data/nuscenes/samples/LIDAR_TOP/CAM_BACK_LEFT directory * Add files via upload * Remove unnecessary 'f' * Remove unnecessary \ in arguments * Remove check for pycocotools version because it has been done in the cocodataset * Remove unnecessary comma, add TODO and change init of attrs in format_results * Merge RandomFlip3D and RandomFlipMono3D * Add pytest to check whether cuda is available in the unit test * Support monocular 3D detection on KITTI dataset * Add visualization TODO * Merge nus_mono3d and update dataset init * Remove duplicated loading images in mono3d * Remove aos evaluation of bbox2d predictions on KITTI * Add unit test for kitti mono3d dataset * Add accidentally deleted classes in the dataset init * Replace .format with f-string in kitti dataset * Clean comma * Toy data for unit test of kitti mono dataset * Sample image data for unit test

[Feature] Support monocular 3D detection on KITTI (#415)
* Support nuscenes mono3d json info generation * Support nuscenes mono3d dataset class * Support attribute and bbox2d prediction in bbox3dnms and bbox3d2result * Rename dataset class and add comments to 'attrs' * Support mono3d related pipelines * Fix unittest for loading 3D annotations * Add unit test for nuscenes mono3d dataset * Rename the sample result file * Upload sample data for mono3d unit test * Upload sample data for mono3d unit test * Upload sample image for unit test * Delete tests/data/nuscenes/samples/LIDAR_TOP/CAM_BACK_LEFT directory * Add files via upload * Remove unnecessary 'f' * Remove unnecessary \ in arguments * Remove check for pycocotools version because it has been done in the cocodataset * Remove unnecessary comma, add TODO and change init of attrs in format_results * Merge RandomFlip3D and RandomFlipMono3D * Add pytest to check whether cuda is available in the unit test * Support monocular 3D detection on KITTI dataset * Add visualization TODO * Merge nus_mono3d and update dataset init * Remove duplicated loading images in mono3d * Remove aos evaluation of bbox2d predictions on KITTI * Add unit test for kitti mono3d dataset * Add accidentally deleted classes in the dataset init * Replace .format with f-string in kitti dataset * Clean comma * Toy data for unit test of kitti mono dataset * Sample image data for unit test
e37f5d5e · twang · GitHub · a03100ea · e37f5d5e · e37f5d5e
Unverified Commit e37f5d5e authored Apr 12, 2021 by twang Committed by GitHub Apr 12, 2021
12 changed files
--- a/mmdet3d/core/evaluation/kitti_utils/eval.py
+++ b/mmdet3d/core/evaluation/kitti_utils/eval.py
@@ -690,7 +690,8 @@ def kitti_eval(gt_annos,
    pred_alpha = False
    valid_alpha_gt = False
    for anno in dt_annos:
-        if anno['alpha'].shape[0] != 0:
+        mask = (anno['alpha'] != -10)
+        if anno['alpha'][mask].shape[0] != 0:
            pred_alpha = True
            break
    for anno in gt_annos:

--- a/mmdet3d/datasets/__init__.py
+++ b/mmdet3d/datasets/__init__.py
@@ -3,6 +3,7 @@ from .builder import DATASETS, build_dataset
 from .custom_3d import Custom3DDataset
 from .custom_3d_seg import Custom3DSegDataset
 from .kitti_dataset import KittiDataset
+from .kitti_mono_dataset import KittiMonoDataset
 from .lyft_dataset import LyftDataset
 from .nuscenes_dataset import NuScenesDataset
 from .nuscenes_mono_dataset import NuScenesMonoDataset
@@ -19,14 +20,15 @@ from .utils import get_loading_pipeline
 from .waymo_dataset import WaymoDataset

 __all__ = [
-    'KittiDataset', 'GroupSampler', 'DistributedGroupSampler',
-    'build_dataloader', 'RepeatFactorDataset', 'DATASETS', 'build_dataset',
-    'CocoDataset', 'NuScenesDataset', 'NuScenesMonoDataset', 'LyftDataset',
-    'ObjectSample', 'RandomFlip3D', 'ObjectNoise', 'GlobalRotScaleTrans',
-    'PointShuffle', 'ObjectRangeFilter', 'PointsRangeFilter', 'Collect3D',
-    'LoadPointsFromFile', 'NormalizePointsColor', 'IndoorPointSample',
-    'LoadAnnotations3D', 'SUNRGBDDataset', 'ScanNetDataset',
-    'ScanNetSegDataset', 'SemanticKITTIDataset', 'Custom3DDataset',
-    'Custom3DSegDataset', 'LoadPointsFromMultiSweeps', 'WaymoDataset',
-    'BackgroundPointsFilter', 'VoxelBasedPointSampler', 'get_loading_pipeline'
+    'KittiDataset', 'KittiMonoDataset', 'GroupSampler',
+    'DistributedGroupSampler', 'build_dataloader', 'RepeatFactorDataset',
+    'DATASETS', 'build_dataset', 'CocoDataset', 'NuScenesDataset',
+    'NuScenesMonoDataset', 'LyftDataset', 'ObjectSample', 'RandomFlip3D',
+    'ObjectNoise', 'GlobalRotScaleTrans', 'PointShuffle', 'ObjectRangeFilter',
+    'PointsRangeFilter', 'Collect3D', 'LoadPointsFromFile',
+    'NormalizePointsColor', 'IndoorPointSample', 'LoadAnnotations3D',
+    'SUNRGBDDataset', 'ScanNetDataset', 'ScanNetSegDataset',
+    'SemanticKITTIDataset', 'Custom3DDataset', 'Custom3DSegDataset',
+    'LoadPointsFromMultiSweeps', 'WaymoDataset', 'BackgroundPointsFilter',
+    'VoxelBasedPointSampler', 'get_loading_pipeline'
 ]
--- a/mmdet3d/datasets/kitti_dataset.py
+++ b/mmdet3d/datasets/kitti_dataset.py
@@ -580,7 +580,7 @@ class KittiDataset(Custom3DDataset):
                                anno['score'][idx]),
                            file=f,
                        )
-            print('Result is saved to {}'.format(submission_prefix))
+            print(f'Result is saved to {submission_prefix}')

        return det_annos

@@ -658,8 +658,7 @@ class KittiDataset(Custom3DDataset):
                box3d_lidar=box_preds[valid_inds].tensor.numpy(),
                scores=scores[valid_inds].numpy(),
                label_preds=labels[valid_inds].numpy(),
-                sample_idx=sample_idx,
-            )
+                sample_idx=sample_idx)
        else:
            return dict(
                bbox=np.zeros([0, 4]),
@@ -667,8 +666,7 @@ class KittiDataset(Custom3DDataset):
                box3d_lidar=np.zeros([0, 7]),
                scores=np.zeros([0]),
                label_preds=np.zeros([0, 4]),
-                sample_idx=sample_idx,
-            )
+                sample_idx=sample_idx)

    def show(self, results, out_dir, show=True):
        """Results visualization.

--- a/mmdet3d/datasets/kitti_mono_dataset.py
+++ b/mmdet3d/datasets/kitti_mono_dataset.py
+import copy
+import mmcv
+import numpy as np
+import tempfile
+import torch
+from mmcv.utils import print_log
+from os import path as osp
+
+from mmdet.datasets import DATASETS
+from ..core.bbox import Box3DMode, CameraInstance3DBoxes, points_cam2img
+from .nuscenes_mono_dataset import NuScenesMonoDataset
+
+
+@DATASETS.register_module()
+class KittiMonoDataset(NuScenesMonoDataset):
+    """Monocular 3D detection on KITTI Dataset.
+
+    Args:
+        data_root (str): Path of dataset root.
+        info_file (str): Path of info file.
+        load_interval (int, optional): Interval of loading the dataset. It is
+            used to uniformly sample the dataset. Defaults to 1.
+        with_velocity (bool, optional): Whether include velocity prediction
+            into the experiments. Defaults to False.
+        eval_version (str, optional): Configuration version of evaluation.
+            Defaults to None.
+        version (str, optional): Dataset version. Defaults to None.
+        kwargs (dict): Other arguments are the same of NuScenesMonoDataset.
+    """
+
+    CLASSES = ('Pedestrian', 'Cyclist', 'Car')
+
+    def __init__(self,
+                 data_root,
+                 info_file,
+                 load_interval=1,
+                 with_velocity=False,
+                 eval_version=None,
+                 version=None,
+                 **kwargs):
+        super().__init__(
+            data_root=data_root,
+            load_interval=load_interval,
+            with_velocity=with_velocity,
+            eval_version=eval_version,
+            version=version,
+            **kwargs)
+        self.anno_infos = mmcv.load(info_file)
+        self.bbox_code_size = 7
+
+    def _parse_ann_info(self, img_info, ann_info):
+        """Parse bbox and mask annotation.
+
+        Args:
+            ann_info (list[dict]): Annotation info of an image.
+            with_mask (bool): Whether to parse mask annotations.
+
+        Returns:
+            dict: A dict containing the following keys: bboxes, bboxes_ignore,\
+                labels, masks, seg_map. "masks" are raw annotations and not \
+                decoded into binary masks.
+        """
+        gt_bboxes = []
+        gt_labels = []
+        gt_bboxes_ignore = []
+        gt_masks_ann = []
+        gt_bboxes_cam3d = []
+        centers2d = []
+        depths = []
+        for i, ann in enumerate(ann_info):
+            if ann.get('ignore', False):
+                continue
+            x1, y1, w, h = ann['bbox']
+            inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0))
+            inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0))
+            if inter_w * inter_h == 0:
+                continue
+            if ann['area'] <= 0 or w < 1 or h < 1:
+                continue
+            if ann['category_id'] not in self.cat_ids:
+                continue
+            bbox = [x1, y1, x1 + w, y1 + h]
+            if ann.get('iscrowd', False):
+                gt_bboxes_ignore.append(bbox)
+            else:
+                gt_bboxes.append(bbox)
+                gt_labels.append(self.cat2label[ann['category_id']])
+                gt_masks_ann.append(ann.get('segmentation', None))
+                # 3D annotations in camera coordinates
+                bbox_cam3d = np.array(ann['bbox_cam3d']).reshape(-1, )
+                # change orientation to local yaw
+                bbox_cam3d[6] = -np.arctan2(bbox_cam3d[0],
+                                            bbox_cam3d[2]) + bbox_cam3d[6]
+                gt_bboxes_cam3d.append(bbox_cam3d)
+                # 2.5D annotations in camera coordinates
+                center2d = ann['center2d'][:2]
+                depth = ann['center2d'][2]
+                centers2d.append(center2d)
+                depths.append(depth)
+
+        if gt_bboxes:
+            gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
+            gt_labels = np.array(gt_labels, dtype=np.int64)
+        else:
+            gt_bboxes = np.zeros((0, 4), dtype=np.float32)
+            gt_labels = np.array([], dtype=np.int64)
+
+        if gt_bboxes_cam3d:
+            gt_bboxes_cam3d = np.array(gt_bboxes_cam3d, dtype=np.float32)
+            centers2d = np.array(centers2d, dtype=np.float32)
+            depths = np.array(depths, dtype=np.float32)
+        else:
+            gt_bboxes_cam3d = np.zeros((0, self.bbox_code_size),
+                                       dtype=np.float32)
+            centers2d = np.zeros((0, 2), dtype=np.float32)
+            depths = np.zeros((0), dtype=np.float32)
+
+        gt_bboxes_cam3d = CameraInstance3DBoxes(
+            gt_bboxes_cam3d,
+            box_dim=gt_bboxes_cam3d.shape[-1],
+            origin=(0.5, 0.5, 0.5))
+        gt_labels_3d = copy.deepcopy(gt_labels)
+
+        if gt_bboxes_ignore:
+            gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
+        else:
+            gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)
+
+        seg_map = img_info['filename'].replace('jpg', 'png')
+
+        ann = dict(
+            bboxes=gt_bboxes,
+            labels=gt_labels,
+            gt_bboxes_3d=gt_bboxes_cam3d,
+            gt_labels_3d=gt_labels_3d,
+            centers2d=centers2d,
+            depths=depths,
+            bboxes_ignore=gt_bboxes_ignore,
+            masks=gt_masks_ann,
+            seg_map=seg_map)
+
+        return ann
+
+    def format_results(self,
+                       outputs,
+                       pklfile_prefix=None,
+                       submission_prefix=None):
+        """Format the results to pkl file.
+
+        Args:
+            outputs (list[dict]): Testing results of the dataset.
+            pklfile_prefix (str | None): The prefix of pkl files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+            submission_prefix (str | None): The prefix of submitted files. It
+                includes the file path and the prefix of filename, e.g.,
+                "a/b/prefix". If not specified, a temp file will be created.
+                Default: None.
+
+        Returns:
+            tuple: (result_files, tmp_dir), result_files is a dict containing \
+                the json filepaths, tmp_dir is the temporal directory created \
+                for saving json files when jsonfile_prefix is not specified.
+        """
+        if pklfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            pklfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            tmp_dir = None
+
+        if not isinstance(outputs[0], dict):
+            result_files = self.bbox2result_kitti2d(outputs, self.CLASSES,
+                                                    pklfile_prefix,
+                                                    submission_prefix)
+        elif 'pts_bbox' in outputs[0] or 'img_bbox' in outputs[0] or \
+                'img_bbox2d' in outputs[0]:
+            result_files = dict()
+            for name in outputs[0]:
+                results_ = [out[name] for out in outputs]
+                pklfile_prefix_ = pklfile_prefix + name
+                if submission_prefix is not None:
+                    submission_prefix_ = submission_prefix + name
+                else:
+                    submission_prefix_ = None
+                if '2d' in name:
+                    result_files_ = self.bbox2result_kitti2d(
+                        results_, self.CLASSES, pklfile_prefix_,
+                        submission_prefix_)
+                else:
+                    result_files_ = self.bbox2result_kitti(
+                        results_, self.CLASSES, pklfile_prefix_,
+                        submission_prefix_)
+                result_files[name] = result_files_
+        else:
+            result_files = self.bbox2result_kitti(outputs, self.CLASSES,
+                                                  pklfile_prefix,
+                                                  submission_prefix)
+        return result_files, tmp_dir
+
+    def evaluate(self,
+                 results,
+                 metric=None,
+                 logger=None,
+                 pklfile_prefix=None,
+                 submission_prefix=None,
+                 show=False,
+                 out_dir=None):
+        """Evaluation in KITTI protocol.
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            metric (str | list[str]): Metrics to be evaluated.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            pklfile_prefix (str | None): The prefix of pkl files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+            submission_prefix (str | None): The prefix of submission datas.
+                If not specified, the submission data will not be generated.
+            show (bool): Whether to visualize.
+                Default: False.
+            out_dir (str): Path to save the visualization results.
+                Default: None.
+
+        Returns:
+            dict[str, float]: Results of each evaluation metric.
+        """
+        result_files, tmp_dir = self.format_results(results, pklfile_prefix)
+        from mmdet3d.core.evaluation import kitti_eval
+        gt_annos = [info['annos'] for info in self.anno_infos]
+
+        if isinstance(result_files, dict):
+            ap_dict = dict()
+            for name, result_files_ in result_files.items():
+                eval_types = ['bbox', 'bev', '3d']
+                if '2d' in name:
+                    eval_types = ['bbox']
+                ap_result_str, ap_dict_ = kitti_eval(
+                    gt_annos,
+                    result_files_,
+                    self.CLASSES,
+                    eval_types=eval_types)
+                for ap_type, ap in ap_dict_.items():
+                    ap_dict[f'{name}/{ap_type}'] = float('{:.4f}'.format(ap))
+
+                print_log(
+                    f'Results of {name}:\n' + ap_result_str, logger=logger)
+
+        else:
+            if metric == 'img_bbox2d':
+                ap_result_str, ap_dict = kitti_eval(
+                    gt_annos, result_files, self.CLASSES, eval_types=['bbox'])
+            else:
+                ap_result_str, ap_dict = kitti_eval(gt_annos, result_files,
+                                                    self.CLASSES)
+            print_log('\n' + ap_result_str, logger=logger)
+
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+        if show:
+            self.show(results, out_dir)
+        return ap_dict
+
+    def bbox2result_kitti(self,
+                          net_outputs,
+                          class_names,
+                          pklfile_prefix=None,
+                          submission_prefix=None):
+        """Convert 3D detection results to kitti format for evaluation and test
+        submission.
+
+        Args:
+            net_outputs (list[np.ndarray]): List of array storing the \
+                inferenced bounding boxes and scores.
+            class_names (list[String]): A list of class names.
+            pklfile_prefix (str | None): The prefix of pkl file.
+            submission_prefix (str | None): The prefix of submission file.
+
+        Returns:
+            list[dict]: A list of dictionaries with the kitti format.
+        """
+        assert len(net_outputs) == len(self.anno_infos)
+        if submission_prefix is not None:
+            mmcv.mkdir_or_exist(submission_prefix)
+
+        det_annos = []
+        print('\nConverting prediction to KITTI format')
+        for idx, pred_dicts in enumerate(
+                mmcv.track_iter_progress(net_outputs)):
+            annos = []
+            info = self.anno_infos[idx]
+            sample_idx = info['image']['image_idx']
+            image_shape = info['image']['image_shape'][:2]
+
+            box_dict = self.convert_valid_bboxes(pred_dicts, info)
+            anno = {
+                'name': [],
+                'truncated': [],
+                'occluded': [],
+                'alpha': [],
+                'bbox': [],
+                'dimensions': [],
+                'location': [],
+                'rotation_y': [],
+                'score': []
+            }
+            if len(box_dict['bbox']) > 0:
+                box_2d_preds = box_dict['bbox']
+                box_preds = box_dict['box3d_camera']
+                scores = box_dict['scores']
+                box_preds_lidar = box_dict['box3d_lidar']
+                label_preds = box_dict['label_preds']
+
+                for box, box_lidar, bbox, score, label in zip(
+                        box_preds, box_preds_lidar, box_2d_preds, scores,
+                        label_preds):
+                    bbox[2:] = np.minimum(bbox[2:], image_shape[::-1])
+                    bbox[:2] = np.maximum(bbox[:2], [0, 0])
+                    anno['name'].append(class_names[int(label)])
+                    anno['truncated'].append(0.0)
+                    anno['occluded'].append(0)
+                    anno['alpha'].append(-np.arctan2(box[0], box[2]) + box[6])
+                    anno['bbox'].append(bbox)
+                    anno['dimensions'].append(box[3:6])
+                    anno['location'].append(box[:3])
+                    anno['rotation_y'].append(box[6])
+                    anno['score'].append(score)
+
+                anno = {k: np.stack(v) for k, v in anno.items()}
+                annos.append(anno)
+
+            else:
+                anno = {
+                    'name': np.array([]),
+                    'truncated': np.array([]),
+                    'occluded': np.array([]),
+                    'alpha': np.array([]),
+                    'bbox': np.zeros([0, 4]),
+                    'dimensions': np.zeros([0, 3]),
+                    'location': np.zeros([0, 3]),
+                    'rotation_y': np.array([]),
+                    'score': np.array([]),
+                }
+                annos.append(anno)
+
+            if submission_prefix is not None:
+                curr_file = f'{submission_prefix}/{sample_idx:06d}.txt'
+                with open(curr_file, 'w') as f:
+                    bbox = anno['bbox']
+                    loc = anno['location']
+                    dims = anno['dimensions']  # lhw -> hwl
+
+                    for idx in range(len(bbox)):
+                        print(
+                            '{} -1 -1 {:.4f} {:.4f} {:.4f} {:.4f} '
+                            '{:.4f} {:.4f} {:.4f} '
+                            '{:.4f} {:.4f} {:.4f} {:.4f} {:.4f} {:.4f}'.format(
+                                anno['name'][idx], anno['alpha'][idx],
+                                bbox[idx][0], bbox[idx][1], bbox[idx][2],
+                                bbox[idx][3], dims[idx][1], dims[idx][2],
+                                dims[idx][0], loc[idx][0], loc[idx][1],
+                                loc[idx][2], anno['rotation_y'][idx],
+                                anno['score'][idx]),
+                            file=f)
+
+            annos[-1]['sample_idx'] = np.array(
+                [sample_idx] * len(annos[-1]['score']), dtype=np.int64)
+
+            det_annos += annos
+
+        if pklfile_prefix is not None:
+            if not pklfile_prefix.endswith(('.pkl', '.pickle')):
+                out = f'{pklfile_prefix}.pkl'
+            mmcv.dump(det_annos, out)
+            print('Result is saved to %s' % out)
+
+        return det_annos
+
+    def bbox2result_kitti2d(self,
+                            net_outputs,
+                            class_names,
+                            pklfile_prefix=None,
+                            submission_prefix=None):
+        """Convert 2D detection results to kitti format for evaluation and test
+        submission.
+
+        Args:
+            net_outputs (list[np.ndarray]): List of array storing the \
+                inferenced bounding boxes and scores.
+            class_names (list[String]): A list of class names.
+            pklfile_prefix (str | None): The prefix of pkl file.
+            submission_prefix (str | None): The prefix of submission file.
+
+        Returns:
+            list[dict]: A list of dictionaries have the kitti format
+        """
+        assert len(net_outputs) == len(self.anno_infos)
+
+        det_annos = []
+        print('\nConverting prediction to KITTI format')
+        for i, bboxes_per_sample in enumerate(
+                mmcv.track_iter_progress(net_outputs)):
+            annos = []
+            anno = dict(
+                name=[],
+                truncated=[],
+                occluded=[],
+                alpha=[],
+                bbox=[],
+                dimensions=[],
+                location=[],
+                rotation_y=[],
+                score=[])
+            sample_idx = self.anno_infos[i]['image']['image_idx']
+
+            num_example = 0
+            for label in range(len(bboxes_per_sample)):
+                bbox = bboxes_per_sample[label]
+                for i in range(bbox.shape[0]):
+                    anno['name'].append(class_names[int(label)])
+                    anno['truncated'].append(0.0)
+                    anno['occluded'].append(0)
+                    anno['alpha'].append(-10)
+                    anno['bbox'].append(bbox[i, :4])
+                    # set dimensions (height, width, length) to zero
+                    anno['dimensions'].append(
+                        np.zeros(shape=[3], dtype=np.float32))
+                    # set the 3D translation to (-1000, -1000, -1000)
+                    anno['location'].append(
+                        np.ones(shape=[3], dtype=np.float32) * (-1000.0))
+                    anno['rotation_y'].append(0.0)
+                    anno['score'].append(bbox[i, 4])
+                    num_example += 1
+
+            if num_example == 0:
+                annos.append(
+                    dict(
+                        name=np.array([]),
+                        truncated=np.array([]),
+                        occluded=np.array([]),
+                        alpha=np.array([]),
+                        bbox=np.zeros([0, 4]),
+                        dimensions=np.zeros([0, 3]),
+                        location=np.zeros([0, 3]),
+                        rotation_y=np.array([]),
+                        score=np.array([]),
+                    ))
+            else:
+                anno = {k: np.stack(v) for k, v in anno.items()}
+                annos.append(anno)
+
+            annos[-1]['sample_idx'] = np.array(
+                [sample_idx] * num_example, dtype=np.int64)
+            det_annos += annos
+
+        if pklfile_prefix is not None:
+            if not pklfile_prefix.endswith(('.pkl', '.pickle')):
+                out = f'{pklfile_prefix}.pkl'
+            mmcv.dump(det_annos, out)
+            print('Result is saved to %s' % out)
+
+        if submission_prefix is not None:
+            # save file in submission format
+            mmcv.mkdir_or_exist(submission_prefix)
+            print(f'Saving KITTI submission to {submission_prefix}')
+            for i, anno in enumerate(det_annos):
+                sample_idx = self.anno_infos[i]['image']['image_idx']
+                cur_det_file = f'{submission_prefix}/{sample_idx:06d}.txt'
+                with open(cur_det_file, 'w') as f:
+                    bbox = anno['bbox']
+                    loc = anno['location']
+                    dims = anno['dimensions'][::-1]  # lhw -> hwl
+                    for idx in range(len(bbox)):
+                        print(
+                            '{} -1 -1 {:4f} {:4f} {:4f} {:4f} {:4f} {:4f} '
+                            '{:4f} {:4f} {:4f} {:4f} {:4f} {:4f} {:4f}'.format(
+                                anno['name'][idx],
+                                anno['alpha'][idx],
+                                *bbox[idx],  # 4 float
+                                *dims[idx],  # 3 float
+                                *loc[idx],  # 3 float
+                                anno['rotation_y'][idx],
+                                anno['score'][idx]),
+                            file=f,
+                        )
+            print(f'Result is saved to {submission_prefix}')
+
+        return det_annos
+
+    def convert_valid_bboxes(self, box_dict, info):
+        """Convert the predicted boxes into valid ones.
+
+        Args:
+            box_dict (dict): Box dictionaries to be converted.
+                - boxes_3d (:obj:`CameraInstance3DBoxes`): 3D bounding boxes.
+                - scores_3d (torch.Tensor): Scores of boxes.
+                - labels_3d (torch.Tensor): Class labels of boxes.
+            info (dict): Data info.
+
+        Returns:
+            dict: Valid predicted boxes.
+                - bbox (np.ndarray): 2D bounding boxes.
+                - box3d_camera (np.ndarray): 3D bounding boxes in \
+                    camera coordinate.
+                - scores (np.ndarray): Scores of boxes.
+                - label_preds (np.ndarray): Class label predictions.
+                - sample_idx (int): Sample index.
+        """
+        box_preds = box_dict['boxes_3d']
+        scores = box_dict['scores_3d']
+        labels = box_dict['labels_3d']
+        sample_idx = info['image']['image_idx']
+
+        if len(box_preds) == 0:
+            return dict(
+                bbox=np.zeros([0, 4]),
+                box3d_camera=np.zeros([0, 7]),
+                scores=np.zeros([0]),
+                label_preds=np.zeros([0, 4]),
+                sample_idx=sample_idx)
+
+        rect = info['calib']['R0_rect'].astype(np.float32)
+        Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32)
+        P2 = info['calib']['P2'].astype(np.float32)
+        img_shape = info['image']['image_shape']
+        P2 = box_preds.tensor.new_tensor(P2)
+
+        box_preds_camera = box_preds
+        box_preds_lidar = box_preds.convert_to(Box3DMode.LIDAR,
+                                               np.linalg.inv(rect @ Trv2c))
+
+        box_corners = box_preds_camera.corners
+        box_corners_in_image = points_cam2img(box_corners, P2)
+        # box_corners_in_image: [N, 8, 2]
+        minxy = torch.min(box_corners_in_image, dim=1)[0]
+        maxxy = torch.max(box_corners_in_image, dim=1)[0]
+        box_2d_preds = torch.cat([minxy, maxxy], dim=1)
+        # Post-processing
+        # check box_preds_camera
+        image_shape = box_preds.tensor.new_tensor(img_shape)
+        valid_cam_inds = ((box_2d_preds[:, 0] < image_shape[1]) &
+                          (box_2d_preds[:, 1] < image_shape[0]) &
+                          (box_2d_preds[:, 2] > 0) & (box_2d_preds[:, 3] > 0))
+        # check box_preds
+        valid_inds = valid_cam_inds
+
+        if valid_inds.sum() > 0:
+            return dict(
+                bbox=box_2d_preds[valid_inds, :].numpy(),
+                box3d_camera=box_preds_camera[valid_inds].tensor.numpy(),
+                box3d_lidar=box_preds_lidar[valid_inds].tensor.numpy(),
+                scores=scores[valid_inds].numpy(),
+                label_preds=labels[valid_inds].numpy(),
+                sample_idx=sample_idx)
+        else:
+            return dict(
+                bbox=np.zeros([0, 4]),
+                box3d_camera=np.zeros([0, 7]),
+                box3d_lidar=np.zeros([0, 7]),
+                scores=np.zeros([0]),
+                label_preds=np.zeros([0, 4]),
+                sample_idx=sample_idx)
--- a/tests/data/kitti/kitti_infos_mono3d.coco.json
+++ b/tests/data/kitti/kitti_infos_mono3d.coco.json
+{"images": [{"file_name": "training/image_2/000007.png", "id": 7, "Tri2v": [[0.9999976, 0.0007553071, -0.002035826, -0.8086759], [-0.0007854027, 0.9998898, -0.01482298, 0.3195559], [0.002024406, 0.01482454, 0.9998881, -0.7997231], [0.0, 0.0, 0.0, 1.0]], "Trv2c": [[0.007533745, -0.9999714, -0.000616602, -0.004069766], [0.01480249, 0.0007280733, -0.9998902, -0.07631618], [0.9998621, 0.00752379, 0.01480755, -0.2717806], [0.0, 0.0, 0.0, 1.0]], "rect": [[0.9999239, 0.00983776, -0.007445048, 0.0], [-0.009869795, 0.9999421, -0.004278459, 0.0], [0.007402527, 0.004351614, 0.9999631, 0.0], [0.0, 0.0, 0.0, 1.0]], "cam_intrinsic": [[721.5377, 0.0, 609.5593, 44.85728], [0.0, 721.5377, 172.854, 0.2163791], [0.0, 0.0, 1.0, 0.002745884], [0.0, 0.0, 0.0, 1.0]], "width": 1242, "height": 375}], "annotations": [{"file_name": "training/image_2/000007.png", "image_id": 7, "area": 2556.023616260146, "category_name": "Car", "category_id": 2, "bbox": [565.4822720402807, 175.01202566042497, 51.17323679197273, 49.94844525177848], "iscrowd": 0, "bbox_cam3d": [-0.627830982208252, 0.8849999904632568, 25.010000228881836, 3.200000047683716, 1.6100000143051147, 1.659999966621399, -1.590000033378601], "velo_cam3d": -1, "center2d": [591.3814672167642, 198.3730937263457, 25.012745884], "attribute_name": -1, "attribute_id": -1, "segmentation": [], "id": 2}, {"file_name": "training/image_2/000007.png", "image_id": 7, "area": 693.1538564468428, "category_name": "Car", "category_id": 2, "bbox": [481.8496708488522, 179.85710612050596, 30.55976691329198, 22.681909139344754], "iscrowd": 0, "bbox_cam3d": [-7.367831230163574, 1.1799999475479126, 47.54999923706055, 3.700000047683716, 1.399999976158142, 1.5099999904632568, 1.5499999523162842], "velo_cam3d": -1, "center2d": [497.72892067550754, 190.75320250122618, 47.552745884], "attribute_name": -1, "attribute_id": -1, "segmentation": [], "id": 3}, {"file_name": "training/image_2/000007.png", "image_id": 7, "area": 419.21693566410073, "category_name": "Car", "category_id": 2, "bbox": [542.2247151650495, 175.73341152322814, 23.019633917835904, 18.211277258379255], "iscrowd": 0, "bbox_cam3d": [-4.647830963134766, 0.9800000190734863, 60.52000045776367, 4.050000190734863, 1.4600000381469727, 1.659999966621399, 1.559999942779541], "velo_cam3d": -1, "center2d": [554.1213152040074, 184.53305847203026, 60.522745884], "attribute_name": -1, "attribute_id": -1, "segmentation": [], "id": 4}, {"file_name": "training/image_2/000007.png", "image_id": 7, "area": 928.9555081918186, "category_name": "Cyclist", "category_id": 1, "bbox": [330.84191493374504, 176.13804311926262, 24.65593879860404, 37.67674456769879], "iscrowd": 0, "bbox_cam3d": [-12.567831039428711, 1.0199999809265137, 34.09000015258789, 1.9500000476837158, 1.7200000286102295, 0.5, 1.5399999618530273], "velo_cam3d": -1, "center2d": [343.52506265845847, 194.43366972124528, 34.092745884], "attribute_name": -1, "attribute_id": -1, "segmentation": [], "id": 5}], "categories": [{"id": 0, "name": "Pedestrian"}, {"id": 1, "name": "Cyclist"}, {"id": 2, "name": "Car"}]}
\ No newline at end of file
--- a/tests/data/kitti/kitti_infos_mono3d.pkl
+++ b/tests/data/kitti/kitti_infos_mono3d.pkl
--- a/tests/data/kitti/mono3d_sample_results.pkl
+++ b/tests/data/kitti/mono3d_sample_results.pkl
--- a/tests/data/kitti/mono3d_sample_results2d.pkl
+++ b/tests/data/kitti/mono3d_sample_results2d.pkl
--- a/tests/data/kitti/training/image_2/000007.png
+++ b/tests/data/kitti/training/image_2/000007.png
--- a/tests/test_data/test_datasets/test_kitti_mono_dataset.py
+++ b/tests/test_data/test_datasets/test_kitti_mono_dataset.py
+import mmcv
+import numpy as np
+import pytest
+import torch
+
+from mmdet3d.datasets import KittiMonoDataset
+
+
+def test_getitem():
+    np.random.seed(0)
+    class_names = ['Pedestrian', 'Cyclist', 'Car']
+    img_norm_cfg = dict(
+        mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+    pipeline = [
+        dict(type='LoadImageFromFileMono3D'),
+        dict(
+            type='LoadAnnotations3D',
+            with_bbox=True,
+            with_label=True,
+            with_attr_label=False,
+            with_bbox_3d=True,
+            with_label_3d=True,
+            with_bbox_depth=True),
+        dict(type='Resize', img_scale=(1242, 375), keep_ratio=True),
+        dict(type='RandomFlip3D', flip_ratio_bev_horizontal=1.0),
+        dict(type='Normalize', **img_norm_cfg),
+        dict(type='Pad', size_divisor=32),
+        dict(type='DefaultFormatBundle3D', class_names=class_names),
+        dict(
+            type='Collect3D',
+            keys=[
+                'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_3d',
+                'gt_labels_3d', 'centers2d', 'depths'
+            ]),
+    ]
+
+    kitti_dataset = KittiMonoDataset(
+        ann_file='tests/data/kitti/kitti_infos_mono3d.coco.json',
+        info_file='tests/data/kitti/kitti_infos_mono3d.pkl',
+        pipeline=pipeline,
+        data_root='tests/data/kitti/',
+        img_prefix='tests/data/kitti/',
+        test_mode=False)
+
+    data = kitti_dataset[0]
+    img_metas = data['img_metas']._data
+    filename = img_metas['filename']
+    img_shape = img_metas['img_shape']
+    pad_shape = img_metas['pad_shape']
+    flip = img_metas['flip']
+    bboxes = data['gt_bboxes']._data
+    labels3d = data['gt_labels_3d']._data
+    labels = data['gt_labels']._data
+    centers2d = data['centers2d']._data
+    depths = data['depths']._data
+
+    expected_filename = 'tests/data/kitti/training/image_2/000007.png'
+    expected_img_shape = (375, 1242, 3)
+    expected_pad_shape = (384, 1248, 3)
+    expected_flip = True
+    expected_bboxes = torch.tensor([[625.3445, 175.0120, 676.5177, 224.9605],
+                                    [729.5906, 179.8571, 760.1503, 202.5390],
+                                    [676.7557, 175.7334, 699.7753, 193.9447],
+                                    [886.5021, 176.1380, 911.1581, 213.8148]])
+    expected_labels = torch.tensor([2, 2, 2, 1])
+    expected_centers2d = torch.tensor([[650.6185, 198.3731],
+                                       [744.2711, 190.7532],
+                                       [687.8787, 184.5331],
+                                       [898.4750, 194.4337]])
+    expected_depths = torch.tensor([25.0127, 47.5527, 60.5227, 34.0927])
+
+    assert filename == expected_filename
+    assert img_shape == expected_img_shape
+    assert pad_shape == expected_pad_shape
+    assert flip == expected_flip
+    assert torch.allclose(bboxes, expected_bboxes, 1e-5)
+    assert torch.all(labels == expected_labels)
+    assert torch.all(labels3d == expected_labels)
+    assert torch.allclose(centers2d, expected_centers2d, 1e-5)
+    assert torch.allclose(depths, expected_depths, 1e-5)
+
+
+def test_format_results():
+    root_path = 'tests/data/kitti/'
+    info_file = 'tests/data/kitti/kitti_infos_mono3d.pkl'
+    ann_file = 'tests/data/kitti/kitti_infos_mono3d.coco.json'
+    class_names = ['Pedestrian', 'Cyclist', 'Car']
+    pipeline = [
+        dict(type='LoadImageFromFileMono3D'),
+        dict(
+            type='LoadAnnotations3D',
+            with_bbox=True,
+            with_label=True,
+            with_attr_label=False,
+            with_bbox_3d=True,
+            with_label_3d=True,
+            with_bbox_depth=True),
+        dict(type='Resize', img_scale=(1242, 375), keep_ratio=True),
+        dict(type='Pad', size_divisor=32),
+        dict(type='DefaultFormatBundle3D', class_names=class_names),
+        dict(
+            type='Collect3D',
+            keys=[
+                'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_3d',
+                'gt_labels_3d', 'centers2d', 'depths'
+            ]),
+    ]
+    kitti_dataset = KittiMonoDataset(
+        ann_file=ann_file,
+        info_file=info_file,
+        pipeline=pipeline,
+        data_root=root_path,
+        test_mode=True)
+
+    # format 3D detection results
+    results = mmcv.load('tests/data/kitti/mono3d_sample_results.pkl')
+    result_files, tmp_dir = kitti_dataset.format_results(results)
+    result_data = result_files['img_bbox']
+    assert len(result_data) == 1
+    assert len(result_data[0]['name']) == 4
+    det = result_data[0]
+
+    expected_bbox = torch.tensor([[565.4989, 175.02547, 616.70184, 225.00565],
+                                  [481.85907, 179.8642, 512.43414, 202.5624],
+                                  [542.23157, 175.73912, 565.26263, 193.96303],
+                                  [330.8572, 176.1482, 355.53937, 213.8469]])
+    expected_dims = torch.tensor([[3.201, 1.6110001, 1.661],
+                                  [3.701, 1.401, 1.511],
+                                  [4.051, 1.4610001, 1.661],
+                                  [1.9510001, 1.7210001, 0.501]])
+    expected_rotation = torch.tensor([-1.59, 1.55, 1.56, 1.54])
+    expected_detname = ['Car', 'Car', 'Car', 'Cyclist']
+
+    assert torch.allclose(torch.from_numpy(det['bbox']), expected_bbox, 1e-5)
+    assert torch.allclose(
+        torch.from_numpy(det['dimensions']), expected_dims, 1e-5)
+    assert torch.allclose(
+        torch.from_numpy(det['rotation_y']), expected_rotation, 1e-5)
+    assert det['name'].tolist() == expected_detname
+
+    # format 2D detection results
+    results = mmcv.load('tests/data/kitti/mono3d_sample_results2d.pkl')
+    result_files, tmp_dir = kitti_dataset.format_results(results)
+    result_data = result_files['img_bbox2d']
+    assert len(result_data) == 1
+    assert len(result_data[0]['name']) == 4
+    det = result_data[0]
+
+    expected_bbox = torch.tensor(
+        [[330.84191493, 176.13804312, 355.49885373, 213.81578769],
+         [565.48227204, 175.01202566, 616.65650883, 224.96147091],
+         [481.84967085, 179.85710612, 512.41043776, 202.54001526],
+         [542.22471517, 175.73341152, 565.24534908, 193.94568878]])
+    expected_dims = torch.tensor([[0., 0., 0.], [0., 0., 0.], [0., 0., 0.],
+                                  [0., 0., 0.]])
+    expected_rotation = torch.tensor([0., 0., 0., 0.])
+    expected_detname = ['Cyclist', 'Car', 'Car', 'Car']
+
+    assert torch.allclose(
+        torch.from_numpy(det['bbox']).float(), expected_bbox, 1e-5)
+    assert torch.allclose(
+        torch.from_numpy(det['dimensions']).float(), expected_dims, 1e-5)
+    assert torch.allclose(
+        torch.from_numpy(det['rotation_y']).float(), expected_rotation, 1e-5)
+    assert det['name'].tolist() == expected_detname
+
+
+def test_evaluate():
+    if not torch.cuda.is_available():
+        pytest.skip('test requires GPU and torch+cuda')
+    root_path = 'tests/data/kitti/'
+    info_file = 'tests/data/kitti/kitti_infos_mono3d.pkl'
+    ann_file = 'tests/data/kitti/kitti_infos_mono3d.coco.json'
+    class_names = ['Pedestrian', 'Cyclist', 'Car']
+    pipeline = [
+        dict(type='LoadImageFromFileMono3D'),
+        dict(
+            type='LoadAnnotations3D',
+            with_bbox=True,
+            with_label=True,
+            with_attr_label=False,
+            with_bbox_3d=True,
+            with_label_3d=True,
+            with_bbox_depth=True),
+        dict(type='Resize', img_scale=(1242, 375), keep_ratio=True),
+        dict(type='Pad', size_divisor=32),
+        dict(type='DefaultFormatBundle3D', class_names=class_names),
+        dict(
+            type='Collect3D',
+            keys=[
+                'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_3d',
+                'gt_labels_3d', 'centers2d', 'depths'
+            ]),
+    ]
+    kitti_dataset = KittiMonoDataset(
+        ann_file=ann_file,
+        info_file=info_file,
+        pipeline=pipeline,
+        data_root=root_path,
+        test_mode=True)
+
+    # format 3D detection results
+    results = mmcv.load('tests/data/kitti/mono3d_sample_results.pkl')
+    results2d = mmcv.load('tests/data/kitti/mono3d_sample_results2d.pkl')
+    results[0]['img_bbox2d'] = results2d[0]['img_bbox2d']
+
+    metric = ['mAP']
+    ap_dict = kitti_dataset.evaluate(results, metric)
+    assert np.isclose(ap_dict['img_bbox/KITTI/Overall_3D_easy'], 3.0303)
+    assert np.isclose(ap_dict['img_bbox/KITTI/Overall_3D_moderate'], 6.0606)
+    assert np.isclose(ap_dict['img_bbox/KITTI/Overall_3D_hard'], 6.0606)
+    assert np.isclose(ap_dict['img_bbox2d/KITTI/Overall_2D_easy'], 3.0303)
+    assert np.isclose(ap_dict['img_bbox2d/KITTI/Overall_2D_moderate'], 6.0606)
+    assert np.isclose(ap_dict['img_bbox2d/KITTI/Overall_2D_hard'], 6.0606)
--- a/tools/create_data.py
+++ b/tools/create_data.py
@@ -22,6 +22,17 @@ def kitti_data_prep(root_path, info_prefix, version, out_dir):
    """
    kitti.create_kitti_info_file(root_path, info_prefix)
    kitti.create_reduced_point_cloud(root_path, info_prefix)
+
+    info_train_path = osp.join(root_path, f'{info_prefix}_infos_train.pkl')
+    info_val_path = osp.join(root_path, f'{info_prefix}_infos_val.pkl')
+    info_trainval_path = osp.join(root_path,
+                                  f'{info_prefix}_infos_trainval.pkl')
+    info_test_path = osp.join(root_path, f'{info_prefix}_infos_test.pkl')
+    kitti.export_2d_annotation(root_path, info_train_path)
+    kitti.export_2d_annotation(root_path, info_val_path)
+    kitti.export_2d_annotation(root_path, info_trainval_path)
+    kitti.export_2d_annotation(root_path, info_test_path)
+
    create_groundtruth_database(
        'KittiDataset',
        root_path,

--- a/tools/data_converter/kitti_converter.py
+++ b/tools/data_converter/kitti_converter.py
 import mmcv
 import numpy as np
+from collections import OrderedDict
+from nuscenes.utils.geometry_utils import view_points
 from pathlib import Path

 from mmdet3d.core.bbox import box_np_ops
 from .kitti_data_utils import get_kitti_image_info, get_waymo_image_info
+from .nuscenes_converter import post_process_coords
+
+kitti_categories = ('Pedestrian', 'Cyclist', 'Car')


 def convert_to_kitti_info_version2(info):
@@ -321,3 +326,218 @@ def create_reduced_point_cloud(data_path,
            data_path, val_info_path, save_path, back=True)
        _create_reduced_point_cloud(
            data_path, test_info_path, save_path, back=True)
+
+
+def export_2d_annotation(root_path, info_path, mono3d=True):
+    """Export 2d annotation from the info file and raw data.
+
+    Args:
+        root_path (str): Root path of the raw data.
+        info_path (str): Path of the info file.
+        mono3d (bool): Whether to export mono3d annotation. Default: True.
+    """
+    # get bbox annotations for camera
+    kitti_infos = mmcv.load(info_path)
+    cat2Ids = [
+        dict(id=kitti_categories.index(cat_name), name=cat_name)
+        for cat_name in kitti_categories
+    ]
+    coco_ann_id = 0
+    coco_2d_dict = dict(annotations=[], images=[], categories=cat2Ids)
+    from os import path as osp
+    for info in mmcv.track_iter_progress(kitti_infos):
+        coco_infos = get_2d_boxes(info, occluded=[0, 1, 2, 3], mono3d=mono3d)
+        (height, width,
+         _) = mmcv.imread(osp.join(root_path,
+                                   info['image']['image_path'])).shape
+        coco_2d_dict['images'].append(
+            dict(
+                file_name=info['image']['image_path'],
+                id=info['image']['image_idx'],
+                Tri2v=info['calib']['Tr_imu_to_velo'],
+                Trv2c=info['calib']['Tr_velo_to_cam'],
+                rect=info['calib']['R0_rect'],
+                cam_intrinsic=info['calib']['P2'],
+                width=width,
+                height=height))
+        for coco_info in coco_infos:
+            if coco_info is None:
+                continue
+            # add an empty key for coco format
+            coco_info['segmentation'] = []
+            coco_info['id'] = coco_ann_id
+            coco_2d_dict['annotations'].append(coco_info)
+            coco_ann_id += 1
+    if mono3d:
+        json_prefix = f'{info_path[:-4]}_mono3d'
+    else:
+        json_prefix = f'{info_path[:-4]}'
+    mmcv.dump(coco_2d_dict, f'{json_prefix}.coco.json')
+
+
+def get_2d_boxes(info, occluded, mono3d=True):
+    """Get the 2D annotation records for a given info.
+
+    Args:
+        info: Information of the given sample data.
+        occluded: Integer (0, 1, 2, 3) indicating occlusion state: \
+            0 = fully visible, 1 = partly occluded, 2 = largely occluded, \
+            3 = unknown, -1 = DontCare
+        mono3d (bool): Whether to get boxes with mono3d annotation.
+
+    Return:
+        list[dict]: List of 2D annotation record that belongs to the input
+            `sample_data_token`.
+    """
+    # Get calibration information
+    P2 = info['calib']['P2']
+
+    repro_recs = []
+    # if no annotations in info (test dataset), then return
+    if 'annos' not in info:
+        return repro_recs
+
+    # Get all the annotation with the specified visibilties.
+    ann_dicts = info['annos']
+    mask = [(ocld in occluded) for ocld in ann_dicts['occluded']]
+    for k in ann_dicts.keys():
+        ann_dicts[k] = ann_dicts[k][mask]
+
+    # convert dict of list to list of dict
+    ann_recs = []
+    for i in range(len(ann_dicts['occluded'])):
+        ann_rec = {}
+        for k in ann_dicts.keys():
+            ann_rec[k] = ann_dicts[k][i]
+        ann_recs.append(ann_rec)
+
+    for ann_idx, ann_rec in enumerate(ann_recs):
+        # Augment sample_annotation with token information.
+        ann_rec['sample_annotation_token'] = \
+            f"{info['image']['image_idx']}.{ann_idx}"
+        ann_rec['sample_data_token'] = info['image']['image_idx']
+        sample_data_token = info['image']['image_idx']
+
+        loc = ann_rec['location'][np.newaxis, :]
+        dim = ann_rec['dimensions'][np.newaxis, :]
+        rot = ann_rec['rotation_y'][np.newaxis, np.newaxis]
+        # transform the center from [0.5, 1.0, 0.5] to [0.5, 0.5, 0.5]
+        dst = np.array([0.5, 0.5, 0.5])
+        src = np.array([0.5, 1.0, 0.5])
+        loc = loc + dim * (dst - src)
+        offset = (info['calib']['P2'][0, 3] - info['calib']['P0'][0, 3]) \
+            / info['calib']['P2'][0, 0]
+        loc_3d = np.copy(loc)
+        loc_3d[0, 0] += offset
+        gt_bbox_3d = np.concatenate([loc, dim, rot], axis=1).astype(np.float32)
+
+        # Filter out the corners that are not in front of the calibrated
+        # sensor.
+        corners_3d = box_np_ops.center_to_corner_box3d(
+            gt_bbox_3d[:, :3],
+            gt_bbox_3d[:, 3:6],
+            gt_bbox_3d[:, 6], [0.5, 0.5, 0.5],
+            axis=1)
+        corners_3d = corners_3d[0].T  # (1, 8, 3) -> (3, 8)
+        in_front = np.argwhere(corners_3d[2, :] > 0).flatten()
+        corners_3d = corners_3d[:, in_front]
+
+        # Project 3d box to 2d.
+        camera_intrinsic = P2
+        corner_coords = view_points(corners_3d, camera_intrinsic,
+                                    True).T[:, :2].tolist()
+
+        # Keep only corners that fall within the image.
+        final_coords = post_process_coords(corner_coords)
+
+        # Skip if the convex hull of the re-projected corners
+        # does not intersect the image canvas.
+        if final_coords is None:
+            continue
+        else:
+            min_x, min_y, max_x, max_y = final_coords
+
+        # Generate dictionary record to be included in the .json file.
+        repro_rec = generate_record(ann_rec, min_x, min_y, max_x, max_y,
+                                    sample_data_token,
+                                    info['image']['image_path'])
+
+        # If mono3d=True, add 3D annotations in camera coordinates
+        if mono3d and (repro_rec is not None):
+            repro_rec['bbox_cam3d'] = np.concatenate(
+                [loc_3d, dim, rot],
+                axis=1).astype(np.float32).squeeze().tolist()
+            repro_rec['velo_cam3d'] = -1  # no velocity in KITTI
+
+            center3d = np.array(loc).reshape([1, 3])
+            center2d = box_np_ops.points_cam2img(
+                center3d, camera_intrinsic, with_depth=True)
+            repro_rec['center2d'] = center2d.squeeze().tolist()
+            # normalized center2D + depth
+            # samples with depth < 0 will be removed
+            if repro_rec['center2d'][2] <= 0:
+                continue
+
+            repro_rec['attribute_name'] = -1  # no attribute in KITTI
+            repro_rec['attribute_id'] = -1
+
+        repro_recs.append(repro_rec)
+
+    return repro_recs
+
+
+def generate_record(ann_rec, x1, y1, x2, y2, sample_data_token, filename):
+    """Generate one 2D annotation record given various informations on top of
+    the 2D bounding box coordinates.
+
+    Args:
+        ann_rec (dict): Original 3d annotation record.
+        x1 (float): Minimum value of the x coordinate.
+        y1 (float): Minimum value of the y coordinate.
+        x2 (float): Maximum value of the x coordinate.
+        y2 (float): Maximum value of the y coordinate.
+        sample_data_token (str): Sample data token.
+        filename (str):The corresponding image file where the annotation
+            is present.
+
+    Returns:
+        dict: A sample 2D annotation record.
+            - file_name (str): flie name
+            - image_id (str): sample data token
+            - area (float): 2d box area
+            - category_name (str): category name
+            - category_id (int): category id
+            - bbox (list[float]): left x, top y, dx, dy of 2d box
+            - iscrowd (int): whether the area is crowd
+    """
+    repro_rec = OrderedDict()
+    repro_rec['sample_data_token'] = sample_data_token
+    coco_rec = dict()
+
+    key_mapping = {
+        'name': 'category_name',
+        'num_points_in_gt': 'num_lidar_pts',
+        'sample_annotation_token': 'sample_annotation_token',
+        'sample_data_token': 'sample_data_token',
+    }
+
+    for key, value in ann_rec.items():
+        if key in key_mapping.keys():
+            repro_rec[key_mapping[key]] = value
+
+    repro_rec['bbox_corners'] = [x1, y1, x2, y2]
+    repro_rec['filename'] = filename
+
+    coco_rec['file_name'] = filename
+    coco_rec['image_id'] = sample_data_token
+    coco_rec['area'] = (y2 - y1) * (x2 - x1)
+
+    if repro_rec['category_name'] not in kitti_categories:
+        return None
+    cat_name = repro_rec['category_name']
+    coco_rec['category_name'] = cat_name
+    coco_rec['category_id'] = kitti_categories.index(cat_name)
+    coco_rec['bbox'] = [x1, y1, x2 - x1, y2 - y1]
+    coco_rec['iscrowd'] = 0
+
+    return coco_rec