[Feature] Support multi-modality visualization (demos and dataset show function) (#405)

* uncomplete * support lidar2img loading * add box_type_3d args in MVX-Net config file * support multi-modality demo for LiDAR point clouds * support multi-modality demo for indoor (depth) point clouds * move demo data into folder and modify docs * add input check & more general filename matching * update docs for demo and add README file for demo * add score_threshold option to demos * add data for ScanNet & KITTI dataset multi-modality test * add multi-modality visualization in ScanNet and KITTI dataset * add unittest for modified visualization function * delete saved temp file and dirs in unittests using TemporaryDirectory * fix typos in docs & move README of demos to docs/ * add demo docs to documentation * fix link error Co-authored-by: wHao-Wu <wenhaowu.chn@gmail.com>

[Feature] Support multi-modality visualization (demos and dataset show function) (#405)
* uncomplete * support lidar2img loading * add box_type_3d args in MVX-Net config file * support multi-modality demo for LiDAR point clouds * support multi-modality demo for indoor (depth) point clouds * move demo data into folder and modify docs * add input check & more general filename matching * update docs for demo and add README file for demo * add score_threshold option to demos * add data for ScanNet & KITTI dataset multi-modality test * add multi-modality visualization in ScanNet and KITTI dataset * add unittest for modified visualization function * delete saved temp file and dirs in unittests using TemporaryDirectory * fix typos in docs & move README of demos to docs/ * add demo docs to documentation * fix link error Co-authored-by: wHao-Wu <wenhaowu.chn@gmail.com>
e21e61e0 · Ziyi Wu · GitHub · 825f47a4 · e21e61e0 · e21e61e0
Unverified Commit e21e61e0 authored Apr 08, 2021 by Ziyi Wu Committed by GitHub Apr 08, 2021
20 changed files
--- a/configs/mvxnet/dv_mvx-fpn_second_secfpn_adamw_2x8_80e_kitti-3d-3class.py
+++ b/configs/mvxnet/dv_mvx-fpn_second_secfpn_adamw_2x8_80e_kitti-3d-3class.py
@@ -202,7 +202,8 @@ data = dict(
            pipeline=train_pipeline,
            modality=input_modality,
            classes=class_names,
-            test_mode=False)),
+            test_mode=False,
+            box_type_3d='LiDAR')),
    val=dict(
        type=dataset_type,
        data_root=data_root,
@@ -212,7 +213,8 @@ data = dict(
        pipeline=test_pipeline,
        modality=input_modality,
        classes=class_names,
-        test_mode=True),
+        test_mode=True,
+        box_type_3d='LiDAR'),
    test=dict(
        type=dataset_type,
        data_root=data_root,
@@ -222,7 +224,8 @@ data = dict(
        pipeline=test_pipeline,
        modality=input_modality,
        classes=class_names,
-        test_mode=True))
+        test_mode=True,
+        box_type_3d='LiDAR'))
 # Training settings
 optimizer = dict(type='AdamW', lr=0.003, betas=(0.95, 0.99), weight_decay=0.01)
 # max_norm=10 is better for SECOND

--- a/demo/kitti_000008.bin
+++ b/demo/kitti_000008.bin
--- a/demo/data/kitti/kitti_000008.png
+++ b/demo/data/kitti/kitti_000008.png
--- a/demo/data/kitti/kitti_000008_infos.pkl
+++ b/demo/data/kitti/kitti_000008_infos.pkl
--- a/demo/sunrgbd_000017.bin
+++ b/demo/sunrgbd_000017.bin
--- a/demo/data/sunrgbd/sunrgbd_000017.jpg
+++ b/demo/data/sunrgbd/sunrgbd_000017.jpg
--- a/demo/data/sunrgbd/sunrgbd_000017_infos.pkl
+++ b/demo/data/sunrgbd/sunrgbd_000017_infos.pkl
--- a/demo/multi_modality_demo.py
+++ b/demo/multi_modality_demo.py
+from argparse import ArgumentParser
+
+from mmdet3d.apis import (inference_multi_modality_detector, init_detector,
+                          show_result_meshlab)
+
+
+def main():
+    parser = ArgumentParser()
+    parser.add_argument('pcd', help='Point cloud file')
+    parser.add_argument('image', help='image file')
+    parser.add_argument('ann', help='ann file')
+    parser.add_argument('config', help='Config file')
+    parser.add_argument('checkpoint', help='Checkpoint file')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+    parser.add_argument(
+        '--score-thr', type=float, default=0.0, help='bbox score threshold')
+    parser.add_argument(
+        '--out-dir', type=str, default='demo', help='dir to save results')
+    args = parser.parse_args()
+
+    # build the model from a config file and a checkpoint file
+    model = init_detector(args.config, args.checkpoint, device=args.device)
+    # test a single image
+    result, data = inference_multi_modality_detector(model, args.pcd,
+                                                     args.image, args.ann)
+    # show the results
+    show_result_meshlab(data, result, args.out_dir, args.score_thr)
+
+
+if __name__ == '__main__':
+    main()
--- a/demo/pcd_demo.py
+++ b/demo/pcd_demo.py
@@ -11,7 +11,7 @@ def main():
    parser.add_argument(
        '--device', default='cuda:0', help='Device used for inference')
    parser.add_argument(
-        '--score-thr', type=float, default=0.6, help='bbox score threshold')
+        '--score-thr', type=float, default=0.0, help='bbox score threshold')
    parser.add_argument(
        '--out-dir', type=str, default='demo', help='dir to save results')
    args = parser.parse_args()
@@ -21,7 +21,7 @@ def main():
    # test a single image
    result, data = inference_detector(model, args.pcd)
    # show the results
-    show_result_meshlab(data, result, args.out_dir)
+    show_result_meshlab(data, result, args.out_dir, args.score_thr)


 if __name__ == '__main__':

--- a/docs/0_demo.md
+++ b/docs/0_demo.md
+# Demo
+
+## Introduction
+
+We provide scipts for multi-modality/single-modality and indoor/outdoor 3D detection demos. The pre-trained models can be downloaded from [model zoo](../docs/model_zoo.md). We provide pre-processed sample data from KITTI and SUN RGB-D dataset. You can use any other data following our pre-processing steps.
+
+## Testing
+
+### Single-modality demo
+
+To test a 3D detector on point cloud data, simply run:
+
+```shell
+python demo/pcd_demo.py ${PCD_FILE} ${CONFIG_FILE} ${CHECKPOINT_FILE} [--device ${GPU_ID}] [--score-thr ${SCORE_THR}] [--out-dir ${OUT_DIR}]
+```
+
+The visualization results including a point cloud and predicted 3D bounding boxes will be saved in ```demo/PCD_NAME```, which you can open using [MeshLab](http://www.meshlab.net/).
+
+Example on KITTI data using [SECOND](https://github.com/open-mmlab/mmdetection3d/tree/master/configs/second) model:
+
+```shell
+python demo/pcd_demo.py demo/data/kitti/kitti_000008.bin configs/second/hv_second_secfpn_6x8_80e_kitti-3d-car.py checkpoints/hv_second_secfpn_6x8_80e_kitti-3d-car_20200620_230238-393f000c.pth
+```
+
+Example on SUN RGB-D data using [VoteNet](https://github.com/open-mmlab/mmdetection3d/tree/master/configs/votenet) model:
+
+```shell
+python demo/pcd_demo.py demo/data/sunrgbd/sunrgbd_000017.bin configs/votenet/votenet_16x8_sunrgbd-3d-10class.py checkpoints/votenet_16x8_sunrgbd-3d-10class_20200620_230238-4483c0c0.pth
+```
+
+Remember to convert the VoteNet checkpoint if you are using mmdetection3d version >= 0.6.0. See its [README](https://github.com/open-mmlab/mmdetection3d/blob/master/configs/votenet/README.md) for detailed instructions on how to convert the checkpoint.
+
+### Multi-modality demo
+
+To test a 3D detector on multi-modality data (typically point cloud and image), simply run:
+
+```shell
+python demo/multi_modality_demo.py ${PCD_FILE} ${IMAGE_FILE} ${ANNOTATION_FILE} ${CONFIG_FILE} ${CHECKPOINT_FILE} [--device ${GPU_ID}] [--score-thr ${SCORE_THR}] [--out-dir ${OUT_DIR}]
+```
+
+where the ```ANNOTATION_FILE``` should provide the 3D to 2D projection matrix. The visualization results including a point cloud, an image, predicted 3D bounding boxes and their projection on the image will be saved in ```demo/PCD_NAME```.
+
+Example on KITTI data using [MVX-Net](https://github.com/open-mmlab/mmdetection3d/tree/master/configs/mvxnet) model:
+
+```shell
+python demo/multi_modality_demo.py demo/data/kitti/kitti_000008.bin demo/data/kitti/kitti_000008.png demo/data/kitti/kitti_000008_infos.pkl configs/mvxnet/dv_mvx-fpn_second_secfpn_adamw_2x8_80e_kitti-3d-3class.py checkpoints/dv_mvx-fpn_second_secfpn_adamw_2x8_80e_kitti-3d-3class_20200621_003904-10140f2d.pth
+```
+
+Example on SUN RGB-D data using [ImVoteNet](https://github.com/open-mmlab/mmdetection3d/tree/master/configs/imvotenet) model:
+
+```shell
+python demo/multi_modality_demo.py demo/data/sunrgbd/sunrgbd_000017.bin demo/data/sunrgbd/sunrgbd_000017.jpg demo/data/sunrgbd/sunrgbd_000017_infos.pkl configs/imvotenet/imvotenet_stage2_16x8_sunrgbd-3d-10class.py checkpoints/imvotenet_stage2_16x8_sunrgbd-3d-10class_20210323_184021-d44dcb66.pth
+```
--- a/docs/getting_started.md
+++ b/docs/getting_started.md
@@ -194,7 +194,7 @@ PYTHONPATH="$(dirname $0)/..":$PYTHONPATH

 ### Point cloud demo

-We provide a demo script to test a single sample. Pre-trained models can be downloaded from [model zoo](model_zoo.md)
+We provide several demo scripts to test a single sample. Pre-trained models can be downloaded from [model zoo](model_zoo.md). To test a single-modality 3D detection on point cloud scenes:

 ```shell
 python demo/pcd_demo.py ${PCD_FILE} ${CONFIG_FILE} ${CHECKPOINT_FILE} [--device ${GPU_ID}] [--score-thr ${SCORE_THR}] [--out-dir ${OUT_DIR}]
@@ -203,16 +203,18 @@ python demo/pcd_demo.py ${PCD_FILE} ${CONFIG_FILE} ${CHECKPOINT_FILE} [--device
 Examples:

 ```shell
-python demo/pcd_demo.py demo/kitti_000008.bin configs/second/hv_second_secfpn_6x8_80e_kitti-3d-car.py checkpoints/hv_second_secfpn_6x8_80e_kitti-3d-car_20200620_230238-393f000c.pth
+python demo/pcd_demo.py demo/data/kitti/kitti_000008.bin configs/second/hv_second_secfpn_6x8_80e_kitti-3d-car.py checkpoints/hv_second_secfpn_6x8_80e_kitti-3d-car_20200620_230238-393f000c.pth
 ```
+
 If you want to input a `ply` file, you can use the following function and convert it to `bin` format. Then you can use the converted `bin` file to generate demo.
 Note that you need to install pandas and plyfile before using this script. This function can also be used for data preprocessing for training ```ply data```.
+
 ```python
 import numpy as np
 import pandas as pd
 from plyfile import PlyData

-def conver_ply(input_path, output_path):
+def convert_ply(input_path, output_path):
    plydata = PlyData.read(input_path)  # read file
    data = plydata.elements[0].data  # read data
    data_pd = pd.DataFrame(data)  # convert to DataFrame
@@ -223,12 +225,31 @@ def conver_ply(input_path, output_path):
        data_np[:, i] = data_pd[name]
    data_np.astype(np.float32).tofile(output_path)
 ```
+
 Examples:

 ```python
 convert_ply('./test.ply', './test.bin')
 ```

+If you have point clouds in other format (`off`, `obj`, etc.), you can use trimesh to convert them into `ply`.
+
+```python
+import trimesh
+
+def to_ply(input_path, output_path, original_type):
+    mesh = trimesh.load(input_path, file_type=original_type)  # read file
+    mesh.export(output_path, file_type='ply')  # convert to ply
+```
+
+Examples:
+
+```python
+to_ply('./test.obj', './test.ply', 'obj')
+```
+
+More demos about single/multi-modality and indoor/outdoor 3D detection can be found in [demo](0_demo.md).
+
 ## High-level APIs for testing point clouds

 ### Synchronous interface

--- a/docs/index.rst
+++ b/docs/index.rst
@@ -13,6 +13,7 @@ Welcome to MMDetection3D's documentation!
   :maxdepth: 2
   :caption: Quick Run

+   0_demo.md
   1_exist_data_model.md
   2_new_data_model.md


--- a/mmdet3d/apis/__init__.py
+++ b/mmdet3d/apis/__init__.py
-from .inference import (convert_SyncBN, inference_detector, init_detector,
+from .inference import (convert_SyncBN, inference_detector,
+                        inference_multi_modality_detector, init_detector,
                        show_result_meshlab)
 from .test import single_gpu_test

 __all__ = [
    'inference_detector', 'init_detector', 'single_gpu_test',
-    'show_result_meshlab', 'convert_SyncBN'
+    'show_result_meshlab', 'convert_SyncBN',
+    'inference_multi_modality_detector'
 ]
--- a/mmdet3d/apis/inference.py
+++ b/mmdet3d/apis/inference.py
 import mmcv
+import numpy as np
+import re
 import torch
 from copy import deepcopy
 from mmcv.parallel import collate, scatter
 from mmcv.runner import load_checkpoint
 from os import path as osp

-from mmdet3d.core import Box3DMode, show_result
+from mmdet3d.core import (Box3DMode, DepthInstance3DBoxes,
+                          LiDARInstance3DBoxes, show_multi_modality_result,
+                          show_result)
 from mmdet3d.core.bbox import get_box_type
 from mmdet3d.datasets.pipelines import Compose
 from mmdet3d.models import build_detector
@@ -106,13 +110,89 @@ def inference_detector(model, pcd):
    return result, data


-def show_result_meshlab(data, result, out_dir):
+def inference_multi_modality_detector(model, pcd, image, ann_file):
+    """Inference point cloud with the multimodality detector.
+
+    Args:
+        model (nn.Module): The loaded detector.
+        pcd (str): Point cloud files.
+        image (str): Image files.
+        ann_file (str): Annotation files.
+
+    Returns:
+        tuple: Predicted results and data from pipeline.
+    """
+    cfg = model.cfg
+    device = next(model.parameters()).device  # model device
+    # build the data pipeline
+    test_pipeline = deepcopy(cfg.data.test.pipeline)
+    test_pipeline = Compose(test_pipeline)
+    box_type_3d, box_mode_3d = get_box_type(cfg.data.test.box_type_3d)
+    # get data info containing calib
+    data_infos = mmcv.load(ann_file)
+    image_idx = int(re.findall(r'\d+', image)[-1])  # xxx/sunrgbd_000017.jpg
+    for x in data_infos:
+        if int(x['image']['image_idx']) != image_idx:
+            continue
+        info = x
+        break
+    data = dict(
+        pts_filename=pcd,
+        img_prefix=osp.dirname(image),
+        img_info=dict(filename=osp.basename(image)),
+        box_type_3d=box_type_3d,
+        box_mode_3d=box_mode_3d,
+        img_fields=[],
+        bbox3d_fields=[],
+        pts_mask_fields=[],
+        pts_seg_fields=[],
+        bbox_fields=[],
+        mask_fields=[],
+        seg_fields=[])
+
+    # depth map points to image conversion
+    if box_mode_3d == Box3DMode.DEPTH:
+        data.update(dict(calib=info['calib']))
+
+    data = test_pipeline(data)
+
+    # LiDAR to image conversion
+    if box_mode_3d == Box3DMode.LIDAR:
+        rect = info['calib']['R0_rect'].astype(np.float32)
+        Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32)
+        P2 = info['calib']['P2'].astype(np.float32)
+        lidar2img = P2 @ rect @ Trv2c
+        data['img_metas'][0].data['lidar2img'] = lidar2img
+    elif box_mode_3d == Box3DMode.DEPTH:
+        data['calib'][0]['Rt'] = data['calib'][0]['Rt'].astype(np.float32)
+        data['calib'][0]['K'] = data['calib'][0]['K'].astype(np.float32)
+
+    data = collate([data], samples_per_gpu=1)
+    if next(model.parameters()).is_cuda:
+        # scatter to specified GPU
+        data = scatter(data, [device.index])[0]
+    else:
+        # this is a workaround to avoid the bug of MMDataParallel
+        data['img_metas'] = data['img_metas'][0].data
+        data['points'] = data['points'][0].data
+        data['img'] = data['img'][0].data
+        if box_mode_3d == Box3DMode.DEPTH:
+            data['calib'] = data['calib'][0].data
+
+    # forward the model
+    with torch.no_grad():
+        result = model(return_loss=False, rescale=True, **data)
+    return result, data
+
+
+def show_result_meshlab(data, result, out_dir, score_thr=0.0):
    """Show result by meshlab.

    Args:
        data (dict): Contain data from pipeline.
        result (dict): Predicted result from model.
        out_dir (str): Directory to save visualized result.
+        score_thr (float): Minimum score of bboxes to be shown. Default: 0.0
    """
    points = data['points'][0][0].cpu().numpy()
    pts_filename = data['img_metas'][0][0]['pts_filename']
@@ -122,14 +202,67 @@ def show_result_meshlab(data, result, out_dir):

    if 'pts_bbox' in result[0].keys():
        pred_bboxes = result[0]['pts_bbox']['boxes_3d'].tensor.numpy()
+        pred_scores = result[0]['pts_bbox']['scores_3d'].numpy()
    else:
        pred_bboxes = result[0]['boxes_3d'].tensor.numpy()
+        pred_scores = result[0]['scores_3d'].numpy()
+
+    # filter out low score bboxes for visualization
+    if score_thr > 0:
+        inds = pred_scores > score_thr
+        pred_bboxes = pred_bboxes[inds]
+
    # for now we convert points into depth mode
-    if data['img_metas'][0][0]['box_mode_3d'] != Box3DMode.DEPTH:
+    box_mode = data['img_metas'][0][0]['box_mode_3d']
+    if box_mode != Box3DMode.DEPTH:
        points = points[..., [1, 0, 2]]
        points[..., 0] *= -1
-        pred_bboxes = Box3DMode.convert(pred_bboxes,
-                                        data['img_metas'][0][0]['box_mode_3d'],
-                                        Box3DMode.DEPTH)
-    show_result(points, None, pred_bboxes, out_dir, file_name, show=False)
+        show_bboxes = Box3DMode.convert(pred_bboxes, box_mode, Box3DMode.DEPTH)
+    else:
+        show_bboxes = deepcopy(pred_bboxes)
+    show_result(points, None, show_bboxes, out_dir, file_name, show=False)
+
+    if 'img' not in data.keys():
+        return out_dir, file_name
+
+    # multi-modality visualization
+    # project 3D bbox to 2D image plane
+    if box_mode == Box3DMode.LIDAR:
+        if 'lidar2img' not in data['img_metas'][0][0]:
+            raise NotImplementedError(
+                'LiDAR to image transformation matrix is not provided')
+
+        show_bboxes = LiDARInstance3DBoxes(pred_bboxes, origin=(0.5, 0.5, 0))
+        img = mmcv.imread(data['img_metas'][0][0]['filename'])
+
+        show_multi_modality_result(
+            img,
+            None,
+            show_bboxes,
+            data['img_metas'][0][0]['lidar2img'],
+            out_dir,
+            file_name,
+            show=False)
+    elif box_mode == Box3DMode.DEPTH:
+        if 'calib' not in data.keys():
+            raise NotImplementedError(
+                'camera calibration information is not provided')
+
+        show_bboxes = DepthInstance3DBoxes(pred_bboxes, origin=(0.5, 0.5, 0))
+        img = mmcv.imread(data['img_metas'][0][0]['filename'])
+
+        show_multi_modality_result(
+            img,
+            None,
+            show_bboxes,
+            data['calib'][0],
+            out_dir,
+            file_name,
+            depth_bbox=True,
+            img_metas=data['img_metas'][0][0],
+            show=False)
+    else:
+        raise NotImplementedError(
+            f'visualization of {box_mode} bbox is not supported')
+
    return out_dir, file_name
--- a/mmdet3d/core/visualizer/__init__.py
+++ b/mmdet3d/core/visualizer/__init__.py
-from .show_result import show_result, show_seg_result
+from .show_result import (show_multi_modality_result, show_result,
+                          show_seg_result)

-__all__ = ['show_result', 'show_seg_result']
+__all__ = ['show_result', 'show_seg_result', 'show_multi_modality_result']
--- a/mmdet3d/core/visualizer/open3d_vis.py
+++ b/mmdet3d/core/visualizer/open3d_vis.py
@@ -365,7 +365,7 @@ def project_pts_on_img(points,
            color=tuple(color),
            thickness=thickness,
        )
-    cv2.imshow('project_pts_img', img)
+    cv2.imshow('project_pts_img', img.astype(np.uint8))
    cv2.waitKey(100)


@@ -407,10 +407,117 @@ def project_bbox3d_on_img(bboxes3d,
                     (corners[end, 0], corners[end, 1]), color, thickness,
                     cv2.LINE_AA)

-    cv2.imshow('project_bbox3d_img', img)
+    cv2.imshow('project_bbox3d_img', img.astype(np.uint8))
    cv2.waitKey(0)


+def draw_lidar_bbox3d_on_img(bboxes3d,
+                             raw_img,
+                             lidar2img_rt,
+                             img_metas,
+                             color=(0, 255, 0),
+                             thickness=1):
+    """Project the 3D bbox on 2D plane and draw on input image.
+
+    Args:
+        bboxes3d (numpy.array, shape=[M, 7]):
+            3d bbox (x, y, z, dx, dy, dz, yaw) to visualize.
+        raw_img (numpy.array): The numpy array of image.
+        lidar2img_rt (numpy.array, shape=[4, 4]): The projection matrix
+            according to the camera intrinsic parameters.
+        img_metas (dict): Useless here.
+        color (tuple[int]): The color to draw bboxes. Default: (0, 255, 0).
+        thickness (int, optional): The thickness of bboxes. Default: 1.
+    """
+    img = raw_img.copy()
+    corners_3d = bboxes3d.corners
+    num_bbox = corners_3d.shape[0]
+    pts_4d = np.concatenate(
+        [corners_3d.reshape(-1, 3),
+         np.ones((num_bbox * 8, 1))], axis=-1)
+    lidar2img_rt = copy.deepcopy(lidar2img_rt).reshape(4, 4)
+    if isinstance(lidar2img_rt, torch.Tensor):
+        lidar2img_rt = lidar2img_rt.cpu().numpy()
+    pts_2d = pts_4d @ lidar2img_rt.T
+
+    pts_2d[:, 2] = np.clip(pts_2d[:, 2], a_min=1e-5, a_max=1e5)
+    pts_2d[:, 0] /= pts_2d[:, 2]
+    pts_2d[:, 1] /= pts_2d[:, 2]
+    imgfov_pts_2d = pts_2d[..., :2].reshape(num_bbox, 8, 2)
+
+    line_indices = ((0, 1), (0, 3), (0, 4), (1, 2), (1, 5), (3, 2), (3, 7),
+                    (4, 5), (4, 7), (2, 6), (5, 6), (6, 7))
+    for i in range(num_bbox):
+        corners = imgfov_pts_2d[i].astype(np.int)
+        for start, end in line_indices:
+            cv2.line(img, (corners[start, 0], corners[start, 1]),
+                     (corners[end, 0], corners[end, 1]), color, thickness,
+                     cv2.LINE_AA)
+
+    return img.astype(np.uint8)
+
+
+def draw_depth_bbox3d_on_img(bboxes3d,
+                             raw_img,
+                             calibs,
+                             img_metas,
+                             color=(0, 255, 0),
+                             thickness=1):
+    """Project the 3D bbox on 2D plane and draw on input image.
+
+    Args:
+        bboxes3d (numpy.array, shape=[M, 7]):
+            3d camera bbox (x, y, z, dx, dy, dz, yaw) to visualize.
+        raw_img (numpy.array): The numpy array of image.
+        calibs (dict): Camera calibration information, Rt and K.
+        img_metas (dict): Used in coordinates transformation.
+        color (tuple[int]): The color to draw bboxes. Default: (0, 255, 0).
+        thickness (int, optional): The thickness of bboxes. Default: 1.
+    """
+    from mmdet3d.core import Coord3DMode
+    from mmdet3d.core.bbox import points_cam2img
+    from mmdet3d.models import apply_3d_transformation
+
+    img = raw_img.copy()
+    calibs = copy.deepcopy(calibs)
+    img_metas = copy.deepcopy(img_metas)
+    corners_3d = bboxes3d.corners
+    num_bbox = corners_3d.shape[0]
+    points_3d = corners_3d.reshape(-1, 3)
+    assert ('Rt' in calibs.keys() and 'K' in calibs.keys()), \
+        'Rt and K matrix should be provided as camera caliberation information'
+    if not isinstance(calibs['Rt'], torch.Tensor):
+        calibs['Rt'] = torch.from_numpy(np.array(calibs['Rt']))
+    if not isinstance(calibs['K'], torch.Tensor):
+        calibs['K'] = torch.from_numpy(np.array(calibs['K']))
+    calibs['Rt'] = calibs['Rt'].reshape(3, 3).float().cpu()
+    calibs['K'] = calibs['K'].reshape(3, 3).float().cpu()
+
+    # first reverse the data transformations
+    xyz_depth = apply_3d_transformation(
+        points_3d, 'DEPTH', img_metas, reverse=True)
+
+    # then convert from depth coords to camera coords
+    xyz_cam = Coord3DMode.convert_point(
+        xyz_depth, Coord3DMode.DEPTH, Coord3DMode.CAM, rt_mat=calibs['Rt'])
+
+    # project to 2d to get image coords (uv)
+    uv_origin = points_cam2img(xyz_cam, calibs['K'])
+    uv_origin = (uv_origin - 1).round()
+    imgfov_pts_2d = uv_origin[..., :2].reshape(num_bbox, 8, 2).numpy()
+
+    line_indices = ((0, 1), (0, 3), (0, 4), (1, 2), (1, 5), (3, 2), (3, 7),
+                    (4, 5), (4, 7), (2, 6), (5, 6), (6, 7))
+    for i in range(num_bbox):
+        corners = imgfov_pts_2d[i].astype(np.int)
+        for start, end in line_indices:
+            cv2.line(img, (corners[start, 0], corners[start, 1]),
+                     (corners[end, 0], corners[end, 1]), color, thickness,
+                     cv2.LINE_AA)
+
+    return img.astype(np.uint8)
+
+
 class Visualizer(object):
    r"""Online visualizer implemented with Open3d.


--- a/mmdet3d/core/visualizer/show_result.py
+++ b/mmdet3d/core/visualizer/show_result.py
@@ -77,7 +77,7 @@ def show_result(points, gt_bboxes, pred_bboxes, out_dir, filename, show=True):
        pred_bboxes (np.ndarray): Predicted boxes.
        out_dir (str): Path of output directory
        filename (str): Filename of the current frame.
-        show (bool): Visualize the results online.
+        show (bool): Visualize the results online. Defaults to True.
    """
    if show:
        from .open3d_vis import Visualizer
@@ -178,3 +178,70 @@ def show_seg_result(points,
    if pred_seg is not None:
        _write_obj(pred_seg_color, osp.join(result_path,
                                            f'{filename}_pred.obj'))
+
+
+def show_multi_modality_result(img,
+                               gt_bboxes,
+                               pred_bboxes,
+                               proj_mat,
+                               out_dir,
+                               filename,
+                               depth_bbox=False,
+                               img_metas=None,
+                               show=True,
+                               gt_bbox_color=(61, 102, 255),
+                               pred_bbox_color=(241, 101, 72)):
+    """Convert multi-modality detection results into 2D results.
+
+    Project the predicted 3D bbox to 2D image plane and visualize them.
+
+    Args:
+        img (np.ndarray): The numpy array of image in cv2 fashion.
+        gt_bboxes (np.ndarray): Ground truth boxes.
+        pred_bboxes (np.ndarray): Predicted boxes.
+        proj_mat (numpy.array, shape=[4, 4]): The projection matrix
+            according to the camera intrinsic parameters.
+        out_dir (str): Path of output directory
+        filename (str): Filename of the current frame.
+        depth_bbox (bool): Whether we are projecting camera bbox or lidar bbox.
+        img_metas (dict): Used in projecting cameta bbox.
+        show (bool): Visualize the results online. Defaults to True.
+        gt_bbox_color (str or tuple(int)): Color of bbox lines.
+           The tuple of color should be in BGR order. Default: (255, 102, 61)
+        pred_bbox_color (str or tuple(int)): Color of bbox lines.
+           The tuple of color should be in BGR order. Default: (72, 101, 241)
+    """
+    if depth_bbox:
+        from .open3d_vis import draw_depth_bbox3d_on_img as draw_bbox
+    else:
+        from .open3d_vis import draw_lidar_bbox3d_on_img as draw_bbox
+
+    result_path = osp.join(out_dir, filename)
+    mmcv.mkdir_or_exist(result_path)
+
+    if show:
+        show_img = img.copy()
+        if gt_bboxes is not None:
+            show_img = draw_bbox(
+                gt_bboxes, show_img, proj_mat, img_metas, color=gt_bbox_color)
+        if pred_bboxes is not None:
+            show_img = draw_bbox(
+                pred_bboxes,
+                show_img,
+                proj_mat,
+                img_metas,
+                color=pred_bbox_color)
+        mmcv.imshow(show_img, win_name='project_bbox3d_img', wait_time=0)
+
+    if img is not None:
+        mmcv.imwrite(img, osp.join(result_path, f'{filename}_img.png'))
+
+    if gt_bboxes is not None:
+        gt_img = draw_bbox(
+            gt_bboxes, img, proj_mat, img_metas, color=gt_bbox_color)
+        mmcv.imwrite(gt_img, osp.join(result_path, f'{filename}_gt.png'))
+
+    if pred_bboxes is not None:
+        pred_img = draw_bbox(
+            pred_bboxes, img, proj_mat, img_metas, color=pred_bbox_color)
+        mmcv.imwrite(pred_img, osp.join(result_path, f'{filename}_pred.png'))
--- a/mmdet3d/datasets/kitti_dataset.py
+++ b/mmdet3d/datasets/kitti_dataset.py
@@ -8,9 +8,9 @@ from mmcv.utils import print_log
 from os import path as osp

 from mmdet.datasets import DATASETS
-from ..core import show_result
+from ..core import show_multi_modality_result, show_result
 from ..core.bbox import (Box3DMode, CameraInstance3DBoxes, Coord3DMode,
-                         points_cam2img)
+                         LiDARInstance3DBoxes, points_cam2img)
 from .custom_3d import Custom3DDataset


@@ -688,11 +688,28 @@ class KittiDataset(Custom3DDataset):
            points = example['points'][0]._data.numpy()
            points = Coord3DMode.convert_point(points, Coord3DMode.LIDAR,
                                               Coord3DMode.DEPTH)
-            gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d'].tensor
-            gt_bboxes = Box3DMode.convert(gt_bboxes, Box3DMode.LIDAR,
-                                          Box3DMode.DEPTH)
+            gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d'].tensor.numpy()
+            show_gt_bboxes = Box3DMode.convert(gt_bboxes, Box3DMode.LIDAR,
+                                               Box3DMode.DEPTH)
            pred_bboxes = result['boxes_3d'].tensor.numpy()
-            pred_bboxes = Box3DMode.convert(pred_bboxes, Box3DMode.LIDAR,
-                                            Box3DMode.DEPTH)
-            show_result(points, gt_bboxes, pred_bboxes, out_dir, file_name,
-                        show)
+            show_pred_bboxes = Box3DMode.convert(pred_bboxes, Box3DMode.LIDAR,
+                                                 Box3DMode.DEPTH)
+            show_result(points, show_gt_bboxes, show_pred_bboxes, out_dir,
+                        file_name, show)
+
+            # multi-modality visualization
+            if self.modality['use_camera'] and \
+                    'lidar2img' in example['img_metas'][0]._data.keys():
+                img = mmcv.imread(example['img_metas'][0]._data['filename'])
+                show_pred_bboxes = LiDARInstance3DBoxes(
+                    pred_bboxes, origin=(0.5, 0.5, 0))
+                show_gt_bboxes = LiDARInstance3DBoxes(
+                    gt_bboxes, origin=(0.5, 0.5, 0))
+                show_multi_modality_result(
+                    img,
+                    show_gt_bboxes,
+                    show_pred_bboxes,
+                    example['img_metas'][0]._data['lidar2img'],
+                    out_dir,
+                    file_name,
+                    show=False)
--- a/mmdet3d/datasets/lyft_dataset.py
+++ b/mmdet3d/datasets/lyft_dataset.py
@@ -415,7 +415,7 @@ class LyftDataset(Custom3DDataset):
            points = Coord3DMode.convert_point(points, Coord3DMode.LIDAR,
                                               Coord3DMode.DEPTH)
            inds = result['pts_bbox']['scores_3d'] > 0.1
-            gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d'].tensor
+            gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d'].tensor.numpy()
            gt_bboxes = Box3DMode.convert(gt_bboxes, Box3DMode.LIDAR,
                                          Box3DMode.DEPTH)
            pred_bboxes = result['pts_bbox']['boxes_3d'][inds].tensor.numpy()

--- a/mmdet3d/datasets/nuscenes_dataset.py
+++ b/mmdet3d/datasets/nuscenes_dataset.py
@@ -507,7 +507,7 @@ class NuScenesDataset(Custom3DDataset):
            points = Coord3DMode.convert_point(points, Coord3DMode.LIDAR,
                                               Coord3DMode.DEPTH)
            inds = result['pts_bbox']['scores_3d'] > 0.1
-            gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d'].tensor
+            gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d'].tensor.numpy()
            gt_bboxes = Box3DMode.convert(gt_bboxes, Box3DMode.LIDAR,
                                          Box3DMode.DEPTH)
            pred_bboxes = result['pts_bbox']['boxes_3d'][inds].tensor.numpy()