[Enhance] Support visualization of multi-modality 3D detector using multi-view images (#2488)

* init commit * add demo image * support wait-time in hook * add demo of bevfusion * polish docs * more smooth multi-modal vis * fix visualization.md * support depth vis adaptively

[Enhance] Support visualization of multi-modality 3D detector using multi-view images (#2488)
* init commit * add demo image * support wait-time in hook * add demo of bevfusion * polish docs * more smooth multi-modal vis * fix visualization.md * support depth vis adaptively
35fd8394 · Jingwei Zhang · GitHub · 1f0aeba1 · 35fd8394 · 35fd8394
Unverified Commit 35fd8394 authored May 10, 2023 by Jingwei Zhang Committed by GitHub May 10, 2023
17 changed files
--- a/demo/data/nuscenes/n015-2018-07-24-11-22-45+0800.pkl
+++ b/demo/data/nuscenes/n015-2018-07-24-11-22-45+0800.pkl
--- a/demo/data/nuscenes/n015-2018-07-24-11-22-45+0800__CAM_BACK_LEFT__1532402927647423.jpg
+++ b/demo/data/nuscenes/n015-2018-07-24-11-22-45+0800__CAM_BACK_LEFT__1532402927647423.jpg
--- a/demo/data/nuscenes/n015-2018-07-24-11-22-45+0800__CAM_BACK_RIGHT__1532402927627893.jpg
+++ b/demo/data/nuscenes/n015-2018-07-24-11-22-45+0800__CAM_BACK_RIGHT__1532402927627893.jpg
--- a/demo/data/nuscenes/n015-2018-07-24-11-22-45+0800__CAM_FRONT_LEFT__1532402927604844.jpg
+++ b/demo/data/nuscenes/n015-2018-07-24-11-22-45+0800__CAM_FRONT_LEFT__1532402927604844.jpg
--- a/demo/data/nuscenes/n015-2018-07-24-11-22-45+0800__CAM_FRONT_RIGHT__1532402927620339.jpg
+++ b/demo/data/nuscenes/n015-2018-07-24-11-22-45+0800__CAM_FRONT_RIGHT__1532402927620339.jpg
--- a/demo/data/nuscenes/n015-2018-07-24-11-22-45+0800__CAM_FRONT__1532402927612460.jpg
+++ b/demo/data/nuscenes/n015-2018-07-24-11-22-45+0800__CAM_FRONT__1532402927612460.jpg
--- a/demo/multi_modality_demo.py
+++ b/demo/multi_modality_demo.py
@@ -49,8 +49,15 @@ def main(args):
    result, data = inference_multi_modality_detector(model, args.pcd, args.img,
                                                     args.ann, args.cam_type)
    points = data['inputs']['points']
-    img = mmcv.imread(args.img)
-    img = mmcv.imconvert(img, 'bgr', 'rgb')
+    if isinstance(result.img_path, list):
+        img = []
+        for img_path in result.img_path:
+            single_img = mmcv.imread(img_path)
+            single_img = mmcv.imconvert(single_img, 'bgr', 'rgb')
+            img.append(single_img)
+    else:
+        img = mmcv.imread(result.img_path)
+        img = mmcv.imconvert(img, 'bgr', 'rgb')
    data_input = dict(points=points, img=img)

    # show the results

--- a/docs/en/user_guides/inference.md
+++ b/docs/en/user_guides/inference.md
@@ -78,6 +78,12 @@ Example on SUN RGB-D data using [ImVoteNet model](https://download.openmmlab.com
 python demo/multi_modality_demo.py demo/data/sunrgbd/000017.bin demo/data/sunrgbd/000017.jpg demo/data/sunrgbd/sunrgbd_000017_infos.pkl configs/imvotenet/imvotenet_stage2_8xb16_sunrgbd-3d.py ${CHECKPOINT_FILE} --cam-type CAM0 --show --score-thr 0.6
 ```

+Example on NuScenes data using [BEVFusion model](https://drive.google.com/file/d/1QkvbYDk4G2d6SZoeJqish13qSyXA4lp3/view?usp=share_link):
+
+```shell
+python demo/multi_modality_demo.py demo/data/nuscenes/n015-2018-07-24-11-22-45+0800__LIDAR_TOP__1532402927647951.pcd.bin demo/data/nuscenes/ demo/data/nuscenes/n015-2018-07-24-11-22-45+0800.pkl projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py ${CHECKPOINT_FILE} --cam-type all --score-thr 0.2 --show
+```
+
 ### 3D Segmentation

 To test a 3D segmentor on point cloud data, simply run:

--- a/docs/en/user_guides/visualization.md
+++ b/docs/en/user_guides/visualization.md
@@ -42,18 +42,19 @@ We support drawing 3D boxes on point cloud by using `draw_bboxes_3d`.

 ```python
 import torch
+import numpy as np

 from mmdet3d.visualization import Det3DLocalVisualizer
 from mmdet3d.structures import LiDARInstance3DBoxes

-points = np.fromfile('tests/data/kitti/training/velodyne/000000.bin', dtype=np.float32)
+points = np.fromfile('demo/data/kitti/000008.bin', dtype=np.float32)
 points = points.reshape(-1, 4)
 visualizer = Det3DLocalVisualizer()
 # set point cloud in visualizer
 visualizer.set_points(points)
-bboxes_3d = LiDARInstance3DBoxes(torch.tensor(
-                [[8.7314, -1.8559, -1.5997, 1.2000, 0.4800, 1.8900,
-                  -1.5808]])),
+bboxes_3d = LiDARInstance3DBoxes(
+    torch.tensor([[8.7314, -1.8559, -1.5997, 4.2000, 3.4800, 1.8900,
+                   -1.5808]]))
 # Draw 3D bboxes
 visualizer.draw_bboxes_3d(bboxes_3d)
 visualizer.show()
@@ -92,8 +93,6 @@ visualizer.draw_proj_bboxes_3d(gt_bboxes_3d, input_meta)
 visualizer.show()
 ```

-![mono3d](../../../resources/mono3d.png)
-
 ### Drawing BEV Boxes

 We support drawing BEV boxes by using `draw_bev_bboxes`.
@@ -120,23 +119,22 @@ visualizer.draw_bev_bboxes(gt_bboxes_3d, edge_colors='orange')
 visualizer.show()
 ```

-<img src="../../../resources/bev.png" width = "50%" />
-
 ### Drawing 3D Semantic Mask

 We support draw segmentation mask via per-point colorization by using `draw_seg_mask`.

 ```python
-import torch
+import numpy as np

 from mmdet3d.visualization import Det3DLocalVisualizer

-points = np.fromfile('tests/data/s3dis/points/Area_1_office_2.bin', dtype=np.float32)
+points = np.fromfile('demo/data/sunrgbd/000017.bin', dtype=np.float32)
 points = points.reshape(-1, 3)
 visualizer = Det3DLocalVisualizer()
 mask = np.random.rand(points.shape[0], 3)
 points_with_mask = np.concatenate((points, mask), axis=-1)
 # Draw 3D points with mask
+visualizer.set_points(points, pcd_mode=2, vis_mode='add')
 visualizer.draw_seg_mask(points_with_mask)
 visualizer.show()
 ```
@@ -168,10 +166,10 @@ This allows the inference and results generation to be done in remote server and
 We also provide scripts to visualize the dataset without inference. You can use `tools/misc/browse_dataset.py` to show loaded data and ground-truth online and save them on the disk. Currently we support single-modality 3D detection and 3D segmentation on all the datasets, multi-modality 3D detection on KITTI and SUN RGB-D, as well as monocular 3D detection on nuScenes. To browse the KITTI dataset, you can run the following command:

 ```shell
-python tools/misc/browse_dataset.py configs/_base_/datasets/kitti-3d-3class.py --task det --output-dir ${OUTPUT_DIR}
+python tools/misc/browse_dataset.py configs/_base_/datasets/kitti-3d-3class.py --task lidar_det --output-dir ${OUTPUT_DIR}
 ```

-**Notice**: Once specifying `--output-dir`, the images of views specified by users will be saved when pressing `_ESC_` in open3d window.
+**Notice**: Once specifying `--output-dir`, the images of views specified by users will be saved when pressing `_ESC_` in open3d window. If you want to zoom out/in the point clouds to inspect more details, you could specify `--show-interval=0` in the command.

 To verify the data consistency and the effect of data augmentation, you can also add `--aug` flag to visualize the data after data augmentation using the command as below:

@@ -182,7 +180,7 @@ python tools/misc/browse_dataset.py configs/_base_/datasets/kitti-3d-3class.py -
 If you also want to show 2D images with 3D bounding boxes projected onto them, you need to find a config that supports multi-modality data loading, and then change the `--task` args to `multi-modality_det`. An example is showed below:

 ```shell
-python tools/misc/browse_dataset.py configs/mvxnet/dv_mvx-fpn_second_secfpn_adamw_2x8_80e_kitti-3d-3class.py --task multi-modality_det --output-dir ${OUTPUT_DIR}
+python tools/misc/browse_dataset.py configs/mvxnet/mvxnet_fpn_dv_second_secfpn_8xb2-80e_kitti-3d-3class.py --task multi-modality_det --output-dir ${OUTPUT_DIR}
 ```

 ![](../../../resources/browse_dataset_multi_modality.png)
@@ -190,7 +188,7 @@ python tools/misc/browse_dataset.py configs/mvxnet/dv_mvx-fpn_second_secfpn_adam
 You can simply browse different datasets using different configs, e.g. visualizing the ScanNet dataset in 3D semantic segmentation task:

 ```shell
-python tools/misc/browse_dataset.py configs/_base_/datasets/scannet-seg.py --task lidar_seg --output-dir ${OUTPUT_DIR} --online
+python tools/misc/browse_dataset.py configs/_base_/datasets/scannet-seg.py --task lidar_seg --output-dir ${OUTPUT_DIR}
 ```

 ![](../../../resources/browse_dataset_seg.png)
@@ -198,7 +196,7 @@ python tools/misc/browse_dataset.py configs/_base_/datasets/scannet-seg.py --tas
 And browsing the nuScenes dataset in monocular 3D detection task:

 ```shell
-python tools/misc/browse_dataset.py configs/_base_/datasets/nus-mono3d.py --task mono_det --output-dir ${OUTPUT_DIR} --online
+python tools/misc/browse_dataset.py configs/_base_/datasets/nus-mono3d.py --task mono_det --output-dir ${OUTPUT_DIR}
 ```

 ![](../../../resources/browse_dataset_mono.png)
--- a/docs/zh_cn/user_guides/inference.md
+++ b/docs/zh_cn/user_guides/inference.md
@@ -78,6 +78,12 @@ python demo/multi_modality_demo.py demo/data/kitti/000008.bin demo/data/kitti/00
 python demo/multi_modality_demo.py demo/data/sunrgbd/000017.bin demo/data/sunrgbd/000017.jpg demo/data/sunrgbd/sunrgbd_000017_infos.pkl configs/imvotenet/imvotenet_stage2_8xb16_sunrgbd-3d.py ${CHECKPOINT_FILE} --cam-type CAM0 --show --score-thr 0.6
 ```

+在 NuScenes 数据上测试 [BEVFusion 模型](https://drive.google.com/file/d/1QkvbYDk4G2d6SZoeJqish13qSyXA4lp3/view?usp=share_link)
+
+```shell
+python demo/multi_modality_demo.py demo/data/nuscenes/n015-2018-07-24-11-22-45+0800__LIDAR_TOP__1532402927647951.pcd.bin demo/data/nuscenes/ demo/data/nuscenes/n015-2018-07-24-11-22-45+0800.pkl projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py ${CHECKPOINT_FILE} --cam-type all --score-thr 0.2 --show
+```
+
 ### 3D 分割

 在点云数据上测试 3D 分割器，运行：

--- a/docs/zh_cn/user_guides/visualization.md
+++ b/docs/zh_cn/user_guides/visualization.md
@@ -42,18 +42,19 @@ visualizer.show()

 ```python
 import torch
+import numpy as np

 from mmdet3d.visualization import Det3DLocalVisualizer
 from mmdet3d.structures import LiDARInstance3DBoxes

-points = np.fromfile('tests/data/kitti/training/velodyne/000000.bin', dtype=np.float32)
+points = np.fromfile('demo/data/kitti/000008.bin', dtype=np.float32)
 points = points.reshape(-1, 4)
 visualizer = Det3DLocalVisualizer()
 # set point cloud in visualizer
 visualizer.set_points(points)
-bboxes_3d = LiDARInstance3DBoxes(torch.tensor(
-                [[8.7314, -1.8559, -1.5997, 1.2000, 0.4800, 1.8900,
-                  -1.5808]])),
+bboxes_3d = LiDARInstance3DBoxes(
+    torch.tensor([[8.7314, -1.8559, -1.5997, 4.2000, 3.4800, 1.8900,
+                   -1.5808]]))
 # Draw 3D bboxes
 visualizer.draw_bboxes_3d(bboxes_3d)
 visualizer.show()
@@ -92,8 +93,6 @@ visualizer.draw_proj_bboxes_3d(gt_bboxes_3d, input_meta)
 visualizer.show()
 ```

-![mono3d](../../../resources/mono3d.png)
-
 ### 绘制 BEV 视角的框

 通过使用 `draw_bev_bboxes`，我们支持绘制 BEV 视角下的框。
@@ -120,23 +119,22 @@ visualizer.draw_bev_bboxes(gt_bboxes_3d, edge_colors='orange')
 visualizer.show()
 ```

-<img src="../../../resources/bev.png" width = "50%" />
-
 ### 绘制 3D 分割掩码

 通过使用 `draw_seg_mask`，我们支持通过逐点着色来绘制分割掩码。

 ```python
-import torch
+import numpy as np

 from mmdet3d.visualization import Det3DLocalVisualizer

-points = np.fromfile('tests/data/s3dis/points/Area_1_office_2.bin', dtype=np.float32)
+points = np.fromfile('demo/data/sunrgbd/000017.bin', dtype=np.float32)
 points = points.reshape(-1, 3)
 visualizer = Det3DLocalVisualizer()
 mask = np.random.rand(points.shape[0], 3)
 points_with_mask = np.concatenate((points, mask), axis=-1)
 # Draw 3D points with mask
+visualizer.set_points(points, pcd_mode=2, vis_mode='add')
 visualizer.draw_seg_mask(points_with_mask)
 visualizer.show()
 ```
@@ -171,7 +169,7 @@ python tools/misc/visualize_results.py ${CONFIG_FILE} --result ${RESULTS_PATH} -
 python tools/misc/browse_dataset.py configs/_base_/datasets/kitti-3d-3class.py --task lidar_det --output-dir ${OUTPUT_DIR}
 ```

-**注意**：一旦指定了 `--output-dir`，当在 open3d 窗口中按下 `_ESC_` 时，用户指定的视图图像将会被保存下来。
+**注意**：一旦指定了 `--output-dir`，当在 open3d 窗口中按下 `_ESC_` 时，用户指定的视图图像将会被保存下来。如果你想要对点云进行缩放操作以观察更多细节， 你可以在命令中指定 `--show-interval=0`。

 为了验证数据的一致性和数据增强的效果，你可以加上 `--aug` 来可视化数据增强后的数据，指令如下所示：

@@ -182,7 +180,7 @@ python tools/misc/browse_dataset.py configs/_base_/datasets/kitti-3d-3class.py -
 如果你想显示带有投影的 3D 边界框的 2D 图像，你需要一个支持多模态数据加载的配置文件，并将 `--task` 参数改为 `multi-modality_det`。示例如下：

 ```shell
-python tools/misc/browse_dataset.py configs/mvxnet/dv_mvx-fpn_second_secfpn_adamw_2x8_80e_kitti-3d-3class.py --task multi-modality_det --output-dir ${OUTPUT_DIR}
+python tools/misc/browse_dataset.py configs/mvxnet/mvxnet_fpn_dv_second_secfpn_8xb2-80e_kitti-3d-3class.py --task multi-modality_det --output-dir ${OUTPUT_DIR}
 ```

 ![](../../../resources/browse_dataset_multi_modality.png)
@@ -190,7 +188,7 @@ python tools/misc/browse_dataset.py configs/mvxnet/dv_mvx-fpn_second_secfpn_adam
 你可以使用不同的配置浏览不同的数据集，例如在 3D 语义分割任务中可视化 ScanNet 数据集：

 ```shell
-python tools/misc/browse_dataset.py configs/_base_/datasets/scannet-seg.py --task lidar_seg --output-dir ${OUTPUT_DIR} --online
+python tools/misc/browse_dataset.py configs/_base_/datasets/scannet-seg.py --task lidar_seg --output-dir ${OUTPUT_DIR}
 ```

 ![](../../../resources/browse_dataset_seg.png)
@@ -198,7 +196,7 @@ python tools/misc/browse_dataset.py configs/_base_/datasets/scannet-seg.py --tas
 在单目 3D 检测任务中浏览 nuScenes 数据集：

 ```shell
-python tools/misc/browse_dataset.py configs/_base_/datasets/nus-mono3d.py --task mono_det --output-dir ${OUTPUT_DIR} --online
+python tools/misc/browse_dataset.py configs/_base_/datasets/nus-mono3d.py --task mono_det --output-dir ${OUTPUT_DIR}
 ```

 ![](../../../resources/browse_dataset_mono.png)
--- a/mmdet3d/apis/inference.py
+++ b/mmdet3d/apis/inference.py
@@ -188,10 +188,10 @@ def inference_multi_modality_detector(model: nn.Module,
        imgs (str, Sequence[str]):
           Either image files or loaded images.
        ann_file (str, Sequence[str]): Annotation files.
-        cam_type (str): Image of Camera chose to infer.
-            For kitti dataset, it should be 'CAM2',
-            and for nuscenes dataset, it should be
-            'CAM_FRONT'. Defaults to 'CAM_FRONT'.
+        cam_type (str): Image of Camera chose to infer. When detector only uses
+            single-view image, we need to specify a camera view. For kitti
+            dataset, it should be 'CAM2'. For sunrgbd, it should be 'CAM0'.
+            When detector uses multi-view images, we should set it to 'all'.

    Returns:
        :obj:`Det3DDataSample` or list[:obj:`Det3DDataSample`]:
@@ -220,37 +220,51 @@ def inference_multi_modality_detector(model: nn.Module,
    data = []
    for index, pcd in enumerate(pcds):
        # get data info containing calib
-        img = imgs[index]
        data_info = data_list[index]
-        img_path = data_info['images'][cam_type]['img_path']
-
-        if osp.basename(img_path) != osp.basename(img):
-            raise ValueError(f'the info file of {img_path} is not provided.')
+        img = imgs[index]

-        # TODO: check the name consistency of
-        # image file and point cloud file
-        # TODO: support multi-view image loading
-        data_ = dict(
-            lidar_points=dict(lidar_path=pcd),
-            img_path=img,
-            box_type_3d=box_type_3d,
-            box_mode_3d=box_mode_3d)
+        if cam_type != 'all':
+            assert osp.isfile(img), f'{img} must be a file.'
+            img_path = data_info['images'][cam_type]['img_path']
+            if osp.basename(img_path) != osp.basename(img):
+                raise ValueError(
+                    f'the info file of {img_path} is not provided.')
+            data_ = dict(
+                lidar_points=dict(lidar_path=pcd),
+                img_path=img,
+                box_type_3d=box_type_3d,
+                box_mode_3d=box_mode_3d)
+            data_info['images'][cam_type]['img_path'] = img
+            if 'cam2img' in data_info['images'][cam_type]:
+                # The data annotation in SRUNRGBD dataset does not contain
+                # `cam2img`
+                data_['cam2img'] = np.array(
+                    data_info['images'][cam_type]['cam2img'])
+
+            # LiDAR to image conversion for KITTI dataset
+            if box_mode_3d == Box3DMode.LIDAR:
+                if 'lidar2img' in data_info['images'][cam_type]:
+                    data_['lidar2img'] = np.array(
+                        data_info['images'][cam_type]['lidar2img'])
+            # Depth to image conversion for SUNRGBD dataset
+            elif box_mode_3d == Box3DMode.DEPTH:
+                data_['depth2img'] = np.array(
+                    data_info['images'][cam_type]['depth2img'])
+        else:
+            assert osp.isdir(img), f'{img} must be a file directory'
+            for _, img_info in data_info['images'].items():
+                img_info['img_path'] = osp.join(img, img_info['img_path'])
+                assert osp.isfile(img_info['img_path']
+                                  ), f'{img_info["img_path"]} does not exist.'
+            data_ = dict(
+                lidar_points=dict(lidar_path=pcd),
+                images=data_info['images'],
+                box_type_3d=box_type_3d,
+                box_mode_3d=box_mode_3d)

-        data_info['images'][cam_type]['img_path'] = img
-        if 'cam2img' in data_info['images'][cam_type]:
-            # The data annotation in SRUNRGBD dataset does not contain
-            # `cam2img`
-            data_['cam2img'] = np.array(
-                data_info['images'][cam_type]['cam2img'])
-
-        # LiDAR to image conversion for KITTI dataset
-        if box_mode_3d == Box3DMode.LIDAR:
-            data_['lidar2img'] = np.array(
-                data_info['images'][cam_type]['lidar2img'])
-        # Depth to image conversion for SUNRGBD dataset
-        elif box_mode_3d == Box3DMode.DEPTH:
-            data_['depth2img'] = np.array(
-                data_info['images'][cam_type]['depth2img'])
+        if 'timestamp' in data_info:
+            # Using multi-sweeps need `timestamp`
+            data_['timestamp'] = data_info['timestamp']

        data_ = test_pipeline(data_)
        data.append(data_)

--- a/mmdet3d/engine/hooks/visualization_hook.py
+++ b/mmdet3d/engine/hooks/visualization_hook.py
@@ -102,8 +102,17 @@ class Det3DVisualizationHook(Hook):
        ]:
            assert 'img_path' in outputs[0], 'img_path is not in outputs[0]'
            img_path = outputs[0].img_path
-            img_bytes = get(img_path, backend_args=self.backend_args)
-            img = mmcv.imfrombytes(img_bytes, channel_order='rgb')
+            if isinstance(img_path, list):
+                img = []
+                for single_img_path in img_path:
+                    img_bytes = get(
+                        single_img_path, backend_args=self.backend_args)
+                    single_img = mmcv.imfrombytes(
+                        img_bytes, channel_order='rgb')
+                    img.append(single_img)
+            else:
+                img_bytes = get(img_path, backend_args=self.backend_args)
+                img = mmcv.imfrombytes(img_bytes, channel_order='rgb')
            data_input['img'] = img

        if self.vis_task in ['lidar_det', 'multi-modality_det', 'lidar_seg']:
@@ -161,10 +170,21 @@ class Det3DVisualizationHook(Hook):
                assert 'img_path' in data_sample, \
                    'img_path is not in data_sample'
                img_path = data_sample.img_path
-                img_bytes = get(img_path, backend_args=self.backend_args)
-                img = mmcv.imfrombytes(img_bytes, channel_order='rgb')
+                if isinstance(img_path, list):
+                    img = []
+                    for single_img_path in img_path:
+                        img_bytes = get(
+                            single_img_path, backend_args=self.backend_args)
+                        single_img = mmcv.imfrombytes(
+                            img_bytes, channel_order='rgb')
+                        img.append(single_img)
+                else:
+                    img_bytes = get(img_path, backend_args=self.backend_args)
+                    img = mmcv.imfrombytes(img_bytes, channel_order='rgb')
                data_input['img'] = img
                if self.test_out_dir is not None:
+                    if isinstance(img_path, list):
+                        img_path = img_path[0]
                    out_file = osp.basename(img_path)
                    out_file = osp.join(self.test_out_dir, out_file)


--- a/mmdet3d/visualization/local_visualizer.py
+++ b/mmdet3d/visualization/local_visualizer.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import copy
-from typing import List, Optional, Tuple, Union
+import math
+import time
+from typing import List, Optional, Sequence, Tuple, Union

 import matplotlib.pyplot as plt
 import mmcv
@@ -67,6 +69,8 @@ class Det3DLocalVisualizer(DetLocalVisualizer):
            Defaults to dict(size=1, origin=[0, 0, 0]).
        alpha (int or float): The transparency of bboxes or mask.
            Defaults to 0.8.
+        multi_imgs_col (int): The number of columns in arrangement when showing
+            multi-view images.

    Examples:
        >>> import numpy as np
@@ -102,19 +106,23 @@ class Det3DLocalVisualizer(DetLocalVisualizer):
        ...                                       vis_task='lidar_seg')
    """

-    def __init__(self,
-                 name: str = 'visualizer',
-                 points: Optional[np.ndarray] = None,
-                 image: Optional[np.ndarray] = None,
-                 pcd_mode: int = 0,
-                 vis_backends: Optional[List[dict]] = None,
-                 save_dir: Optional[str] = None,
-                 bbox_color: Optional[Union[str, Tuple[int]]] = None,
-                 text_color: Union[str, Tuple[int]] = (200, 200, 200),
-                 mask_color: Optional[Union[str, Tuple[int]]] = None,
-                 line_width: Union[int, float] = 3,
-                 frame_cfg: dict = dict(size=1, origin=[0, 0, 0]),
-                 alpha: Union[int, float] = 0.8) -> None:
+    def __init__(
+        self,
+        name: str = 'visualizer',
+        points: Optional[np.ndarray] = None,
+        image: Optional[np.ndarray] = None,
+        pcd_mode: int = 0,
+        vis_backends: Optional[List[dict]] = None,
+        save_dir: Optional[str] = None,
+        bbox_color: Optional[Union[str, Tuple[int]]] = None,
+        text_color: Union[str, Tuple[int]] = (200, 200, 200),
+        mask_color: Optional[Union[str, Tuple[int]]] = None,
+        line_width: Union[int, float] = 3,
+        frame_cfg: dict = dict(size=1, origin=[0, 0, 0]),
+        alpha: Union[int, float] = 0.8,
+        multi_imgs_col: int = 3,
+        fig_show_cfg: dict = dict(figsize=(18, 12))
+    ) -> None:
        super().__init__(
            name=name,
            image=image,
@@ -128,6 +136,8 @@ class Det3DLocalVisualizer(DetLocalVisualizer):
        if points is not None:
            self.set_points(points, pcd_mode=pcd_mode, frame_cfg=frame_cfg)
        self.pts_seg_num = 0
+        self.multi_imgs_col = multi_imgs_col
+        self.fig_show_cfg.update(fig_show_cfg)

    def _clear_o3d_vis(self) -> None:
        """Clear open3d vis."""
@@ -163,7 +173,7 @@ class Det3DLocalVisualizer(DetLocalVisualizer):
                   pcd_mode: int = 0,
                   vis_mode: str = 'replace',
                   frame_cfg: dict = dict(size=1, origin=[0, 0, 0]),
-                   points_color: Tuple[float] = (0.5, 0.5, 0.5),
+                   points_color: Tuple[float] = (1, 1, 1),
                   points_size: int = 2,
                   mode: str = 'xyz') -> None:
        """Set the point cloud to draw.
@@ -183,7 +193,7 @@ class Det3DLocalVisualizer(DetLocalVisualizer):
                visualization initialization.
                Defaults to dict(size=1, origin=[0, 0, 0]).
            points_color (Tuple[float]): The color of points.
-                Defaults to (0.5, 0.5, 0.5).
+                Defaults to (1, 1, 1).
            points_size (int): The size of points to show on visualizer.
                Defaults to 2.
            mode (str): Indicate type of the input points, available mode
@@ -204,8 +214,10 @@ class Det3DLocalVisualizer(DetLocalVisualizer):
            self.o3d_vis.remove_geometry(self.pcd)

        # set points size in Open3D
-        if self.o3d_vis.get_render_option() is not None:
-            self.o3d_vis.get_render_option().point_size = points_size
+        render_option = self.o3d_vis.get_render_option()
+        if render_option is not None:
+            render_option.point_size = points_size
+            render_option.background_color = np.asarray([0, 0, 0])

        points = points.copy()
        pcd = geometry.PointCloud()
@@ -412,7 +424,8 @@ class Det3DLocalVisualizer(DetLocalVisualizer):
    def draw_points_on_image(self,
                             points: Union[np.ndarray, Tensor],
                             pts2img: np.ndarray,
-                             sizes: Union[np.ndarray, int] = 10) -> None:
+                             sizes: Union[np.ndarray, int] = 3,
+                             max_depth: Optional[float] = None) -> None:
        """Draw projected points on the image.

        Args:
@@ -420,13 +433,18 @@ class Det3DLocalVisualizer(DetLocalVisualizer):
            pts2img (np.ndarray): The transformation matrix from the coordinate
                of point cloud to image plane.
            sizes (np.ndarray or int): The marker size. Defaults to 10.
+            max_depth (float): The max depth in the color map. Defaults to
+                None.
        """
        check_type('points', points, (np.ndarray, Tensor))
        points = tensor2ndarray(points)
        assert self._image is not None, 'Please set image using `set_image`'
        projected_points = points_cam2img(points, pts2img, with_depth=True)
        depths = projected_points[:, 2]
-        colors = (depths % 20) / 20
+        # Show depth adaptively consideing different scenes
+        if max_depth is None:
+            max_depth = depths.max()
+        colors = (depths % max_depth) / max_depth
        # use colormap to obtain the render color
        color_map = plt.get_cmap('jet')
        self.ax_save.scatter(
@@ -435,7 +453,7 @@ class Det3DLocalVisualizer(DetLocalVisualizer):
            c=colors,
            cmap=color_map,
            s=sizes,
-            alpha=0.5,
+            alpha=0.7,
            edgecolors='none')

    # TODO: set bbox color according to palette
@@ -450,7 +468,8 @@ class Det3DLocalVisualizer(DetLocalVisualizer):
            line_widths: Union[int, float, List[Union[int, float]]] = 2,
            face_colors: Union[str, Tuple[int],
                               List[Union[str, Tuple[int]]]] = 'royalblue',
-            alpha: Union[int, float] = 0.4):
+            alpha: Union[int, float] = 0.4,
+            img_size: Optional[Tuple] = None):
        """Draw projected 3D boxes on the image.

        Args:
@@ -476,6 +495,7 @@ class Det3DLocalVisualizer(DetLocalVisualizer):
            face_colors (str or Tuple[int] or List[str or Tuple[int]]):
                The face colors. Defaults to 'royalblue'.
            alpha (int or float): The transparency of bboxes. Defaults to 0.4.
+            img_size (tuple, optional): The size (w, h) of the image.
        """

        check_type('bboxes', bboxes_3d, BaseInstance3DBoxes)
@@ -490,6 +510,14 @@ class Det3DLocalVisualizer(DetLocalVisualizer):
            raise NotImplementedError('unsupported box type!')

        corners_2d = proj_bbox3d_to_img(bboxes_3d, input_meta)
+        if img_size is not None:
+            # Filter out the bbox where half of stuff is outside the image.
+            # This is for the visualization of multi-view image.
+            valid_point_idx = (corners_2d[..., 0] >= 0) & \
+                        (corners_2d[..., 0] <= img_size[0]) & \
+                        (corners_2d[..., 1] >= 0) & (corners_2d[..., 1] <= img_size[1])  # noqa: E501
+            valid_bbox_idx = valid_point_idx.sum(axis=-1) >= 4
+            corners_2d = corners_2d[valid_bbox_idx]

        lines_verts_idx = [0, 1, 2, 3, 7, 6, 5, 4, 0, 3, 7, 4, 5, 1, 2, 6]
        lines_verts = corners_2d[:, lines_verts_idx, :]
@@ -593,16 +621,59 @@ class Det3DLocalVisualizer(DetLocalVisualizer):
        if vis_task in ['mono_det', 'multi-modality_det']:
            assert 'img' in data_input
            img = data_input['img']
-            if isinstance(data_input['img'], Tensor):
-                img = img.permute(1, 2, 0).numpy()
-                img = img[..., [2, 1, 0]]  # bgr to rgb
-            self.set_image(img)
-            self.draw_proj_bboxes_3d(bboxes_3d, input_meta)
-            if vis_task == 'mono_det' and hasattr(instances, 'centers_2d'):
-                centers_2d = instances.centers_2d
-                self.draw_points(centers_2d)
-            drawn_img = self.get_image()
-            data_3d['img'] = drawn_img
+            if isinstance(img, list) or (isinstance(img, (np.ndarray, Tensor))
+                                         and len(img.shape) == 4):
+                # show multi-view images
+                img_size = img[0].shape[:2] if isinstance(
+                    img, list) else img.shape[-2:]  # noqa: E501
+                img_col = self.multi_imgs_col
+                img_row = math.ceil(len(img) / img_col)
+                composed_img = np.zeros(
+                    (img_size[0] * img_row, img_size[1] * img_col, 3),
+                    dtype=np.uint8)
+                for i, single_img in enumerate(img):
+                    # Note that we should keep the same order of elements both
+                    # in `img` and `input_meta`
+                    if isinstance(single_img, Tensor):
+                        single_img = single_img.permute(1, 2, 0).numpy()
+                        single_img = single_img[..., [2, 1, 0]]  # bgr to rgb
+                    self.set_image(single_img)
+                    single_img_meta = dict()
+                    for key, meta in input_meta.items():
+                        if isinstance(meta,
+                                      (Sequence, np.ndarray,
+                                       Tensor)) and len(meta) == len(img):
+                            single_img_meta[key] = meta[i]
+                        else:
+                            single_img_meta[key] = meta
+                    self.draw_proj_bboxes_3d(
+                        bboxes_3d,
+                        single_img_meta,
+                        img_size=single_img.shape[:2][::-1])
+                    if vis_task == 'mono_det' and hasattr(
+                            instances, 'centers_2d'):
+                        centers_2d = instances.centers_2d
+                        self.draw_points(centers_2d)
+                    composed_img[(i // img_col) *
+                                 img_size[0]:(i // img_col + 1) * img_size[0],
+                                 (i % img_col) *
+                                 img_size[1]:(i % img_col + 1) *
+                                 img_size[1]] = self.get_image()
+                data_3d['img'] = composed_img
+            else:
+                # show single-view image
+                # TODO: Solve the problem: some line segments of 3d bboxes are
+                # out of image by a large margin
+                if isinstance(data_input['img'], Tensor):
+                    img = img.permute(1, 2, 0).numpy()
+                    img = img[..., [2, 1, 0]]  # bgr to rgb
+                self.set_image(img)
+                self.draw_proj_bboxes_3d(bboxes_3d, input_meta)
+                if vis_task == 'mono_det' and hasattr(instances, 'centers_2d'):
+                    centers_2d = instances.centers_2d
+                    self.draw_points(centers_2d)
+                drawn_img = self.get_image()
+                data_3d['img'] = drawn_img

        return data_3d

@@ -644,7 +715,8 @@ class Det3DLocalVisualizer(DetLocalVisualizer):
             drawn_img: Optional[np.ndarray] = None,
             win_name: str = 'image',
             wait_time: int = 0,
-             continue_key: str = ' ') -> None:
+             continue_key: str = ' ',
+             vis_task: str = 'lidar_det') -> None:
        """Show the drawn point cloud/image.

        Args:
@@ -661,22 +733,46 @@ class Det3DLocalVisualizer(DetLocalVisualizer):
                means "forever". Defaults to 0.
            continue_key (str): The key for users to continue. Defaults to ' '.
        """
+        if vis_task == 'multi-modality_det':
+            img_wait_time = 0.5
+        else:
+            img_wait_time = wait_time
+
+        # In order to show multi-modal results at the same time, we show image
+        # firstly and then show point cloud since the running of
+        # Open3D will block the process
+        if hasattr(self, '_image'):
+            if drawn_img is None and drawn_img_3d is None:
+                # use the image got by Visualizer.get_image()
+                super().show(drawn_img_3d, win_name, img_wait_time,
+                             continue_key)
+            else:
+                if drawn_img_3d is not None:
+                    super().show(drawn_img_3d, win_name, img_wait_time,
+                                 continue_key)
+                if drawn_img is not None:
+                    super().show(drawn_img, win_name, img_wait_time,
+                                 continue_key)
+
        if hasattr(self, 'o3d_vis'):
-            self.o3d_vis.run()
+            self.o3d_vis.poll_events()
+            self.o3d_vis.update_renderer()
+            if wait_time > 0:
+                time.sleep(wait_time)
+            else:
+                self.o3d_vis.run()
            if save_path is not None:
                if not (save_path.endswith('.png')
                        or save_path.endswith('.jpg')):
                    save_path += '.png'
                self.o3d_vis.capture_screen_image(save_path)
+
+            # TODO: support more flexible window control
+            self.o3d_vis.clear_geometries()
            self.o3d_vis.destroy_window()
+            self.o3d_vis.close()
            self._clear_o3d_vis()

-        if hasattr(self, '_image'):
-            if drawn_img_3d is not None:
-                super().show(drawn_img_3d, win_name, wait_time, continue_key)
-            if drawn_img is not None:
-                super().show(drawn_img, win_name, wait_time, continue_key)
-
    # TODO: Support Visualize the 3D results from image and point cloud
    # respectively
    @master_only
@@ -823,7 +919,8 @@ class Det3DLocalVisualizer(DetLocalVisualizer):
                drawn_img_3d,
                drawn_img,
                win_name=name,
-                wait_time=wait_time)
+                wait_time=wait_time,
+                vis_task=vis_task)

        if out_file is not None:
            # check the suffix of the name of image file

--- a/projects/BEVFusion/README.md
+++ b/projects/BEVFusion/README.md
@@ -29,6 +29,14 @@ We implement BEVFusion and provide the results and pretrained checkpoints on NuS
 python projects/BEVFusion/setup.py develop
 ```

+### Demo
+
+Run a demo on NuScenes data using [BEVFusion model](https://drive.google.com/file/d/1QkvbYDk4G2d6SZoeJqish13qSyXA4lp3/view?usp=share_link):
+
+```shell
+python demo/multi_modality_demo.py demo/data/nuscenes/n015-2018-07-24-11-22-45+0800__LIDAR_TOP__1532402927647951.pcd.bin demo/data/nuscenes/ demo/data/nuscenes/n015-2018-07-24-11-22-45+0800.pkl projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py ${CHECKPOINT_FILE} --cam-type all --score-thr 0.2 --show
+```
+
 ### Training commands

 In MMDetection3D's root directory, run the following command to train the model:

--- a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
+++ b/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
-_base_ = ['mmdet3d::_base_/default_runtime.py']
+_base_ = ['../../../configs/_base_/default_runtime.py']
 custom_imports = dict(
    imports=['projects.BEVFusion.bevfusion'], allow_failed_imports=False)

@@ -323,7 +323,7 @@ test_pipeline = [
        meta_keys=[
            'cam2img', 'ori_cam2img', 'lidar2cam', 'lidar2img', 'cam2lidar',
            'ori_lidar2img', 'img_aug_matrix', 'box_type_3d', 'sample_idx',
-            'lidar_path', 'img_path'
+            'lidar_path', 'img_path', 'num_pts_feats', 'num_views'
        ])
 ]


--- a/tools/misc/browse_dataset.py
+++ b/tools/misc/browse_dataset.py
@@ -67,6 +67,9 @@ def build_data_cfg(config_path, aug, cfg_options):
    # use only first dataset for `ConcatDataset`
    if cfg.train_dataloader.dataset['type'] == 'ConcatDataset':
        cfg.train_dataloader.dataset = cfg.train_dataloader.dataset.datasets[0]
+    if cfg.train_dataloader.dataset['type'] == 'CBGSDataset':
+        cfg.train_dataloader.dataset = cfg.train_dataloader.dataset.dataset
+
    train_data_cfg = cfg.train_dataloader.dataset

    if aug: