[Enhance] Use Inferencer to implement Demo (#2763)

db39fd4a · Sun Jiahao · GitHub · f4c032e4 · db39fd4a · db39fd4a
Unverified Commit db39fd4a authored Oct 18, 2023 by Sun Jiahao Committed by GitHub Oct 18, 2023
20 changed files
--- a/demo/inference_demo.ipynb
+++ b/demo/inference_demo.ipynb
@@ -2,117 +2,83 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": null,
-   "source": [
-    "from mmdet3d.apis import inference_detector, init_model\n",
-    "from mmdet3d.registry import VISUALIZERS\n",
-    "from mmdet3d.utils import register_all_modules"
-   ],
-   "outputs": [],
+   "execution_count": 25,
   "metadata": {
    "pycharm": {
     "is_executing": false
    }
-   }
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "source": [
-    "# register all modules in mmdet3d into the registries\n",
-    "register_all_modules()"
-   ],
   "outputs": [],
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
   "source": [
-    "config_file = '../configs/second/hv_second_secfpn_6x8_80e_kitti-3d-car.py'\n",
-    "# download the checkpoint from model zoo and put it in `checkpoints/`\n",
-    "checkpoint_file = '../work_dirs/second/epoch_40.pth'"
-   ],
-   "outputs": [],
-   "metadata": {
-    "pycharm": {
-     "is_executing": false
-    }
-   }
+    "from mmdet3d.apis import LidarDet3DInferencer"
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "source": [
-    "# build the model from a config file and a checkpoint file\n",
-    "model = init_model(config_file, checkpoint_file, device='cuda:0')"
-   ],
+   "metadata": {},
   "outputs": [],
-   "metadata": {}
+   "source": [
+    "# initialize inferencer\n",
+    "inferencer = LidarDet3DInferencer('pointpillars_kitti-3class')"
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "source": [
-    "# init visualizer\n",
-    "visualizer = VISUALIZERS.build(model.cfg.visualizer)\n",
-    "visualizer.dataset_meta = {\n",
-    "    'CLASSES': model.CLASSES,\n",
-    "    'PALETTE': model.PALETTE\n",
-    "}"
-   ],
-   "outputs": [],
   "metadata": {
    "pycharm": {
     "is_executing": false
    }
-   }
+   },
+   "outputs": [],
+   "source": [
+    "# inference\n",
+    "inputs = dict(points='./data/kitti/000008.bin')\n",
+    "inferencer(inputs)"
+   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
-   "source": [
-    "# test a single sample\n",
-    "pcd = './data/kitti/000008.bin'\n",
-    "result, data = inference_detector(model, pcd)\n",
-    "points = data['inputs']['points']\n",
-    "data_input = dict(points=points)"
-   ],
+   "execution_count": null,
+   "metadata": {},
   "outputs": [],
-   "metadata": {
-    "pycharm": {
-     "is_executing": false
-    }
-   }
+   "source": [
+    "# inference and visualize\n",
+    "# NOTE: use the `Esc` key to exit Open3D window in Jupyter Notebook Environment\n",
+    "inferencer(inputs, show=True)"
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "source": [
-    "# show the results\n",
-    "out_dir = './'\n",
-    "visualizer.add_datasample(\n",
-    "    'result',\n",
-    "    data_input,\n",
-    "    data_sample=result,\n",
-    "    draw_gt=False,\n",
-    "    show=True,\n",
-    "    wait_time=0,\n",
-    "    out_file=out_dir,\n",
-    "    vis_task='det')"
-   ],
+   "metadata": {},
   "outputs": [],
-   "metadata": {
-    "pycharm": {
-     "is_executing": false
-    }
-   }
+   "source": [
+    "# If your operating environment does not have a display device,\n",
+    "# (e.g. a remote server), you can save the predictions and visualize\n",
+    "# them in local devices.\n",
+    "inferencer(inputs, show=False, out_dir='./remote_outputs')\n",
+    "\n",
+    "# Simulate the migration process\n",
+    "%mv ./remote_outputs ./local_outputs\n",
+    "\n",
+    "# Visualize the predictions from the saved files\n",
+    "# NOTE: use the `Esc` key to exit Open3D window in Jupyter Notebook Environment\n",
+    "local_inferencer = LidarDet3DInferencer('pointpillars_kitti-3class')\n",
+    "inputs = local_inferencer._inputs_to_list(inputs)\n",
+    "local_inferencer.visualize_preds_fromfile(inputs, ['local_outputs/preds/000008.json'], show=True)"
+   ]
  }
 ],
 "metadata": {
+  "interpreter": {
+   "hash": "a0c343fece975dd89087e8c2194dd4d3db28d7000f1b32ed9ed9d584dd54dbbe"
+  },
  "kernelspec": {
-   "name": "python3",
-   "display_name": "Python 3.7.6 64-bit ('torch1.7-cu10.1': conda)"
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
@@ -124,19 +90,16 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.7.6"
+   "version": "3.9.16"
  },
  "pycharm": {
   "stem_cell": {
    "cell_type": "raw",
-    "source": [],
    "metadata": {
     "collapsed": false
-    }
-   }
    },
-  "interpreter": {
-   "hash": "a0c343fece975dd89087e8c2194dd4d3db28d7000f1b32ed9ed9d584dd54dbbe"
+    "source": []
+   }
  }
 },
 "nbformat": 4,

--- a/demo/mono_det_demo.py
+++ b/demo/mono_det_demo.py
 # Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import os
 from argparse import ArgumentParser

-import mmcv
+from mmengine.logging import print_log

-from mmdet3d.apis import inference_mono_3d_detector, init_model
-from mmdet3d.registry import VISUALIZERS
+from mmdet3d.apis import MonoDet3DInferencer


 def parse_args():
    parser = ArgumentParser()
-    parser.add_argument('img', help='image file')
-    parser.add_argument('ann', help='ann file')
-    parser.add_argument('config', help='Config file')
-    parser.add_argument('checkpoint', help='Checkpoint file')
+    parser.add_argument('img', help='Image file')
+    parser.add_argument('infos', help='Infos file with annotations')
+    parser.add_argument('model', help='Config file')
+    parser.add_argument('weights', help='Checkpoint file')
    parser.add_argument(
        '--device', default='cuda:0', help='Device used for inference')
    parser.add_argument(
@@ -21,50 +22,77 @@ def parse_args():
        default='CAM_BACK',
        help='choose camera type to inference')
    parser.add_argument(
-        '--score-thr', type=float, default=0.30, help='bbox score threshold')
+        '--pred-score-thr',
+        type=float,
+        default=0.3,
+        help='bbox score threshold')
    parser.add_argument(
-        '--out-dir', type=str, default='demo', help='dir to save results')
+        '--out-dir',
+        type=str,
+        default='outputs',
+        help='Output directory of prediction and visualization results.')
    parser.add_argument(
        '--show',
        action='store_true',
-        help='show online visualization results')
+        help='Show online visualization results')
+    parser.add_argument(
+        '--wait-time',
+        type=float,
+        default=-1,
+        help='The interval of show (s). Demo will be blocked in showing'
+        'results, if wait_time is -1. Defaults to -1.')
    parser.add_argument(
-        '--snapshot',
+        '--no-save-vis',
        action='store_true',
-        help='whether to save online visualization results')
-    args = parser.parse_args()
-    return args
+        help='Do not save detection visualization results')
+    parser.add_argument(
+        '--no-save-pred',
+        action='store_true',
+        help='Do not save detection prediction results')
+    parser.add_argument(
+        '--print-result',
+        action='store_true',
+        help='Whether to print the results.')
+    call_args = vars(parser.parse_args())
+
+    call_args['inputs'] = dict(
+        img=call_args.pop('img'), infos=call_args.pop('infos'))
+    call_args.pop('cam_type')
+
+    if call_args['no_save_vis'] and call_args['no_save_pred']:
+        call_args['out_dir'] = ''
+
+    init_kws = ['model', 'weights', 'device']
+    init_args = {}
+    for init_kw in init_kws:
+        init_args[init_kw] = call_args.pop(init_kw)

+    # NOTE: If your operating environment does not have a display device,
+    # (e.g. a remote server), you can save the predictions and visualize
+    # them in local devices.
+    if os.environ.get('DISPLAY') is None and call_args['show']:
+        print_log(
+            'Display device not found. `--show` is forced to False',
+            logger='current',
+            level=logging.WARNING)
+        call_args['show'] = False

-def main(args):
-    # build the model from a config file and a checkpoint file
-    model = init_model(args.config, args.checkpoint, device=args.device)
+    return init_args, call_args

-    # init visualizer
-    visualizer = VISUALIZERS.build(model.cfg.visualizer)
-    visualizer.dataset_meta = model.dataset_meta

-    # test a single image
-    result = inference_mono_3d_detector(model, args.img, args.ann,
-                                        args.cam_type)
+def main():
+    # TODO: Support inference of point cloud numpy file.
+    init_args, call_args = parse_args()

-    img = mmcv.imread(args.img)
-    img = mmcv.imconvert(img, 'bgr', 'rgb')
+    inferencer = MonoDet3DInferencer(**init_args)
+    inferencer(**call_args)

-    data_input = dict(img=img)
-    # show the results
-    visualizer.add_datasample(
-        'result',
-        data_input,
-        data_sample=result,
-        draw_gt=False,
-        show=args.show,
-        wait_time=-1,
-        out_file=args.out_dir,
-        pred_score_thr=args.score_thr,
-        vis_task='mono_det')
+    if call_args['out_dir'] != '' and not (call_args['no_save_vis']
+                                           and call_args['no_save_pred']):
+        print_log(
+            f'results have been saved at {call_args["out_dir"]}',
+            logger='current')


 if __name__ == '__main__':
-    args = parse_args()
-    main(args)
+    main()
--- a/demo/multi_modality_demo.py
+++ b/demo/multi_modality_demo.py
 # Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import os
 from argparse import ArgumentParser

-import mmcv
+from mmengine.logging import print_log

-from mmdet3d.apis import inference_multi_modality_detector, init_model
-from mmdet3d.registry import VISUALIZERS
+from mmdet3d.apis import MultiModalityDet3DInferencer


 def parse_args():
    parser = ArgumentParser()
    parser.add_argument('pcd', help='Point cloud file')
-    parser.add_argument('img', help='image file')
-    parser.add_argument('ann', help='ann file')
-    parser.add_argument('config', help='Config file')
-    parser.add_argument('checkpoint', help='Checkpoint file')
+    parser.add_argument('img', help='Image file')
+    parser.add_argument('infos', help='Infos file with annotations')
+    parser.add_argument('model', help='Config file')
+    parser.add_argument('weights', help='Checkpoint file')
    parser.add_argument(
        '--device', default='cuda:0', help='Device used for inference')
    parser.add_argument(
@@ -22,57 +23,79 @@ def parse_args():
        default='CAM_FRONT',
        help='choose camera type to inference')
    parser.add_argument(
-        '--score-thr', type=float, default=0.0, help='bbox score threshold')
+        '--pred-score-thr',
+        type=float,
+        default=0.3,
+        help='bbox score threshold')
    parser.add_argument(
-        '--out-dir', type=str, default='demo', help='dir to save results')
+        '--out-dir',
+        type=str,
+        default='outputs',
+        help='Output directory of prediction and visualization results.')
    parser.add_argument(
        '--show',
        action='store_true',
-        help='show online visualization results')
+        help='Show online visualization results')
+    parser.add_argument(
+        '--wait-time',
+        type=float,
+        default=-1,
+        help='The interval of show (s). Demo will be blocked in showing'
+        'results, if wait_time is -1. Defaults to -1.')
+    parser.add_argument(
+        '--no-save-vis',
+        action='store_true',
+        help='Do not save detection visualization results')
+    parser.add_argument(
+        '--no-save-pred',
+        action='store_true',
+        help='Do not save detection prediction results')
    parser.add_argument(
-        '--snapshot',
+        '--print-result',
        action='store_true',
-        help='whether to save online visualization results')
-    args = parser.parse_args()
-    return args
+        help='Whether to print the results.')
+    call_args = vars(parser.parse_args())
+
+    call_args['inputs'] = dict(
+        points=call_args.pop('pcd'),
+        img=call_args.pop('img'),
+        infos=call_args.pop('infos'))
+    call_args.pop('cam_type')
+
+    if call_args['no_save_vis'] and call_args['no_save_pred']:
+        call_args['out_dir'] = ''
+
+    init_kws = ['model', 'weights', 'device']
+    init_args = {}
+    for init_kw in init_kws:
+        init_args[init_kw] = call_args.pop(init_kw)
+
+    # NOTE: If your operating environment does not have a display device,
+    # (e.g. a remote server), you can save the predictions and visualize
+    # them in local devices.
+    if os.environ.get('DISPLAY') is None and call_args['show']:
+        print_log(
+            'Display device not found. `--show` is forced to False',
+            logger='current',
+            level=logging.WARNING)
+        call_args['show'] = False

+    return init_args, call_args

-def main(args):
-    # build the model from a config file and a checkpoint file
-    model = init_model(args.config, args.checkpoint, device=args.device)

-    # init visualizer
-    visualizer = VISUALIZERS.build(model.cfg.visualizer)
-    visualizer.dataset_meta = model.dataset_meta
+def main():
+    # TODO: Support inference of point cloud numpy file.
+    init_args, call_args = parse_args()

-    # test a single image and point cloud sample
-    result, data = inference_multi_modality_detector(model, args.pcd, args.img,
-                                                     args.ann, args.cam_type)
-    points = data['inputs']['points']
-    if isinstance(result.img_path, list):
-        img = []
-        for img_path in result.img_path:
-            single_img = mmcv.imread(img_path)
-            single_img = mmcv.imconvert(single_img, 'bgr', 'rgb')
-            img.append(single_img)
-    else:
-        img = mmcv.imread(result.img_path)
-        img = mmcv.imconvert(img, 'bgr', 'rgb')
-    data_input = dict(points=points, img=img)
+    inferencer = MultiModalityDet3DInferencer(**init_args)
+    inferencer(**call_args)

-    # show the results
-    visualizer.add_datasample(
-        'result',
-        data_input,
-        data_sample=result,
-        draw_gt=False,
-        show=args.show,
-        wait_time=-1,
-        out_file=args.out_dir,
-        pred_score_thr=args.score_thr,
-        vis_task='multi-modality_det')
+    if call_args['out_dir'] != '' and not (call_args['no_save_vis']
+                                           and call_args['no_save_pred']):
+        print_log(
+            f'results have been saved at {call_args["out_dir"]}',
+            logger='current')


 if __name__ == '__main__':
-    args = parse_args()
-    main(args)
+    main()
--- a/demo/pcd_demo.py
+++ b/demo/pcd_demo.py
 # Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import os
 from argparse import ArgumentParser

-from mmdet3d.apis import inference_detector, init_model
-from mmdet3d.registry import VISUALIZERS
+from mmengine.logging import print_log
+
+from mmdet3d.apis import LidarDet3DInferencer


 def parse_args():
    parser = ArgumentParser()
    parser.add_argument('pcd', help='Point cloud file')
-    parser.add_argument('config', help='Config file')
-    parser.add_argument('checkpoint', help='Checkpoint file')
+    parser.add_argument('model', help='Config file')
+    parser.add_argument('weights', help='Checkpoint file')
    parser.add_argument(
        '--device', default='cuda:0', help='Device used for inference')
    parser.add_argument(
-        '--score-thr', type=float, default=0.0, help='bbox score threshold')
+        '--pred-score-thr',
+        type=float,
+        default=0.3,
+        help='bbox score threshold')
    parser.add_argument(
-        '--out-dir', type=str, default='demo', help='dir to save results')
+        '--out-dir',
+        type=str,
+        default='outputs',
+        help='Output directory of prediction and visualization results.')
    parser.add_argument(
        '--show',
        action='store_true',
-        help='show online visualization results')
+        help='Show online visualization results')
+    parser.add_argument(
+        '--wait-time',
+        type=float,
+        default=-1,
+        help='The interval of show (s). Demo will be blocked in showing'
+        'results, if wait_time is -1. Defaults to -1.')
+    parser.add_argument(
+        '--no-save-vis',
+        action='store_true',
+        help='Do not save detection visualization results')
    parser.add_argument(
-        '--snapshot',
+        '--no-save-pred',
        action='store_true',
-        help='whether to save online visualization results')
-    args = parser.parse_args()
-    return args
+        help='Do not save detection prediction results')
+    parser.add_argument(
+        '--print-result',
+        action='store_true',
+        help='Whether to print the results.')
+    call_args = vars(parser.parse_args())
+
+    call_args['inputs'] = dict(points=call_args.pop('pcd'))

+    if call_args['no_save_vis'] and call_args['no_save_pred']:
+        call_args['out_dir'] = ''

-def main(args):
+    init_kws = ['model', 'weights', 'device']
+    init_args = {}
+    for init_kw in init_kws:
+        init_args[init_kw] = call_args.pop(init_kw)
+
+    # NOTE: If your operating environment does not have a display device,
+    # (e.g. a remote server), you can save the predictions and visualize
+    # them in local devices.
+    if os.environ.get('DISPLAY') is None and call_args['show']:
+        print_log(
+            'Display device not found. `--show` is forced to False',
+            logger='current',
+            level=logging.WARNING)
+        call_args['show'] = False
+
+    return init_args, call_args
+
+
+def main():
    # TODO: Support inference of point cloud numpy file.
-    # build the model from a config file and a checkpoint file
-    model = init_model(args.config, args.checkpoint, device=args.device)
-
-    # init visualizer
-    visualizer = VISUALIZERS.build(model.cfg.visualizer)
-    visualizer.dataset_meta = model.dataset_meta
-
-    # test a single point cloud sample
-    result, data = inference_detector(model, args.pcd)
-    points = data['inputs']['points']
-    data_input = dict(points=points)
-
-    # show the results
-    visualizer.add_datasample(
-        'result',
-        data_input,
-        data_sample=result,
-        draw_gt=False,
-        show=args.show,
-        wait_time=-1,
-        out_file=args.out_dir,
-        pred_score_thr=args.score_thr,
-        vis_task='lidar_det')
+    init_args, call_args = parse_args()
+
+    inferencer = LidarDet3DInferencer(**init_args)
+    inferencer(**call_args)
+
+    if call_args['out_dir'] != '' and not (call_args['no_save_vis']
+                                           and call_args['no_save_pred']):
+        print_log(
+            f'results have been saved at {call_args["out_dir"]}',
+            logger='current')


 if __name__ == '__main__':
-    args = parse_args()
-    main(args)
+    main()
--- a/demo/pcd_seg_demo.py
+++ b/demo/pcd_seg_demo.py
 # Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import os
 from argparse import ArgumentParser

-from mmdet3d.apis import inference_segmentor, init_model
-from mmdet3d.registry import VISUALIZERS
+from mmengine.logging import print_log
+
+from mmdet3d.apis import LidarSeg3DInferencer


 def parse_args():
    parser = ArgumentParser()
    parser.add_argument('pcd', help='Point cloud file')
-    parser.add_argument('config', help='Config file')
-    parser.add_argument('checkpoint', help='Checkpoint file')
+    parser.add_argument('model', help='Config file')
+    parser.add_argument('weights', help='Checkpoint file')
    parser.add_argument(
        '--device', default='cuda:0', help='Device used for inference')
    parser.add_argument(
-        '--out-dir', type=str, default='demo', help='dir to save results')
+        '--out-dir',
+        type=str,
+        default='outputs',
+        help='Output directory of prediction and visualization results.')
    parser.add_argument(
        '--show',
        action='store_true',
-        help='show online visualization results')
+        help='Show online visualization results')
+    parser.add_argument(
+        '--wait-time',
+        type=float,
+        default=-1,
+        help='The interval of show (s). Demo will be blocked in showing'
+        'results, if wait_time is -1. Defaults to -1.')
+    parser.add_argument(
+        '--no-save-vis',
+        action='store_true',
+        help='Do not save detection visualization results')
    parser.add_argument(
-        '--snapshot',
+        '--no-save-pred',
        action='store_true',
-        help='whether to save online visualization results')
-    args = parser.parse_args()
-    return args
-
-
-def main(args):
-    # build the model from a config file and a checkpoint file
-    model = init_model(args.config, args.checkpoint, device=args.device)
-
-    # init visualizer
-    visualizer = VISUALIZERS.build(model.cfg.visualizer)
-    visualizer.dataset_meta = model.dataset_meta
-
-    # test a single point cloud sample
-    result, data = inference_segmentor(model, args.pcd)
-    points = data['inputs']['points']
-    data_input = dict(points=points)
-    # show the results
-    visualizer.add_datasample(
-        'result',
-        data_input,
-        data_sample=result,
-        draw_gt=False,
-        show=args.show,
-        wait_time=-1,
-        out_file=args.out_dir,
-        vis_task='lidar_seg')
+        help='Do not save detection prediction results')
+    parser.add_argument(
+        '--print-result',
+        action='store_true',
+        help='Whether to print the results.')
+    call_args = vars(parser.parse_args())
+
+    call_args['inputs'] = dict(points=call_args.pop('pcd'))
+
+    if call_args['no_save_vis'] and call_args['no_save_pred']:
+        call_args['out_dir'] = ''
+
+    init_kws = ['model', 'weights', 'device']
+    init_args = {}
+    for init_kw in init_kws:
+        init_args[init_kw] = call_args.pop(init_kw)
+
+    # NOTE: If your operating environment does not have a display device,
+    # (e.g. a remote server), you can save the predictions and visualize
+    # them in local devices.
+    if os.environ.get('DISPLAY') is None and call_args['show']:
+        print_log(
+            'Display device not found. `--show` is forced to False',
+            logger='current',
+            level=logging.WARNING)
+        call_args['show'] = False
+
+    return init_args, call_args
+
+
+def main():
+    # TODO: Support inference of point cloud numpy file.
+    init_args, call_args = parse_args()
+
+    inferencer = LidarSeg3DInferencer(**init_args)
+    inferencer(**call_args)
+
+    if call_args['out_dir'] != '' and not (call_args['no_save_vis']
+                                           and call_args['no_save_pred']):
+        print_log(
+            f'results have been saved at {call_args["out_dir"]}',
+            logger='current')


 if __name__ == '__main__':
-    args = parse_args()
-    main(args)
+    main()
--- a/docs/en/get_started.md
+++ b/docs/en/get_started.md
@@ -141,6 +141,10 @@ You will see a visualizer interface with point cloud, where bounding boxes are p

 **Note**:

+If you install MMDetection3D on a remote server without display device, you can leave out the `--show` argument. Demo will still save the predictions to  `outputs/pred/000008.json` file.
+
+**Note**:
+
 If you want to input a `.ply` file, you can use the following function and convert it to `.bin` format. Then you can use the converted `.bin` file to run demo.
 Note that you need to install `pandas` and `plyfile` before using this script. This function can also be used for data preprocessing for training `ply data`.


--- a/docs/zh_cn/get_started.md
+++ b/docs/zh_cn/get_started.md
@@ -139,6 +139,10 @@ python demo/pcd_demo.py demo/data/kitti/000008.bin pointpillars_hv_secfpn_8xb6-1

 **注意**：

+如果你在没有显示设备的服务器上安装 MMDetection3D ，你可以忽略 `--show` 参数。Demo 仍会将预测结果保存到 `outputs/pred/000008.json` 文件中。
+
+**注意**：
+
 如果您想输入一个 `.ply` 文件，您可以使用如下函数将它转换成 `.bin` 格式。然后您可以使用转化的 `.bin` 文件来运行样例。请注意在使用此脚本之前，您需要安装 `pandas` 和 `plyfile`。这个函数也可以用于训练 `ply 数据`时作为数据预处理来使用。

 ```python

--- a/mmdet3d/__init__.py
+++ b/mmdet3d/__init__.py
@@ -7,7 +7,7 @@ from mmengine.utils import digit_version
 from .version import __version__, version_info

 mmcv_minimum_version = '2.0.0rc4'
-mmcv_maximum_version = '2.1.0'
+mmcv_maximum_version = '2.2.0'
 mmcv_version = digit_version(mmcv.__version__)

 mmengine_minimum_version = '0.8.0'

--- a/mmdet3d/apis/inference.py
+++ b/mmdet3d/apis/inference.py
@@ -392,7 +392,8 @@ def inference_segmentor(model: nn.Module, pcds: PointsType):

    new_test_pipeline = []
    for pipeline in test_pipeline:
-        if pipeline['type'] != 'LoadAnnotations3D':
+        if pipeline['type'] != 'LoadAnnotations3D' and pipeline[
+                'type'] != 'PointSegClassMapping':
            new_test_pipeline.append(pipeline)
    test_pipeline = Compose(new_test_pipeline)


--- a/mmdet3d/apis/inferencers/base_3d_inferencer.py
+++ b/mmdet3d/apis/inferencers/base_3d_inferencer.py
 # Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import os.path as osp
+from copy import deepcopy
 from typing import Dict, List, Optional, Sequence, Tuple, Union

-import mmengine
 import numpy as np
 import torch.nn as nn
-from mmengine.fileio import (get_file_backend, isdir, join_path,
-                             list_dir_or_file)
+from mmengine import dump, print_log
 from mmengine.infer.infer import BaseInferencer, ModelType
+from mmengine.model.utils import revert_sync_batchnorm
 from mmengine.registry import init_default_scope
 from mmengine.runner import load_checkpoint
 from mmengine.structures import InstanceData
 from mmengine.visualization import Visualizer
+from rich.progress import track

-from mmdet3d.registry import MODELS
+from mmdet3d.registry import DATASETS, MODELS
+from mmdet3d.structures import Box3DMode, Det3DDataSample
 from mmdet3d.utils import ConfigType

 InstanceList = List[InstanceData]
@@ -44,14 +48,14 @@ class Base3DInferencer(BaseInferencer):
            priority is palette -> config -> checkpoint. Defaults to 'none'.
    """

-    preprocess_kwargs: set = set()
+    preprocess_kwargs: set = {'cam_type'}
    forward_kwargs: set = set()
    visualize_kwargs: set = {
        'return_vis', 'show', 'wait_time', 'draw_pred', 'pred_score_thr',
-        'img_out_dir'
+        'img_out_dir', 'no_save_vis', 'cam_type_dir'
    }
    postprocess_kwargs: set = {
-        'print_result', 'pred_out_file', 'return_datasample'
+        'print_result', 'pred_out_dir', 'return_datasample', 'no_save_pred'
    }

    def __init__(self,
@@ -60,10 +64,14 @@ class Base3DInferencer(BaseInferencer):
                 device: Optional[str] = None,
                 scope: str = 'mmdet3d',
                 palette: str = 'none') -> None:
+        # A global counter tracking the number of frames processed, for
+        # naming of the output results
+        self.num_predicted_frames = 0
        self.palette = palette
        init_default_scope(scope)
        super().__init__(
            model=model, weights=weights, device=device, scope=scope)
+        self.model = revert_sync_batchnorm(self.model)

    def _convert_syncbn(self, cfg: ConfigType):
        """Convert config's naiveSyncBN to BN.
@@ -108,56 +116,19 @@ class Base3DInferencer(BaseInferencer):
            if 'PALETTE' in checkpoint.get('meta', {}):  # 3D Segmentor
                model.dataset_meta['palette'] = checkpoint['meta']['PALETTE']

+        test_dataset_cfg = deepcopy(cfg.test_dataloader.dataset)
+        # lazy init. We only need the metainfo.
+        test_dataset_cfg['lazy_init'] = True
+        metainfo = DATASETS.build(test_dataset_cfg).metainfo
+        cfg_palette = metainfo.get('palette', None)
+        if cfg_palette is not None:
+            model.dataset_meta['palette'] = cfg_palette
+
        model.cfg = cfg  # save the config in the model for convenience
        model.to(device)
        model.eval()
        return model

-    def _inputs_to_list(
-            self,
-            inputs: Union[dict, list],
-            modality_key: Union[str, List[str]] = 'points') -> list:
-        """Preprocess the inputs to a list.
-
-        Preprocess inputs to a list according to its type:
-
-        - list or tuple: return inputs
-        - dict: the value of key 'points'/`img` is
-            - Directory path: return all files in the directory
-            - other cases: return a list containing the string. The string
-              could be a path to file, a url or other types of string according
-              to the task.
-
-        Args:
-            inputs (Union[dict, list]): Inputs for the inferencer.
-            modality_key (Union[str, List[str]]): The key of the modality.
-                Defaults to 'points'.
-
-        Returns:
-            list: List of input for the :meth:`preprocess`.
-        """
-        if isinstance(modality_key, str):
-            modality_key = [modality_key]
-        assert set(modality_key).issubset({'points', 'img'})
-
-        for key in modality_key:
-            if isinstance(inputs, dict) and isinstance(inputs[key], str):
-                img = inputs[key]
-                backend = get_file_backend(img)
-                if hasattr(backend, 'isdir') and isdir(img):
-                    # Backends like HttpsBackend do not implement `isdir`, so
-                    # only those backends that implement `isdir` could accept
-                    # the inputs as a directory
-                    filename_list = list_dir_or_file(img, list_dir=False)
-                    inputs = [{
-                        f'{key}': join_path(img, filename)
-                    } for filename in filename_list]
-
-        if not isinstance(inputs, (list, tuple)):
-            inputs = [inputs]
-
-        return list(inputs)
-
    def _get_transform_idx(self, pipeline_cfg: ConfigType, name: str) -> int:
        """Returns the index of the transform in a pipeline.

@@ -173,64 +144,81 @@ class Base3DInferencer(BaseInferencer):
        visualizer.dataset_meta = self.model.dataset_meta
        return visualizer

+    def _dispatch_kwargs(self,
+                         out_dir: str = '',
+                         cam_type: str = '',
+                         **kwargs) -> Tuple[Dict, Dict, Dict, Dict]:
+        """Dispatch kwargs to preprocess(), forward(), visualize() and
+        postprocess() according to the actual demands.
+
+        Args:
+            out_dir (str): Dir to save the inference results.
+            cam_type (str): Camera type. Defaults to ''.
+            **kwargs (dict): Key words arguments passed to :meth:`preprocess`,
+                :meth:`forward`, :meth:`visualize` and :meth:`postprocess`.
+                Each key in kwargs should be in the corresponding set of
+                ``preprocess_kwargs``, ``forward_kwargs``, ``visualize_kwargs``
+                and ``postprocess_kwargs``.
+
+        Returns:
+            Tuple[Dict, Dict, Dict, Dict]: kwargs passed to preprocess,
+            forward, visualize and postprocess respectively.
+        """
+        kwargs['img_out_dir'] = out_dir
+        kwargs['pred_out_dir'] = out_dir
+        if cam_type != '':
+            kwargs['cam_type_dir'] = cam_type
+        return super()._dispatch_kwargs(**kwargs)
+
    def __call__(self,
                 inputs: InputsType,
-                 return_datasamples: bool = False,
                 batch_size: int = 1,
-                 return_vis: bool = False,
-                 show: bool = False,
-                 wait_time: int = 0,
-                 draw_pred: bool = True,
-                 pred_score_thr: float = 0.3,
-                 img_out_dir: str = '',
-                 print_result: bool = False,
-                 pred_out_file: str = '',
-                 **kwargs) -> dict:
+                 return_datasamples: bool = False,
+                 **kwargs) -> Optional[dict]:
        """Call the inferencer.

        Args:
            inputs (InputsType): Inputs for the inferencer.
+            batch_size (int): Batch size. Defaults to 1.
            return_datasamples (bool): Whether to return results as
                :obj:`BaseDataElement`. Defaults to False.
-            batch_size (int): Inference batch size. Defaults to 1.
-            return_vis (bool): Whether to return the visualization result.
-                Defaults to False.
-            show (bool): Whether to display the visualization results in a
-                popup window. Defaults to False.
-            wait_time (float): The interval of show (s). Defaults to 0.
-            draw_pred (bool): Whether to draw predicted bounding boxes.
-                Defaults to True.
-            pred_score_thr (float): Minimum score of bboxes to draw.
-                Defaults to 0.3.
-            img_out_dir (str): Output directory of visualization results.
-                If left as empty, no file will be saved. Defaults to ''.
-            print_result (bool): Whether to print the inference result w/o
-                visualization to the console. Defaults to False.
-            pred_out_file (str): File to save the inference results w/o
-                visualization. If left as empty, no file will be saved.
-                Defaults to ''.
-            **kwargs: Other keyword arguments passed to :meth:`preprocess`,
+            **kwargs: Key words arguments passed to :meth:`preprocess`,
                :meth:`forward`, :meth:`visualize` and :meth:`postprocess`.
                Each key in kwargs should be in the corresponding set of
                ``preprocess_kwargs``, ``forward_kwargs``, ``visualize_kwargs``
                and ``postprocess_kwargs``.

+
        Returns:
            dict: Inference and visualization results.
        """
-        return super().__call__(
-            inputs,
+
+        (
+            preprocess_kwargs,
+            forward_kwargs,
+            visualize_kwargs,
+            postprocess_kwargs,
+        ) = self._dispatch_kwargs(**kwargs)
+
+        cam_type = preprocess_kwargs.pop('cam_type', 'CAM2')
+        ori_inputs = self._inputs_to_list(inputs, cam_type=cam_type)
+        inputs = self.preprocess(
+            ori_inputs, batch_size=batch_size, **preprocess_kwargs)
+        preds = []
+
+        results_dict = {'predictions': [], 'visualization': []}
+        for data in (track(inputs, description='Inference')
+                     if self.show_progress else inputs):
+            preds.extend(self.forward(data, **forward_kwargs))
+            visualization = self.visualize(ori_inputs, preds,
+                                           **visualize_kwargs)
+            results = self.postprocess(preds, visualization,
                                       return_datasamples,
-            batch_size,
-            return_vis=return_vis,
-            show=show,
-            wait_time=wait_time,
-            draw_pred=draw_pred,
-            pred_score_thr=pred_score_thr,
-            img_out_dir=img_out_dir,
-            print_result=print_result,
-            pred_out_file=pred_out_file,
-            **kwargs)
+                                       **postprocess_kwargs)
+            results_dict['predictions'].extend(results['predictions'])
+            if results['visualization'] is not None:
+                results_dict['visualization'].extend(results['visualization'])
+        return results_dict

    def postprocess(
        self,
@@ -238,7 +226,8 @@ class Base3DInferencer(BaseInferencer):
        visualization: Optional[List[np.ndarray]] = None,
        return_datasample: bool = False,
        print_result: bool = False,
-        pred_out_file: str = '',
+        no_save_pred: bool = False,
+        pred_out_dir: str = '',
    ) -> Union[ResType, Tuple[ResType, np.ndarray]]:
        """Process the predictions and visualization results from ``forward``
        and ``visualize``.
@@ -258,7 +247,7 @@ class Base3DInferencer(BaseInferencer):
                Defaults to False.
            print_result (bool): Whether to print the inference result w/o
                visualization to the console. Defaults to False.
-            pred_out_file (str): File to save the inference results w/o
+            pred_out_dir (str): Directory to save the inference results w/o
                visualization. If left as empty, no file will be saved.
                Defaults to ''.

@@ -273,35 +262,56 @@ class Base3DInferencer(BaseInferencer):
              json-serializable dict containing only basic data elements such
              as strings and numbers.
        """
+        if no_save_pred is True:
+            pred_out_dir = ''
+
        result_dict = {}
        results = preds
        if not return_datasample:
            results = []
            for pred in preds:
-                result = self.pred2dict(pred)
+                result = self.pred2dict(pred, pred_out_dir)
                results.append(result)
+        elif pred_out_dir != '':
+            print_log(
+                'Currently does not support saving datasample '
+                'when return_datasample is set to True. '
+                'Prediction results are not saved!',
+                level=logging.WARNING)
+        # Add img to the results after printing and dumping
        result_dict['predictions'] = results
        if print_result:
            print(result_dict)
-        if pred_out_file != '':
-            mmengine.dump(result_dict, pred_out_file)
        result_dict['visualization'] = visualization
        return result_dict

-    def pred2dict(self, data_sample: InstanceData) -> Dict:
+    # TODO: The data format and fields saved in json need further discussion.
+    #  Maybe should include model name, timestamp, filename, image info etc.
+    def pred2dict(self,
+                  data_sample: Det3DDataSample,
+                  pred_out_dir: str = '') -> Dict:
        """Extract elements necessary to represent a prediction into a
        dictionary.

        It's better to contain only basic data elements such as strings and
        numbers in order to guarantee it's json-serializable.
+
+        Args:
+            data_sample (:obj:`DetDataSample`): Predictions of the model.
+            pred_out_dir: Dir to save the inference results w/o
+                visualization. If left as empty, no file will be saved.
+                Defaults to ''.
+
+        Returns:
+            dict: Prediction results.
        """
        result = {}
        if 'pred_instances_3d' in data_sample:
            pred_instances_3d = data_sample.pred_instances_3d.numpy()
            result = {
-                'bboxes_3d': pred_instances_3d.bboxes_3d.tensor.cpu().tolist(),
                'labels_3d': pred_instances_3d.labels_3d.tolist(),
-                'scores_3d': pred_instances_3d.scores_3d.tolist()
+                'scores_3d': pred_instances_3d.scores_3d.tolist(),
+                'bboxes_3d': pred_instances_3d.bboxes_3d.tensor.cpu().tolist()
            }

        if 'pred_pts_seg' in data_sample:
@@ -309,4 +319,28 @@ class Base3DInferencer(BaseInferencer):
            result['pts_semantic_mask'] = \
                pred_pts_seg.pts_semantic_mask.tolist()

+        if data_sample.box_mode_3d == Box3DMode.LIDAR:
+            result['box_type_3d'] = 'LiDAR'
+        elif data_sample.box_mode_3d == Box3DMode.CAM:
+            result['box_type_3d'] = 'Camera'
+        elif data_sample.box_mode_3d == Box3DMode.DEPTH:
+            result['box_type_3d'] = 'Depth'
+
+        if pred_out_dir != '':
+            if 'lidar_path' in data_sample:
+                lidar_path = osp.basename(data_sample.lidar_path)
+                lidar_path = osp.splitext(lidar_path)[0]
+                out_json_path = osp.join(pred_out_dir, 'preds',
+                                         lidar_path + '.json')
+            elif 'img_path' in data_sample:
+                img_path = osp.basename(data_sample.img_path)
+                img_path = osp.splitext(img_path)[0]
+                out_json_path = osp.join(pred_out_dir, 'preds',
+                                         img_path + '.json')
+            else:
+                out_json_path = osp.join(
+                    pred_out_dir, 'preds',
+                    f'{str(self.num_visualized_imgs).zfill(8)}.json')
+            dump(result, out_json_path)
+
        return result
--- a/mmdet3d/apis/inferencers/lidar_det3d_inferencer.py
+++ b/mmdet3d/apis/inferencers/lidar_det3d_inferencer.py
@@ -4,11 +4,16 @@ from typing import Dict, List, Optional, Sequence, Union

 import mmengine
 import numpy as np
+import torch
 from mmengine.dataset import Compose
+from mmengine.fileio import (get_file_backend, isdir, join_path,
+                             list_dir_or_file)
 from mmengine.infer.infer import ModelType
 from mmengine.structures import InstanceData

 from mmdet3d.registry import INFERENCERS
+from mmdet3d.structures import (CameraInstance3DBoxes, DepthInstance3DBoxes,
+                                Det3DDataSample, LiDARInstance3DBoxes)
 from mmdet3d.utils import ConfigType
 from .base_3d_inferencer import Base3DInferencer

@@ -43,16 +48,6 @@ class LidarDet3DInferencer(Base3DInferencer):
            priority is palette -> config -> checkpoint. Defaults to 'none'.
    """

-    preprocess_kwargs: set = set()
-    forward_kwargs: set = set()
-    visualize_kwargs: set = {
-        'return_vis', 'show', 'wait_time', 'draw_pred', 'pred_score_thr',
-        'img_out_dir'
-    }
-    postprocess_kwargs: set = {
-        'print_result', 'pred_out_file', 'return_datasample'
-    }
-
    def __init__(self,
                 model: Union[ModelType, str, None] = None,
                 weights: Optional[str] = None,
@@ -69,7 +64,7 @@ class LidarDet3DInferencer(Base3DInferencer):
            scope=scope,
            palette=palette)

-    def _inputs_to_list(self, inputs: Union[dict, list]) -> list:
+    def _inputs_to_list(self, inputs: Union[dict, list], **kwargs) -> list:
        """Preprocess the inputs to a list.

        Preprocess inputs to a list according to its type:
@@ -87,7 +82,22 @@ class LidarDet3DInferencer(Base3DInferencer):
        Returns:
            list: List of input for the :meth:`preprocess`.
        """
-        return super()._inputs_to_list(inputs, modality_key='points')
+        if isinstance(inputs, dict) and isinstance(inputs['points'], str):
+            pcd = inputs['points']
+            backend = get_file_backend(pcd)
+            if hasattr(backend, 'isdir') and isdir(pcd):
+                # Backends like HttpsBackend do not implement `isdir`, so
+                # only those backends that implement `isdir` could accept
+                # the inputs as a directory
+                filename_list = list_dir_or_file(pcd, list_dir=False)
+                inputs = [{
+                    'points': join_path(pcd, filename)
+                } for filename in filename_list]
+
+        if not isinstance(inputs, (list, tuple)):
+            inputs = [inputs]
+
+        return list(inputs)

    def _init_pipeline(self, cfg: ConfigType) -> Compose:
        """Initialize the test pipeline."""
@@ -113,9 +123,10 @@ class LidarDet3DInferencer(Base3DInferencer):
                  preds: PredType,
                  return_vis: bool = False,
                  show: bool = False,
-                  wait_time: int = 0,
+                  wait_time: int = -1,
                  draw_pred: bool = True,
                  pred_score_thr: float = 0.3,
+                  no_save_vis: bool = False,
                  img_out_dir: str = '') -> Union[List[np.ndarray], None]:
        """Visualize predictions.

@@ -126,11 +137,13 @@ class LidarDet3DInferencer(Base3DInferencer):
                Defaults to False.
            show (bool): Whether to display the image in a popup window.
                Defaults to False.
-            wait_time (float): The interval of show (s). Defaults to 0.
+            wait_time (float): The interval of show (s). Defaults to -1.
            draw_pred (bool): Whether to draw predicted bounding boxes.
                Defaults to True.
            pred_score_thr (float): Minimum score of bboxes to draw.
                Defaults to 0.3.
+            no_save_vis (bool): Whether to force not to save prediction
+                vis results. Defaults to False.
            img_out_dir (str): Output directory of visualization results.
                If left as empty, no file will be saved. Defaults to ''.

@@ -138,8 +151,10 @@ class LidarDet3DInferencer(Base3DInferencer):
            List[np.ndarray] or None: Returns visualization results only if
            applicable.
        """
-        if self.visualizer is None or (not show and img_out_dir == ''
-                                       and not return_vis):
+        if no_save_vis is True:
+            img_out_dir = ''
+
+        if not show and img_out_dir == '' and not return_vis:
            return None

        if getattr(self, 'visualizer') is None:
@@ -160,13 +175,16 @@ class LidarDet3DInferencer(Base3DInferencer):
            elif isinstance(single_input, np.ndarray):
                points = single_input.copy()
                pc_num = str(self.num_visualized_frames).zfill(8)
-                pc_name = f'pc_{pc_num}.png'
+                pc_name = f'{pc_num}.png'
            else:
                raise ValueError('Unsupported input type: '
                                 f'{type(single_input)}')

-            o3d_save_path = osp.join(img_out_dir, pc_name) \
-                if img_out_dir != '' else None
+            if img_out_dir != '' and show:
+                o3d_save_path = osp.join(img_out_dir, 'vis_lidar', pc_name)
+                mmengine.mkdir_or_exist(osp.dirname(o3d_save_path))
+            else:
+                o3d_save_path = None

            data_input = dict(points=points)
            self.visualizer.add_datasample(
@@ -185,3 +203,40 @@ class LidarDet3DInferencer(Base3DInferencer):
            self.num_visualized_frames += 1

        return results
+
+    def visualize_preds_fromfile(self, inputs: InputsType, preds: PredType,
+                                 **kwargs) -> Union[List[np.ndarray], None]:
+        """Visualize predictions from `*.json` files.
+
+        Args:
+            inputs (InputsType): Inputs for the inferencer.
+            preds (PredType): Predictions of the model.
+
+        Returns:
+            List[np.ndarray] or None: Returns visualization results only if
+            applicable.
+        """
+        data_samples = []
+        for pred in preds:
+            pred = mmengine.load(pred)
+            data_sample = Det3DDataSample()
+            data_sample.pred_instances_3d = InstanceData()
+
+            data_sample.pred_instances_3d.labels_3d = torch.tensor(
+                pred['labels_3d'])
+            data_sample.pred_instances_3d.scores_3d = torch.tensor(
+                pred['scores_3d'])
+            if pred['box_type_3d'] == 'LiDAR':
+                data_sample.pred_instances_3d.bboxes_3d = \
+                    LiDARInstance3DBoxes(pred['bboxes_3d'])
+            elif pred['box_type_3d'] == 'Camera':
+                data_sample.pred_instances_3d.bboxes_3d = \
+                    CameraInstance3DBoxes(pred['bboxes_3d'])
+            elif pred['box_type_3d'] == 'Depth':
+                data_sample.pred_instances_3d.bboxes_3d = \
+                    DepthInstance3DBoxes(pred['bboxes_3d'])
+            else:
+                raise ValueError('Unsupported box type: '
+                                 f'{pred["box_type_3d"]}')
+            data_samples.append(data_sample)
+        return self.visualize(inputs=inputs, preds=data_samples, **kwargs)
--- a/mmdet3d/apis/inferencers/lidar_seg3d_inferencer.py
+++ b/mmdet3d/apis/inferencers/lidar_seg3d_inferencer.py
@@ -5,6 +5,8 @@ from typing import Dict, List, Optional, Sequence, Union
 import mmengine
 import numpy as np
 from mmengine.dataset import Compose
+from mmengine.fileio import (get_file_backend, isdir, join_path,
+                             list_dir_or_file)
 from mmengine.infer.infer import ModelType
 from mmengine.structures import InstanceData

@@ -43,16 +45,6 @@ class LidarSeg3DInferencer(Base3DInferencer):
            priority is palette -> config -> checkpoint. Defaults to 'none'.
    """

-    preprocess_kwargs: set = set()
-    forward_kwargs: set = set()
-    visualize_kwargs: set = {
-        'return_vis', 'show', 'wait_time', 'draw_pred', 'pred_score_thr',
-        'img_out_dir'
-    }
-    postprocess_kwargs: set = {
-        'print_result', 'pred_out_file', 'return_datasample'
-    }
-
    def __init__(self,
                 model: Union[ModelType, str, None] = None,
                 weights: Optional[str] = None,
@@ -69,7 +61,7 @@ class LidarSeg3DInferencer(Base3DInferencer):
            scope=scope,
            palette=palette)

-    def _inputs_to_list(self, inputs: Union[dict, list]) -> list:
+    def _inputs_to_list(self, inputs: Union[dict, list], **kwargs) -> list:
        """Preprocess the inputs to a list.

        Preprocess inputs to a list according to its type:
@@ -87,7 +79,22 @@ class LidarSeg3DInferencer(Base3DInferencer):
        Returns:
            list: List of input for the :meth:`preprocess`.
        """
-        return super()._inputs_to_list(inputs, modality_key='points')
+        if isinstance(inputs, dict) and isinstance(inputs['points'], str):
+            pcd = inputs['points']
+            backend = get_file_backend(pcd)
+            if hasattr(backend, 'isdir') and isdir(pcd):
+                # Backends like HttpsBackend do not implement `isdir`, so
+                # only those backends that implement `isdir` could accept
+                # the inputs as a directory
+                filename_list = list_dir_or_file(pcd, list_dir=False)
+                inputs = [{
+                    'points': join_path(pcd, filename)
+                } for filename in filename_list]
+
+        if not isinstance(inputs, (list, tuple)):
+            inputs = [inputs]
+
+        return list(inputs)

    def _init_pipeline(self, cfg: ConfigType) -> Compose:
        """Initialize the test pipeline."""
@@ -124,6 +131,7 @@ class LidarSeg3DInferencer(Base3DInferencer):
                  wait_time: int = 0,
                  draw_pred: bool = True,
                  pred_score_thr: float = 0.3,
+                  no_save_vis: bool = False,
                  img_out_dir: str = '') -> Union[List[np.ndarray], None]:
        """Visualize predictions.

@@ -139,6 +147,7 @@ class LidarSeg3DInferencer(Base3DInferencer):
                Defaults to True.
            pred_score_thr (float): Minimum score of bboxes to draw.
                Defaults to 0.3.
+            no_save_vis (bool): Whether to save visualization results.
            img_out_dir (str): Output directory of visualization results.
                If left as empty, no file will be saved. Defaults to ''.

@@ -146,8 +155,10 @@ class LidarSeg3DInferencer(Base3DInferencer):
            List[np.ndarray] or None: Returns visualization results only if
            applicable.
        """
-        if self.visualizer is None or (not show and img_out_dir == ''
-                                       and not return_vis):
+        if no_save_vis is True:
+            img_out_dir = ''
+
+        if not show and img_out_dir == '' and not return_vis:
            return None

        if getattr(self, 'visualizer') is None:
@@ -168,13 +179,16 @@ class LidarSeg3DInferencer(Base3DInferencer):
            elif isinstance(single_input, np.ndarray):
                points = single_input.copy()
                pc_num = str(self.num_visualized_frames).zfill(8)
-                pc_name = f'pc_{pc_num}.png'
+                pc_name = f'{pc_num}.png'
            else:
                raise ValueError('Unsupported input type: '
                                 f'{type(single_input)}')

-            o3d_save_path = osp.join(img_out_dir, pc_name) \
-                if img_out_dir != '' else None
+            if img_out_dir != '' and show:
+                o3d_save_path = osp.join(img_out_dir, 'vis_lidar', pc_name)
+                mmengine.mkdir_or_exist(osp.dirname(o3d_save_path))
+            else:
+                o3d_save_path = None

            data_input = dict(points=points)
            self.visualizer.add_datasample(

--- a/mmdet3d/apis/inferencers/mono_det3d_inferencer.py
+++ b/mmdet3d/apis/inferencers/mono_det3d_inferencer.py
@@ -6,6 +6,8 @@ import mmcv
 import mmengine
 import numpy as np
 from mmengine.dataset import Compose
+from mmengine.fileio import (get_file_backend, isdir, join_path,
+                             list_dir_or_file)
 from mmengine.infer.infer import ModelType
 from mmengine.structures import InstanceData

@@ -44,16 +46,6 @@ class MonoDet3DInferencer(Base3DInferencer):
            priority is palette -> config -> checkpoint. Defaults to 'none'.
    """

-    preprocess_kwargs: set = set()
-    forward_kwargs: set = set()
-    visualize_kwargs: set = {
-        'return_vis', 'show', 'wait_time', 'draw_pred', 'pred_score_thr',
-        'img_out_dir'
-    }
-    postprocess_kwargs: set = {
-        'print_result', 'pred_out_file', 'return_datasample'
-    }
-
    def __init__(self,
                 model: Union[ModelType, str, None] = None,
                 weights: Optional[str] = None,
@@ -70,7 +62,10 @@ class MonoDet3DInferencer(Base3DInferencer):
            scope=scope,
            palette=palette)

-    def _inputs_to_list(self, inputs: Union[dict, list]) -> list:
+    def _inputs_to_list(self,
+                        inputs: Union[dict, list],
+                        cam_type='CAM2',
+                        **kwargs) -> list:
        """Preprocess the inputs to a list.

        Preprocess inputs to a list according to its type:
@@ -88,7 +83,79 @@ class MonoDet3DInferencer(Base3DInferencer):
        Returns:
            list: List of input for the :meth:`preprocess`.
        """
-        return super()._inputs_to_list(inputs, modality_key='img')
+        if isinstance(inputs, dict):
+            assert 'infos' in inputs
+            infos = inputs.pop('infos')
+
+            if isinstance(inputs['img'], str):
+                img = inputs['img']
+                backend = get_file_backend(img)
+                if hasattr(backend, 'isdir') and isdir(img):
+                    # Backends like HttpsBackend do not implement `isdir`, so
+                    # only those backends that implement `isdir` could accept
+                    # the inputs as a directory
+                    filename_list = list_dir_or_file(img, list_dir=False)
+                    inputs = [{
+                        'img': join_path(img, filename)
+                    } for filename in filename_list]
+
+            if not isinstance(inputs, (list, tuple)):
+                inputs = [inputs]
+
+            # get cam2img, lidar2cam and lidar2img from infos
+            info_list = mmengine.load(infos)['data_list']
+            assert len(info_list) == len(inputs)
+            for index, input in enumerate(inputs):
+                data_info = info_list[index]
+                img_path = data_info['images'][cam_type]['img_path']
+                if isinstance(input['img'], str) and \
+                        osp.basename(img_path) != osp.basename(input['img']):
+                    raise ValueError(
+                        f'the info file of {img_path} is not provided.')
+                cam2img = np.asarray(
+                    data_info['images'][cam_type]['cam2img'], dtype=np.float32)
+                lidar2cam = np.asarray(
+                    data_info['images'][cam_type]['lidar2cam'],
+                    dtype=np.float32)
+                if 'lidar2img' in data_info['images'][cam_type]:
+                    lidar2img = np.asarray(
+                        data_info['images'][cam_type]['lidar2img'],
+                        dtype=np.float32)
+                else:
+                    lidar2img = cam2img @ lidar2cam
+                input['cam2img'] = cam2img
+                input['lidar2cam'] = lidar2cam
+                input['lidar2img'] = lidar2img
+        elif isinstance(inputs, (list, tuple)):
+            # get cam2img, lidar2cam and lidar2img from infos
+            for input in inputs:
+                assert 'infos' in input
+                infos = input.pop('infos')
+                info_list = mmengine.load(infos)['data_list']
+                assert len(info_list) == 1, 'Only support single sample info' \
+                    'in `.pkl`, when inputs is a list.'
+                data_info = info_list[0]
+                img_path = data_info['images'][cam_type]['img_path']
+                if isinstance(input['img'], str) and \
+                        osp.basename(img_path) != osp.basename(input['img']):
+                    raise ValueError(
+                        f'the info file of {img_path} is not provided.')
+                cam2img = np.asarray(
+                    data_info['images'][cam_type]['cam2img'], dtype=np.float32)
+                lidar2cam = np.asarray(
+                    data_info['images'][cam_type]['lidar2cam'],
+                    dtype=np.float32)
+                if 'lidar2img' in data_info['images'][cam_type]:
+                    lidar2img = np.asarray(
+                        data_info['images'][cam_type]['lidar2img'],
+                        dtype=np.float32)
+                else:
+                    lidar2img = cam2img @ lidar2cam
+                input['cam2img'] = cam2img
+                input['lidar2cam'] = lidar2cam
+                input['lidar2img'] = lidar2img
+
+        return list(inputs)

    def _init_pipeline(self, cfg: ConfigType) -> Compose:
        """Initialize the test pipeline."""
@@ -110,7 +177,9 @@ class MonoDet3DInferencer(Base3DInferencer):
                  wait_time: int = 0,
                  draw_pred: bool = True,
                  pred_score_thr: float = 0.3,
-                  img_out_dir: str = '') -> Union[List[np.ndarray], None]:
+                  no_save_vis: bool = False,
+                  img_out_dir: str = '',
+                  cam_type_dir: str = 'CAM2') -> Union[List[np.ndarray], None]:
        """Visualize predictions.

        Args:
@@ -125,15 +194,19 @@ class MonoDet3DInferencer(Base3DInferencer):
                Defaults to True.
            pred_score_thr (float): Minimum score of bboxes to draw.
                Defaults to 0.3.
+            no_save_vis (bool): Whether to save visualization results.
            img_out_dir (str): Output directory of visualization results.
                If left as empty, no file will be saved. Defaults to ''.
+            cam_type_dir (str): Camera type directory. Defaults to 'CAM2'.

        Returns:
            List[np.ndarray] or None: Returns visualization results only if
            applicable.
        """
-        if self.visualizer is None or (not show and img_out_dir == ''
-                                       and not return_vis):
+        if no_save_vis is True:
+            img_out_dir = ''
+
+        if not show and img_out_dir == '' and not return_vis:
            return None

        if getattr(self, 'visualizer') is None:
@@ -156,8 +229,8 @@ class MonoDet3DInferencer(Base3DInferencer):
                raise ValueError('Unsupported input type: '
                                 f"{type(single_input['img'])}")

-            out_file = osp.join(img_out_dir, img_name) if img_out_dir != '' \
-                else None
+            out_file = osp.join(img_out_dir, 'vis_camera', cam_type_dir,
+                                img_name) if img_out_dir != '' else None

            data_input = dict(img=img)
            self.visualizer.add_datasample(

--- a/mmdet3d/apis/inferencers/multi_modality_det3d_inferencer.py
+++ b/mmdet3d/apis/inferencers/multi_modality_det3d_inferencer.py
@@ -7,6 +7,8 @@ import mmcv
 import mmengine
 import numpy as np
 from mmengine.dataset import Compose
+from mmengine.fileio import (get_file_backend, isdir, join_path,
+                             list_dir_or_file)
 from mmengine.infer.infer import ModelType
 from mmengine.structures import InstanceData

@@ -44,16 +46,6 @@ class MultiModalityDet3DInferencer(Base3DInferencer):
        palette (str): The palette of visualization. Defaults to 'none'.
    """

-    preprocess_kwargs: set = set()
-    forward_kwargs: set = set()
-    visualize_kwargs: set = {
-        'return_vis', 'show', 'wait_time', 'draw_pred', 'pred_score_thr',
-        'img_out_dir'
-    }
-    postprocess_kwargs: set = {
-        'print_result', 'pred_out_file', 'return_datasample'
-    }
-
    def __init__(self,
                 model: Union[ModelType, str, None] = None,
                 weights: Optional[str] = None,
@@ -70,7 +62,10 @@ class MultiModalityDet3DInferencer(Base3DInferencer):
            scope=scope,
            palette=palette)

-    def _inputs_to_list(self, inputs: Union[dict, list]) -> list:
+    def _inputs_to_list(self,
+                        inputs: Union[dict, list],
+                        cam_type: str = 'CAM2',
+                        **kwargs) -> list:
        """Preprocess the inputs to a list.

        Preprocess inputs to a list according to its type:
@@ -88,7 +83,86 @@ class MultiModalityDet3DInferencer(Base3DInferencer):
        Returns:
            list: List of input for the :meth:`preprocess`.
        """
-        return super()._inputs_to_list(inputs, modality_key=['points', 'img'])
+        if isinstance(inputs, dict):
+            assert 'infos' in inputs
+            infos = inputs.pop('infos')
+
+            if isinstance(inputs['img'], str):
+                img, pcd = inputs['img'], inputs['points']
+                backend = get_file_backend(img)
+                if hasattr(backend, 'isdir') and isdir(img) and isdir(pcd):
+                    # Backends like HttpsBackend do not implement `isdir`, so
+                    # only those backends that implement `isdir` could accept
+                    # the inputs as a directory
+                    img_filename_list = list_dir_or_file(
+                        img, list_dir=False, suffix=['.png', '.jpg'])
+                    pcd_filename_list = list_dir_or_file(
+                        pcd, list_dir=False, suffix='.bin')
+                    assert len(img_filename_list) == len(pcd_filename_list)
+
+                    inputs = [{
+                        'img': join_path(img, img_filename),
+                        'points': join_path(pcd, pcd_filename)
+                    } for pcd_filename, img_filename in zip(
+                        pcd_filename_list, img_filename_list)]
+
+            if not isinstance(inputs, (list, tuple)):
+                inputs = [inputs]
+
+            # get cam2img, lidar2cam and lidar2img from infos
+            info_list = mmengine.load(infos)['data_list']
+            assert len(info_list) == len(inputs)
+            for index, input in enumerate(inputs):
+                data_info = info_list[index]
+                img_path = data_info['images'][cam_type]['img_path']
+                if isinstance(input['img'], str) and \
+                        osp.basename(img_path) != osp.basename(input['img']):
+                    raise ValueError(
+                        f'the info file of {img_path} is not provided.')
+                cam2img = np.asarray(
+                    data_info['images'][cam_type]['cam2img'], dtype=np.float32)
+                lidar2cam = np.asarray(
+                    data_info['images'][cam_type]['lidar2cam'],
+                    dtype=np.float32)
+                if 'lidar2img' in data_info['images'][cam_type]:
+                    lidar2img = np.asarray(
+                        data_info['images'][cam_type]['lidar2img'],
+                        dtype=np.float32)
+                else:
+                    lidar2img = cam2img @ lidar2cam
+                input['cam2img'] = cam2img
+                input['lidar2cam'] = lidar2cam
+                input['lidar2img'] = lidar2img
+        elif isinstance(inputs, (list, tuple)):
+            # get cam2img, lidar2cam and lidar2img from infos
+            for input in inputs:
+                assert 'infos' in input
+                infos = input.pop('infos')
+                info_list = mmengine.load(infos)['data_list']
+                assert len(info_list) == 1, 'Only support single sample' \
+                    'info in `.pkl`, when input is a list.'
+                data_info = info_list[0]
+                img_path = data_info['images'][cam_type]['img_path']
+                if isinstance(input['img'], str) and \
+                        osp.basename(img_path) != osp.basename(input['img']):
+                    raise ValueError(
+                        f'the info file of {img_path} is not provided.')
+                cam2img = np.asarray(
+                    data_info['images'][cam_type]['cam2img'], dtype=np.float32)
+                lidar2cam = np.asarray(
+                    data_info['images'][cam_type]['lidar2cam'],
+                    dtype=np.float32)
+                if 'lidar2img' in data_info['images'][cam_type]:
+                    lidar2img = np.asarray(
+                        data_info['images'][cam_type]['lidar2img'],
+                        dtype=np.float32)
+                else:
+                    lidar2img = cam2img @ lidar2cam
+                input['cam2img'] = cam2img
+                input['lidar2cam'] = lidar2cam
+                input['lidar2img'] = lidar2img
+
+        return list(inputs)

    def _init_pipeline(self, cfg: ConfigType) -> Compose:
        """Initialize the test pipeline."""
@@ -144,7 +218,9 @@ class MultiModalityDet3DInferencer(Base3DInferencer):
                  wait_time: int = 0,
                  draw_pred: bool = True,
                  pred_score_thr: float = 0.3,
-                  img_out_dir: str = '') -> Union[List[np.ndarray], None]:
+                  no_save_vis: bool = False,
+                  img_out_dir: str = '',
+                  cam_type_dir: str = 'CAM2') -> Union[List[np.ndarray], None]:
        """Visualize predictions.

        Args:
@@ -157,6 +233,7 @@ class MultiModalityDet3DInferencer(Base3DInferencer):
            wait_time (float): The interval of show (s). Defaults to 0.
            draw_pred (bool): Whether to draw predicted bounding boxes.
                Defaults to True.
+            no_save_vis (bool): Whether to save visualization results.
            pred_score_thr (float): Minimum score of bboxes to draw.
                Defaults to 0.3.
            img_out_dir (str): Output directory of visualization results.
@@ -166,8 +243,10 @@ class MultiModalityDet3DInferencer(Base3DInferencer):
            List[np.ndarray] or None: Returns visualization results only if
            applicable.
        """
-        if self.visualizer is None or (not show and img_out_dir == ''
-                                       and not return_vis):
+        if no_save_vis is True:
+            img_out_dir = ''
+
+        if not show and img_out_dir == '' and not return_vis:
            return None

        if getattr(self, 'visualizer') is None:
@@ -188,13 +267,16 @@ class MultiModalityDet3DInferencer(Base3DInferencer):
            elif isinstance(points_input, np.ndarray):
                points = points_input.copy()
                pc_num = str(self.num_visualized_frames).zfill(8)
-                pc_name = f'pc_{pc_num}.png'
+                pc_name = f'{pc_num}.png'
            else:
                raise ValueError('Unsupported input type: '
                                 f'{type(points_input)}')

-            o3d_save_path = osp.join(img_out_dir, pc_name) \
-                if img_out_dir != '' else None
+            if img_out_dir != '' and show:
+                o3d_save_path = osp.join(img_out_dir, 'vis_lidar', pc_name)
+                mmengine.mkdir_or_exist(osp.dirname(o3d_save_path))
+            else:
+                o3d_save_path = None

            img_input = single_input['img']
            if isinstance(single_input['img'], str):
@@ -210,8 +292,8 @@ class MultiModalityDet3DInferencer(Base3DInferencer):
                raise ValueError('Unsupported input type: '
                                 f'{type(img_input)}')

-            out_file = osp.join(img_out_dir, img_name) if img_out_dir != '' \
-                else None
+            out_file = osp.join(img_out_dir, 'vis_camera', cam_type_dir,
+                                img_name) if img_out_dir != '' else None

            data_input = dict(points=points, img=img)
            self.visualizer.add_datasample(

--- a/mmdet3d/datasets/transforms/loading.py
+++ b/mmdet3d/datasets/transforms/loading.py
@@ -1153,7 +1153,6 @@ class MonoDet3DInferencerLoader(BaseTransform):

    Added keys:
      - img
-      - cam2img
      - box_type_3d
      - box_mode_3d

@@ -1176,32 +1175,19 @@ class MonoDet3DInferencerLoader(BaseTransform):
            dict: The dict contains loaded image and meta information.
        """
        box_type_3d, box_mode_3d = get_box_type('camera')
-        assert 'calib' in single_input and 'img' in single_input, \
-            "key 'calib' and 'img' must be in input dict"
-        if isinstance(single_input['calib'], str):
-            calib_path = single_input['calib']
-            with open(calib_path, 'r') as f:
-                lines = f.readlines()
-            cam2img = np.array([
-                float(info) for info in lines[0].split(' ')[0:16]
-            ]).reshape([4, 4])
-        elif isinstance(single_input['calib'], np.ndarray):
-            cam2img = single_input['calib']
-        else:
-            raise ValueError('Unsupported input calib type: '
-                             f"{type(single_input['calib'])}")

        if isinstance(single_input['img'], str):
            inputs = dict(
                images=dict(
                    CAM_FRONT=dict(
-                        img_path=single_input['img'], cam2img=cam2img)),
+                        img_path=single_input['img'],
+                        cam2img=single_input['cam2img'])),
                box_mode_3d=box_mode_3d,
                box_type_3d=box_type_3d)
        elif isinstance(single_input['img'], np.ndarray):
            inputs = dict(
                img=single_input['img'],
-                cam2img=cam2img,
+                cam2img=single_input['cam2img'],
                box_type_3d=box_type_3d,
                box_mode_3d=box_mode_3d)
        else:
@@ -1252,9 +1238,9 @@ class MultiModalityDet3DInferencerLoader(BaseTransform):
            dict: The dict contains loaded image, point cloud and meta
            information.
        """
-        assert 'points' in single_input and 'img' in single_input and \
-            'calib' in single_input, "key 'points', 'img' and 'calib' must be "
-        f'in input dict, but got {single_input}'
+        assert 'points' in single_input and 'img' in single_input, \
+            "key 'points', 'img' and must be in input dict," \
+            f'but got {single_input}'
        if isinstance(single_input['points'], str):
            inputs = dict(
                lidar_points=dict(lidar_path=single_input['points']),
@@ -1283,36 +1269,21 @@ class MultiModalityDet3DInferencerLoader(BaseTransform):
        multi_modality_inputs = points_inputs

        box_type_3d, box_mode_3d = get_box_type('lidar')
-        if isinstance(single_input['calib'], str):
-            calib = mmengine.load(single_input['calib'])
-
-        elif isinstance(single_input['calib'], dict):
-            calib = single_input['calib']
-        else:
-            raise ValueError('Unsupported input calib type: '
-                             f"{type(single_input['calib'])}")
-
-        cam2img = np.asarray(calib['cam2img'], dtype=np.float32)
-        lidar2cam = np.asarray(calib['lidar2cam'], dtype=np.float32)
-        if 'lidar2cam' in calib:
-            lidar2img = np.asarray(calib['lidar2img'], dtype=np.float32)
-        else:
-            lidar2img = cam2img @ lidar2cam

        if isinstance(single_input['img'], str):
            inputs = dict(
                img_path=single_input['img'],
-                cam2img=cam2img,
-                lidar2img=lidar2img,
-                lidar2cam=lidar2cam,
+                cam2img=single_input['cam2img'],
+                lidar2img=single_input['lidar2img'],
+                lidar2cam=single_input['lidar2cam'],
                box_mode_3d=box_mode_3d,
                box_type_3d=box_type_3d)
        elif isinstance(single_input['img'], np.ndarray):
            inputs = dict(
                img=single_input['img'],
-                cam2img=cam2img,
-                lidar2img=lidar2img,
-                lidar2cam=lidar2cam,
+                cam2img=single_input['cam2img'],
+                lidar2img=single_input['lidar2img'],
+                lidar2cam=single_input['lidar2cam'],
                box_type_3d=box_type_3d,
                box_mode_3d=box_mode_3d)
        else:

--- a/mmdet3d/visualization/local_visualizer.py
+++ b/mmdet3d/visualization/local_visualizer.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import copy
 import math
+import os
 import sys
 import time
 from typing import List, Optional, Sequence, Tuple, Union
@@ -155,7 +156,7 @@ class Det3DLocalVisualizer(DetLocalVisualizer):
            if hasattr(self, 'pcd'):
                del self.pcd

-    def _initialize_o3d_vis(self) -> Visualizer:
+    def _initialize_o3d_vis(self, show=True) -> Visualizer:
        """Initialize open3d vis according to frame_cfg.

        Args:
@@ -176,6 +177,7 @@ class Det3DLocalVisualizer(DetLocalVisualizer):
        o3d_vis.register_key_action_callback(glfw_key_space,
                                             self.space_action_callback)
        o3d_vis.register_key_callback(glfw_key_right, self.right_callback)
+        if os.environ.get('DISPLAY', None) is not None and show:
            o3d_vis.create_window()
            self.view_control = o3d_vis.get_view_control()
        return o3d_vis
@@ -859,6 +861,9 @@ class Det3DLocalVisualizer(DetLocalVisualizer):
                    self.view_port)
            self.flag_exit = not self.o3d_vis.poll_events()
            self.o3d_vis.update_renderer()
+            # if not hasattr(self, 'view_control'):
+            #     self.o3d_vis.create_window()
+            #     self.view_control = self.o3d_vis.get_view_control()
            self.view_port = \
                self.view_control.convert_to_pinhole_camera_parameters()  # noqa: E501
            if wait_time != -1:
@@ -976,7 +981,7 @@ class Det3DLocalVisualizer(DetLocalVisualizer):
        # For object detection datasets, no palette is saved
        palette = self.dataset_meta.get('palette', None)
        ignore_index = self.dataset_meta.get('ignore_index', None)
-        if ignore_index is not None and 'gt_pts_seg' in data_sample and vis_task == 'lidar_seg':  # noqa: E501
+        if vis_task == 'lidar_seg' and ignore_index is not None and 'pts_semantic_mask' in data_sample.gt_pts_seg:  # noqa: E501
            keep_index = data_sample.gt_pts_seg.pts_semantic_mask != ignore_index  # noqa: E501
        else:
            keep_index = None
@@ -986,6 +991,12 @@ class Det3DLocalVisualizer(DetLocalVisualizer):
        gt_img_data = None
        pred_img_data = None

+        if not hasattr(self, 'o3d_vis') and vis_task in [
+                'multi-view_det', 'lidar_det', 'lidar_seg',
+                'multi-modality_det'
+        ]:
+            self.o3d_vis = self._initialize_o3d_vis(show=show)
+
        if draw_gt and data_sample is not None:
            if 'gt_instances_3d' in data_sample:
                gt_data_3d = self._draw_instances_3d(
@@ -1083,6 +1094,7 @@ class Det3DLocalVisualizer(DetLocalVisualizer):
            if drawn_img_3d is not None:
                mmcv.imwrite(drawn_img_3d[..., ::-1], out_file)
            if drawn_img is not None:
-                mmcv.imwrite(drawn_img[..., ::-1], out_file)
+                mmcv.imwrite(drawn_img[..., ::-1],
+                             out_file[:-4] + '_2d' + out_file[-4:])
        else:
            self.add_image(name, drawn_img_3d, step)
--- a/projects/BEVFusion/README.md
+++ b/projects/BEVFusion/README.md
@@ -34,7 +34,7 @@ python projects/BEVFusion/setup.py develop
 Run a demo on NuScenes data using [BEVFusion model](https://drive.google.com/file/d/1QkvbYDk4G2d6SZoeJqish13qSyXA4lp3/view?usp=share_link):

 ```shell
-python demo/multi_modality_demo.py demo/data/nuscenes/n015-2018-07-24-11-22-45+0800__LIDAR_TOP__1532402927647951.pcd.bin demo/data/nuscenes/ demo/data/nuscenes/n015-2018-07-24-11-22-45+0800.pkl projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py ${CHECKPOINT_FILE} --cam-type all --score-thr 0.2 --show
+python projects/BEVFusion/demo/multi_modality_demo.py demo/data/nuscenes/n015-2018-07-24-11-22-45+0800__LIDAR_TOP__1532402927647951.pcd.bin demo/data/nuscenes/ demo/data/nuscenes/n015-2018-07-24-11-22-45+0800.pkl projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py ${CHECKPOINT_FILE} --cam-type all --score-thr 0.2 --show
 ```

 ### Training commands

--- a/projects/BEVFusion/demo/multi_modality_demo.py
+++ b/projects/BEVFusion/demo/multi_modality_demo.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from argparse import ArgumentParser
+
+import mmcv
+
+from mmdet3d.apis import inference_multi_modality_detector, init_model
+from mmdet3d.registry import VISUALIZERS
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument('pcd', help='Point cloud file')
+    parser.add_argument('img', help='image file')
+    parser.add_argument('ann', help='ann file')
+    parser.add_argument('config', help='Config file')
+    parser.add_argument('checkpoint', help='Checkpoint file')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+    parser.add_argument(
+        '--cam-type',
+        type=str,
+        default='CAM_FRONT',
+        help='choose camera type to inference')
+    parser.add_argument(
+        '--score-thr', type=float, default=0.0, help='bbox score threshold')
+    parser.add_argument(
+        '--out-dir', type=str, default='demo', help='dir to save results')
+    parser.add_argument(
+        '--show',
+        action='store_true',
+        help='show online visualization results')
+    parser.add_argument(
+        '--snapshot',
+        action='store_true',
+        help='whether to save online visualization results')
+    args = parser.parse_args()
+    return args
+
+
+def main(args):
+    # build the model from a config file and a checkpoint file
+    model = init_model(args.config, args.checkpoint, device=args.device)
+
+    # init visualizer
+    visualizer = VISUALIZERS.build(model.cfg.visualizer)
+    visualizer.dataset_meta = model.dataset_meta
+
+    # test a single image and point cloud sample
+    result, data = inference_multi_modality_detector(model, args.pcd, args.img,
+                                                     args.ann, args.cam_type)
+    points = data['inputs']['points']
+    if isinstance(result.img_path, list):
+        img = []
+        for img_path in result.img_path:
+            single_img = mmcv.imread(img_path)
+            single_img = mmcv.imconvert(single_img, 'bgr', 'rgb')
+            img.append(single_img)
+    else:
+        img = mmcv.imread(result.img_path)
+        img = mmcv.imconvert(img, 'bgr', 'rgb')
+    data_input = dict(points=points, img=img)
+
+    # show the results
+    visualizer.add_datasample(
+        'result',
+        data_input,
+        data_sample=result,
+        draw_gt=False,
+        show=args.show,
+        wait_time=-1,
+        out_file=args.out_dir,
+        pred_score_thr=args.score_thr,
+        vis_task='multi-modality_det')
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    main(args)
--- a/tests/test_apis/test_inferencers/test_lidar_det3d_inferencer.py
+++ b/tests/test_apis/test_inferencers/test_lidar_det3d_inferencer.py
@@ -89,7 +89,7 @@ class TestLidarDet3DInferencer(TestCase):
        inputs = dict(points='tests/data/kitti/training/velodyne/000000.bin'),
        # img_out_dir
        with tempfile.TemporaryDirectory() as tmp_dir:
-            self.inferencer(inputs, img_out_dir=tmp_dir)
+            self.inferencer(inputs, out_dir=tmp_dir)
            # TODO: For LiDAR-based detection, the saved image only exists when
            # show=True.
            # self.assertTrue(osp.exists(osp.join(tmp_dir, '000000.png')))
@@ -102,11 +102,9 @@ class TestLidarDet3DInferencer(TestCase):
        res = self.inferencer(inputs, return_datasamples=True)
        self.assertTrue(is_list_of(res['predictions'], Det3DDataSample))

-        # pred_out_file
+        # pred_out_dir
        with tempfile.TemporaryDirectory() as tmp_dir:
-            pred_out_file = osp.join(tmp_dir, 'tmp.json')
-            res = self.inferencer(
-                inputs, print_result=True, pred_out_file=pred_out_file)
-            dumped_res = mmengine.load(pred_out_file)
-            self.assert_predictions_equal(res['predictions'],
-                                          dumped_res['predictions'])
+            res = self.inferencer(inputs, print_result=True, out_dir=tmp_dir)
+            dumped_res = mmengine.load(
+                osp.join(tmp_dir, 'preds', '000000.json'))
+            self.assertEqual(res['predictions'][0], dumped_res)
--- a/tests/test_apis/test_inferencers/test_lidar_seg3d_inferencer.py
+++ b/tests/test_apis/test_inferencers/test_lidar_seg3d_inferencer.py
@@ -91,7 +91,7 @@ class TestLiDARSeg3DInferencer(TestCase):
        inputs = dict(points='tests/data/s3dis/points/Area_1_office_2.bin')
        # img_out_dir
        with tempfile.TemporaryDirectory() as tmp_dir:
-            self.inferencer(inputs, img_out_dir=tmp_dir)
+            self.inferencer(inputs, out_dir=tmp_dir)

    def test_post_processor(self):
        if not torch.cuda.is_available():
@@ -101,11 +101,9 @@ class TestLiDARSeg3DInferencer(TestCase):
        res = self.inferencer(inputs, return_datasamples=True)
        self.assertTrue(is_list_of(res['predictions'], Det3DDataSample))

-        # pred_out_file
+        # pred_out_dir
        with tempfile.TemporaryDirectory() as tmp_dir:
-            pred_out_file = osp.join(tmp_dir, 'tmp.json')
-            res = self.inferencer(
-                inputs, print_result=True, pred_out_file=pred_out_file)
-            dumped_res = mmengine.load(pred_out_file)
-            self.assert_predictions_equal(res['predictions'],
-                                          dumped_res['predictions'])
+            res = self.inferencer(inputs, print_result=True, out_dir=tmp_dir)
+            dumped_res = mmengine.load(
+                osp.join(tmp_dir, 'preds', 'Area_1_office_2.json'))
+            self.assertEqual(res['predictions'][0], dumped_res)