raw_mmdetection

7aa442d5 · raojy · 9c03eaa8 · 7aa442d5 · 7aa442d5 · 7aa442d5
Commit 7aa442d5 authored Apr 01, 2026 by raojy
20 changed files
--- a/mmdetection3d/demo/mono_det_demo.py
+++ b/mmdetection3d/demo/mono_det_demo.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import os
+from argparse import ArgumentParser
+
+from mmengine.logging import print_log
+
+from mmdet3d.apis import MonoDet3DInferencer
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument('img', help='Image file')
+    parser.add_argument('infos', help='Infos file with annotations')
+    parser.add_argument('model', help='Config file')
+    parser.add_argument('weights', help='Checkpoint file')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+    parser.add_argument(
+        '--cam-type',
+        type=str,
+        default='CAM_BACK',
+        help='choose camera type to inference')
+    parser.add_argument(
+        '--pred-score-thr',
+        type=float,
+        default=0.3,
+        help='bbox score threshold')
+    parser.add_argument(
+        '--out-dir',
+        type=str,
+        default='outputs',
+        help='Output directory of prediction and visualization results.')
+    parser.add_argument(
+        '--show',
+        action='store_true',
+        help='Show online visualization results')
+    parser.add_argument(
+        '--wait-time',
+        type=float,
+        default=-1,
+        help='The interval of show (s). Demo will be blocked in showing'
+        'results, if wait_time is -1. Defaults to -1.')
+    parser.add_argument(
+        '--no-save-vis',
+        action='store_true',
+        help='Do not save detection visualization results')
+    parser.add_argument(
+        '--no-save-pred',
+        action='store_true',
+        help='Do not save detection prediction results')
+    parser.add_argument(
+        '--print-result',
+        action='store_true',
+        help='Whether to print the results.')
+    call_args = vars(parser.parse_args())
+
+    call_args['inputs'] = dict(
+        img=call_args.pop('img'), infos=call_args.pop('infos'))
+    call_args.pop('cam_type')
+
+    if call_args['no_save_vis'] and call_args['no_save_pred']:
+        call_args['out_dir'] = ''
+
+    init_kws = ['model', 'weights', 'device']
+    init_args = {}
+    for init_kw in init_kws:
+        init_args[init_kw] = call_args.pop(init_kw)
+
+    # NOTE: If your operating environment does not have a display device,
+    # (e.g. a remote server), you can save the predictions and visualize
+    # them in local devices.
+    if os.environ.get('DISPLAY') is None and call_args['show']:
+        print_log(
+            'Display device not found. `--show` is forced to False',
+            logger='current',
+            level=logging.WARNING)
+        call_args['show'] = False
+
+    return init_args, call_args
+
+
+def main():
+    # TODO: Support inference of point cloud numpy file.
+    init_args, call_args = parse_args()
+
+    inferencer = MonoDet3DInferencer(**init_args)
+    inferencer(**call_args)
+
+    if call_args['out_dir'] != '' and not (call_args['no_save_vis']
+                                           and call_args['no_save_pred']):
+        print_log(
+            f'results have been saved at {call_args["out_dir"]}',
+            logger='current')
+
+
+if __name__ == '__main__':
+    main()
--- a/mmdetection3d/demo/multi_modality_demo.py
+++ b/mmdetection3d/demo/multi_modality_demo.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import os
+from argparse import ArgumentParser
+
+from mmengine.logging import print_log
+
+from mmdet3d.apis import MultiModalityDet3DInferencer
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument('pcd', help='Point cloud file')
+    parser.add_argument('img', help='Image file')
+    parser.add_argument('infos', help='Infos file with annotations')
+    parser.add_argument('model', help='Config file')
+    parser.add_argument('weights', help='Checkpoint file')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+    parser.add_argument(
+        '--cam-type',
+        type=str,
+        default='CAM_FRONT',
+        help='choose camera type to inference')
+    parser.add_argument(
+        '--pred-score-thr',
+        type=float,
+        default=0.3,
+        help='bbox score threshold')
+    parser.add_argument(
+        '--out-dir',
+        type=str,
+        default='outputs',
+        help='Output directory of prediction and visualization results.')
+    parser.add_argument(
+        '--show',
+        action='store_true',
+        help='Show online visualization results')
+    parser.add_argument(
+        '--wait-time',
+        type=float,
+        default=-1,
+        help='The interval of show (s). Demo will be blocked in showing'
+        'results, if wait_time is -1. Defaults to -1.')
+    parser.add_argument(
+        '--no-save-vis',
+        action='store_true',
+        help='Do not save detection visualization results')
+    parser.add_argument(
+        '--no-save-pred',
+        action='store_true',
+        help='Do not save detection prediction results')
+    parser.add_argument(
+        '--print-result',
+        action='store_true',
+        help='Whether to print the results.')
+    call_args = vars(parser.parse_args())
+
+    call_args['inputs'] = dict(
+        points=call_args.pop('pcd'),
+        img=call_args.pop('img'),
+        infos=call_args.pop('infos'))
+    call_args.pop('cam_type')
+
+    if call_args['no_save_vis'] and call_args['no_save_pred']:
+        call_args['out_dir'] = ''
+
+    init_kws = ['model', 'weights', 'device']
+    init_args = {}
+    for init_kw in init_kws:
+        init_args[init_kw] = call_args.pop(init_kw)
+
+    # NOTE: If your operating environment does not have a display device,
+    # (e.g. a remote server), you can save the predictions and visualize
+    # them in local devices.
+    if os.environ.get('DISPLAY') is None and call_args['show']:
+        print_log(
+            'Display device not found. `--show` is forced to False',
+            logger='current',
+            level=logging.WARNING)
+        call_args['show'] = False
+
+    return init_args, call_args
+
+
+def main():
+    # TODO: Support inference of point cloud numpy file.
+    init_args, call_args = parse_args()
+
+    inferencer = MultiModalityDet3DInferencer(**init_args)
+    inferencer(**call_args)
+
+    if call_args['out_dir'] != '' and not (call_args['no_save_vis']
+                                           and call_args['no_save_pred']):
+        print_log(
+            f'results have been saved at {call_args["out_dir"]}',
+            logger='current')
+
+
+if __name__ == '__main__':
+    main()
--- a/mmdetection3d/demo/pcd_demo.py
+++ b/mmdetection3d/demo/pcd_demo.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import os
+from argparse import ArgumentParser
+
+from mmengine.logging import print_log
+
+from mmdet3d.apis import LidarDet3DInferencer
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument('pcd', help='Point cloud file')
+    parser.add_argument('model', help='Config file')
+    parser.add_argument('weights', help='Checkpoint file')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+    parser.add_argument(
+        '--pred-score-thr',
+        type=float,
+        default=0.3,
+        help='bbox score threshold')
+    parser.add_argument(
+        '--out-dir',
+        type=str,
+        default='outputs',
+        help='Output directory of prediction and visualization results.')
+    parser.add_argument(
+        '--show',
+        action='store_true',
+        help='Show online visualization results')
+    parser.add_argument(
+        '--wait-time',
+        type=float,
+        default=-1,
+        help='The interval of show (s). Demo will be blocked in showing'
+        'results, if wait_time is -1. Defaults to -1.')
+    parser.add_argument(
+        '--no-save-vis',
+        action='store_true',
+        help='Do not save detection visualization results')
+    parser.add_argument(
+        '--no-save-pred',
+        action='store_true',
+        help='Do not save detection prediction results')
+    parser.add_argument(
+        '--print-result',
+        action='store_true',
+        help='Whether to print the results.')
+    call_args = vars(parser.parse_args())
+
+    call_args['inputs'] = dict(points=call_args.pop('pcd'))
+
+    if call_args['no_save_vis'] and call_args['no_save_pred']:
+        call_args['out_dir'] = ''
+
+    init_kws = ['model', 'weights', 'device']
+    init_args = {}
+    for init_kw in init_kws:
+        init_args[init_kw] = call_args.pop(init_kw)
+
+    # NOTE: If your operating environment does not have a display device,
+    # (e.g. a remote server), you can save the predictions and visualize
+    # them in local devices.
+    if os.environ.get('DISPLAY') is None and call_args['show']:
+        print_log(
+            'Display device not found. `--show` is forced to False',
+            logger='current',
+            level=logging.WARNING)
+        call_args['show'] = False
+
+    return init_args, call_args
+
+
+def main():
+    # TODO: Support inference of point cloud numpy file.
+    init_args, call_args = parse_args()
+
+    inferencer = LidarDet3DInferencer(**init_args)
+    inferencer(**call_args)
+
+    if call_args['out_dir'] != '' and not (call_args['no_save_vis']
+                                           and call_args['no_save_pred']):
+        print_log(
+            f'results have been saved at {call_args["out_dir"]}',
+            logger='current')
+
+
+if __name__ == '__main__':
+    main()
--- a/mmdetection3d/demo/pcd_seg_demo.py
+++ b/mmdetection3d/demo/pcd_seg_demo.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import os
+from argparse import ArgumentParser
+
+from mmengine.logging import print_log
+
+from mmdet3d.apis import LidarSeg3DInferencer
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument('pcd', help='Point cloud file')
+    parser.add_argument('model', help='Config file')
+    parser.add_argument('weights', help='Checkpoint file')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+    parser.add_argument(
+        '--out-dir',
+        type=str,
+        default='outputs',
+        help='Output directory of prediction and visualization results.')
+    parser.add_argument(
+        '--show',
+        action='store_true',
+        help='Show online visualization results')
+    parser.add_argument(
+        '--wait-time',
+        type=float,
+        default=-1,
+        help='The interval of show (s). Demo will be blocked in showing'
+        'results, if wait_time is -1. Defaults to -1.')
+    parser.add_argument(
+        '--no-save-vis',
+        action='store_true',
+        help='Do not save detection visualization results')
+    parser.add_argument(
+        '--no-save-pred',
+        action='store_true',
+        help='Do not save detection prediction results')
+    parser.add_argument(
+        '--print-result',
+        action='store_true',
+        help='Whether to print the results.')
+    call_args = vars(parser.parse_args())
+
+    call_args['inputs'] = dict(points=call_args.pop('pcd'))
+
+    if call_args['no_save_vis'] and call_args['no_save_pred']:
+        call_args['out_dir'] = ''
+
+    init_kws = ['model', 'weights', 'device']
+    init_args = {}
+    for init_kw in init_kws:
+        init_args[init_kw] = call_args.pop(init_kw)
+
+    # NOTE: If your operating environment does not have a display device,
+    # (e.g. a remote server), you can save the predictions and visualize
+    # them in local devices.
+    if os.environ.get('DISPLAY') is None and call_args['show']:
+        print_log(
+            'Display device not found. `--show` is forced to False',
+            logger='current',
+            level=logging.WARNING)
+        call_args['show'] = False
+
+    return init_args, call_args
+
+
+def main():
+    # TODO: Support inference of point cloud numpy file.
+    init_args, call_args = parse_args()
+
+    inferencer = LidarSeg3DInferencer(**init_args)
+    inferencer(**call_args)
+
+    if call_args['out_dir'] != '' and not (call_args['no_save_vis']
+                                           and call_args['no_save_pred']):
+        print_log(
+            f'results have been saved at {call_args["out_dir"]}',
+            logger='current')
+
+
+if __name__ == '__main__':
+    main()
--- a/mmdetection3d/docker/Dockerfile
+++ b/mmdetection3d/docker/Dockerfile
+ARG PYTORCH="1.9.0"
+ARG CUDA="11.1"
+ARG CUDNN="8"
+
+FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
+
+ENV TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6+PTX" \
+    TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
+    CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \
+    FORCE_CUDA="1"
+
+# Avoid Public GPG key error
+# https://github.com/NVIDIA/nvidia-docker/issues/1631
+RUN rm /etc/apt/sources.list.d/cuda.list \
+    && rm /etc/apt/sources.list.d/nvidia-ml.list \
+    && apt-key del 7fa2af80 \
+    && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub \
+    && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
+
+# (Optional, use Mirror to speed up downloads)
+# RUN sed -i 's/http:\/\/archive.ubuntu.com\/ubuntu\//http:\/\/mirrors.aliyun.com\/ubuntu\//g' /etc/apt/sources.list && \
+#    pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
+
+# Install the required packages
+RUN apt-get update \
+    && apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install MMEngine, MMCV and MMDetection
+RUN pip install openmim && \
+    mim install "mmengine" "mmcv>=2.0.0rc4" "mmdet>=3.0.0"
+
+# Install MMDetection3D
+RUN conda clean --all \
+    && git clone https://github.com/open-mmlab/mmdetection3d.git -b dev-1.x /mmdetection3d \
+    && cd /mmdetection3d \
+    && pip install --no-cache-dir -e .
+
+WORKDIR /mmdetection3d
--- a/mmdetection3d/docker/serve/Dockerfile
+++ b/mmdetection3d/docker/serve/Dockerfile
+ARG PYTORCH="1.9.0"
+ARG CUDA="11.1"
+ARG CUDNN="8"
+FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
+
+ARG MMCV="2.0.0rc4"
+ARG MMDET="3.3.0"
+ARG MMDET3D="1.4.0"
+
+ENV PYTHONUNBUFFERED TRUE
+
+# Avoid Public GPG key error
+# https://github.com/NVIDIA/nvidia-docker/issues/1631
+RUN rm /etc/apt/sources.list.d/cuda.list \
+    && rm /etc/apt/sources.list.d/nvidia-ml.list \
+    && apt-key del 7fa2af80 \
+    && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub \
+    && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
+
+# (Optional, use Mirror to speed up downloads)
+# RUN sed -i 's/http:\/\/archive.ubuntu.com\/ubuntu\//http:\/\/mirrors.aliyun.com\/ubuntu\//g' /etc/apt/sources.list
+
+# Install the required packages
+RUN apt-get update && \
+    DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
+    ca-certificates \
+    g++ \
+    openjdk-11-jre-headless \
+    # MMDet3D Requirements
+    ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 \
+    && rm -rf /var/lib/apt/lists/*
+
+ENV PATH="/opt/conda/bin:$PATH" \
+    FORCE_CUDA="1"
+
+# TORCHSEVER
+RUN pip install torchserve torch-model-archiver
+
+# MMLAB
+ARG PYTORCH
+ARG CUDA
+RUN pip install openmim
+RUN mim install mmengine
+RUN mim install mmcv==${MMCV}
+RUN mim install mmdet==${MMDET}
+RUN mim install mmdet3d==${MMDET3D}
+
+RUN useradd -m model-server \
+    && mkdir -p /home/model-server/tmp
+
+COPY entrypoint.sh /usr/local/bin/entrypoint.sh
+
+RUN chmod +x /usr/local/bin/entrypoint.sh \
+    && chown -R model-server /home/model-server
+
+COPY config.properties /home/model-server/config.properties
+RUN mkdir /home/model-server/model-store && chown -R model-server /home/model-server/model-store
+
+EXPOSE 8080 8081 8082
+
+USER model-server
+WORKDIR /home/model-server
+ENV TEMP=/home/model-server/tmp
+ENTRYPOINT ["/usr/local/bin/entrypoint.sh"]
+CMD ["serve"]
--- a/mmdetection3d/docker/serve/config.properties
+++ b/mmdetection3d/docker/serve/config.properties
+inference_address=http://0.0.0.0:8080
+management_address=http://0.0.0.0:8081
+metrics_address=http://0.0.0.0:8082
+model_store=/home/model-server/model-store
+load_models=all
--- a/mmdetection3d/docker/serve/entrypoint.sh
+++ b/mmdetection3d/docker/serve/entrypoint.sh
+#!/bin/bash
+set -e
+
+if [[ "$1" = "serve" ]]; then
+    shift 1
+    torchserve --start --ts-config /home/model-server/config.properties
+else
+    eval "$@"
+fi
+
+# prevent docker exit
+tail -f /dev/null
--- a/mmdetection3d/docs/en/Makefile
+++ b/mmdetection3d/docs/en/Makefile
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/mmdetection3d/docs/en/_static/css/readthedocs.css
+++ b/mmdetection3d/docs/en/_static/css/readthedocs.css
+.header-logo {
+    background-image: url("../image/mmdet3d-logo.png");
+    background-size: 182.5px 40px;
+    height: 40px;
+    width: 182.5px;
+}
--- a/mmdetection3d/docs/en/advanced_guides/customize_dataset.md
+++ b/mmdetection3d/docs/en/advanced_guides/customize_dataset.md
+# Customize Datasets
+
+In this note, you will know how to train and test predefined models with customized datasets.
+
+The basic steps are as below:
+
+1. Prepare data
+2. Prepare a config
+3. Train, test and inference models on the customized dataset
+
+## Data Preparation
+
+The ideal situation is that we can reorganize the customized raw data and convert the annotation format into KITTI style. However, considering some calibration files and 3D annotations in KITTI format are difficult to obtain for customized datasets, we introduce the basic data format in the doc.
+
+### Basic Data Format
+
+#### Point cloud Format
+
+Currently, we only support `.bin` format point cloud for training and inference. Before training on your own datasets, you need to convert your point cloud files with other formats to `.bin` files. The common point cloud data formats include `.pcd` and `.las`, we list some open-source tools for reference.
+
+1. Convert `.pcd` to `.bin`: https://github.com/DanielPollithy/pypcd
+
+- You can install `pypcd` with the following command:
+
+  ```bash
+  pip install git+https://github.com/DanielPollithy/pypcd.git
+  ```
+
+- You can use the following script to read the `.pcd` file and convert it to `.bin` format for saving:
+
+  ```python
+  import numpy as np
+  from pypcd import pypcd
+
+  pcd_data = pypcd.PointCloud.from_path('point_cloud_data.pcd')
+  points = np.zeros([pcd_data.width, 4], dtype=np.float32)
+  points[:, 0] = pcd_data.pc_data['x'].copy()
+  points[:, 1] = pcd_data.pc_data['y'].copy()
+  points[:, 2] = pcd_data.pc_data['z'].copy()
+  points[:, 3] = pcd_data.pc_data['intensity'].copy().astype(np.float32)
+  with open('point_cloud_data.bin', 'wb') as f:
+      f.write(points.tobytes())
+  ```
+
+2. Convert `.las` to `.bin`: The common conversion path is `.las -> .pcd -> .bin`, and the conversion path `.las -> .pcd` can be achieved through [this tool](https://github.com/Hitachi-Automotive-And-Industry-Lab/semantic-segmentation-editor).
+
+#### Label Format
+
+The most basic information: 3D bounding box and category label of each scene need to be contained in the `.txt` annotation file. Each line represents a 3D box in a certain scene as follow:
+
+```
+# format: [x, y, z, dx, dy, dz, yaw, category_name]
+1.23 1.42 0.23 3.96 1.65 1.55 1.56 Car
+3.51 2.15 0.42 1.05 0.87 1.86 1.23 Pedestrian
+...
+```
+
+**Note**: Currently we only support KITTI Metric evaluation for customized datasets evaluation.
+
+The 3D Box should be stored in unified 3D coordinates.
+
+#### Calibration Format
+
+For the point cloud data collected by each LiDAR, they are usually fused and converted to a certain LiDAR coordinate. So typically the calibration information file should contain the intrinsic matrix of each camera and the transformation extrinsic matrix from the LiDAR to each camera in `.txt` calibration file, while `Px` represents the intrinsic matrix of `camera_x` and `lidar2camx` represents the transformation extrinsic matrix from the `lidar` to `camera_x`.
+
+```
+P0
+P1
+P2
+P3
+P4
+...
+lidar2cam0
+lidar2cam1
+lidar2cam2
+lidar2cam3
+lidar2cam4
+...
+```
+
+### Raw Data Structure
+
+#### LiDAR-Based 3D Detection
+
+The raw data for LiDAR-based 3D object detection are typically organized as follows, where `ImageSets` contains split files indicating which files belong to training/validation set, `points` includes point cloud data which are supposed to be stored in `.bin` format and `labels` includes label files for 3D detection.
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── custom
+│   │   ├── ImageSets
+│   │   │   ├── train.txt
+│   │   │   ├── val.txt
+│   │   ├── points
+│   │   │   ├── 000000.bin
+│   │   │   ├── 000001.bin
+│   │   │   ├── ...
+│   │   ├── labels
+│   │   │   ├── 000000.txt
+│   │   │   ├── 000001.txt
+│   │   │   ├── ...
+```
+
+#### Vision-Based 3D Detection
+
+The raw data for vision-based 3D object detection are typically organized as follows, where `ImageSets` contains split files indicating which files belong to training/validation set, `images` contains the images from different cameras, for example, images from `camera_x` need to be placed in `images/images_x`, `calibs` contains calibration information files which store the camera intrinsic matrix of each camera, and `labels` includes label files for 3D detection.
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── custom
+│   │   ├── ImageSets
+│   │   │   ├── train.txt
+│   │   │   ├── val.txt
+│   │   ├── calibs
+│   │   │   ├── 000000.txt
+│   │   │   ├── 000001.txt
+│   │   │   ├── ...
+│   │   ├── images
+│   │   │   ├── images_0
+│   │   │   │   ├── 000000.png
+│   │   │   │   ├── 000001.png
+│   │   │   │   ├── ...
+│   │   │   ├── images_1
+│   │   │   ├── images_2
+│   │   │   ├── ...
+│   │   ├── labels
+│   │   │   ├── 000000.txt
+│   │   │   ├── 000001.txt
+│   │   │   ├── ...
+```
+
+#### Multi-Modality 3D Detection
+
+The raw data for multi-modality 3D object detection are typically organized as follows. Different from vision-based 3D object detection, calibration information files in `calibs` store the camera intrinsic matrix of each camera and extrinsic matrix.
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── custom
+│   │   ├── ImageSets
+│   │   │   ├── train.txt
+│   │   │   ├── val.txt
+│   │   ├── calibs
+│   │   │   ├── 000000.txt
+│   │   │   ├── 000001.txt
+│   │   │   ├── ...
+│   │   ├── points
+│   │   │   ├── 000000.bin
+│   │   │   ├── 000001.bin
+│   │   │   ├── ...
+│   │   ├── images
+│   │   │   ├── images_0
+│   │   │   │   ├── 000000.png
+│   │   │   │   ├── 000001.png
+│   │   │   │   ├── ...
+│   │   │   ├── images_1
+│   │   │   ├── images_2
+│   │   │   ├── ...
+│   │   ├── labels
+│   │   │   ├── 000000.txt
+│   │   │   ├── 000001.txt
+│   │   │   ├── ...
+```
+
+#### LiDAR-Based 3D Semantic Segmentation
+
+The raw data for LiDAR-based 3D semantic segmentation are typically organized as follows, where `ImageSets` contains split files indicating which files belong to training/validation set, `points` includes point cloud data, and `semantic_mask` includes point-level label.
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── custom
+│   │   ├── ImageSets
+│   │   │   ├── train.txt
+│   │   │   ├── val.txt
+│   │   ├── points
+│   │   │   ├── 000000.bin
+│   │   │   ├── 000001.bin
+│   │   │   ├── ...
+│   │   ├── semantic_mask
+│   │   │   ├── 000000.bin
+│   │   │   ├── 000001.bin
+│   │   │   ├── ...
+```
+
+### Data Converter
+
+Once you prepared the raw data following our instruction, you can directly use the following command to generate training/validation information files.
+
+```bash
+python tools/create_data.py custom --root-path ./data/custom --out-dir ./data/custom --extra-tag custom
+```
+
+## An example of customized dataset
+
+Once we finish data preparation, we can create a new dataset in `mmdet3d/datasets/my_dataset.py` to load the data.
+
+```python
+import mmengine
+
+from mmdet3d.registry import DATASETS
+from .det3d_dataset import Det3DDataset
+
+
+@DATASETS.register_module()
+class MyDataset(Det3DDataset):
+
+    # replace with all the classes in customized pkl info file
+    METAINFO = {
+        'classes': ('Pedestrian', 'Cyclist', 'Car')
+    }
+
+    def parse_ann_info(self, info):
+        """Process the `instances` in data info to `ann_info`.
+
+        Args:
+            info (dict): Data information of single data sample.
+
+        Returns:
+            dict: Annotation information consists of the following keys:
+
+                - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`):
+                  3D ground truth bboxes.
+                - gt_labels_3d (np.ndarray): Labels of ground truths.
+        """
+        ann_info = super().parse_ann_info(info)
+        if ann_info is None:
+            ann_info = dict()
+            # empty instance
+            ann_info['gt_bboxes_3d'] = np.zeros((0, 7), dtype=np.float32)
+            ann_info['gt_labels_3d'] = np.zeros(0, dtype=np.int64)
+
+        # filter the gt classes not used in training
+        ann_info = self._remove_dontcare(ann_info)
+        gt_bboxes_3d = LiDARInstance3DBoxes(ann_info['gt_bboxes_3d'])
+        ann_info['gt_bboxes_3d'] = gt_bboxes_3d
+        return ann_info
+```
+
+After the data pre-processing, there are two steps for users to train the customized new dataset:
+
+1. Modify the config file for using the customized dataset.
+2. Check the annotations of the customized dataset.
+
+Here we take training PointPillars on customized dataset as an example:
+
+### Prepare a config
+
+Here we demonstrate a config sample for pure point cloud training.
+
+#### Prepare dataset config
+
+In `configs/_base_/datasets/custom.py`:
+
+```python
+# dataset settings
+dataset_type = 'MyDataset'
+data_root = 'data/custom/'
+class_names = ['Pedestrian', 'Cyclist', 'Car']  # replace with your dataset class
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]  # adjust according to your dataset
+input_modality = dict(use_lidar=True, use_camera=False)
+metainfo = dict(classes=class_names)
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,  # replace with your point cloud data dimension
+        use_dim=4),  # replace with the actual dimension used in training and inference
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True),
+    dict(
+        type='ObjectNoise',
+        num_try=100,
+        translation_std=[1.0, 1.0, 0.5],
+        global_rot_range=[0.0, 0.0],
+        rot_range=[-0.78539816, 0.78539816]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,  # replace with your point cloud data dimension
+        use_dim=4),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+# construct a pipeline for data and gt loading in show function
+eval_pipeline = [
+    dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
+    dict(type='Pack3DDetInputs', keys=['points']),
+]
+train_dataloader = dict(
+    batch_size=6,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='custom_infos_train.pkl',  # specify your training pkl info
+            data_prefix=dict(pts='points'),
+            pipeline=train_pipeline,
+            modality=input_modality,
+            test_mode=False,
+            metainfo=metainfo,
+            box_type_3d='LiDAR')))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(pts='points'),
+        ann_file='custom_infos_val.pkl',  # specify your validation pkl info
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        box_type_3d='LiDAR'))
+val_evaluator = dict(
+    type='KittiMetric',
+    ann_file=data_root + 'custom_infos_val.pkl',  # specify your validation pkl info
+    metric='bbox')
+```
+
+#### Prepare model config
+
+For voxel-based detectors such as SECOND, PointPillars and CenterPoint, the point cloud range and voxel size should be adjusted according to your dataset.
+Theoretically, `voxel_size` is linked to the setting of `point_cloud_range`. Setting a smaller `voxel_size` will increase the voxel num and the corresponding memory consumption. In addition, the following issues need to be noted:
+
+If the `point_cloud_range` and `voxel_size` are set to be `[0, -40, -3, 70.4, 40, 1]` and `[0.05, 0.05, 0.1]` respectively, then the shape of intermediate feature map should be `[(1-(-3))/0.1+1, (40-(-40))/0.05, (70.4-0)/0.05]=[41, 1600, 1408]`. When changing `point_cloud_range`, remember to change the shape of intermediate feature map in `middle_encoder` according to the `voxel_size`.
+
+Regarding the setting of `anchor_range`, it is generally adjusted according to dataset. Note that `z` value needs to be adjusted accordingly to the position of the point cloud, please refer to this [issue](https://github.com/open-mmlab/mmdetection3d/issues/986).
+
+Regarding the setting of `anchor_size`, it is usually necessary to count the average length, width and height of objects in the entire training dataset as `anchor_size` to obtain the best results.
+
+In `configs/_base_/models/pointpillars_hv_secfpn_custom.py`:
+
+```python
+voxel_size = [0.16, 0.16, 4]  # adjust according to your dataset
+point_cloud_range = [0, -39.68, -3, 69.12, 39.68, 1]  # adjust according to your dataset
+model = dict(
+    type='VoxelNet',
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        voxel=True,
+        voxel_layer=dict(
+            max_num_points=32,
+            point_cloud_range=point_cloud_range,
+            voxel_size=voxel_size,
+            max_voxels=(16000, 40000))),
+    voxel_encoder=dict(
+        type='PillarFeatureNet',
+        in_channels=4,
+        feat_channels=[64],
+        with_distance=False,
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range),
+    # the `output_shape` should be adjusted according to `point_cloud_range`
+    # and `voxel_size`
+    middle_encoder=dict(
+        type='PointPillarsScatter', in_channels=64, output_shape=[496, 432]),
+    backbone=dict(
+        type='SECOND',
+        in_channels=64,
+        layer_nums=[3, 5, 5],
+        layer_strides=[2, 2, 2],
+        out_channels=[64, 128, 256]),
+    neck=dict(
+        type='SECONDFPN',
+        in_channels=[64, 128, 256],
+        upsample_strides=[1, 2, 4],
+        out_channels=[128, 128, 128]),
+    bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=3,
+        in_channels=384,
+        feat_channels=384,
+        use_direction_classifier=True,
+        assign_per_class=True,
+        # adjust the `ranges` and `sizes` according to your dataset
+        anchor_generator=dict(
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[
+                [0, -39.68, -0.6, 69.12, 39.68, -0.6],
+                [0, -39.68, -0.6, 69.12, 39.68, -0.6],
+                [0, -39.68, -1.78, 69.12, 39.68, -1.78],
+            ],
+            sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        diff_rad_by_sin=True,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False,
+            loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(
+        assigner=[
+            dict(  # for Pedestrian
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.35,
+                min_pos_iou=0.35,
+                ignore_iof_thr=-1),
+            dict(  # for Cyclist
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.35,
+                min_pos_iou=0.35,
+                ignore_iof_thr=-1),
+            dict(  # for Car
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.6,
+                neg_iou_thr=0.45,
+                min_pos_iou=0.45,
+                ignore_iof_thr=-1),
+        ],
+        allowed_border=0,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_thr=0.01,
+        score_thr=0.1,
+        min_bbox_size=0,
+        nms_pre=100,
+        max_num=50))
+```
+
+#### Prepare overall config
+
+We combine all the configs above in `configs/pointpillars/pointpillars_hv_secfpn_8xb6_custom.py`:
+
+```python
+_base_ = [
+    '../_base_/models/pointpillars_hv_secfpn_custom.py',
+    '../_base_/datasets/custom.py',
+    '../_base_/schedules/cyclic-40e.py', '../_base_/default_runtime.py'
+]
+```
+
+#### Visualize your dataset (optional)
+
+To validate whether your prepared data and config are correct, it's highly recommended to use `tools/misc/browse_dataset.py` script
+to visualize your dataset and annotations before training and validation. Please refer to [visualization doc](https://mmdetection3d.readthedocs.io/en/dev-1.x/user_guides/visualization.html) for more details.
+
+## Evaluation
+
+Once the data and config have been prepared, you can directly run the training/testing script following our doc.
+
+**Note**: We only provide an implementation for KITTI style evaluation for the customized dataset. It should be included in the dataset config:
+
+```python
+val_evaluator = dict(
+    type='KittiMetric',
+    ann_file=data_root + 'custom_infos_val.pkl',  # specify your validation pkl info
+    metric='bbox')
+```
--- a/mmdetection3d/docs/en/advanced_guides/customize_models.md
+++ b/mmdetection3d/docs/en/advanced_guides/customize_models.md
+# Customize Models
+
+We basically categorize model components into 6 types:
+
+- encoder: Including voxel encoder and middle encoder used in voxel-based methods before backbone, e.g., `HardVFE` and `PointPillarsScatter`.
+- backbone: Usually an FCN network to extract feature maps, e.g., `ResNet`, `SECOND`.
+- neck: The component between backbones and heads, e.g., `FPN`, `SECONDFPN`.
+- head: The component for specific tasks, e.g., `bbox prediction` and `mask prediction`.
+- RoI extractor: The part for extracting RoI features from feature maps, e.g., `H3DRoIHead` and `PartAggregationROIHead`.
+- loss: The component in heads for calculating losses, e.g., `FocalLoss`, `L1Loss`, and `GHMLoss`.
+
+## Develop new components
+
+### Add a new encoder
+
+Here we show how to develop new components with an example of HardVFE.
+
+#### 1. Define a new voxel encoder (e.g. HardVFE: Voxel feature encoder used in HV-SECOND)
+
+Create a new file `mmdet3d/models/voxel_encoders/voxel_encoder.py`.
+
+```python
+import torch.nn as nn
+
+from mmdet3d.registry import MODELS
+
+
+@MODELS.register_module()
+class HardVFE(nn.Module):
+
+    def __init__(self, arg1, arg2):
+        pass
+
+    def forward(self, x):  # should return a tuple
+        pass
+```
+
+#### 2. Import the module
+
+You can either add the following line to `mmdet3d/models/voxel_encoders/__init__.py`:
+
+```python
+from .voxel_encoder import HardVFE
+```
+
+or alternatively add
+
+```python
+custom_imports = dict(
+    imports=['mmdet3d.models.voxel_encoders.voxel_encoder'],
+    allow_failed_imports=False)
+```
+
+to the config file to avoid modifying the original code.
+
+#### 3. Use the voxel encoder in your config file
+
+```python
+model = dict(
+    ...
+    voxel_encoder=dict(
+        type='HardVFE',
+        arg1=xxx,
+        arg2=yyy),
+    ...
+)
+```
+
+### Add a new backbone
+
+Here we show how to develop new components with an example of [SECOND](https://www.mdpi.com/1424-8220/18/10/3337) (Sparsely Embedded Convolutional Detection).
+
+#### 1. Define a new backbone (e.g. SECOND)
+
+Create a new file `mmdet3d/models/backbones/second.py`.
+
+```python
+from mmengine.model import BaseModule
+
+from mmdet3d.registry import MODELS
+
+
+@MODELS.register_module()
+class SECOND(BaseModule):
+
+    def __init__(self, arg1, arg2):
+        pass
+
+    def forward(self, x):  # should return a tuple
+        pass
+```
+
+#### 2. Import the module
+
+You can either add the following line to `mmdet3d/models/backbones/__init__.py`:
+
+```python
+from .second import SECOND
+```
+
+or alternatively add
+
+```python
+custom_imports = dict(
+    imports=['mmdet3d.models.backbones.second'],
+    allow_failed_imports=False)
+```
+
+to the config file to avoid modifying the original code.
+
+#### 3. Use the backbone in your config file
+
+```python
+model = dict(
+    ...
+    backbone=dict(
+        type='SECOND',
+        arg1=xxx,
+        arg2=yyy),
+    ...
+)
+```
+
+### Add a new neck
+
+#### 1. Define a new neck (e.g. SECONDFPN)
+
+Create a new file `mmdet3d/models/necks/second_fpn.py`.
+
+```python
+from mmengine.model import BaseModule
+
+from mmdet3d.registry import MODELS
+
+
+@MODELS.register_module()
+class SECONDFPN(BaseModule):
+
+    def __init__(self,
+                 in_channels=[128, 128, 256],
+                 out_channels=[256, 256, 256],
+                 upsample_strides=[1, 2, 4],
+                 norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+                 upsample_cfg=dict(type='deconv', bias=False),
+                 conv_cfg=dict(type='Conv2d', bias=False),
+                 use_conv_for_no_stride=False,
+                 init_cfg=None):
+        pass
+
+    def forward(self, x):
+        # implementation is ignored
+        pass
+```
+
+#### 2. Import the module
+
+You can either add the following line to `mmdet3d/models/necks/__init__.py`:
+
+```python
+from .second_fpn import SECONDFPN
+```
+
+or alternatively add
+
+```python
+custom_imports = dict(
+    imports=['mmdet3d.models.necks.second_fpn'],
+    allow_failed_imports=False)
+```
+
+to the config file to avoid modifying the original code.
+
+#### 3. Use the neck in your config file
+
+```python
+model = dict(
+    ...
+    neck=dict(
+        type='SECONDFPN',
+        in_channels=[64, 128, 256],
+        upsample_strides=[1, 2, 4],
+        out_channels=[128, 128, 128]),
+    ...
+)
+```
+
+### Add a new head
+
+Here we show how to develop a new head with the example of [PartA2 Head](https://arxiv.org/abs/1907.03670) as the following.
+
+**Note**: Here the example of `PartA2 RoI Head` is used in the second stage. For one-stage heads, please refer to examples in `mmdet3d/models/dense_heads/`. They are more commonly used in 3D detection for autonomous driving due to its simplicity and high efficiency.
+
+First, add a new bbox head in `mmdet3d/models/roi_heads/bbox_heads/parta2_bbox_head.py`.
+`PartA2 RoI Head` implements a new bbox head for object detection.
+To implement a bbox head, basically we need to implement two functions of the new module as the following. Sometimes other related functions like `loss` and `get_targets` are also required.
+
+```python
+from mmengine.model import BaseModule
+
+from mmdet3d.registry import MODELS
+
+
+@MODELS.register_module()
+class PartA2BboxHead(BaseModule):
+    """PartA2 RoI head."""
+
+    def __init__(self,
+                 num_classes,
+                 seg_in_channels,
+                 part_in_channels,
+                 seg_conv_channels=None,
+                 part_conv_channels=None,
+                 merge_conv_channels=None,
+                 down_conv_channels=None,
+                 shared_fc_channels=None,
+                 cls_channels=None,
+                 reg_channels=None,
+                 dropout_ratio=0.1,
+                 roi_feat_size=14,
+                 with_corner_loss=True,
+                 bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+                 conv_cfg=dict(type='Conv1d'),
+                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+                 loss_bbox=dict(
+                     type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+                 loss_cls=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     reduction='none',
+                     loss_weight=1.0),
+                 init_cfg=None):
+        super(PartA2BboxHead, self).__init__(init_cfg=init_cfg)
+
+    def forward(self, seg_feats, part_feats):
+        pass
+```
+
+Second, implement a new RoI Head if it is necessary. We plan to inherit the new `PartAggregationROIHead` from `Base3DRoIHead`. We can find that a `Base3DRoIHead` already implements the following functions.
+
+```python
+from mmdet.models.roi_heads import BaseRoIHead
+
+from mmdet3d.registry import MODELS, TASK_UTILS
+
+
+class Base3DRoIHead(BaseRoIHead):
+    """Base class for 3d RoIHeads."""
+
+    def __init__(self,
+                 bbox_head=None,
+                 bbox_roi_extractor=None,
+                 mask_head=None,
+                 mask_roi_extractor=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 init_cfg=None):
+        super(Base3DRoIHead, self).__init__(
+            bbox_head=bbox_head,
+            bbox_roi_extractor=bbox_roi_extractor,
+            mask_head=mask_head,
+            mask_roi_extractor=mask_roi_extractor,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg)
+
+    def init_bbox_head(self, bbox_roi_extractor: dict,
+                       bbox_head: dict) -> None:
+        """Initialize box head and box roi extractor.
+
+        Args:
+            bbox_roi_extractor (dict or ConfigDict): Config of box
+                roi extractor.
+            bbox_head (dict or ConfigDict): Config of box in box head.
+        """
+        self.bbox_roi_extractor = MODELS.build(bbox_roi_extractor)
+        self.bbox_head = MODELS.build(bbox_head)
+
+    def init_assigner_sampler(self):
+        """Initialize assigner and sampler."""
+        self.bbox_assigner = None
+        self.bbox_sampler = None
+        if self.train_cfg:
+            if isinstance(self.train_cfg.assigner, dict):
+                self.bbox_assigner = TASK_UTILS.build(self.train_cfg.assigner)
+            elif isinstance(self.train_cfg.assigner, list):
+                self.bbox_assigner = [
+                    TASK_UTILS.build(res) for res in self.train_cfg.assigner
+                ]
+            self.bbox_sampler = TASK_UTILS.build(self.train_cfg.sampler)
+
+    def init_mask_head(self):
+        """Initialize mask head, skip since ``PartAggregationROIHead`` does not
+        have one."""
+        pass
+```
+
+Double Head's modification is mainly in the bbox_forward logic, and it inherits other logics from the `Base3DRoIHead`.
+In the `mmdet3d/models/roi_heads/part_aggregation_roi_head.py`, we implement the new RoI Head as the following:
+
+```python
+from typing import Dict, List, Tuple
+
+from mmdet.models.task_modules import AssignResult, SamplingResult
+from mmengine import ConfigDict
+from torch import Tensor
+from torch.nn import functional as F
+
+from mmdet3d.registry import MODELS
+from mmdet3d.structures import bbox3d2roi
+from mmdet3d.utils import InstanceList
+from ...structures.det3d_data_sample import SampleList
+from .base_3droi_head import Base3DRoIHead
+
+
+@MODELS.register_module()
+class PartAggregationROIHead(Base3DRoIHead):
+    """Part aggregation roi head for PartA2.
+
+    Args:
+        semantic_head (ConfigDict): Config of semantic head.
+        num_classes (int): The number of classes.
+        seg_roi_extractor (ConfigDict): Config of seg_roi_extractor.
+        bbox_roi_extractor (ConfigDict): Config of part_roi_extractor.
+        bbox_head (ConfigDict): Config of bbox_head.
+        train_cfg (ConfigDict): Training config.
+        test_cfg (ConfigDict): Testing config.
+    """
+
+    def __init__(self,
+                 semantic_head: dict,
+                 num_classes: int = 3,
+                 seg_roi_extractor: dict = None,
+                 bbox_head: dict = None,
+                 bbox_roi_extractor: dict = None,
+                 train_cfg: dict = None,
+                 test_cfg: dict = None,
+                 init_cfg: dict = None) -> None:
+        super(PartAggregationROIHead, self).__init__(
+            bbox_head=bbox_head,
+            bbox_roi_extractor=bbox_roi_extractor,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg)
+        self.num_classes = num_classes
+        assert semantic_head is not None
+        self.init_seg_head(seg_roi_extractor, semantic_head)
+
+    def init_seg_head(self, seg_roi_extractor: dict,
+                      semantic_head: dict) -> None:
+        """Initialize semantic head and seg roi extractor.
+
+        Args:
+            seg_roi_extractor (dict): Config of seg
+                roi extractor.
+            semantic_head (dict): Config of semantic head.
+        """
+        self.semantic_head = MODELS.build(semantic_head)
+        self.seg_roi_extractor = MODELS.build(seg_roi_extractor)
+
+    @property
+    def with_semantic(self):
+        """bool: whether the head has semantic branch"""
+        return hasattr(self,
+                       'semantic_head') and self.semantic_head is not None
+
+    def predict(self,
+                feats_dict: Dict,
+                rpn_results_list: InstanceList,
+                batch_data_samples: SampleList,
+                rescale: bool = False,
+                **kwargs) -> InstanceList:
+        """Perform forward propagation of the roi head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            feats_dict (dict): Contains features from the first stage.
+            rpn_results_list (List[:obj:`InstanceData`]): Detection results
+                of rpn head.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                samples. It usually includes information such as
+                `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each sample
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+              (num_instances, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes_3d (BaseInstance3DBoxes): Prediction of bboxes,
+              contains a tensor with shape (num_instances, C), where
+              C >= 7.
+        """
+        assert self.with_bbox, 'Bbox head must be implemented in PartA2.'
+        assert self.with_semantic, 'Semantic head must be implemented' \
+                                   ' in PartA2.'
+
+        batch_input_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+        voxels_dict = feats_dict.pop('voxels_dict')
+        # TODO: Split predict semantic and bbox
+        results_list = self.predict_bbox(feats_dict, voxels_dict,
+                                         batch_input_metas, rpn_results_list,
+                                         self.test_cfg)
+        return results_list
+
+    def predict_bbox(self, feats_dict: Dict, voxel_dict: Dict,
+                     batch_input_metas: List[dict],
+                     rpn_results_list: InstanceList,
+                     test_cfg: ConfigDict) -> InstanceList:
+        """Perform forward propagation of the bbox head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            feats_dict (dict): Contains features from the first stage.
+            voxel_dict (dict): Contains information of voxels.
+            batch_input_metas (list[dict], Optional): Batch image meta info.
+                Defaults to None.
+            rpn_results_list (List[:obj:`InstanceData`]): Detection results
+                of rpn head.
+            test_cfg (Config): Test config.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each sample
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+              (num_instances, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes_3d (BaseInstance3DBoxes): Prediction of bboxes,
+              contains a tensor with shape (num_instances, C), where
+              C >= 7.
+        """
+        ...
+
+    def loss(self, feats_dict: Dict, rpn_results_list: InstanceList,
+             batch_data_samples: SampleList, **kwargs) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        roi on the features of the upstream network.
+
+        Args:
+            feats_dict (dict): Contains features from the first stage.
+            rpn_results_list (List[:obj:`InstanceData`]): Detection results
+                of rpn head.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                samples. It usually includes information such as
+                `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components
+        """
+        assert len(rpn_results_list) == len(batch_data_samples)
+        losses = dict()
+        batch_gt_instances_3d = []
+        batch_gt_instances_ignore = []
+        voxels_dict = feats_dict.pop('voxels_dict')
+        for data_sample in batch_data_samples:
+            batch_gt_instances_3d.append(data_sample.gt_instances_3d)
+            if 'ignored_instances' in data_sample:
+                batch_gt_instances_ignore.append(data_sample.ignored_instances)
+            else:
+                batch_gt_instances_ignore.append(None)
+        if self.with_semantic:
+            semantic_results = self._semantic_forward_train(
+                feats_dict, voxels_dict, batch_gt_instances_3d)
+            losses.update(semantic_results.pop('loss_semantic'))
+
+        sample_results = self._assign_and_sample(rpn_results_list,
+                                                 batch_gt_instances_3d)
+        if self.with_bbox:
+            feats_dict.update(semantic_results)
+            bbox_results = self._bbox_forward_train(feats_dict, voxels_dict,
+                                                    sample_results)
+            losses.update(bbox_results['loss_bbox'])
+
+        return losses
+```
+
+Here we omit more details related to other functions. Please see the [code](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/models/roi_heads/part_aggregation_roi_head.py) for more details.
+
+Last, the users need to add the module in
+`mmdet3d/models/roi_heads/bbox_heads/__init__.py` and `mmdet3d/models/roi_heads/__init__.py` thus the corresponding registry could find and load them.
+
+Alternatively, the users can add
+
+```python
+custom_imports=dict(
+    imports=['mmdet3d.models.roi_heads.part_aggregation_roi_head', 'mmdet3d.models.roi_heads.bbox_heads.parta2_bbox_head'],
+    allow_failed_imports=False)
+```
+
+to the config file and achieve the same goal.
+
+The config file of `PartAggregationROIHead` is as the following:
+
+```python
+model = dict(
+    ...
+    roi_head=dict(
+        type='PartAggregationROIHead',
+        num_classes=3,
+        semantic_head=dict(
+            type='PointwiseSemanticHead',
+            in_channels=16,
+            extra_width=0.2,
+            seg_score_thr=0.3,
+            num_classes=3,
+            loss_seg=dict(
+                type='mmdet.FocalLoss',
+                use_sigmoid=True,
+                reduction='sum',
+                gamma=2.0,
+                alpha=0.25,
+                loss_weight=1.0),
+            loss_part=dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=True,
+                loss_weight=1.0)),
+        seg_roi_extractor=dict(
+            type='Single3DRoIAwareExtractor',
+            roi_layer=dict(
+                type='RoIAwarePool3d',
+                out_size=14,
+                max_pts_per_voxel=128,
+                mode='max')),
+        bbox_roi_extractor=dict(
+            type='Single3DRoIAwareExtractor',
+            roi_layer=dict(
+                type='RoIAwarePool3d',
+                out_size=14,
+                max_pts_per_voxel=128,
+                mode='avg')),
+        bbox_head=dict(
+            type='PartA2BboxHead',
+            num_classes=3,
+            seg_in_channels=16,
+            part_in_channels=4,
+            seg_conv_channels=[64, 64],
+            part_conv_channels=[64, 64],
+            merge_conv_channels=[128, 128],
+            down_conv_channels=[128, 256],
+            bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+            shared_fc_channels=[256, 512, 512, 512],
+            cls_channels=[256, 256],
+            reg_channels=[256, 256],
+            dropout_ratio=0.1,
+            roi_feat_size=14,
+            with_corner_loss=True,
+            loss_bbox=dict(
+                type='mmdet.SmoothL1Loss',
+                beta=1.0 / 9.0,
+                reduction='sum',
+                loss_weight=1.0),
+            loss_cls=dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=True,
+                reduction='sum',
+                loss_weight=1.0))),
+    ...
+)
+```
+
+Since MMDetection 2.0, the config system supports to inherit configs such that the users can focus on the modification.
+The second stage of PartA2 Head mainly uses a new `PartAggregationROIHead` and a new
+`PartA2BboxHead`, the arguments are set according to the `__init__` function of each module.
+
+### Add a new loss
+
+Assume you want to add a new loss as `MyLoss` for bounding box regression.
+To add a new loss function, the users need to implement it in `mmdet3d/models/losses/my_loss.py`.
+The decorator `weighted_loss` enables the loss to be weighted for each element.
+
+```python
+import torch
+import torch.nn as nn
+from mmdet.models.losses.utils import weighted_loss
+
+from mmdet3d.registry import MODELS
+
+
+@weighted_loss
+def my_loss(pred, target):
+    assert pred.size() == target.size() and target.numel() > 0
+    loss = torch.abs(pred - target)
+    return loss
+
+
+@MODELS.register_module()
+class MyLoss(nn.Module):
+
+    def __init__(self, reduction='mean', loss_weight=1.0):
+        super(MyLoss, self).__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss_bbox = self.loss_weight * my_loss(
+            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss_bbox
+```
+
+Then the users need to add it in the `mmdet3d/models/losses/__init__.py`.
+
+```python
+from .my_loss import MyLoss, my_loss
+```
+
+Alternatively, you can add
+
+```python
+custom_imports=dict(
+    imports=['mmdet3d.models.losses.my_loss'],
+    allow_failed_imports=False)
+```
+
+to the config file and achieve the same goal.
+
+To use it, users should modify the `loss_xxx` field.
+Since `MyLoss` is for regression, you need to modify the `loss_bbox` field in the head.
+
+```python
+loss_bbox=dict(type='MyLoss', loss_weight=1.0)
+```
--- a/mmdetection3d/docs/en/advanced_guides/customize_runtime.md
+++ b/mmdetection3d/docs/en/advanced_guides/customize_runtime.md
+# Customize Runtime Settings
+
+## Customize optimization settings
+
+Optimization related configuration is now all managed by `optim_wrapper` which usually has three fields: `optimizer`, `paramwise_cfg`, `clip_grad`. Please refer to [OptimWrapper](https://mmengine.readthedocs.io/en/latest/tutorials/optim_wrapper.html) for more details. See the example below, where `AdamW` is used as an `optimizer`, the learning rate of the backbone is reduced by a factor of 10, and gradient clipping is added.
+
+```python
+optim_wrapper = dict(
+    type='OptimWrapper',
+    # optimizer
+    optimizer=dict(
+        type='AdamW',
+        lr=0.0001,
+        weight_decay=0.05,
+        eps=1e-8,
+        betas=(0.9, 0.999)),
+
+    # Parameter-level learning rate and weight decay settings
+    paramwise_cfg=dict(
+        custom_keys={
+            'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+        },
+        norm_decay_mult=0.0),
+
+    # gradient clipping
+    clip_grad=dict(max_norm=0.01, norm_type=2))
+```
+
+### Customize optimizer supported by PyTorch
+
+We already support to use all the optimizers implemented by PyTorch, and the only modification is to change the `optimizer` field in `optim_wrapper` field of config files. For example, if you want to use `Adam` (note that the performance could drop a lot), the modification could be as the following:
+
+```python
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='Adam', lr=0.0003, weight_decay=0.0001))
+```
+
+To modify the learning rate of the model, the users only need to modify the `lr` in `optimizer`. The users can directly set arguments following the [API doc](https://pytorch.org/docs/stable/optim.html?highlight=optim#module-torch.optim) of PyTorch.
+
+### Customize self-implemented optimizer
+
+#### 1. Define a new optimizer
+
+A customized optimizer could be defined as following:
+
+Assume you want to add a optimizer named `MyOptimizer`, which has arguments `a`, `b`, and `c`.
+You need to create a new directory named `mmdet3d/engine/optimizers`, and then implement the new optimizer in a file, e.g., in `mmdet3d/engine/optimizers/my_optimizer.py`:
+
+```python
+from torch.optim import Optimizer
+
+from mmdet3d.registry import OPTIMIZERS
+
+
+@OPTIMIZERS.register_module()
+class MyOptimizer(Optimizer):
+
+    def __init__(self, a, b, c):
+        pass
+```
+
+#### 2. Add the optimizer to registry
+
+To find the above module defined above, this module should be imported into the main namespace at first. There are two options to achieve it.
+
+- Modify `mmdet3d/engine/optimizers/__init__.py` to import it.
+
+  The newly defined module should be imported in `mmdet3d/engine/optimizers/__init__.py` so that the registry will find the new module and add it:
+
+  ```python
+  from .my_optimizer import MyOptimizer
+  ```
+
+- Use `custom_imports` in the config to manually import it.
+
+  ```python
+  custom_imports = dict(imports=['mmdet3d.engine.optimizers.my_optimizer'], allow_failed_imports=False)
+  ```
+
+  The module `mmdet3d.engine.optimizers.my_optimizer` will be imported at the beginning of the program and the class `MyOptimizer` is then automatically registered.
+  Note that only the package containing the class `MyOptimizer` should be imported.
+  `mmdet3d.engine.optimizers.my_optimizer.MyOptimizer` **cannot** be imported directly.
+
+  Actually users can use a totally different file directory structure with this importing method, as long as the module root is located in `PYTHONPATH`.
+
+#### 3. Specify the optimizer in the config file
+
+Then you can use `MyOptimizer` in `optimizer` field in `optim_wrapper` field of config files. In the configs, the optimizers are defined by the field `optimizer` like the following:
+
+```python
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001))
+```
+
+To use your own optimizer, the field can be changed to:
+
+```python
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='MyOptimizer', a=a_value, b=b_value, c=c_value))
+```
+
+### Customize optimizer wrapper constructor
+
+Some models may have some parameter-specific settings for optimization, e.g. weight decay for BatchNorm layers.
+The users can do those fine-grained parameter tuning through customizing optimizer wrapper constructor.
+
+```python
+from mmengine.optim import DefaultOptimWrapperConstructor
+
+from mmdet3d.registry import OPTIM_WRAPPER_CONSTRUCTORS
+from .my_optimizer import MyOptimizer
+
+
+@OPTIM_WRAPPER_CONSTRUCTORS.register_module()
+class MyOptimizerWrapperConstructor(DefaultOptimWrapperConstructor):
+
+    def __init__(self,
+                 optim_wrapper_cfg: dict,
+                 paramwise_cfg: Optional[dict] = None):
+        pass
+
+    def __call__(self, model: nn.Module) -> OptimWrapper:
+
+        return optim_wrapper
+```
+
+The default optimizer wrapper constructor is implemented [here](https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/default_constructor.py#L18), which could also serve as a template for the new optimizer wrapper constructor.
+
+### Additional settings
+
+Tricks not implemented by the optimizer should be implemented through optimizer wrapper constructor (e.g., set parameter-wise learning rates) or hooks. We list some common settings that could stabilize the training or accelerate the training. Feel free to create PR, issue for more settings.
+
+- __Use gradient clip to stabilize training__:
+  Some models need gradient clip to clip the gradients to stabilize the training process. An example is as below:
+
+  ```python
+  optim_wrapper = dict(
+      _delete_=True, clip_grad=dict(max_norm=35, norm_type=2))
+  ```
+
+  If your config inherits the base config which already sets the `optim_wrapper`, you might need `_delete_=True` to override the unnecessary settings. See the [config documentation](https://mmdetection3d.readthedocs.io/en/dev-1.x/user_guides/config.html) for more details.
+
+- __Use momentum schedule to accelerate model convergence__:
+  We support momentum scheduler to modify model's momentum according to learning rate, which could make the model converge in a faster way.
+  Momentum scheduler is usually used with LR scheduler, for example, the following config is used in [3D detection](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/configs/_base_/schedules/cyclic-20e.py) to accelerate convergence.
+  For more details, please refer to the implementation of [CosineAnnealingLR](https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py#L43) and [CosineAnnealingMomentum](https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/momentum_scheduler.py#L71).
+
+  ```python
+  param_scheduler = [
+      # learning rate scheduler
+      # During the first 8 epochs, learning rate increases from 0 to lr * 10
+      # during the next 12 epochs, learning rate decreases from lr * 10 to lr * 1e-4
+      dict(
+          type='CosineAnnealingLR',
+          T_max=8,
+          eta_min=lr * 10,
+          begin=0,
+          end=8,
+          by_epoch=True,
+          convert_to_iter_based=True),
+      dict(
+          type='CosineAnnealingLR',
+          T_max=12,
+          eta_min=lr * 1e-4,
+          begin=8,
+          end=20,
+          by_epoch=True,
+          convert_to_iter_based=True),
+      # momentum scheduler
+      # During the first 8 epochs, momentum increases from 0 to 0.85 / 0.95
+      # during the next 12 epochs, momentum increases from 0.85 / 0.95 to 1
+      dict(
+          type='CosineAnnealingMomentum',
+          T_max=8,
+          eta_min=0.85 / 0.95,
+          begin=0,
+          end=8,
+          by_epoch=True,
+          convert_to_iter_based=True),
+      dict(
+          type='CosineAnnealingMomentum',
+          T_max=12,
+          eta_min=1,
+          begin=8,
+          end=20,
+          by_epoch=True,
+          convert_to_iter_based=True)
+  ]
+  ```
+
+## Customize training schedules
+
+By default we use step learning rate with 1x schedule, this calls [`MultiStepLR`](https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py#L144) in MMEngine.
+We support many other learning rate schedule [here](https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py), such as `CosineAnnealingLR` and `PolyLR` schedules. Here are some examples:
+
+- Poly schedule:
+
+  ```python
+  param_scheduler = [
+      dict(
+          type='PolyLR',
+          power=0.9,
+          eta_min=1e-4,
+          begin=0,
+          end=8,
+          by_epoch=True)]
+  ```
+
+- CosineAnnealing schedule:
+
+  ```python
+  param_scheduler = [
+      dict(
+          type='CosineAnnealingLR',
+          T_max=8,
+          eta_min=lr * 1e-5,
+          begin=0,
+          end=8,
+          by_epoch=True)]
+  ```
+
+## Customize train loop
+
+By default, `EpochBasedTrainLoop` is used in `train_cfg` and validation is done after every train epoch, as follows:
+
+```python
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_begin=1, val_interval=1)
+```
+
+Actually, both [`IterBasedTrainLoop`](https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/loops.py#L185) and [`EpochBasedTrainLoop`](https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/loops.py#L18) support dynamic interval, see the following example:
+
+```python
+# Before 365001th iteration, we do evaluation every 5000 iterations.
+# After 365000th iteration, we do evaluation every 368750 iterations,
+# which means that we do evaluation at the end of training.
+
+interval = 5000
+max_iters = 368750
+dynamic_intervals = [(max_iters // interval * interval + 1, max_iters)]
+train_cfg = dict(
+    type='IterBasedTrainLoop',
+    max_iters=max_iters,
+    val_interval=interval,
+    dynamic_intervals=dynamic_intervals)
+```
+
+## Customize hooks
+
+### Customize self-implemented hooks
+
+#### 1. Implement a new hook
+
+MMEngine provides many useful [hooks](https://mmengine.readthedocs.io/en/latest/tutorials/hook.html), but there are some occasions when the users might need to implement a new hook. MMDetection3D supports customized hooks in training based on MMEngine after v1.1.0rc0. Thus the users could implement a hook directly in mmdet3d or their mmdet3d-based codebases and use the hook by only modifying the config in training.
+Here we give an example of creating a new hook in mmdet3d and using it in training.
+
+```python
+from mmengine.hooks import Hook
+
+from mmdet3d.registry import HOOKS
+
+
+@HOOKS.register_module()
+class MyHook(Hook):
+
+    def __init__(self, a, b):
+
+    def before_run(self, runner) -> None:
+
+    def after_run(self, runner) -> None:
+
+    def before_train(self, runner) -> None:
+
+    def after_train(self, runner) -> None:
+
+    def before_train_epoch(self, runner) -> None:
+
+    def after_train_epoch(self, runner) -> None:
+
+    def before_train_iter(self,
+                          runner,
+                          batch_idx: int,
+                          data_batch: DATA_BATCH = None) -> None:
+
+    def after_train_iter(self,
+                         runner,
+                         batch_idx: int,
+                         data_batch: DATA_BATCH = None,
+                         outputs: Optional[dict] = None) -> None:
+```
+
+Depending on the functionality of the hook, users need to specify what the hook will do at each stage of the training in `before_run`, `after_run`, `before_train`, `after_train`, `before_train_epoch`, `after_train_epoch`, `before_train_iter`, and `after_train_iter`. There are more points where hooks can be inserted, refer to [base hook class](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/hook.py#L9) for more details.
+
+#### 2. Register the new hook
+
+Then we need to make `MyHook` imported. Assuming the file is in `mmdet3d/engine/hooks/my_hook.py`, there are two ways to do that:
+
+- Modify `mmdet3d/engine/hooks/__init__.py` to import it.
+
+  The newly defined module should be imported in `mmdet3d/engine/hooks/__init__.py` so that the registry will find the new module and add it:
+
+  ```python
+  from .my_hook import MyHook
+  ```
+
+- Use `custom_imports` in the config to manually import it.
+
+  ```python
+  custom_imports = dict(imports=['mmdet3d.engine.hooks.my_hook'], allow_failed_imports=False)
+  ```
+
+#### 3. Modify the config
+
+```python
+custom_hooks = [
+    dict(type='MyHook', a=a_value, b=b_value)
+]
+```
+
+You can also set the priority of the hook by adding key `priority` to `'NORMAL'` or `'HIGHEST'` as below:
+
+```python
+custom_hooks = [
+    dict(type='MyHook', a=a_value, b=b_value, priority='NORMAL')
+]
+```
+
+By default the hook's priority is set as `NORMAL` during registration.
+
+### Use hooks implemented in MMDetection3D
+
+If the hook is already implemented in MMDetection3D, you can directly modify the config to use the hook as below.
+
+#### Example: `DisableObjectSampleHook`
+
+We implement a customized hook named [DisableObjectSampleHook](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/engine/hooks/disable_object_sample_hook.py) to disable `ObjectSample` augmentation during training after specified epoch.
+
+We can set it in the config file if needed:
+
+```python
+custom_hooks = [dict(type='DisableObjectSampleHook', disable_after_epoch=15)]
+```
+
+### Modify default runtime hooks
+
+There are some common hooks that are registered through `default_hooks`, they are
+
+- `IterTimerHook`: A hook that logs 'data_time' for loading data and 'time' for a model training step.
+- `LoggerHook`: A hook that collects logs from different components of `Runner` and writes them to terminal, json file, tensorboard and wandb etc.
+- `ParamSchedulerHook`: A hook that updates some hyper-parameters in optimizer, e.g., learning rate and momentum.
+- `CheckpointHook`: A hook that saves checkpoints periodically.
+- `DistSamplerSeedHook`: A hook that sets the seed for sampler and batch_sampler.
+- `Det3DVisualizationHook`: A hook used to visualize validation and testing process prediction results.
+
+`IterTimerHook`, `ParamSchedulerHook` and `DistSamplerSeedHook` are simple and no need to be modified usually, so here we reveal what we can do with `LoggerHook`, `CheckpointHook` and `Det3DVisualizationHook`.
+
+#### CheckpointHook
+
+Except saving checkpoints periodically, [`CheckpointHook`](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/checkpoint_hook.py#L18) provides other options such as `max_keep_ckpts`, `save_optimizer` and etc. The users could set `max_keep_ckpts` to only save small number of checkpoints or decide whether to store state dict of optimizer by `save_optimizer`. More details of the arguments are [here](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/checkpoint_hook.py#L18).
+
+```python
+default_hooks = dict(
+    checkpoint=dict(
+        type='CheckpointHook',
+        interval=1,
+        max_keep_ckpts=3,
+        save_optimizer=True))
+```
+
+#### LoggerHook
+
+The `LoggerHook` enables setting intervals. Detailed instructions can be found in the [docstring](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/logger_hook.py#L19).
+
+```python
+default_hooks = dict(logger=dict(type='LoggerHook', interval=50))
+```
+
+#### Det3DVisualizationHook
+
+`Det3DVisualizationHook` use `DetLocalVisualizer` to visualize prediction results, and `Det3DLocalVisualizer` current supports different backends, e.g., `TensorboardVisBackend` and `WandbVisBackend` (see [docstring](https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py) for more details). The users could add multi backends to do visualization as follows.
+
+```python
+default_hooks = dict(
+    visualization=dict(type='Det3DVisualizationHook', draw=True))
+
+vis_backends = [dict(type='LocalVisBackend'),
+                dict(type='TensorboardVisBackend')]
+visualizer = dict(
+    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+```
--- a/mmdetection3d/docs/en/advanced_guides/datasets/index.rst
+++ b/mmdetection3d/docs/en/advanced_guides/datasets/index.rst
+.. toctree::
+   :maxdepth: 3
+
+   kitti.md
+   nuscenes.md
+   lyft.md
+   waymo.md
+   sunrgbd.md
+   scannet.md
+   s3dis.md
+   semantickitti.md
--- a/mmdetection3d/docs/en/advanced_guides/datasets/kitti.md
+++ b/mmdetection3d/docs/en/advanced_guides/datasets/kitti.md
+# KITTI Dataset
+
+This page provides specific tutorials about the usage of MMDetection3D for KITTI dataset.
+
+## Prepare dataset
+
+You can download KITTI 3D detection data [HERE](http://www.cvlibs.net/datasets/kitti/eval_object.php?obj_benchmark=3d) and unzip all zip files. Besides, the road planes could be downloaded from [HERE](https://download.openmmlab.com/mmdetection3d/data/train_planes.zip), which are optional for data augmentation during training for better performance. The road planes are generated by [AVOD](https://github.com/kujason/avod), you can see more details [HERE](https://github.com/kujason/avod/issues/19).
+
+Like the general way to prepare dataset, it is recommended to symlink the dataset root to `$MMDETECTION3D/data`.
+
+The folder structure should be organized as follows before our processing.
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── kitti
+│   │   ├── ImageSets
+│   │   ├── testing
+│   │   │   ├── calib
+│   │   │   ├── image_2
+│   │   │   ├── velodyne
+│   │   ├── training
+│   │   │   ├── calib
+│   │   │   ├── image_2
+│   │   │   ├── label_2
+│   │   │   ├── velodyne
+│   │   │   ├── planes (optional)
+```
+
+### Create KITTI dataset
+
+To create KITTI point cloud data, we load the raw point cloud data and generate the relevant annotations including object labels and bounding boxes. We also generate all single training objects' point cloud in KITTI dataset and save them as `.bin` files in `data/kitti/kitti_gt_database`. Meanwhile, `.pkl` info files are also generated for training or validation. Subsequently, create KITTI data by running:
+
+```bash
+mkdir ./data/kitti/ && mkdir ./data/kitti/ImageSets
+
+# Download data split
+wget -c  https://raw.githubusercontent.com/traveller59/second.pytorch/master/second/data/ImageSets/test.txt --no-check-certificate --content-disposition -O ./data/kitti/ImageSets/test.txt
+wget -c  https://raw.githubusercontent.com/traveller59/second.pytorch/master/second/data/ImageSets/train.txt --no-check-certificate --content-disposition -O ./data/kitti/ImageSets/train.txt
+wget -c  https://raw.githubusercontent.com/traveller59/second.pytorch/master/second/data/ImageSets/val.txt --no-check-certificate --content-disposition -O ./data/kitti/ImageSets/val.txt
+wget -c  https://raw.githubusercontent.com/traveller59/second.pytorch/master/second/data/ImageSets/trainval.txt --no-check-certificate --content-disposition -O ./data/kitti/ImageSets/trainval.txt
+
+python tools/create_data.py kitti --root-path ./data/kitti --out-dir ./data/kitti --extra-tag kitti --with-plane
+```
+
+Note that if your local disk does not have enough space for saving converted data, you can change the `--out-dir` to anywhere else, and you need to remove the `--with-plane` flag if `planes` are not prepared.
+
+The folder structure after processing should be as below
+
+```
+kitti
+├── ImageSets
+│   ├── test.txt
+│   ├── train.txt
+│   ├── trainval.txt
+│   ├── val.txt
+├── testing
+│   ├── calib
+│   ├── image_2
+│   ├── velodyne
+│   ├── velodyne_reduced
+├── training
+│   ├── calib
+│   ├── image_2
+│   ├── label_2
+│   ├── velodyne
+│   ├── velodyne_reduced
+│   ├── planes (optional)
+├── kitti_gt_database
+│   ├── xxxxx.bin
+├── kitti_infos_train.pkl
+├── kitti_infos_val.pkl
+├── kitti_dbinfos_train.pkl
+├── kitti_infos_test.pkl
+├── kitti_infos_trainval.pkl
+```
+
+- `kitti_gt_database/xxxxx.bin`: point cloud data included in each 3D bounding box of the training dataset.
+- `kitti_infos_train.pkl`: training dataset, a dict contains two keys: `metainfo` and `data_list`.
+  `metainfo` contains the basic information for the dataset itself, such as `categories`, `dataset` and `info_version`, while `data_list` is a list of dict, each dict (hereinafter referred to as `info`) contains all the detailed information of single sample as follows:
+  - info\['sample_idx'\]: The index of this sample in the whole dataset.
+  - info\['images'\]: Information of images captured by multiple cameras. A dict contains five keys including: `CAM0`, `CAM1`, `CAM2`, `CAM3`, `R0_rect`.
+    - info\['images'\]\['R0_rect'\]: Rectifying rotation matrix with shape (4, 4).
+    - info\['images'\]\['CAM2'\]: Include some information about the `CAM2` camera sensor.
+      - info\['images'\]\['CAM2'\]\['img_path'\]: The filename of the image.
+      - info\['images'\]\['CAM2'\]\['height'\]: The height of the image.
+      - info\['images'\]\['CAM2'\]\['width'\]: The width of the image.
+      - info\['images'\]\['CAM2'\]\['cam2img'\]: Transformation matrix from camera to image with shape (4, 4).
+      - info\['images'\]\['CAM2'\]\['lidar2cam'\]: Transformation matrix from lidar to camera with shape (4, 4).
+      - info\['images'\]\['CAM2'\]\['lidar2img'\]: Transformation matrix from lidar to image with shape (4, 4).
+  - info\['lidar_points'\]: A dict containing all the information related to the lidar points.
+    - info\['lidar_points'\]\['lidar_path'\]: The filename of the lidar point cloud data.
+    - info\['lidar_points'\]\['num_pts_feats'\]: The feature dimension of point.
+    - info\['lidar_points'\]\['Tr_velo_to_cam'\]: Transformation from Velodyne coordinate to camera coordinate with shape (4, 4).
+    - info\['lidar_points'\]\['Tr_imu_to_velo'\]: Transformation from IMU coordinate to Velodyne coordinate with shape (4, 4).
+  - info\['instances'\]: It is a list of dict. Each dict contains all annotation information of single instance. For the i-th instance:
+    - info\['instances'\]\[i\]\['bbox'\]: List of 4 numbers representing the 2D bounding box of the instance, in (x1, y1, x2, y2) order.
+    - info\['instances'\]\[i\]\['bbox_3d'\]: List of 7 numbers representing the 3D bounding box of the instance, in (x, y, z, l, h, w, yaw) order.
+    - info\['instances'\]\[i\]\['bbox_label'\]: An int indicate the 2D label of instance and the -1 indicating ignore.
+    - info\['instances'\]\[i\]\['bbox_label_3d'\]: An int indicate the 3D label of instance and the -1 indicating ignore.
+    - info\['instances'\]\[i\]\['depth'\]: Projected center depth of the 3D bounding box with respect to the image plane.
+    - info\['instances'\]\[i\]\['num_lidar_pts'\]: The number of LiDAR points in the 3D bounding box.
+    - info\['instances'\]\[i\]\['center_2d'\]: Projected 2D center of the 3D bounding box.
+    - info\['instances'\]\[i\]\['difficulty'\]: KITTI difficulty: 'Easy', 'Moderate', 'Hard'.
+    - info\['instances'\]\[i\]\['truncated'\]: Float from 0 (non-truncated) to 1 (truncated), where truncated refers to the object leaving image boundaries.
+    - info\['instances'\]\[i\]\['occluded'\]: Integer (0,1,2,3) indicating occlusion state: 0 = fully visible, 1 = partly occluded, 2 = largely occluded, 3 = unknown.
+    - info\['instances'\]\[i\]\['group_ids'\]: Used for multi-part object.
+  - info\['plane'\](optional): Road level information.
+
+Please refer to [kitti_converter.py](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/tools/dataset_converters/kitti_converter.py) and [update_infos_to_v2.py ](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/tools/dataset_converters/update_infos_to_v2.py) for more details.
+
+## Train pipeline
+
+A typical train pipeline of 3D detection on KITTI is as below:
+
+```python
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4, # x, y, z, intensity
+        use_dim=4),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='ObjectNoise',
+        num_try=100,
+        translation_std=[1.0, 1.0, 0.5],
+        global_rot_range=[0.0, 0.0],
+        rot_range=[-0.78539816, 0.78539816]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+```
+
+- Data augmentation:
+  - `ObjectNoise`: apply noise to each GT objects in the scene.
+  - `RandomFlip3D`: randomly flip input point cloud horizontally or vertically.
+  - `GlobalRotScaleTrans`: rotate input point cloud.
+
+## Evaluation
+
+An example to evaluate PointPillars with 8 GPUs with kitti metrics is as follows:
+
+```shell
+bash tools/dist_test.sh configs/pointpillars/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class.py work_dirs/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class/latest.pth 8
+```
+
+## Metrics
+
+KITTI evaluates 3D object detection performance using mean Average Precision (mAP) and Average Orientation Similarity (AOS), Please refer to its [official website](http://www.cvlibs.net/datasets/kitti/eval_3dobject.php) and [original paper](http://www.cvlibs.net/publications/Geiger2012CVPR.pdf) for more details.
+
+We also adopt this approach for evaluation on KITTI. An example of printed evaluation results is as follows:
+
+```
+Car AP@0.70, 0.70, 0.70:
+bbox AP:97.9252, 89.6183, 88.1564
+bev  AP:90.4196, 87.9491, 85.1700
+3d   AP:88.3891, 77.1624, 74.4654
+aos  AP:97.70, 89.11, 87.38
+Car AP@0.70, 0.50, 0.50:
+bbox AP:97.9252, 89.6183, 88.1564
+bev  AP:98.3509, 90.2042, 89.6102
+3d   AP:98.2800, 90.1480, 89.4736
+aos  AP:97.70, 89.11, 87.38
+```
+
+## Testing and make a submission
+
+An example to test PointPillars on KITTI with 8 GPUs and generate a submission to the leaderboard is as follows:
+
+- First, you need to modify the `test_dataloader` and `test_evaluator` dict in your config file, just like:
+
+  ```python
+  data_root = 'data/kitti/'
+  test_dataloader = dict(
+      dataset=dict(
+          ann_file='kitti_infos_test.pkl',
+          load_eval_anns=False,
+          data_prefix=dict(pts='testing/velodyne_reduced')))
+  test_evaluator = dict(
+      ann_file=data_root + 'kitti_infos_test.pkl',
+      format_only=True,
+      pklfile_prefix='results/kitti-3class/kitti_results',
+      submission_prefix='results/kitti-3class/kitti_results')
+  ```
+
+- And then, you can run the test script.
+
+  ```shell
+  ./tools/dist_test.sh configs/pointpillars/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class.py work_dirs/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class/latest.pth 8
+  ```
+
+After generating `results/kitti-3class/kitti_results/xxxxx.txt` files, you can submit these files to KITTI benchmark. Please refer to the [KITTI official website](http://www.cvlibs.net/datasets/kitti/index.php) for more details.
--- a/mmdetection3d/docs/en/advanced_guides/datasets/lyft.md
+++ b/mmdetection3d/docs/en/advanced_guides/datasets/lyft.md
+# Lyft Dataset
+
+This page provides specific tutorials about the usage of MMDetection3D for Lyft dataset.
+
+## Before Preparation
+
+You can download Lyft 3D detection data [HERE](https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles/data) and unzip all zip files.
+
+Like the general way to prepare a dataset, it is recommended to symlink the dataset root to `$MMDETECTION3D/data`.
+
+The folder structure should be organized as follows before our processing.
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── lyft
+│   │   ├── v1.01-train
+│   │   │   ├── v1.01-train (train_data)
+│   │   │   ├── lidar (train_lidar)
+│   │   │   ├── images (train_images)
+│   │   │   ├── maps (train_maps)
+│   │   ├── v1.01-test
+│   │   │   ├── v1.01-test (test_data)
+│   │   │   ├── lidar (test_lidar)
+│   │   │   ├── images (test_images)
+│   │   │   ├── maps (test_maps)
+│   │   ├── train.txt
+│   │   ├── val.txt
+│   │   ├── test.txt
+│   │   ├── sample_submission.csv
+```
+
+Here `v1.01-train` and `v1.01-test` contain the metafiles which are similar to those of nuScenes. `.txt` files contain the data split information.
+Lyft does not have an official split for training and validation set, so we provide a split considering the number of objects from different categories in different scenes.
+`sample_submission.csv` is the base file for submission on the Kaggle evaluation server.
+Note that we follow the original folder names for clear organization. Please rename the raw folders as shown above.
+
+## Dataset Preparation
+
+The way to organize Lyft dataset is similar to nuScenes. We also generate the `.pkl` files which share almost the same structure.
+Next, we will mainly focus on the difference between these two datasets. For a more detailed explanation of the info structure, please refer to [nuScenes tutorial](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/docs/en/advanced_guides/datasets/nuscenes_det.md).
+
+To prepare info files for Lyft, run the following commands:
+
+```bash
+python tools/create_data.py lyft --root-path ./data/lyft --out-dir ./data/lyft --extra-tag lyft --version v1.01
+python tools/dataset_converters/lyft_data_fixer.py --version v1.01 --root-folder ./data/lyft
+```
+
+Note that the second command serves the purpose of fixing a corrupted lidar data file. Please refer to the discussion [here](https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles/discussion/110000) for more details.
+
+The folder structure after processing should be as below.
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── lyft
+│   │   ├── v1.01-train
+│   │   │   ├── v1.01-train (train_data)
+│   │   │   ├── lidar (train_lidar)
+│   │   │   ├── images (train_images)
+│   │   │   ├── maps (train_maps)
+│   │   ├── v1.01-test
+│   │   │   ├── v1.01-test (test_data)
+│   │   │   ├── lidar (test_lidar)
+│   │   │   ├── images (test_images)
+│   │   │   ├── maps (test_maps)
+│   │   ├── train.txt
+│   │   ├── val.txt
+│   │   ├── test.txt
+│   │   ├── sample_submission.csv
+│   │   ├── lyft_infos_train.pkl
+│   │   ├── lyft_infos_val.pkl
+│   │   ├── lyft_infos_test.pkl
+```
+
+- `lyft_infos_train.pkl`: training dataset, a dict contains two keys: `metainfo` and `data_list`.
+  `metainfo` contains the basic information for the dataset itself, such as `categories`, `dataset` and `info_version`, while `data_list` is a list of dict, each dict (hereinafter referred to as `info`) contains all the detailed information of single sample as follows:
+  - info\['sample_idx'\]: The index of this sample in the whole dataset.
+  - info\['token'\]: Sample data token.
+  - info\['timestamp'\]: Timestamp of the sample data.
+  - info\['lidar_points'\]: A dict containing all the information related to the lidar points.
+    - info\['lidar_points'\]\['lidar_path'\]: The filename of the lidar point cloud data.
+    - info\['lidar_points'\]\['num_pts_feats'\]: The feature dimension of point.
+    - info\['lidar_points'\]\['lidar2ego'\]: The transformation matrix from this lidar sensor to ego vehicle. (4x4 list)
+    - info\['lidar_points'\]\['ego2global'\]: The transformation matrix from the ego vehicle to global coordinates. (4x4 list)
+  - info\['lidar_sweeps'\]: A list contains sweeps information (The intermediate lidar frames without annotations).
+    - info\['lidar_sweeps'\]\[i\]\['lidar_points'\]\['data_path'\]: The lidar data path of i-th sweep.
+    - info\['lidar_sweeps'\]\[i\]\['lidar_points'\]\['lidar2ego'\]: The transformation matrix from this lidar sensor to ego vehicle in i-th sweep timestamp
+    - info\['lidar_sweeps'\]\[i\]\['lidar_points'\]\['ego2global'\]: The transformation matrix from the ego vehicle in i-th sweep timestamp to global coordinates. (4x4 list)
+    - info\['lidar_sweeps'\]\[i\]\['lidar2sensor'\]: The transformation matrix from the keyframe lidar to the i-th frame lidar. (4x4 list)
+    - info\['lidar_sweeps'\]\[i\]\['timestamp'\]: Timestamp of the sweep data.
+    - info\['lidar_sweeps'\]\[i\]\['sample_data_token'\]: The sweep sample data token.
+  - info\['images'\]: A dict contains six keys corresponding to each camera: `'CAM_FRONT'`, `'CAM_FRONT_RIGHT'`, `'CAM_FRONT_LEFT'`, `'CAM_BACK'`, `'CAM_BACK_LEFT'`, `'CAM_BACK_RIGHT'`. Each dict contains all data information related to  corresponding camera.
+    - info\['images'\]\['CAM_XXX'\]\['img_path'\]: The filename of the image.
+    - info\['images'\]\['CAM_XXX'\]\['cam2img'\]: The transformation matrix recording the intrinsic parameters when projecting 3D points to each image plane. (3x3 list)
+    - info\['images'\]\['CAM_XXX'\]\['sample_data_token'\]: Sample data token of image.
+    - info\['images'\]\['CAM_XXX'\]\['timestamp'\]: Timestamp of the image.
+    - info\['images'\]\['CAM_XXX'\]\['cam2ego'\]: The transformation matrix from this camera sensor to ego vehicle. (4x4 list)
+    - info\['images'\]\['CAM_XXX'\]\['lidar2cam'\]: The transformation matrix from lidar sensor to this camera. (4x4 list)
+  - info\['instances'\]: It is a list of dict. Each dict contains all annotation information of single instance. For the i-th instance:
+    - info\['instances'\]\[i\]\['bbox_3d'\]: List of 7 numbers representing the 3D bounding box in lidar coordinate system of the instance, in (x, y, z, l, w, h, yaw) order.
+    - info\['instances'\]\[i\]\['bbox_label_3d'\]: A int starting from 0 indicates the label of instance, while the -1 indicates ignore class.
+    - info\['instances'\]\[i\]\['bbox_3d_isvalid'\]: Whether each bounding box is valid. In general, we only take the 3D boxes that include at least one lidar or radar point as valid boxes.
+
+Next, we will elaborate on the difference compared to nuScenes in terms of the details recorded in these info files.
+
+- Without `lyft_database/xxxxx.bin`: This folder and `.bin` files are not extracted on the Lyft dataset due to the negligible effect of ground-truth sampling in the experiments.
+
+- `lyft_infos_train.pkl`:
+
+  - Without info\['instances'\]\[i\]\['velocity'\]: There is no velocity measurement on Lyft.
+  - Without info\['instances'\]\[i\]\['num_lidar_pts'\] and info\['instances'\]\['num_radar_pts'\]
+
+Here we only explain the data recorded in the training info files. The same applies to the validation set and test set (without instances).
+
+Please refer to [lyft_converter.py](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/tools/dataset_converters/lyft_converter.py) for more details about the structure of `lyft_infos_xxx.pkl`.
+
+## Training pipeline
+
+### LiDAR-Based Methods
+
+A typical training pipeline of LiDAR-based 3D detection (including multi-modality methods) on Lyft is almost the same as nuScenes as below.
+
+```python
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+```
+
+Similar to nuScenes, models on Lyft also need the `'LoadPointsFromMultiSweeps'` pipeline to load point clouds from consecutive frames.
+In addition, considering the intensity of LiDAR points collected by Lyft is invalid, we also set the `use_dim` in `'LoadPointsFromMultiSweeps'` to `[0, 1, 2, 4]` by default,
+where the first 3 dimensions refer to point coordinates, and the last refers to timestamp differences.
+
+## Evaluation
+
+An example to evaluate PointPillars with 8 GPUs with Lyft metrics is as follows:
+
+```shell
+bash ./tools/dist_test.sh configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb2-2x_lyft-3d.py checkpoints/hv_pointpillars_fpn_sbn-all_2x8_2x_lyft-3d_20210517_202818-fc6904c3.pth 8
+```
+
+## Metrics
+
+Lyft proposes a more strict metric for evaluating the predicted 3D bounding boxes.
+The basic criteria to judge whether a predicted box is positive or not is the same as KITTI, i.e. the 3D Intersection over Union (IoU).
+However, it adopts a way similar to COCO to compute the mean average precision (mAP) -- compute the average precision under different thresholds of 3D IoU from 0.5-0.95.
+Actually, overlap more than 0.7 3D IoU is a quite strict criterion for 3D detection methods, so the overall performance seems a little low.
+The imbalance of annotations for different categories is another important reason for the finally lower results compared to other datasets.
+Please refer to its [official website](https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles/overview/evaluation) for more details about the definition of this metric.
+
+We employ this official method for evaluation on Lyft. An example of printed evaluation results is as follows:
+
+```
+mAPs@0.5:0.95------+--------------+
+| class             | mAP@0.5:0.95 |
+-------------------+--------------+
+| animal            | 0.0          |
+| bicycle           | 0.099        |
+| bus               | 0.177        |
+| car               | 0.422        |
+| emergency_vehicle | 0.0          |
+| motorcycle        | 0.049        |
+| other_vehicle     | 0.359        |
+| pedestrian        | 0.066        |
+| truck             | 0.176        |
+| Overall           | 0.15         |
+-------------------+--------------+
+```
+
+## Testing and make a submission
+
+An example to test PointPillars on Lyft with 8 GPUs and generate a submission to the leaderboard is as follows.
+
+```shell
+./tools/dist_test.sh configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb2-2x_lyft-3d.py work_dirs/pp-lyft/latest.pth 8 --cfg-options test_evaluator.jsonfile_prefix=work_dirs/pp-lyft/results_challenge  test_evaluator.csv_savepath=results/pp-lyft/results_challenge.csv
+```
+
+After generating the `work_dirs/pp-lyft/results_challenge.csv`, you can submit it to the Kaggle evaluation server. Please refer to the [official website](https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles) for more information.
+
+We can also visualize the prediction results with our developed visualization tools. Please refer to the [visualization doc](https://mmdetection3d.readthedocs.io/en/latest/useful_tools.html#visualization) for more details.
--- a/mmdetection3d/docs/en/advanced_guides/datasets/nuscenes.md
+++ b/mmdetection3d/docs/en/advanced_guides/datasets/nuscenes.md
+# NuScenes Dataset
+
+This page provides specific tutorials about the usage of MMDetection3D for nuScenes dataset.
+
+## Before Preparation
+
+You can download nuScenes 3D detection `Full dataset (v1.0)` [HERE](https://www.nuscenes.org/download) and unzip all zip files.
+
+If you want to implement 3D semantic segmentation task, you need to additionally download the `nuScenes-lidarseg` data annotation and place the extracted files in the nuScenes corresponding folder.
+
+**Note**: `v1.0trainval(test)/categroy.json` in nuScenes-lidarseg will replace the original `v1.0trainval(test)/categroy.json` of the Full dataset (v1.0), but will not affect the 3D object detection task.
+
+Like the general way to prepare dataset, it is recommended to symlink the dataset root to `$MMDETECTION3D/data`.
+
+The folder structure should be organized as follows before our processing.
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── nuscenes
+│   │   ├── maps
+│   │   ├── samples
+│   │   ├── sweeps
+│   │   ├── lidarseg (optional)
+│   │   ├── v1.0-test
+|   |   ├── v1.0-trainval
+```
+
+## Dataset Preparation
+
+We typically need to organize the useful data information with a `.pkl` file in a specific style.
+To prepare these files for nuScenes, run the following command:
+
+```bash
+python tools/create_data.py nuscenes --root-path ./data/nuscenes --out-dir ./data/nuscenes --extra-tag nuscenes
+```
+
+The folder structure after processing should be as below.
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── nuscenes
+│   │   ├── maps
+│   │   ├── samples
+│   │   ├── sweeps
+│   │   ├── lidarseg (optional)
+│   │   ├── v1.0-test
+|   |   ├── v1.0-trainval
+│   │   ├── nuscenes_database
+│   │   ├── nuscenes_infos_train.pkl
+│   │   ├── nuscenes_infos_val.pkl
+│   │   ├── nuscenes_infos_test.pkl
+│   │   ├── nuscenes_dbinfos_train.pkl
+```
+
+- `nuscenes_database/xxxxx.bin`: point cloud data included in each 3D bounding box of the training dataset
+- `nuscenes_infos_train.pkl`: training dataset, a dict contains two keys: `metainfo` and `data_list`.
+  `metainfo` contains the basic information for the dataset itself, such as `categories`, `dataset` and `info_version`, while `data_list` is a list of dict, each dict (hereinafter referred to as `info`) contains all the detailed information of single sample as follows:
+  - info\['sample_idx'\]: The index of this sample in the whole dataset.
+  - info\['token'\]: Sample data token.
+  - info\['timestamp'\]: Timestamp of the sample data.
+  - info\['ego2global'\]: The transformation matrix from the ego vehicle to global coordinates. (4x4 list)
+  - info\['lidar_points'\]: A dict containing all the information related to the lidar points.
+    - info\['lidar_points'\]\['lidar_path'\]: The filename of the lidar point cloud data.
+    - info\['lidar_points'\]\['num_pts_feats'\]: The feature dimension of point.
+    - info\['lidar_points'\]\['lidar2ego'\]: The transformation matrix from this lidar sensor to ego vehicle. (4x4 list)
+  - info\['lidar_sweeps'\]: A list contains sweeps information (The intermediate lidar frames without annotations)
+    - info\['lidar_sweeps'\]\[i\]\['lidar_points'\]\['data_path'\]: The lidar data path of i-th sweep.
+    - info\['lidar_sweeps'\]\[i\]\['lidar_points'\]\['lidar2ego'\]: The transformation matrix from this lidar sensor to ego vehicle. (4x4 list)
+    - info\['lidar_sweeps'\]\[i\]\['lidar_points'\]\['ego2global'\]: The transformation matrix from the ego vehicle to global coordinates. (4x4 list)
+    - info\['lidar_sweeps'\]\[i\]\['lidar2sensor'\]: The transformation matrix from the main lidar sensor to the current sensor (for collecting the sweep data). (4x4 list)
+    - info\['lidar_sweeps'\]\[i\]\['timestamp'\]: Timestamp of the sweep data.
+    - info\['lidar_sweeps'\]\[i\]\['sample_data_token'\]: The sweep sample data token.
+  - info\['images'\]: A dict contains six keys corresponding to each camera: `'CAM_FRONT'`, `'CAM_FRONT_RIGHT'`, `'CAM_FRONT_LEFT'`, `'CAM_BACK'`, `'CAM_BACK_LEFT'`, `'CAM_BACK_RIGHT'`. Each dict contains all data information related to  corresponding camera.
+    - info\['images'\]\['CAM_XXX'\]\['img_path'\]: The filename of the image.
+    - info\['images'\]\['CAM_XXX'\]\['cam2img'\]: The transformation matrix recording the intrinsic parameters when projecting 3D points to each image plane. (3x3 list)
+    - info\['images'\]\['CAM_XXX'\]\['sample_data_token'\]: Sample data token of image.
+    - info\['images'\]\['CAM_XXX'\]\['timestamp'\]: Timestamp of the image.
+    - info\['images'\]\['CAM_XXX'\]\['cam2ego'\]: The transformation matrix from this camera sensor to ego vehicle. (4x4 list)
+    - info\['images'\]\['CAM_XXX'\]\['lidar2cam'\]: The transformation matrix from lidar sensor to this camera. (4x4 list)
+  - info\['instances'\]: It is a list of dict. Each dict contains all annotation information of single instance. For the i-th instance:
+    - info\['instances'\]\[i\]\['bbox_3d'\]: List of 7 numbers representing the 3D bounding box of the instance, in (x, y, z, l, w, h, yaw) order.
+    - info\['instances'\]\[i\]\['bbox_label_3d'\]: A int indicate the label of instance and the -1 indicate ignore.
+    - info\['instances'\]\[i\]\['velocity'\]: Velocities of 3D bounding boxes (no vertical measurements due to inaccuracy), a list has shape (2.).
+    - info\['instances'\]\[i\]\['num_lidar_pts'\]: Number of lidar points included in each 3D bounding box.
+    - info\['instances'\]\[i\]\['num_radar_pts'\]: Number of radar points included in each 3D bounding box.
+    - info\['instances'\]\[i\]\['bbox_3d_isvalid'\]: Whether each bounding box is valid. In general, we only take the 3D boxes that include at least one lidar or radar point as valid boxes.
+  - info\['cam_instances'\]: It is a dict containing keys `'CAM_FRONT'`, `'CAM_FRONT_RIGHT'`, `'CAM_FRONT_LEFT'`, `'CAM_BACK'`, `'CAM_BACK_LEFT'`, `'CAM_BACK_RIGHT'`. For vision-based 3D object detection task, we split 3D annotations of the whole scenes according to the camera they belong to. For the i-th instance:
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox_label'\]: Label of instance.
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox_label_3d'\]: Label of instance.
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox'\]: 2D bounding box annotation (exterior rectangle of the projected 3D box), a list arrange as \[x1, y1, x2, y2\].
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['center_2d'\]: Projected center location on the image, a list has shape (2,), .
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['depth'\]: The depth of projected center.
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['velocity'\]: Velocities of 3D bounding boxes (no vertical measurements due to inaccuracy), a list has shape (2,).
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['attr_label'\]: The attr label of instance. We maintain a default attribute collection and mapping for attribute classification.
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox_3d'\]: List of 7 numbers representing the 3D bounding box of the instance, in (x, y, z, l, h, w, yaw) order.
+  - info\['pts_semantic_mask_path'\]：The filename of the lidar point cloud semantic segmentation annotation.
+
+Note:
+
+1. The differences between `bbox_3d` in `instances` and that in `cam_instances`.
+   Both `bbox_3d` have been converted to MMDet3D coordinate system, but `bboxes_3d` in `instances` is in LiDAR coordinate format and `bboxes_3d` in `cam_instances` is in Camera coordinate format. Mind the difference between them in 3D Box representation ('l, w, h' and 'l, h, w').
+
+2. Here we only explain the data recorded in the training info files. The same applies to validation and testing set (the `.pkl` file of test set does not contains `instances` and `cam_instances`).
+
+The core function to get `nuscenes_infos_xxx.pkl` is  [\_fill_trainval_infos](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/tools/dataset_converters/nuscenes_converter.py#L146).
+Please refer to [nuscenes_converter.py](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/tools/dataset_converters/nuscenes_converter.py) for more details.
+
+## Training pipeline
+
+### LiDAR-Based Methods
+
+A typical training pipeline of LiDAR-based 3D detection (including multi-modality methods) on nuScenes is as below.
+
+```python
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+```
+
+Compared to general cases, nuScenes has a specific `'LoadPointsFromMultiSweeps'` pipeline to load point clouds from consecutive frames. This is a common practice used in this setting.
+Please refer to the nuScenes [original paper](https://arxiv.org/abs/1903.11027) for more details.
+The default `use_dim` in `'LoadPointsFromMultiSweeps'` is `[0, 1, 2, 4]`, where the first 3 dimensions refer to point coordinates and the last refers to timestamp differences.
+Intensity is not used by default due to its yielded noise when concatenating the points from different frames.
+
+### Vision-Based Methods
+
+#### Monocular-based
+
+In the NuScenes dataset, for multi-view images, this paradigm usually involves detecting and outputting 3D object detection results separately for each image, and then obtaining the final detection results through post-processing (such as NMS). Essentially, it directly extends monocular 3D detection to multi-view settings. A typical training pipeline of image-based monocular 3D detection on nuScenes is as below.
+
+```python
+train_pipeline = [
+    dict(type='LoadImageFromFileMono3D'),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox=True,
+        with_label=True,
+        with_attr_label=True,
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_bbox_depth=True),
+    dict(type='mmdet.Resize', scale=(1600, 900), keep_ratio=True),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='Pack3DDetInputs',
+        keys=[
+            'img', 'gt_bboxes', 'gt_bboxes_labels', 'attr_labels', 'gt_bboxes_3d',
+            'gt_labels_3d', 'centers_2d', 'depths'
+        ]),
+]
+```
+
+It follows the general pipeline of 2D detection while differs in some details:
+
+- It uses monocular pipelines to load images, which includes additional required information like camera intrinsics.
+- It needs to load 3D annotations.
+- Some data augmentation techniques need to be adjusted, such as `RandomFlip3D`.
+  Currently we do not support more augmentation methods, because how to transfer and apply other techniques is still under explored.
+
+#### BEV-based
+
+BEV, Bird's-Eye-View, is another popular 3D detection paradigm. It directly takes multi-view images to perform 3D detection, for nuScenes, they are `CAM_FRONT`, `CAM_FRONT_LEFT`, `CAM_FRONT_RIGHT`, `CAM_BACK`, `CAM_BACK_LEFT` and `CAM_BACK_RIGHT`. A basic training pipeline of bev-based 3D detection on nuScenes is as below.
+
+```python
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+train_transforms = [
+    dict(type='PhotoMetricDistortion3D'),
+    dict(
+        type='RandomResize3D',
+        scale=(1600, 900),
+        ratio_range=(1., 1.),
+        keep_ratio=True)
+]
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles',
+         to_float32=True,
+         num_views=6, ),
+    dict(type='LoadAnnotations3D',
+         with_bbox_3d=True,
+         with_label_3d=True,
+         with_attr_label=False),
+    # optional, data augmentation
+    dict(type='MultiViewWrapper', transforms=train_transforms),
+    # optional, filter object within specific point cloud range
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    # optional, filter object of specific classes
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='Pack3DDetInputs', keys=['img', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+```
+
+To load multiple view of images, a little modification should be made to the dataset.
+
+```python
+data_prefix = dict(
+    CAM_FRONT='samples/CAM_FRONT',
+    CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT',
+    CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT',
+    CAM_BACK='samples/CAM_BACK',
+    CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
+    CAM_BACK_LEFT='samples/CAM_BACK_LEFT',
+)
+train_dataloader = dict(
+    batch_size=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type="NuScenesDataset",
+        data_root="./data/nuScenes",
+        ann_file="nuscenes_infos_train.pkl",
+        data_prefix=data_prefix,
+        modality=dict(use_camera=True, use_lidar=False, ),
+        pipeline=train_pipeline,
+        test_mode=False, )
+)
+```
+
+## Evaluation
+
+An example to evaluate PointPillars with 8 GPUs with nuScenes metrics is as follows.
+
+```shell
+bash ./tools/dist_test.sh configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb4-2x_nus-3d.py checkpoints/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d_20200620_230405-2fa62f3d.pth 8
+```
+
+## Metrics
+
+NuScenes proposes a comprehensive metric, namely nuScenes detection score (NDS), to evaluate different methods and set up the benchmark.
+It consists of mean Average Precision (mAP), Average Translation Error (ATE), Average Scale Error (ASE), Average Orientation Error (AOE), Average Velocity Error (AVE) and Average Attribute Error (AAE).
+Please refer to its [official website](https://www.nuscenes.org/object-detection?externalData=all&mapData=all&modalities=Any) for more details.
+
+We also adopt this approach for evaluation on nuScenes. An example of printed evaluation results is as follows:
+
+```
+mAP: 0.3197
+mATE: 0.7595
+mASE: 0.2700
+mAOE: 0.4918
+mAVE: 1.3307
+mAAE: 0.1724
+NDS: 0.3905
+Eval time: 170.8s
+
+Per-class results:
+Object Class    AP      ATE     ASE     AOE     AVE     AAE
+car     0.503   0.577   0.152   0.111   2.096   0.136
+truck   0.223   0.857   0.224   0.220   1.389   0.179
+bus     0.294   0.855   0.204   0.190   2.689   0.283
+trailer 0.081   1.094   0.243   0.553   0.742   0.167
+construction_vehicle    0.058   1.017   0.450   1.019   0.137   0.341
+pedestrian      0.392   0.687   0.284   0.694   0.876   0.158
+motorcycle      0.317   0.737   0.265   0.580   2.033   0.104
+bicycle 0.308   0.704   0.299   0.892   0.683   0.010
+traffic_cone    0.555   0.486   0.309   nan     nan     nan
+barrier 0.466   0.581   0.269   0.169   nan     nan
+```
+
+## Testing and make a submission
+
+An example to test PointPillars on nuScenes with 8 GPUs and generate a submission to the leaderboard is as follows.
+
+You should modify the `jsonfile_prefix` in the `test_evaluator` of corresponding configuration. For example, adding `test_evaluator = dict(type='NuScenesMetric', jsonfile_prefix='work_dirs/pp-nus/results_eval.json')` or using `--cfg-options "test_evaluator.jsonfile_prefix=work_dirs/pp-nus/results_eval.json)` after the test command.
+
+```shell
+./tools/dist_test.sh configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb4-2x_nus-3d.py work_dirs/pp-nus/latest.pth 8 --cfg-options 'test_evaluator.jsonfile_prefix=work_dirs/pp-nus/results_eval'
+```
+
+Note that the testing info should be changed to that for testing set instead of validation set [here](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/configs/_base_/datasets/nus-3d.py#L132).
+
+After generating the `work_dirs/pp-nus/results_eval.json`, you can compress it and submit it to nuScenes benchmark. Please refer to the [nuScenes official website](https://www.nuscenes.org/object-detection?externalData=all&mapData=all&modalities=Any) for more information.
+
+We can also visualize the prediction results with our developed visualization tools. Please refer to the [visualization doc](https://mmdetection3d.readthedocs.io/en/latest/useful_tools.html#visualization) for more details.
+
+## Notes
+
+### Transformation between `NuScenesBox` and our `CameraInstanceBoxes`.
+
+In general, the main difference of `NuScenesBox` and our `CameraInstanceBoxes` is mainly reflected in the yaw definition. `NuScenesBox` defines the rotation with a quaternion or three Euler angles while ours only defines one yaw angle due to the practical scenario. It requires us to add some additional rotations manually in the pre-processing and post-processing, such as [here](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/nuscenes_mono_dataset.py#L673).
+
+In addition, please note that the definition of corners and locations are detached in the `NuScenesBox`. For example, in monocular 3D detection, the definition of the box location is in its camera coordinate (see its official [illustration](https://www.nuscenes.org/nuscenes#data-collection) for car setup), which is consistent with [ours](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/core/bbox/structures/cam_box3d.py). In contrast, its corners are defined with the [convention](https://github.com/nutonomy/nuscenes-devkit/blob/02e9200218977193a1058dd7234f935834378319/python-sdk/nuscenes/utils/data_classes.py#L527) "x points forward, y to the left, z up". It results in different philosophy of dimension and rotation definitions from our `CameraInstanceBoxes`. An example to remove similar hacks is PR [#744](https://github.com/open-mmlab/mmdetection3d/pull/744). The same problem also exists in the LiDAR system. To deal with them, we typically add some transformation in the pre-processing and post-processing to guarantee the box will be in our coordinate system during the entire training and inference procedure.
--- a/mmdetection3d/docs/en/advanced_guides/datasets/s3dis.md
+++ b/mmdetection3d/docs/en/advanced_guides/datasets/s3dis.md
+# S3DIS Dataset
+
+## Dataset preparation
+
+For the overall process, please refer to the [README](https://github.com/open-mmlab/mmdetection3d/blob/master/data/s3dis/README.md/) page for S3DIS.
+
+### Export S3DIS data
+
+By exporting S3DIS data, we load the raw point cloud data and generate the relevant annotations including semantic labels and instance labels.
+
+The directory structure before exporting should be as below:
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── s3dis
+│   │   ├── meta_data
+│   │   ├── Stanford3dDataset_v1.2_Aligned_Version
+│   │   │   ├── Area_1
+│   │   │   │   ├── conferenceRoom_1
+│   │   │   │   ├── office_1
+│   │   │   │   ├── ...
+│   │   │   ├── Area_2
+│   │   │   ├── Area_3
+│   │   │   ├── Area_4
+│   │   │   ├── Area_5
+│   │   │   ├── Area_6
+│   │   ├── indoor3d_util.py
+│   │   ├── collect_indoor3d_data.py
+│   │   ├── README.md
+```
+
+Under folder `Stanford3dDataset_v1.2_Aligned_Version`, the rooms are spilted into 6 areas. We use 5 areas for training and 1 for evaluation (typically `Area_5`). Under the directory of each area, there are folders in which raw point cloud data and relevant annotations are saved. For instance, under folder `Area_1/office_1` the files are as below:
+
+- `office_1.txt`: A txt file storing coordinates and colors of each point in the raw point cloud data.
+
+- `Annotations/`: This folder contains txt files for different object instances. Each txt file represents one instance, e.g.
+
+  - `chair_1.txt`: A txt file storing raw point cloud data of one chair in this room.
+
+  If we concat all the txt files under `Annotations/`, we will get the same point cloud as denoted by `office_1.txt`.
+
+Export S3DIS data by running `python collect_indoor3d_data.py`. The main steps include:
+
+- Export original txt files to point cloud, instance label and semantic label.
+- Save point cloud data and relevant annotation files.
+
+And the core function `export` in `indoor3d_util.py` is as follows:
+
+```python
+def export(anno_path, out_filename):
+    """Convert original dataset files to points, instance mask and semantic
+    mask files. We aggregated all the points from each instance in the room.
+
+    Args:
+        anno_path (str): path to annotations. e.g. Area_1/office_2/Annotations/
+        out_filename (str): path to save collected points and labels.
+        file_format (str): txt or numpy, determines what file format to save.
+
+    Note:
+        the points are shifted before save, the most negative point is now
+            at origin.
+    """
+    points_list = []
+    ins_idx = 1  # instance ids should be indexed from 1, so 0 is unannotated
+
+    # an example of `anno_path`: Area_1/office_1/Annotations
+    # which contains all object instances in this room as txt files
+    for f in glob.glob(osp.join(anno_path, '*.txt')):
+        # get class name of this instance
+        one_class = osp.basename(f).split('_')[0]
+        if one_class not in class_names:  # some rooms have 'staris' class
+            one_class = 'clutter'
+        points = np.loadtxt(f)
+        labels = np.ones((points.shape[0], 1)) * class2label[one_class]
+        ins_labels = np.ones((points.shape[0], 1)) * ins_idx
+        ins_idx += 1
+        points_list.append(np.concatenate([points, labels, ins_labels], 1))
+
+    data_label = np.concatenate(points_list, 0)  # [N, 8], (pts, rgb, sem, ins)
+    # align point cloud to the origin
+    xyz_min = np.amin(data_label, axis=0)[0:3]
+    data_label[:, 0:3] -= xyz_min
+
+    np.save(f'{out_filename}_point.npy', data_label[:, :6].astype(np.float32))
+    np.save(f'{out_filename}_sem_label.npy', data_label[:, 6].astype(np.int64))
+    np.save(f'{out_filename}_ins_label.npy', data_label[:, 7].astype(np.int64))
+
+```
+
+where we load and concatenate all the point cloud instances under `Annotations/` to form raw point cloud and generate semantic/instance labels. After exporting each room, the point cloud data, semantic labels and instance labels should be saved in `.npy` files.
+
+### Create dataset
+
+```shell
+python tools/create_data.py s3dis --root-path ./data/s3dis \
+--out-dir ./data/s3dis --extra-tag s3dis
+```
+
+The above exported point cloud files, semantic label files and instance label files are further saved in `.bin` format. Meanwhile `.pkl` info files are also generated for each area.
+
+The directory structure after process should be as below:
+
+```
+s3dis
+├── meta_data
+├── indoor3d_util.py
+├── collect_indoor3d_data.py
+├── README.md
+├── Stanford3dDataset_v1.2_Aligned_Version
+├── s3dis_data
+├── points
+│   ├── xxxxx.bin
+├── instance_mask
+│   ├── xxxxx.bin
+├── semantic_mask
+│   ├── xxxxx.bin
+├── seg_info
+│   ├── Area_1_label_weight.npy
+│   ├── Area_1_resampled_scene_idxs.npy
+│   ├── Area_2_label_weight.npy
+│   ├── Area_2_resampled_scene_idxs.npy
+│   ├── Area_3_label_weight.npy
+│   ├── Area_3_resampled_scene_idxs.npy
+│   ├── Area_4_label_weight.npy
+│   ├── Area_4_resampled_scene_idxs.npy
+│   ├── Area_5_label_weight.npy
+│   ├── Area_5_resampled_scene_idxs.npy
+│   ├── Area_6_label_weight.npy
+│   ├── Area_6_resampled_scene_idxs.npy
+├── s3dis_infos_Area_1.pkl
+├── s3dis_infos_Area_2.pkl
+├── s3dis_infos_Area_3.pkl
+├── s3dis_infos_Area_4.pkl
+├── s3dis_infos_Area_5.pkl
+├── s3dis_infos_Area_6.pkl
+```
+
+- `points/xxxxx.bin`: The exported point cloud data.
+- `instance_mask/xxxxx.bin`: The instance label for each point, value range: \[0, ${NUM_INSTANCES}\], 0: unannotated.
+- `semantic_mask/xxxxx.bin`: The semantic label for each point, value range: \[0, 12\].
+- `s3dis_infos_Area_1.pkl`: Area 1 data infos, the detailed info of each room is as follows:
+  - info\['point_cloud'\]: {'num_features': 6, 'lidar_idx': sample_idx}.
+  - info\['pts_path'\]: The path of `points/xxxxx.bin`.
+  - info\['pts_instance_mask_path'\]: The path of `instance_mask/xxxxx.bin`.
+  - info\['pts_semantic_mask_path'\]: The path of `semantic_mask/xxxxx.bin`.
+- `seg_info`: The generated infos to support semantic segmentation model training.
+  - `Area_1_label_weight.npy`: Weighting factor for each semantic class. Since the number of points in different classes varies greatly, it's a common practice to use label re-weighting to get a better performance.
+  - `Area_1_resampled_scene_idxs.npy`: Re-sampling index for each scene. Different rooms will be sampled multiple times according to their number of points to balance training data.
+
+## Training pipeline
+
+A typical training pipeline of S3DIS for 3D semantic segmentation is as below.
+
+```python
+class_names = ('ceiling', 'floor', 'wall', 'beam', 'column', 'window', 'door',
+               'table', 'chair', 'sofa', 'bookcase', 'board', 'clutter')
+num_points = 4096
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=False,
+        with_seg_3d=True),
+    dict(
+        type='PointSegClassMapping'),
+    dict(
+        type='IndoorPatchPointSample',
+        num_points=num_points,
+        block_size=1.0,
+        ignore_index=None,
+        use_normalized_coord=True,
+        enlarge_size=None,
+        min_unique_num=num_points // 4,
+        eps=0.0),
+    dict(type='NormalizePointsColor', color_mean=None),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-3.141592653589793, 3.141592653589793],  # [-pi, pi]
+        scale_ratio_range=[0.8, 1.2],
+        translation_std=[0, 0, 0]),
+    dict(
+        type='RandomJitterPoints',
+        jitter_std=[0.01, 0.01, 0.01],
+        clip_range=[-0.05, 0.05]),
+    dict(type='RandomDropPointsColor', drop_ratio=0.2),
+    dict(type='Pack3DDetInputs', keys=['points', 'pts_semantic_mask'])
+]
+```
+
+- `PointSegClassMapping`: Only the valid category ids will be mapped to class label ids like \[0, 13) during training. Other class ids will be converted to `ignore_index` which equals to `13`.
+- `IndoorPatchPointSample`: Crop a patch containing a fixed number of points from input point cloud. `block_size` indicates the size of the cropped block, typically `1.0` for S3DIS.
+- `NormalizePointsColor`: Normalize the RGB color values of input point cloud by dividing `255`.
+- Data augmentation:
+  - `GlobalRotScaleTrans`: randomly rotate and scale input point cloud.
+  - `RandomJitterPoints`: randomly jitter point cloud by adding different noise vector to each point.
+  - `RandomDropPointsColor`: set the colors of point cloud to all zeros by a probability `drop_ratio`.
+
+## Metrics
+
+Typically mean intersection over union (mIoU) is used for evaluation on S3DIS. In detail, we first compute IoU for multiple classes and then average them to get mIoU, please refer to [seg_eval.py](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/evaluation/functional/seg_eval.py).
+
+As introduced in section `Export S3DIS data`, S3DIS trains on 5 areas and evaluates on the remaining 1 area. But there are also other area split schemes in different papers.
+To enable flexible combination of train-val splits, we use sub-dataset to represent one area, and concatenate them to form a larger training set. An example of training on area 1, 2, 3, 4, 6 and evaluating on area 5 is shown as below:
+
+```python
+dataset_type = 'S3DISSegDataset'
+data_root = './data/s3dis/'
+class_names = ('ceiling', 'floor', 'wall', 'beam', 'column', 'window', 'door',
+               'table', 'chair', 'sofa', 'bookcase', 'board', 'clutter')
+train_area = [1, 2, 3, 4, 6]
+test_area = 5
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_files=[f's3dis_infos_Area_{i}.pkl' for i in train_area],
+        metainfo=metainfo,
+        data_prefix=data_prefix,
+        pipeline=train_pipeline,
+        modality=input_modality,
+        ignore_index=len(class_names),
+        scene_idxs=[
+            f'seg_info/Area_{i}_resampled_scene_idxs.npy' for i in train_area
+        ],
+        test_mode=False))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_files=f's3dis_infos_Area_{test_area}.pkl',
+        metainfo=metainfo,
+        data_prefix=data_prefix,
+        pipeline=test_pipeline,
+        modality=input_modality,
+        ignore_index=len(class_names),
+        scene_idxs=f'seg_info/Area_{test_area}_resampled_scene_idxs.npy',
+        test_mode=True))
+val_dataloader = test_dataloader
+```
+
+where we specify the areas used for training/validation by setting `ann_files` and `scene_idxs` with lists that include corresponding paths. The train-val split can be simply modified via changing the `train_area` and `test_area` variables.
--- a/mmdetection3d/docs/en/advanced_guides/datasets/scannet.md
+++ b/mmdetection3d/docs/en/advanced_guides/datasets/scannet.md
+# ScanNet Dataset
+
+MMDetection3D supports LiDAR-based detection and segmentation on ScanNet dataset. This page provides specific tutorials about the usage.
+
+## Dataset preparation
+
+For the overall process, please refer to the [README](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/data/scannet/README.md) page for ScanNet.
+
+### Export ScanNet point cloud data
+
+By exporting ScanNet data, we load the raw point cloud data and generate the relevant annotations including semantic labels, instance labels and ground truth bounding boxes.
+
+```shell
+python batch_load_scannet_data.py
+```
+
+The directory structure before data preparation should be as below
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── scannet
+│   │   ├── meta_data
+│   │   ├── scans
+│   │   │   ├── scenexxxx_xx
+│   │   ├── batch_load_scannet_data.py
+│   │   ├── load_scannet_data.py
+│   │   ├── scannet_utils.py
+│   │   ├── README.md
+```
+
+Under folder `scans` there are overall 1201 train and 312 validation folders in which raw point cloud data and relevant annotations are saved. For instance, under folder `scene0001_01` the files are as below:
+
+- `scene0001_01_vh_clean_2.ply`: Mesh file storing coordinates and colors of each vertex. The mesh's vertices are taken as raw point cloud data.
+- `scene0001_01.aggregation.json`: Aggregation file including object ID, segments ID and label.
+- `scene0001_01_vh_clean_2.0.010000.segs.json`: Segmentation file including segments ID and vertex.
+- `scene0001_01.txt`: Meta file including axis-aligned matrix, etc.
+- `scene0001_01_vh_clean_2.labels.ply`: Annotation file containing the category of each vertex.
+
+The procedure of exporting ScanNet data by running `python batch_load_scannet_data.py` mainly includes the following 3 steps:
+
+- Export original files to point cloud, instance label, semantic label and bounding box file.
+- Downsample raw point cloud and filter invalid classes.
+- Save point cloud data and relevant annotation files.
+
+And the core function `export` in `load_scannet_data.py` is as follows:
+
+```python
+def export(mesh_file,
+           agg_file,
+           seg_file,
+           meta_file,
+           label_map_file,
+           output_file=None,
+           test_mode=False):
+
+    # label map file: ./data/scannet/meta_data/scannetv2-labels.combined.tsv
+    # the various label standards in the label map file, e.g. 'nyu40id'
+    label_map = scannet_utils.read_label_mapping(
+        label_map_file, label_from='raw_category', label_to='nyu40id')
+    # load raw point cloud data, 6-dims feature: XYZRGB
+    mesh_vertices = scannet_utils.read_mesh_vertices_rgb(mesh_file)
+
+    # Load scene axis alignment matrix: a 4x4 transformation matrix
+    # transform raw points in sensor coordinate system to a coordinate system
+    # which is axis-aligned with the length/width of the room
+    lines = open(meta_file).readlines()
+    # test set data doesn't have align_matrix
+    axis_align_matrix = np.eye(4)
+    for line in lines:
+        if 'axisAlignment' in line:
+            axis_align_matrix = [
+                float(x)
+                for x in line.rstrip().strip('axisAlignment = ').split(' ')
+            ]
+            break
+    axis_align_matrix = np.array(axis_align_matrix).reshape((4, 4))
+
+    # perform global alignment of mesh vertices
+    pts = np.ones((mesh_vertices.shape[0], 4))
+    # raw point cloud in homogeneous coordinates, each row: [x, y, z, 1]
+    pts[:, 0:3] = mesh_vertices[:, 0:3]
+    # transform raw mesh vertices to aligned mesh vertices
+    pts = np.dot(pts, axis_align_matrix.transpose())  # Nx4
+    aligned_mesh_vertices = np.concatenate([pts[:, 0:3], mesh_vertices[:, 3:]],
+                                           axis=1)
+
+    # Load semantic and instance labels
+    if not test_mode:
+        # each object has one semantic label and consists of several segments
+        object_id_to_segs, label_to_segs = read_aggregation(agg_file)
+        # many points may belong to the same segment
+        seg_to_verts, num_verts = read_segmentation(seg_file)
+        label_ids = np.zeros(shape=(num_verts), dtype=np.uint32)
+        object_id_to_label_id = {}
+        for label, segs in label_to_segs.items():
+            label_id = label_map[label]
+            for seg in segs:
+                verts = seg_to_verts[seg]
+                # each point has one semantic label
+                label_ids[verts] = label_id
+        instance_ids = np.zeros(
+            shape=(num_verts), dtype=np.uint32)  # 0: unannotated
+        for object_id, segs in object_id_to_segs.items():
+            for seg in segs:
+                verts = seg_to_verts[seg]
+                # object_id is 1-indexed, i.e. 1,2,3,.,,,.NUM_INSTANCES
+                # each point belongs to one object
+                instance_ids[verts] = object_id
+                if object_id not in object_id_to_label_id:
+                    object_id_to_label_id[object_id] = label_ids[verts][0]
+        # bbox format is [x, y, z, x_size, y_size, z_size, label_id]
+        # [x, y, z] is gravity center of bbox, [x_size, y_size, z_size] is axis-aligned
+        # [label_id] is semantic label id in 'nyu40id' standard
+        # Note: since 3D bbox is axis-aligned, the yaw is 0.
+        unaligned_bboxes = extract_bbox(mesh_vertices, object_id_to_segs,
+                                        object_id_to_label_id, instance_ids)
+        aligned_bboxes = extract_bbox(aligned_mesh_vertices, object_id_to_segs,
+                                      object_id_to_label_id, instance_ids)
+    ...
+
+    return mesh_vertices, label_ids, instance_ids, unaligned_bboxes, \
+        aligned_bboxes, object_id_to_label_id, axis_align_matrix
+
+```
+
+After exporting each scan, the raw point cloud could be downsampled, e.g. to 50000, if the number of points is too large (the raw point cloud won't be downsampled if it's also used in 3D semantic segmentation task). In addition, invalid semantic labels outside of `nyu40id` standard or optional `DONOT CARE` classes should be filtered. Finally, the point cloud data, semantic labels, instance labels and ground truth bounding boxes should be saved in `.npy` files.
+
+### Export ScanNet RGB data (optional)
+
+By exporting ScanNet RGB data, for each scene we load a set of RGB images with corresponding 4x4 pose matrices, and a single 4x4 camera intrinsic matrix. Note, that this step is optional and can be skipped if multi-view detection is not planned to use.
+
+```shell
+python extract_posed_images.py
+```
+
+Each of 1201 train, 312 validation and 100 test scenes contains a single `.sens` file. For instance, for scene `0001_01` we have `data/scannet/scans/scene0001_01/0001_01.sens`. For this scene all images and poses are extracted to `data/scannet/posed_images/scene0001_01`. Specifically, there will be 300 image files xxxxx.jpg, 300 camera pose files xxxxx.txt and a single `intrinsic.txt` file. Typically, single scene contains several thousand images. By default, we extract only 300 of them with resulting space occupation of \<100 Gb. To extract more images, use `--max-images-per-scene` parameter.
+
+### Create dataset
+
+```shell
+python tools/create_data.py scannet --root-path ./data/scannet \
+--out-dir ./data/scannet --extra-tag scannet
+```
+
+The above exported point cloud file, semantic label file and instance label file are further saved in `.bin` format. Meanwhile `.pkl` info files are also generated for train or validation. The core function `process_single_scene` of getting data infos is as follows.
+
+```python
+def process_single_scene(sample_idx):
+
+    # save point cloud, instance label and semantic label in .bin file respectively, get info['pts_path'], info['pts_instance_mask_path'] and info['pts_semantic_mask_path']
+    ...
+
+    # get annotations
+    if has_label:
+        annotations = {}
+        # box is of shape [k, 6 + class]
+        aligned_box_label = self.get_aligned_box_label(sample_idx)
+        unaligned_box_label = self.get_unaligned_box_label(sample_idx)
+        annotations['gt_num'] = aligned_box_label.shape[0]
+        if annotations['gt_num'] != 0:
+            aligned_box = aligned_box_label[:, :-1]  # k, 6
+            unaligned_box = unaligned_box_label[:, :-1]
+            classes = aligned_box_label[:, -1]  # k
+            annotations['name'] = np.array([
+                self.label2cat[self.cat_ids2class[classes[i]]]
+                for i in range(annotations['gt_num'])
+            ])
+            # default names are given to aligned bbox for compatibility
+            # we also save unaligned bbox info with marked names
+            annotations['location'] = aligned_box[:, :3]
+            annotations['dimensions'] = aligned_box[:, 3:6]
+            annotations['gt_boxes_upright_depth'] = aligned_box
+            annotations['unaligned_location'] = unaligned_box[:, :3]
+            annotations['unaligned_dimensions'] = unaligned_box[:, 3:6]
+            annotations[
+                'unaligned_gt_boxes_upright_depth'] = unaligned_box
+            annotations['index'] = np.arange(
+                annotations['gt_num'], dtype=np.int32)
+            annotations['class'] = np.array([
+                self.cat_ids2class[classes[i]]
+                for i in range(annotations['gt_num'])
+            ])
+        axis_align_matrix = self.get_axis_align_matrix(sample_idx)
+        annotations['axis_align_matrix'] = axis_align_matrix  # 4x4
+        info['annos'] = annotations
+    return info
+```
+
+The directory structure after process should be as below:
+
+```
+scannet
+├── meta_data
+├── batch_load_scannet_data.py
+├── load_scannet_data.py
+├── scannet_utils.py
+├── README.md
+├── scans
+├── scans_test
+├── scannet_instance_data
+├── points
+│   ├── xxxxx.bin
+├── instance_mask
+│   ├── xxxxx.bin
+├── semantic_mask
+│   ├── xxxxx.bin
+├── seg_info
+│   ├── train_label_weight.npy
+│   ├── train_resampled_scene_idxs.npy
+│   ├── val_label_weight.npy
+│   ├── val_resampled_scene_idxs.npy
+├── posed_images
+│   ├── scenexxxx_xx
+│   │   ├── xxxxxx.txt
+│   │   ├── xxxxxx.jpg
+│   │   ├── intrinsic.txt
+├── scannet_infos_train.pkl
+├── scannet_infos_val.pkl
+├── scannet_infos_test.pkl
+```
+
+- `points/xxxxx.bin`: The `axis-unaligned` point cloud data after downsample. Since ScanNet 3D detection task takes axis-aligned point clouds as input, while ScanNet 3D semantic segmentation task takes unaligned points, we choose to store unaligned points and their axis-align transform matrix. Note: the points would be axis-aligned in pre-processing pipeline [`GlobalAlignment`](https://github.com/open-mmlab/mmdetection3d/blob/9f0b01caf6aefed861ef4c3eb197c09362d26b32/mmdet3d/datasets/pipelines/transforms_3d.py#L423) of 3D detection task.
+- `instance_mask/xxxxx.bin`: The instance label for each point, value range: \[0, NUM_INSTANCES\], 0: unannotated.
+- `semantic_mask/xxxxx.bin`: The semantic label for each point, value range: \[1, 40\], i.e. `nyu40id` standard. Note: the `nyu40id` ID will be mapped to train ID in train pipeline `PointSegClassMapping`.
+- `seg_info`: The generated infos to support semantic segmentation model training.
+  - `train_label_weight.npy`: Weighting factor for each semantic class. Since the number of points in different classes varies greatly, it's a common practice to use label re-weighting to get a better performance.
+  - `train_resampled_scene_idxs.npy`: Re-sampling index for each scene. Different rooms will be sampled multiple times according to their number of points to balance training data.
+- `posed_images/scenexxxx_xx`: The set of `.jpg` images with `.txt` 4x4 poses and the single `.txt` file with camera intrinsic matrix.
+- `scannet_infos_train.pkl`: The train data infos, the detailed info of each scan is as follows:
+  - info\['lidar_points'\]: A dict containing all information related to the lidar points.
+    - info\['lidar_points'\]\['lidar_path'\]: The filename of the lidar point cloud data.
+    - info\['lidar_points'\]\['num_pts_feats'\]: The feature dimension of point.
+    - info\['lidar_points'\]\['axis_align_matrix'\]: The transformation matrix to align the axis.
+  - info\['pts_semantic_mask_path'\]: The filename of the semantic mask annotation.
+  - info\['pts_instance_mask_path'\]: The filename of the instance mask annotation.
+  - info\['instances'\]: A list of dict contains all annotations, each dict contains all annotation information of single instance. For the i-th instance:
+    - info\['instances'\]\[i\]\['bbox_3d'\]: List of 6 numbers representing the axis-aligned 3D bounding box of the instance in depth coordinate system, in (x, y, z, l, w, h) order.
+    - info\['instances'\]\[i\]\['bbox_label_3d'\]: The label of each 3d bounding boxes.
+- `scannet_infos_val.pkl`: The val data infos, which shares the same format as `scannet_infos_train.pkl`.
+- `scannet_infos_test.pkl`: The test data infos, which almost shares the same format as `scannet_infos_train.pkl` except for the lack of annotation.
+
+## Training pipeline
+
+A typical training pipeline of ScanNet for 3D detection is as follows.
+
+```python
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_mask_3d=True,
+        with_seg_3d=True),
+    dict(type='GlobalAlignment', rotation_axis=2),
+    dict(type='PointSegClassMapping'),
+    dict(type='PointSample', num_points=40000),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.087266, 0.087266],
+        scale_ratio_range=[1.0, 1.0],
+        shift_height=True),
+    dict(
+        type='Pack3DDetInputs',
+        keys=[
+            'points', 'gt_bboxes_3d', 'gt_labels_3d', 'pts_semantic_mask',
+            'pts_instance_mask'
+        ])
+]
+```
+
+- `GlobalAlignment`: The previous point cloud would be axis-aligned using the axis-aligned matrix.
+- `PointSegClassMapping`: Only the valid category IDs will be mapped to class label IDs like \[0, 18) during training.
+- Data augmentation:
+  - `PointSample`: downsample the input point cloud.
+  - `RandomFlip3D`: randomly flip the input point cloud horizontally or vertically.
+  - `GlobalRotScaleTrans`: rotate the input point cloud, usually in the range of \[-5, 5\] (degrees) for ScanNet; then scale the input point cloud, usually by 1.0 for ScanNet (which means no scaling); finally translate the input point cloud, usually by 0 for ScanNet  (which means no translation).
+
+A typical training pipeline of ScanNet for 3D semantic segmentation is as below:
+
+```python
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=False,
+        with_seg_3d=True),
+    dict(
+        type='PointSegClassMapping'),
+    dict(
+        type='IndoorPatchPointSample',
+        num_points=num_points,
+        block_size=1.5,
+        ignore_index=len(class_names),
+        use_normalized_coord=False,
+        enlarge_size=0.2,
+        min_unique_num=None),
+    dict(type='NormalizePointsColor', color_mean=None),
+    dict(type='Pack3DDetInputs', keys=['points', 'pts_semantic_mask'])
+]
+```
+
+- `PointSegClassMapping`: Only the valid category ids will be mapped to class label ids like \[0, 20) during training. Other class ids will be converted to `ignore_index` which equals to `20`.
+- `IndoorPatchPointSample`: Crop a patch containing a fixed number of points from input point cloud. `block_size` indicates the size of the cropped block, typically `1.5` for ScanNet.
+- `NormalizePointsColor`: Normalize the RGB color values of input point cloud by dividing `255`.
+
+## Metrics
+
+- **Object Detection**: Typically mean Average Precision (mAP) is used for evaluation on ScanNet, e.g. `mAP@0.25` and `mAP@0.5`. In detail, a generic function to compute precision and recall for 3D object detection for multiple classes is called. Please refer to [indoor_eval](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/evaluation/functional/indoor_eval.py) for more details.
+
+  **Note**: As introduced in section `Export ScanNet data`, all ground truth 3D bounding box are axis-aligned, i.e. the yaw is zero. So the yaw target of network predicted 3D bounding box is also zero and axis-aligned 3D Non-Maximum Suppression (NMS), which is regardless of rotation, is adopted during post-processing .
+
+- **Semantic Segmentation**: Typically mean Intersection over Union (mIoU) is used for evaluation on ScanNet. In detail, we first compute IoU for multiple classes and then average them to get mIoU, please refer to [seg_eval](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/evaluation/functional/seg_eval.py).
+
+## Testing and Making a Submission
+
+By default, our codebase evaluates semantic segmentation results on the validation set.
+If you would like to test the model performance on the online benchmark, add `--format-only` flag in the evaluation script and change `ann_file=data_root + 'scannet_infos_val.pkl'` to `ann_file=data_root + 'scannet_infos_test.pkl'` in the ScanNet dataset's [config](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/_base_/datasets/scannet-seg.py#L126). Remember to specify the `txt_prefix` as the directory to save the testing results.
+
+Taking PointNet++ (SSG) on ScanNet for example, the following command can be used to do inference on test set:
+
+```
+./tools/dist_test.sh configs/pointnet2/pointnet2_ssg_16x2_cosine_200e_scannet-seg.py \
+    work_dirs/pointnet2_ssg/latest.pth --format-only \
+    --eval-options txt_prefix=work_dirs/pointnet2_ssg/test_submission
+```
+
+After generating the results, you can basically compress the folder and upload to the [ScanNet evaluation server](http://kaldir.vc.in.tum.de/scannet_benchmark/semantic_label_3d).
--- a/mmdetection3d/docs/en/advanced_guides/datasets/semantickitti.md
+++ b/mmdetection3d/docs/en/advanced_guides/datasets/semantickitti.md
+# SemanticKITTI Dataset
+
+This page provides specific tutorials about the usage of MMDetection3D for SemanticKITTI dataset.
+
+## Prepare dataset
+
+You can download SemanticKITTI dataset [HERE](http://semantic-kitti.org/dataset.html#download) and unzip all zip files.
+
+Like the general way to prepare dataset, it is recommended to symlink the dataset root to `$MMDETECTION3D/data`.
+
+The folder structure should be organized as follows before our processing.
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── semantickitti
+│   │   ├── sequences
+│   │   │   ├── 00
+│   │   │   │   ├── labels
+│   │   │   │   ├── velodyne
+│   │   │   ├── 01
+│   │   │   ├── ..
+│   │   │   ├── 22
+```
+
+SemanticKITTI dataset contains 23 sequences, where \[0-7\], \[9-10\] are used as training set (about 19k training samples), sequence 8 as validation set (about 4k validation samples) and \[11-22\] as test set (about 20k test samples). Each sequence contains velodyne and labels folders for LIDAR point cloud data and segmentation annotations (where the high 16 bits store the instance segmentation annotations and the low 16 bits store the semantic segmentation annotations), respectively.
+
+### Create SemanticKITTI Dataset
+
+We support scripts that generate dataset information for training and testing. Create `.pkl` info by running:
+
+```bash
+python ./tools/create_data.py semantickitti --root-path ./data/semantickitti --out-dir ./data/semantickitti --extra-tag semantickitti
+```
+
+The folder structure after processing should be as below
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── semantickitti
+│   │   ├── sequences
+│   │   │   ├── 00
+│   │   │   │   ├── labels
+│   │   │   │   ├── velodyne
+│   │   │   ├── 01
+│   │   │   ├── ..
+│   │   │   ├── 22
+│   │   ├── semantickitti_infos_test.pkl
+│   │   ├── semantickitti_infos_train.pkl
+│   │   ├── semantickitti_infos_val.pkl
+```
+
+- `semantickitti_infos_train.pkl`: training dataset, a dict contains two keys: `metainfo` and `data_list`.
+  `metainfo` contains the basic information for the dataset itself, while `data_list` is a list of dict, each dict (hereinafter referred to as `info`) contains all the detailed information of single sample as follows:
+  - info\['sample_id'\]: The index of this sample in the whole dataset.
+  - info\['lidar_points'\]: A dict containing all the information related to the lidar points.
+    - info\['lidar_points'\]\['lidar_path'\]: The filename of the lidar point cloud data.
+    - info\['lidar_points'\]\['num_pts_feats'\]: The feature dimension of point.
+  - info\['pts_semantic_mask_pth'\]: The path of 3D semantic segmentation annotation file.
+
+Please refer to [semantickitti_converter.py](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/tools/dataset_converters/semantickitti_converter.py) and [update_infos_to_v2.py ](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/tools/dataset_converters/update_infos_to_v2.py) for more details.
+
+## Train pipeline
+
+A typical train pipeline of 3D segmentation on SemanticKITTI is as below:
+
+```python
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_seg_3d=True,
+        seg_3d_dtype='np.int32',
+        seg_offset=2**16,
+        dataset_type='semantickitti',
+        backend_args=backend_args),
+    dict(type='PointSegClassMapping'),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0.1, 0.1, 0.1],
+    ),
+    dict(type='Pack3DDetInputs', keys=['points', 'pts_semantic_mask'])
+]
+```
+
+- Data augmentation:
+  - `RandomFlip3D`: randomly flip input point cloud horizontally or vertically.
+  - `GlobalRotScaleTrans`: rotate/scale/transform input point cloud.
+
+## Evaluation
+
+An example to evaluate MinkUNet with 8 GPUs with semantickitti metrics is as follows:
+
+```shell
+bash tools/dist_test.sh configs/minkunet/minkunet_w32_8xb2-15e_semantickitti.py checkpoints/minkunet_w32_8xb2-15e_semantickitti_20230309_160710-7fa0a6f1.pth 8
+```
+
+## Metrics
+
+Typically mean intersection over union (mIoU) is used for evaluation on Semantickitti. In detail, we first compute IoU for multiple classes and then average them to get mIoU, please refer to [seg_eval.py](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/evaluation/functional/seg_eval.py).
+
+An example of printed evaluation results is as follows:
+
+| classes | car    | bicycle | motorcycle | truck  | bus    | person | bicyclist | motorcyclist | road   | parking | sidewalk | other-ground | building | fence  | vegetation | trunck | terrian | pole   | traffic-sign | miou   | acc    | acc_cls |
+| ------- | ------ | ------- | ---------- | ------ | ------ | ------ | --------- | ------------ | ------ | ------- | -------- | ------------ | -------- | ------ | ---------- | ------ | ------- | ------ | ------------ | ------ | ------ | ------- |
+| results | 0.9687 | 0.1908  | 0.6313     | 0.8580 | 0.6359 | 0.6818 | 0.8444    | 0.0002       | 0.9353 | 0.4854  | 0.8106   | 0.0024       | 0.9050   | 0.6111 | 0.8822     | 0.6605 | 0.7493  | 0.6442 | 0.4837       | 0.6306 | 0.9202 | 0.6924  |