Commit 7aa442d5 authored by raojy's avatar raojy
Browse files

raw_mmdetection

parent 9c03eaa8
# Copyright (c) OpenMMLab. All rights reserved.
import logging
import os
from argparse import ArgumentParser
from mmengine.logging import print_log
from mmdet3d.apis import MonoDet3DInferencer
def parse_args():
parser = ArgumentParser()
parser.add_argument('img', help='Image file')
parser.add_argument('infos', help='Infos file with annotations')
parser.add_argument('model', help='Config file')
parser.add_argument('weights', help='Checkpoint file')
parser.add_argument(
'--device', default='cuda:0', help='Device used for inference')
parser.add_argument(
'--cam-type',
type=str,
default='CAM_BACK',
help='choose camera type to inference')
parser.add_argument(
'--pred-score-thr',
type=float,
default=0.3,
help='bbox score threshold')
parser.add_argument(
'--out-dir',
type=str,
default='outputs',
help='Output directory of prediction and visualization results.')
parser.add_argument(
'--show',
action='store_true',
help='Show online visualization results')
parser.add_argument(
'--wait-time',
type=float,
default=-1,
help='The interval of show (s). Demo will be blocked in showing'
'results, if wait_time is -1. Defaults to -1.')
parser.add_argument(
'--no-save-vis',
action='store_true',
help='Do not save detection visualization results')
parser.add_argument(
'--no-save-pred',
action='store_true',
help='Do not save detection prediction results')
parser.add_argument(
'--print-result',
action='store_true',
help='Whether to print the results.')
call_args = vars(parser.parse_args())
call_args['inputs'] = dict(
img=call_args.pop('img'), infos=call_args.pop('infos'))
call_args.pop('cam_type')
if call_args['no_save_vis'] and call_args['no_save_pred']:
call_args['out_dir'] = ''
init_kws = ['model', 'weights', 'device']
init_args = {}
for init_kw in init_kws:
init_args[init_kw] = call_args.pop(init_kw)
# NOTE: If your operating environment does not have a display device,
# (e.g. a remote server), you can save the predictions and visualize
# them in local devices.
if os.environ.get('DISPLAY') is None and call_args['show']:
print_log(
'Display device not found. `--show` is forced to False',
logger='current',
level=logging.WARNING)
call_args['show'] = False
return init_args, call_args
def main():
# TODO: Support inference of point cloud numpy file.
init_args, call_args = parse_args()
inferencer = MonoDet3DInferencer(**init_args)
inferencer(**call_args)
if call_args['out_dir'] != '' and not (call_args['no_save_vis']
and call_args['no_save_pred']):
print_log(
f'results have been saved at {call_args["out_dir"]}',
logger='current')
if __name__ == '__main__':
main()
# Copyright (c) OpenMMLab. All rights reserved.
import logging
import os
from argparse import ArgumentParser
from mmengine.logging import print_log
from mmdet3d.apis import MultiModalityDet3DInferencer
def parse_args():
parser = ArgumentParser()
parser.add_argument('pcd', help='Point cloud file')
parser.add_argument('img', help='Image file')
parser.add_argument('infos', help='Infos file with annotations')
parser.add_argument('model', help='Config file')
parser.add_argument('weights', help='Checkpoint file')
parser.add_argument(
'--device', default='cuda:0', help='Device used for inference')
parser.add_argument(
'--cam-type',
type=str,
default='CAM_FRONT',
help='choose camera type to inference')
parser.add_argument(
'--pred-score-thr',
type=float,
default=0.3,
help='bbox score threshold')
parser.add_argument(
'--out-dir',
type=str,
default='outputs',
help='Output directory of prediction and visualization results.')
parser.add_argument(
'--show',
action='store_true',
help='Show online visualization results')
parser.add_argument(
'--wait-time',
type=float,
default=-1,
help='The interval of show (s). Demo will be blocked in showing'
'results, if wait_time is -1. Defaults to -1.')
parser.add_argument(
'--no-save-vis',
action='store_true',
help='Do not save detection visualization results')
parser.add_argument(
'--no-save-pred',
action='store_true',
help='Do not save detection prediction results')
parser.add_argument(
'--print-result',
action='store_true',
help='Whether to print the results.')
call_args = vars(parser.parse_args())
call_args['inputs'] = dict(
points=call_args.pop('pcd'),
img=call_args.pop('img'),
infos=call_args.pop('infos'))
call_args.pop('cam_type')
if call_args['no_save_vis'] and call_args['no_save_pred']:
call_args['out_dir'] = ''
init_kws = ['model', 'weights', 'device']
init_args = {}
for init_kw in init_kws:
init_args[init_kw] = call_args.pop(init_kw)
# NOTE: If your operating environment does not have a display device,
# (e.g. a remote server), you can save the predictions and visualize
# them in local devices.
if os.environ.get('DISPLAY') is None and call_args['show']:
print_log(
'Display device not found. `--show` is forced to False',
logger='current',
level=logging.WARNING)
call_args['show'] = False
return init_args, call_args
def main():
# TODO: Support inference of point cloud numpy file.
init_args, call_args = parse_args()
inferencer = MultiModalityDet3DInferencer(**init_args)
inferencer(**call_args)
if call_args['out_dir'] != '' and not (call_args['no_save_vis']
and call_args['no_save_pred']):
print_log(
f'results have been saved at {call_args["out_dir"]}',
logger='current')
if __name__ == '__main__':
main()
# Copyright (c) OpenMMLab. All rights reserved.
import logging
import os
from argparse import ArgumentParser
from mmengine.logging import print_log
from mmdet3d.apis import LidarDet3DInferencer
def parse_args():
parser = ArgumentParser()
parser.add_argument('pcd', help='Point cloud file')
parser.add_argument('model', help='Config file')
parser.add_argument('weights', help='Checkpoint file')
parser.add_argument(
'--device', default='cuda:0', help='Device used for inference')
parser.add_argument(
'--pred-score-thr',
type=float,
default=0.3,
help='bbox score threshold')
parser.add_argument(
'--out-dir',
type=str,
default='outputs',
help='Output directory of prediction and visualization results.')
parser.add_argument(
'--show',
action='store_true',
help='Show online visualization results')
parser.add_argument(
'--wait-time',
type=float,
default=-1,
help='The interval of show (s). Demo will be blocked in showing'
'results, if wait_time is -1. Defaults to -1.')
parser.add_argument(
'--no-save-vis',
action='store_true',
help='Do not save detection visualization results')
parser.add_argument(
'--no-save-pred',
action='store_true',
help='Do not save detection prediction results')
parser.add_argument(
'--print-result',
action='store_true',
help='Whether to print the results.')
call_args = vars(parser.parse_args())
call_args['inputs'] = dict(points=call_args.pop('pcd'))
if call_args['no_save_vis'] and call_args['no_save_pred']:
call_args['out_dir'] = ''
init_kws = ['model', 'weights', 'device']
init_args = {}
for init_kw in init_kws:
init_args[init_kw] = call_args.pop(init_kw)
# NOTE: If your operating environment does not have a display device,
# (e.g. a remote server), you can save the predictions and visualize
# them in local devices.
if os.environ.get('DISPLAY') is None and call_args['show']:
print_log(
'Display device not found. `--show` is forced to False',
logger='current',
level=logging.WARNING)
call_args['show'] = False
return init_args, call_args
def main():
# TODO: Support inference of point cloud numpy file.
init_args, call_args = parse_args()
inferencer = LidarDet3DInferencer(**init_args)
inferencer(**call_args)
if call_args['out_dir'] != '' and not (call_args['no_save_vis']
and call_args['no_save_pred']):
print_log(
f'results have been saved at {call_args["out_dir"]}',
logger='current')
if __name__ == '__main__':
main()
# Copyright (c) OpenMMLab. All rights reserved.
import logging
import os
from argparse import ArgumentParser
from mmengine.logging import print_log
from mmdet3d.apis import LidarSeg3DInferencer
def parse_args():
parser = ArgumentParser()
parser.add_argument('pcd', help='Point cloud file')
parser.add_argument('model', help='Config file')
parser.add_argument('weights', help='Checkpoint file')
parser.add_argument(
'--device', default='cuda:0', help='Device used for inference')
parser.add_argument(
'--out-dir',
type=str,
default='outputs',
help='Output directory of prediction and visualization results.')
parser.add_argument(
'--show',
action='store_true',
help='Show online visualization results')
parser.add_argument(
'--wait-time',
type=float,
default=-1,
help='The interval of show (s). Demo will be blocked in showing'
'results, if wait_time is -1. Defaults to -1.')
parser.add_argument(
'--no-save-vis',
action='store_true',
help='Do not save detection visualization results')
parser.add_argument(
'--no-save-pred',
action='store_true',
help='Do not save detection prediction results')
parser.add_argument(
'--print-result',
action='store_true',
help='Whether to print the results.')
call_args = vars(parser.parse_args())
call_args['inputs'] = dict(points=call_args.pop('pcd'))
if call_args['no_save_vis'] and call_args['no_save_pred']:
call_args['out_dir'] = ''
init_kws = ['model', 'weights', 'device']
init_args = {}
for init_kw in init_kws:
init_args[init_kw] = call_args.pop(init_kw)
# NOTE: If your operating environment does not have a display device,
# (e.g. a remote server), you can save the predictions and visualize
# them in local devices.
if os.environ.get('DISPLAY') is None and call_args['show']:
print_log(
'Display device not found. `--show` is forced to False',
logger='current',
level=logging.WARNING)
call_args['show'] = False
return init_args, call_args
def main():
# TODO: Support inference of point cloud numpy file.
init_args, call_args = parse_args()
inferencer = LidarSeg3DInferencer(**init_args)
inferencer(**call_args)
if call_args['out_dir'] != '' and not (call_args['no_save_vis']
and call_args['no_save_pred']):
print_log(
f'results have been saved at {call_args["out_dir"]}',
logger='current')
if __name__ == '__main__':
main()
ARG PYTORCH="1.9.0"
ARG CUDA="11.1"
ARG CUDNN="8"
FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
ENV TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6+PTX" \
TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \
FORCE_CUDA="1"
# Avoid Public GPG key error
# https://github.com/NVIDIA/nvidia-docker/issues/1631
RUN rm /etc/apt/sources.list.d/cuda.list \
&& rm /etc/apt/sources.list.d/nvidia-ml.list \
&& apt-key del 7fa2af80 \
&& apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub \
&& apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
# (Optional, use Mirror to speed up downloads)
# RUN sed -i 's/http:\/\/archive.ubuntu.com\/ubuntu\//http:\/\/mirrors.aliyun.com\/ubuntu\//g' /etc/apt/sources.list && \
# pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
# Install the required packages
RUN apt-get update \
&& apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
# Install MMEngine, MMCV and MMDetection
RUN pip install openmim && \
mim install "mmengine" "mmcv>=2.0.0rc4" "mmdet>=3.0.0"
# Install MMDetection3D
RUN conda clean --all \
&& git clone https://github.com/open-mmlab/mmdetection3d.git -b dev-1.x /mmdetection3d \
&& cd /mmdetection3d \
&& pip install --no-cache-dir -e .
WORKDIR /mmdetection3d
ARG PYTORCH="1.9.0"
ARG CUDA="11.1"
ARG CUDNN="8"
FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
ARG MMCV="2.0.0rc4"
ARG MMDET="3.3.0"
ARG MMDET3D="1.4.0"
ENV PYTHONUNBUFFERED TRUE
# Avoid Public GPG key error
# https://github.com/NVIDIA/nvidia-docker/issues/1631
RUN rm /etc/apt/sources.list.d/cuda.list \
&& rm /etc/apt/sources.list.d/nvidia-ml.list \
&& apt-key del 7fa2af80 \
&& apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub \
&& apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
# (Optional, use Mirror to speed up downloads)
# RUN sed -i 's/http:\/\/archive.ubuntu.com\/ubuntu\//http:\/\/mirrors.aliyun.com\/ubuntu\//g' /etc/apt/sources.list
# Install the required packages
RUN apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
ca-certificates \
g++ \
openjdk-11-jre-headless \
# MMDet3D Requirements
ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 \
&& rm -rf /var/lib/apt/lists/*
ENV PATH="/opt/conda/bin:$PATH" \
FORCE_CUDA="1"
# TORCHSEVER
RUN pip install torchserve torch-model-archiver
# MMLAB
ARG PYTORCH
ARG CUDA
RUN pip install openmim
RUN mim install mmengine
RUN mim install mmcv==${MMCV}
RUN mim install mmdet==${MMDET}
RUN mim install mmdet3d==${MMDET3D}
RUN useradd -m model-server \
&& mkdir -p /home/model-server/tmp
COPY entrypoint.sh /usr/local/bin/entrypoint.sh
RUN chmod +x /usr/local/bin/entrypoint.sh \
&& chown -R model-server /home/model-server
COPY config.properties /home/model-server/config.properties
RUN mkdir /home/model-server/model-store && chown -R model-server /home/model-server/model-store
EXPOSE 8080 8081 8082
USER model-server
WORKDIR /home/model-server
ENV TEMP=/home/model-server/tmp
ENTRYPOINT ["/usr/local/bin/entrypoint.sh"]
CMD ["serve"]
inference_address=http://0.0.0.0:8080
management_address=http://0.0.0.0:8081
metrics_address=http://0.0.0.0:8082
model_store=/home/model-server/model-store
load_models=all
#!/bin/bash
set -e
if [[ "$1" = "serve" ]]; then
shift 1
torchserve --start --ts-config /home/model-server/config.properties
else
eval "$@"
fi
# prevent docker exit
tail -f /dev/null
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SOURCEDIR = .
BUILDDIR = _build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.header-logo {
background-image: url("../image/mmdet3d-logo.png");
background-size: 182.5px 40px;
height: 40px;
width: 182.5px;
}
# Customize Datasets
In this note, you will know how to train and test predefined models with customized datasets.
The basic steps are as below:
1. Prepare data
2. Prepare a config
3. Train, test and inference models on the customized dataset
## Data Preparation
The ideal situation is that we can reorganize the customized raw data and convert the annotation format into KITTI style. However, considering some calibration files and 3D annotations in KITTI format are difficult to obtain for customized datasets, we introduce the basic data format in the doc.
### Basic Data Format
#### Point cloud Format
Currently, we only support `.bin` format point cloud for training and inference. Before training on your own datasets, you need to convert your point cloud files with other formats to `.bin` files. The common point cloud data formats include `.pcd` and `.las`, we list some open-source tools for reference.
1. Convert `.pcd` to `.bin`: https://github.com/DanielPollithy/pypcd
- You can install `pypcd` with the following command:
```bash
pip install git+https://github.com/DanielPollithy/pypcd.git
```
- You can use the following script to read the `.pcd` file and convert it to `.bin` format for saving:
```python
import numpy as np
from pypcd import pypcd
pcd_data = pypcd.PointCloud.from_path('point_cloud_data.pcd')
points = np.zeros([pcd_data.width, 4], dtype=np.float32)
points[:, 0] = pcd_data.pc_data['x'].copy()
points[:, 1] = pcd_data.pc_data['y'].copy()
points[:, 2] = pcd_data.pc_data['z'].copy()
points[:, 3] = pcd_data.pc_data['intensity'].copy().astype(np.float32)
with open('point_cloud_data.bin', 'wb') as f:
f.write(points.tobytes())
```
2. Convert `.las` to `.bin`: The common conversion path is `.las -> .pcd -> .bin`, and the conversion path `.las -> .pcd` can be achieved through [this tool](https://github.com/Hitachi-Automotive-And-Industry-Lab/semantic-segmentation-editor).
#### Label Format
The most basic information: 3D bounding box and category label of each scene need to be contained in the `.txt` annotation file. Each line represents a 3D box in a certain scene as follow:
```
# format: [x, y, z, dx, dy, dz, yaw, category_name]
1.23 1.42 0.23 3.96 1.65 1.55 1.56 Car
3.51 2.15 0.42 1.05 0.87 1.86 1.23 Pedestrian
...
```
**Note**: Currently we only support KITTI Metric evaluation for customized datasets evaluation.
The 3D Box should be stored in unified 3D coordinates.
#### Calibration Format
For the point cloud data collected by each LiDAR, they are usually fused and converted to a certain LiDAR coordinate. So typically the calibration information file should contain the intrinsic matrix of each camera and the transformation extrinsic matrix from the LiDAR to each camera in `.txt` calibration file, while `Px` represents the intrinsic matrix of `camera_x` and `lidar2camx` represents the transformation extrinsic matrix from the `lidar` to `camera_x`.
```
P0
P1
P2
P3
P4
...
lidar2cam0
lidar2cam1
lidar2cam2
lidar2cam3
lidar2cam4
...
```
### Raw Data Structure
#### LiDAR-Based 3D Detection
The raw data for LiDAR-based 3D object detection are typically organized as follows, where `ImageSets` contains split files indicating which files belong to training/validation set, `points` includes point cloud data which are supposed to be stored in `.bin` format and `labels` includes label files for 3D detection.
```
mmdetection3d
├── mmdet3d
├── tools
├── configs
├── data
│ ├── custom
│ │ ├── ImageSets
│ │ │ ├── train.txt
│ │ │ ├── val.txt
│ │ ├── points
│ │ │ ├── 000000.bin
│ │ │ ├── 000001.bin
│ │ │ ├── ...
│ │ ├── labels
│ │ │ ├── 000000.txt
│ │ │ ├── 000001.txt
│ │ │ ├── ...
```
#### Vision-Based 3D Detection
The raw data for vision-based 3D object detection are typically organized as follows, where `ImageSets` contains split files indicating which files belong to training/validation set, `images` contains the images from different cameras, for example, images from `camera_x` need to be placed in `images/images_x`, `calibs` contains calibration information files which store the camera intrinsic matrix of each camera, and `labels` includes label files for 3D detection.
```
mmdetection3d
├── mmdet3d
├── tools
├── configs
├── data
│ ├── custom
│ │ ├── ImageSets
│ │ │ ├── train.txt
│ │ │ ├── val.txt
│ │ ├── calibs
│ │ │ ├── 000000.txt
│ │ │ ├── 000001.txt
│ │ │ ├── ...
│ │ ├── images
│ │ │ ├── images_0
│ │ │ │ ├── 000000.png
│ │ │ │ ├── 000001.png
│ │ │ │ ├── ...
│ │ │ ├── images_1
│ │ │ ├── images_2
│ │ │ ├── ...
│ │ ├── labels
│ │ │ ├── 000000.txt
│ │ │ ├── 000001.txt
│ │ │ ├── ...
```
#### Multi-Modality 3D Detection
The raw data for multi-modality 3D object detection are typically organized as follows. Different from vision-based 3D object detection, calibration information files in `calibs` store the camera intrinsic matrix of each camera and extrinsic matrix.
```
mmdetection3d
├── mmdet3d
├── tools
├── configs
├── data
│ ├── custom
│ │ ├── ImageSets
│ │ │ ├── train.txt
│ │ │ ├── val.txt
│ │ ├── calibs
│ │ │ ├── 000000.txt
│ │ │ ├── 000001.txt
│ │ │ ├── ...
│ │ ├── points
│ │ │ ├── 000000.bin
│ │ │ ├── 000001.bin
│ │ │ ├── ...
│ │ ├── images
│ │ │ ├── images_0
│ │ │ │ ├── 000000.png
│ │ │ │ ├── 000001.png
│ │ │ │ ├── ...
│ │ │ ├── images_1
│ │ │ ├── images_2
│ │ │ ├── ...
│ │ ├── labels
│ │ │ ├── 000000.txt
│ │ │ ├── 000001.txt
│ │ │ ├── ...
```
#### LiDAR-Based 3D Semantic Segmentation
The raw data for LiDAR-based 3D semantic segmentation are typically organized as follows, where `ImageSets` contains split files indicating which files belong to training/validation set, `points` includes point cloud data, and `semantic_mask` includes point-level label.
```
mmdetection3d
├── mmdet3d
├── tools
├── configs
├── data
│ ├── custom
│ │ ├── ImageSets
│ │ │ ├── train.txt
│ │ │ ├── val.txt
│ │ ├── points
│ │ │ ├── 000000.bin
│ │ │ ├── 000001.bin
│ │ │ ├── ...
│ │ ├── semantic_mask
│ │ │ ├── 000000.bin
│ │ │ ├── 000001.bin
│ │ │ ├── ...
```
### Data Converter
Once you prepared the raw data following our instruction, you can directly use the following command to generate training/validation information files.
```bash
python tools/create_data.py custom --root-path ./data/custom --out-dir ./data/custom --extra-tag custom
```
## An example of customized dataset
Once we finish data preparation, we can create a new dataset in `mmdet3d/datasets/my_dataset.py` to load the data.
```python
import mmengine
from mmdet3d.registry import DATASETS
from .det3d_dataset import Det3DDataset
@DATASETS.register_module()
class MyDataset(Det3DDataset):
# replace with all the classes in customized pkl info file
METAINFO = {
'classes': ('Pedestrian', 'Cyclist', 'Car')
}
def parse_ann_info(self, info):
"""Process the `instances` in data info to `ann_info`.
Args:
info (dict): Data information of single data sample.
Returns:
dict: Annotation information consists of the following keys:
- gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`):
3D ground truth bboxes.
- gt_labels_3d (np.ndarray): Labels of ground truths.
"""
ann_info = super().parse_ann_info(info)
if ann_info is None:
ann_info = dict()
# empty instance
ann_info['gt_bboxes_3d'] = np.zeros((0, 7), dtype=np.float32)
ann_info['gt_labels_3d'] = np.zeros(0, dtype=np.int64)
# filter the gt classes not used in training
ann_info = self._remove_dontcare(ann_info)
gt_bboxes_3d = LiDARInstance3DBoxes(ann_info['gt_bboxes_3d'])
ann_info['gt_bboxes_3d'] = gt_bboxes_3d
return ann_info
```
After the data pre-processing, there are two steps for users to train the customized new dataset:
1. Modify the config file for using the customized dataset.
2. Check the annotations of the customized dataset.
Here we take training PointPillars on customized dataset as an example:
### Prepare a config
Here we demonstrate a config sample for pure point cloud training.
#### Prepare dataset config
In `configs/_base_/datasets/custom.py`:
```python
# dataset settings
dataset_type = 'MyDataset'
data_root = 'data/custom/'
class_names = ['Pedestrian', 'Cyclist', 'Car'] # replace with your dataset class
point_cloud_range = [0, -40, -3, 70.4, 40, 1] # adjust according to your dataset
input_modality = dict(use_lidar=True, use_camera=False)
metainfo = dict(classes=class_names)
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=4, # replace with your point cloud data dimension
use_dim=4), # replace with the actual dimension used in training and inference
dict(
type='LoadAnnotations3D',
with_bbox_3d=True,
with_label_3d=True),
dict(
type='ObjectNoise',
num_try=100,
translation_std=[1.0, 1.0, 0.5],
global_rot_range=[0.0, 0.0],
rot_range=[-0.78539816, 0.78539816]),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.78539816, 0.78539816],
scale_ratio_range=[0.95, 1.05]),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='PointShuffle'),
dict(
type='Pack3DDetInputs',
keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=4, # replace with your point cloud data dimension
use_dim=4),
dict(type='Pack3DDetInputs', keys=['points'])
]
# construct a pipeline for data and gt loading in show function
eval_pipeline = [
dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
dict(type='Pack3DDetInputs', keys=['points']),
]
train_dataloader = dict(
batch_size=6,
num_workers=4,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type='RepeatDataset',
times=2,
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='custom_infos_train.pkl', # specify your training pkl info
data_prefix=dict(pts='points'),
pipeline=train_pipeline,
modality=input_modality,
test_mode=False,
metainfo=metainfo,
box_type_3d='LiDAR')))
val_dataloader = dict(
batch_size=1,
num_workers=1,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
data_prefix=dict(pts='points'),
ann_file='custom_infos_val.pkl', # specify your validation pkl info
pipeline=test_pipeline,
modality=input_modality,
test_mode=True,
metainfo=metainfo,
box_type_3d='LiDAR'))
val_evaluator = dict(
type='KittiMetric',
ann_file=data_root + 'custom_infos_val.pkl', # specify your validation pkl info
metric='bbox')
```
#### Prepare model config
For voxel-based detectors such as SECOND, PointPillars and CenterPoint, the point cloud range and voxel size should be adjusted according to your dataset.
Theoretically, `voxel_size` is linked to the setting of `point_cloud_range`. Setting a smaller `voxel_size` will increase the voxel num and the corresponding memory consumption. In addition, the following issues need to be noted:
If the `point_cloud_range` and `voxel_size` are set to be `[0, -40, -3, 70.4, 40, 1]` and `[0.05, 0.05, 0.1]` respectively, then the shape of intermediate feature map should be `[(1-(-3))/0.1+1, (40-(-40))/0.05, (70.4-0)/0.05]=[41, 1600, 1408]`. When changing `point_cloud_range`, remember to change the shape of intermediate feature map in `middle_encoder` according to the `voxel_size`.
Regarding the setting of `anchor_range`, it is generally adjusted according to dataset. Note that `z` value needs to be adjusted accordingly to the position of the point cloud, please refer to this [issue](https://github.com/open-mmlab/mmdetection3d/issues/986).
Regarding the setting of `anchor_size`, it is usually necessary to count the average length, width and height of objects in the entire training dataset as `anchor_size` to obtain the best results.
In `configs/_base_/models/pointpillars_hv_secfpn_custom.py`:
```python
voxel_size = [0.16, 0.16, 4] # adjust according to your dataset
point_cloud_range = [0, -39.68, -3, 69.12, 39.68, 1] # adjust according to your dataset
model = dict(
type='VoxelNet',
data_preprocessor=dict(
type='Det3DDataPreprocessor',
voxel=True,
voxel_layer=dict(
max_num_points=32,
point_cloud_range=point_cloud_range,
voxel_size=voxel_size,
max_voxels=(16000, 40000))),
voxel_encoder=dict(
type='PillarFeatureNet',
in_channels=4,
feat_channels=[64],
with_distance=False,
voxel_size=voxel_size,
point_cloud_range=point_cloud_range),
# the `output_shape` should be adjusted according to `point_cloud_range`
# and `voxel_size`
middle_encoder=dict(
type='PointPillarsScatter', in_channels=64, output_shape=[496, 432]),
backbone=dict(
type='SECOND',
in_channels=64,
layer_nums=[3, 5, 5],
layer_strides=[2, 2, 2],
out_channels=[64, 128, 256]),
neck=dict(
type='SECONDFPN',
in_channels=[64, 128, 256],
upsample_strides=[1, 2, 4],
out_channels=[128, 128, 128]),
bbox_head=dict(
type='Anchor3DHead',
num_classes=3,
in_channels=384,
feat_channels=384,
use_direction_classifier=True,
assign_per_class=True,
# adjust the `ranges` and `sizes` according to your dataset
anchor_generator=dict(
type='AlignedAnchor3DRangeGenerator',
ranges=[
[0, -39.68, -0.6, 69.12, 39.68, -0.6],
[0, -39.68, -0.6, 69.12, 39.68, -0.6],
[0, -39.68, -1.78, 69.12, 39.68, -1.78],
],
sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]],
rotations=[0, 1.57],
reshape_out=False),
diff_rad_by_sin=True,
bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
loss_cls=dict(
type='mmdet.FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(
type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
loss_dir=dict(
type='mmdet.CrossEntropyLoss', use_sigmoid=False,
loss_weight=0.2)),
# model training and testing settings
train_cfg=dict(
assigner=[
dict( # for Pedestrian
type='Max3DIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.5,
neg_iou_thr=0.35,
min_pos_iou=0.35,
ignore_iof_thr=-1),
dict( # for Cyclist
type='Max3DIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.5,
neg_iou_thr=0.35,
min_pos_iou=0.35,
ignore_iof_thr=-1),
dict( # for Car
type='Max3DIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.6,
neg_iou_thr=0.45,
min_pos_iou=0.45,
ignore_iof_thr=-1),
],
allowed_border=0,
pos_weight=-1,
debug=False),
test_cfg=dict(
use_rotate_nms=True,
nms_across_levels=False,
nms_thr=0.01,
score_thr=0.1,
min_bbox_size=0,
nms_pre=100,
max_num=50))
```
#### Prepare overall config
We combine all the configs above in `configs/pointpillars/pointpillars_hv_secfpn_8xb6_custom.py`:
```python
_base_ = [
'../_base_/models/pointpillars_hv_secfpn_custom.py',
'../_base_/datasets/custom.py',
'../_base_/schedules/cyclic-40e.py', '../_base_/default_runtime.py'
]
```
#### Visualize your dataset (optional)
To validate whether your prepared data and config are correct, it's highly recommended to use `tools/misc/browse_dataset.py` script
to visualize your dataset and annotations before training and validation. Please refer to [visualization doc](https://mmdetection3d.readthedocs.io/en/dev-1.x/user_guides/visualization.html) for more details.
## Evaluation
Once the data and config have been prepared, you can directly run the training/testing script following our doc.
**Note**: We only provide an implementation for KITTI style evaluation for the customized dataset. It should be included in the dataset config:
```python
val_evaluator = dict(
type='KittiMetric',
ann_file=data_root + 'custom_infos_val.pkl', # specify your validation pkl info
metric='bbox')
```
# Customize Models
We basically categorize model components into 6 types:
- encoder: Including voxel encoder and middle encoder used in voxel-based methods before backbone, e.g., `HardVFE` and `PointPillarsScatter`.
- backbone: Usually an FCN network to extract feature maps, e.g., `ResNet`, `SECOND`.
- neck: The component between backbones and heads, e.g., `FPN`, `SECONDFPN`.
- head: The component for specific tasks, e.g., `bbox prediction` and `mask prediction`.
- RoI extractor: The part for extracting RoI features from feature maps, e.g., `H3DRoIHead` and `PartAggregationROIHead`.
- loss: The component in heads for calculating losses, e.g., `FocalLoss`, `L1Loss`, and `GHMLoss`.
## Develop new components
### Add a new encoder
Here we show how to develop new components with an example of HardVFE.
#### 1. Define a new voxel encoder (e.g. HardVFE: Voxel feature encoder used in HV-SECOND)
Create a new file `mmdet3d/models/voxel_encoders/voxel_encoder.py`.
```python
import torch.nn as nn
from mmdet3d.registry import MODELS
@MODELS.register_module()
class HardVFE(nn.Module):
def __init__(self, arg1, arg2):
pass
def forward(self, x): # should return a tuple
pass
```
#### 2. Import the module
You can either add the following line to `mmdet3d/models/voxel_encoders/__init__.py`:
```python
from .voxel_encoder import HardVFE
```
or alternatively add
```python
custom_imports = dict(
imports=['mmdet3d.models.voxel_encoders.voxel_encoder'],
allow_failed_imports=False)
```
to the config file to avoid modifying the original code.
#### 3. Use the voxel encoder in your config file
```python
model = dict(
...
voxel_encoder=dict(
type='HardVFE',
arg1=xxx,
arg2=yyy),
...
)
```
### Add a new backbone
Here we show how to develop new components with an example of [SECOND](https://www.mdpi.com/1424-8220/18/10/3337) (Sparsely Embedded Convolutional Detection).
#### 1. Define a new backbone (e.g. SECOND)
Create a new file `mmdet3d/models/backbones/second.py`.
```python
from mmengine.model import BaseModule
from mmdet3d.registry import MODELS
@MODELS.register_module()
class SECOND(BaseModule):
def __init__(self, arg1, arg2):
pass
def forward(self, x): # should return a tuple
pass
```
#### 2. Import the module
You can either add the following line to `mmdet3d/models/backbones/__init__.py`:
```python
from .second import SECOND
```
or alternatively add
```python
custom_imports = dict(
imports=['mmdet3d.models.backbones.second'],
allow_failed_imports=False)
```
to the config file to avoid modifying the original code.
#### 3. Use the backbone in your config file
```python
model = dict(
...
backbone=dict(
type='SECOND',
arg1=xxx,
arg2=yyy),
...
)
```
### Add a new neck
#### 1. Define a new neck (e.g. SECONDFPN)
Create a new file `mmdet3d/models/necks/second_fpn.py`.
```python
from mmengine.model import BaseModule
from mmdet3d.registry import MODELS
@MODELS.register_module()
class SECONDFPN(BaseModule):
def __init__(self,
in_channels=[128, 128, 256],
out_channels=[256, 256, 256],
upsample_strides=[1, 2, 4],
norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
upsample_cfg=dict(type='deconv', bias=False),
conv_cfg=dict(type='Conv2d', bias=False),
use_conv_for_no_stride=False,
init_cfg=None):
pass
def forward(self, x):
# implementation is ignored
pass
```
#### 2. Import the module
You can either add the following line to `mmdet3d/models/necks/__init__.py`:
```python
from .second_fpn import SECONDFPN
```
or alternatively add
```python
custom_imports = dict(
imports=['mmdet3d.models.necks.second_fpn'],
allow_failed_imports=False)
```
to the config file to avoid modifying the original code.
#### 3. Use the neck in your config file
```python
model = dict(
...
neck=dict(
type='SECONDFPN',
in_channels=[64, 128, 256],
upsample_strides=[1, 2, 4],
out_channels=[128, 128, 128]),
...
)
```
### Add a new head
Here we show how to develop a new head with the example of [PartA2 Head](https://arxiv.org/abs/1907.03670) as the following.
**Note**: Here the example of `PartA2 RoI Head` is used in the second stage. For one-stage heads, please refer to examples in `mmdet3d/models/dense_heads/`. They are more commonly used in 3D detection for autonomous driving due to its simplicity and high efficiency.
First, add a new bbox head in `mmdet3d/models/roi_heads/bbox_heads/parta2_bbox_head.py`.
`PartA2 RoI Head` implements a new bbox head for object detection.
To implement a bbox head, basically we need to implement two functions of the new module as the following. Sometimes other related functions like `loss` and `get_targets` are also required.
```python
from mmengine.model import BaseModule
from mmdet3d.registry import MODELS
@MODELS.register_module()
class PartA2BboxHead(BaseModule):
"""PartA2 RoI head."""
def __init__(self,
num_classes,
seg_in_channels,
part_in_channels,
seg_conv_channels=None,
part_conv_channels=None,
merge_conv_channels=None,
down_conv_channels=None,
shared_fc_channels=None,
cls_channels=None,
reg_channels=None,
dropout_ratio=0.1,
roi_feat_size=14,
with_corner_loss=True,
bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
loss_bbox=dict(
type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
loss_cls=dict(
type='CrossEntropyLoss',
use_sigmoid=True,
reduction='none',
loss_weight=1.0),
init_cfg=None):
super(PartA2BboxHead, self).__init__(init_cfg=init_cfg)
def forward(self, seg_feats, part_feats):
pass
```
Second, implement a new RoI Head if it is necessary. We plan to inherit the new `PartAggregationROIHead` from `Base3DRoIHead`. We can find that a `Base3DRoIHead` already implements the following functions.
```python
from mmdet.models.roi_heads import BaseRoIHead
from mmdet3d.registry import MODELS, TASK_UTILS
class Base3DRoIHead(BaseRoIHead):
"""Base class for 3d RoIHeads."""
def __init__(self,
bbox_head=None,
bbox_roi_extractor=None,
mask_head=None,
mask_roi_extractor=None,
train_cfg=None,
test_cfg=None,
init_cfg=None):
super(Base3DRoIHead, self).__init__(
bbox_head=bbox_head,
bbox_roi_extractor=bbox_roi_extractor,
mask_head=mask_head,
mask_roi_extractor=mask_roi_extractor,
train_cfg=train_cfg,
test_cfg=test_cfg,
init_cfg=init_cfg)
def init_bbox_head(self, bbox_roi_extractor: dict,
bbox_head: dict) -> None:
"""Initialize box head and box roi extractor.
Args:
bbox_roi_extractor (dict or ConfigDict): Config of box
roi extractor.
bbox_head (dict or ConfigDict): Config of box in box head.
"""
self.bbox_roi_extractor = MODELS.build(bbox_roi_extractor)
self.bbox_head = MODELS.build(bbox_head)
def init_assigner_sampler(self):
"""Initialize assigner and sampler."""
self.bbox_assigner = None
self.bbox_sampler = None
if self.train_cfg:
if isinstance(self.train_cfg.assigner, dict):
self.bbox_assigner = TASK_UTILS.build(self.train_cfg.assigner)
elif isinstance(self.train_cfg.assigner, list):
self.bbox_assigner = [
TASK_UTILS.build(res) for res in self.train_cfg.assigner
]
self.bbox_sampler = TASK_UTILS.build(self.train_cfg.sampler)
def init_mask_head(self):
"""Initialize mask head, skip since ``PartAggregationROIHead`` does not
have one."""
pass
```
Double Head's modification is mainly in the bbox_forward logic, and it inherits other logics from the `Base3DRoIHead`.
In the `mmdet3d/models/roi_heads/part_aggregation_roi_head.py`, we implement the new RoI Head as the following:
```python
from typing import Dict, List, Tuple
from mmdet.models.task_modules import AssignResult, SamplingResult
from mmengine import ConfigDict
from torch import Tensor
from torch.nn import functional as F
from mmdet3d.registry import MODELS
from mmdet3d.structures import bbox3d2roi
from mmdet3d.utils import InstanceList
from ...structures.det3d_data_sample import SampleList
from .base_3droi_head import Base3DRoIHead
@MODELS.register_module()
class PartAggregationROIHead(Base3DRoIHead):
"""Part aggregation roi head for PartA2.
Args:
semantic_head (ConfigDict): Config of semantic head.
num_classes (int): The number of classes.
seg_roi_extractor (ConfigDict): Config of seg_roi_extractor.
bbox_roi_extractor (ConfigDict): Config of part_roi_extractor.
bbox_head (ConfigDict): Config of bbox_head.
train_cfg (ConfigDict): Training config.
test_cfg (ConfigDict): Testing config.
"""
def __init__(self,
semantic_head: dict,
num_classes: int = 3,
seg_roi_extractor: dict = None,
bbox_head: dict = None,
bbox_roi_extractor: dict = None,
train_cfg: dict = None,
test_cfg: dict = None,
init_cfg: dict = None) -> None:
super(PartAggregationROIHead, self).__init__(
bbox_head=bbox_head,
bbox_roi_extractor=bbox_roi_extractor,
train_cfg=train_cfg,
test_cfg=test_cfg,
init_cfg=init_cfg)
self.num_classes = num_classes
assert semantic_head is not None
self.init_seg_head(seg_roi_extractor, semantic_head)
def init_seg_head(self, seg_roi_extractor: dict,
semantic_head: dict) -> None:
"""Initialize semantic head and seg roi extractor.
Args:
seg_roi_extractor (dict): Config of seg
roi extractor.
semantic_head (dict): Config of semantic head.
"""
self.semantic_head = MODELS.build(semantic_head)
self.seg_roi_extractor = MODELS.build(seg_roi_extractor)
@property
def with_semantic(self):
"""bool: whether the head has semantic branch"""
return hasattr(self,
'semantic_head') and self.semantic_head is not None
def predict(self,
feats_dict: Dict,
rpn_results_list: InstanceList,
batch_data_samples: SampleList,
rescale: bool = False,
**kwargs) -> InstanceList:
"""Perform forward propagation of the roi head and predict detection
results on the features of the upstream network.
Args:
feats_dict (dict): Contains features from the first stage.
rpn_results_list (List[:obj:`InstanceData`]): Detection results
of rpn head.
batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
samples. It usually includes information such as
`gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
rescale (bool): If True, return boxes in original image space.
Defaults to False.
Returns:
list[:obj:`InstanceData`]: Detection results of each sample
after the post process.
Each item usually contains following keys.
- scores_3d (Tensor): Classification scores, has a shape
(num_instances, )
- labels_3d (Tensor): Labels of bboxes, has a shape
(num_instances, ).
- bboxes_3d (BaseInstance3DBoxes): Prediction of bboxes,
contains a tensor with shape (num_instances, C), where
C >= 7.
"""
assert self.with_bbox, 'Bbox head must be implemented in PartA2.'
assert self.with_semantic, 'Semantic head must be implemented' \
' in PartA2.'
batch_input_metas = [
data_samples.metainfo for data_samples in batch_data_samples
]
voxels_dict = feats_dict.pop('voxels_dict')
# TODO: Split predict semantic and bbox
results_list = self.predict_bbox(feats_dict, voxels_dict,
batch_input_metas, rpn_results_list,
self.test_cfg)
return results_list
def predict_bbox(self, feats_dict: Dict, voxel_dict: Dict,
batch_input_metas: List[dict],
rpn_results_list: InstanceList,
test_cfg: ConfigDict) -> InstanceList:
"""Perform forward propagation of the bbox head and predict detection
results on the features of the upstream network.
Args:
feats_dict (dict): Contains features from the first stage.
voxel_dict (dict): Contains information of voxels.
batch_input_metas (list[dict], Optional): Batch image meta info.
Defaults to None.
rpn_results_list (List[:obj:`InstanceData`]): Detection results
of rpn head.
test_cfg (Config): Test config.
Returns:
list[:obj:`InstanceData`]: Detection results of each sample
after the post process.
Each item usually contains following keys.
- scores_3d (Tensor): Classification scores, has a shape
(num_instances, )
- labels_3d (Tensor): Labels of bboxes, has a shape
(num_instances, ).
- bboxes_3d (BaseInstance3DBoxes): Prediction of bboxes,
contains a tensor with shape (num_instances, C), where
C >= 7.
"""
...
def loss(self, feats_dict: Dict, rpn_results_list: InstanceList,
batch_data_samples: SampleList, **kwargs) -> dict:
"""Perform forward propagation and loss calculation of the detection
roi on the features of the upstream network.
Args:
feats_dict (dict): Contains features from the first stage.
rpn_results_list (List[:obj:`InstanceData`]): Detection results
of rpn head.
batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
samples. It usually includes information such as
`gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
Returns:
dict[str, Tensor]: A dictionary of loss components
"""
assert len(rpn_results_list) == len(batch_data_samples)
losses = dict()
batch_gt_instances_3d = []
batch_gt_instances_ignore = []
voxels_dict = feats_dict.pop('voxels_dict')
for data_sample in batch_data_samples:
batch_gt_instances_3d.append(data_sample.gt_instances_3d)
if 'ignored_instances' in data_sample:
batch_gt_instances_ignore.append(data_sample.ignored_instances)
else:
batch_gt_instances_ignore.append(None)
if self.with_semantic:
semantic_results = self._semantic_forward_train(
feats_dict, voxels_dict, batch_gt_instances_3d)
losses.update(semantic_results.pop('loss_semantic'))
sample_results = self._assign_and_sample(rpn_results_list,
batch_gt_instances_3d)
if self.with_bbox:
feats_dict.update(semantic_results)
bbox_results = self._bbox_forward_train(feats_dict, voxels_dict,
sample_results)
losses.update(bbox_results['loss_bbox'])
return losses
```
Here we omit more details related to other functions. Please see the [code](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/models/roi_heads/part_aggregation_roi_head.py) for more details.
Last, the users need to add the module in
`mmdet3d/models/roi_heads/bbox_heads/__init__.py` and `mmdet3d/models/roi_heads/__init__.py` thus the corresponding registry could find and load them.
Alternatively, the users can add
```python
custom_imports=dict(
imports=['mmdet3d.models.roi_heads.part_aggregation_roi_head', 'mmdet3d.models.roi_heads.bbox_heads.parta2_bbox_head'],
allow_failed_imports=False)
```
to the config file and achieve the same goal.
The config file of `PartAggregationROIHead` is as the following:
```python
model = dict(
...
roi_head=dict(
type='PartAggregationROIHead',
num_classes=3,
semantic_head=dict(
type='PointwiseSemanticHead',
in_channels=16,
extra_width=0.2,
seg_score_thr=0.3,
num_classes=3,
loss_seg=dict(
type='mmdet.FocalLoss',
use_sigmoid=True,
reduction='sum',
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_part=dict(
type='mmdet.CrossEntropyLoss',
use_sigmoid=True,
loss_weight=1.0)),
seg_roi_extractor=dict(
type='Single3DRoIAwareExtractor',
roi_layer=dict(
type='RoIAwarePool3d',
out_size=14,
max_pts_per_voxel=128,
mode='max')),
bbox_roi_extractor=dict(
type='Single3DRoIAwareExtractor',
roi_layer=dict(
type='RoIAwarePool3d',
out_size=14,
max_pts_per_voxel=128,
mode='avg')),
bbox_head=dict(
type='PartA2BboxHead',
num_classes=3,
seg_in_channels=16,
part_in_channels=4,
seg_conv_channels=[64, 64],
part_conv_channels=[64, 64],
merge_conv_channels=[128, 128],
down_conv_channels=[128, 256],
bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
shared_fc_channels=[256, 512, 512, 512],
cls_channels=[256, 256],
reg_channels=[256, 256],
dropout_ratio=0.1,
roi_feat_size=14,
with_corner_loss=True,
loss_bbox=dict(
type='mmdet.SmoothL1Loss',
beta=1.0 / 9.0,
reduction='sum',
loss_weight=1.0),
loss_cls=dict(
type='mmdet.CrossEntropyLoss',
use_sigmoid=True,
reduction='sum',
loss_weight=1.0))),
...
)
```
Since MMDetection 2.0, the config system supports to inherit configs such that the users can focus on the modification.
The second stage of PartA2 Head mainly uses a new `PartAggregationROIHead` and a new
`PartA2BboxHead`, the arguments are set according to the `__init__` function of each module.
### Add a new loss
Assume you want to add a new loss as `MyLoss` for bounding box regression.
To add a new loss function, the users need to implement it in `mmdet3d/models/losses/my_loss.py`.
The decorator `weighted_loss` enables the loss to be weighted for each element.
```python
import torch
import torch.nn as nn
from mmdet.models.losses.utils import weighted_loss
from mmdet3d.registry import MODELS
@weighted_loss
def my_loss(pred, target):
assert pred.size() == target.size() and target.numel() > 0
loss = torch.abs(pred - target)
return loss
@MODELS.register_module()
class MyLoss(nn.Module):
def __init__(self, reduction='mean', loss_weight=1.0):
super(MyLoss, self).__init__()
self.reduction = reduction
self.loss_weight = loss_weight
def forward(self,
pred,
target,
weight=None,
avg_factor=None,
reduction_override=None):
assert reduction_override in (None, 'none', 'mean', 'sum')
reduction = (
reduction_override if reduction_override else self.reduction)
loss_bbox = self.loss_weight * my_loss(
pred, target, weight, reduction=reduction, avg_factor=avg_factor)
return loss_bbox
```
Then the users need to add it in the `mmdet3d/models/losses/__init__.py`.
```python
from .my_loss import MyLoss, my_loss
```
Alternatively, you can add
```python
custom_imports=dict(
imports=['mmdet3d.models.losses.my_loss'],
allow_failed_imports=False)
```
to the config file and achieve the same goal.
To use it, users should modify the `loss_xxx` field.
Since `MyLoss` is for regression, you need to modify the `loss_bbox` field in the head.
```python
loss_bbox=dict(type='MyLoss', loss_weight=1.0)
```
# Customize Runtime Settings
## Customize optimization settings
Optimization related configuration is now all managed by `optim_wrapper` which usually has three fields: `optimizer`, `paramwise_cfg`, `clip_grad`. Please refer to [OptimWrapper](https://mmengine.readthedocs.io/en/latest/tutorials/optim_wrapper.html) for more details. See the example below, where `AdamW` is used as an `optimizer`, the learning rate of the backbone is reduced by a factor of 10, and gradient clipping is added.
```python
optim_wrapper = dict(
type='OptimWrapper',
# optimizer
optimizer=dict(
type='AdamW',
lr=0.0001,
weight_decay=0.05,
eps=1e-8,
betas=(0.9, 0.999)),
# Parameter-level learning rate and weight decay settings
paramwise_cfg=dict(
custom_keys={
'backbone': dict(lr_mult=0.1, decay_mult=1.0),
},
norm_decay_mult=0.0),
# gradient clipping
clip_grad=dict(max_norm=0.01, norm_type=2))
```
### Customize optimizer supported by PyTorch
We already support to use all the optimizers implemented by PyTorch, and the only modification is to change the `optimizer` field in `optim_wrapper` field of config files. For example, if you want to use `Adam` (note that the performance could drop a lot), the modification could be as the following:
```python
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='Adam', lr=0.0003, weight_decay=0.0001))
```
To modify the learning rate of the model, the users only need to modify the `lr` in `optimizer`. The users can directly set arguments following the [API doc](https://pytorch.org/docs/stable/optim.html?highlight=optim#module-torch.optim) of PyTorch.
### Customize self-implemented optimizer
#### 1. Define a new optimizer
A customized optimizer could be defined as following:
Assume you want to add a optimizer named `MyOptimizer`, which has arguments `a`, `b`, and `c`.
You need to create a new directory named `mmdet3d/engine/optimizers`, and then implement the new optimizer in a file, e.g., in `mmdet3d/engine/optimizers/my_optimizer.py`:
```python
from torch.optim import Optimizer
from mmdet3d.registry import OPTIMIZERS
@OPTIMIZERS.register_module()
class MyOptimizer(Optimizer):
def __init__(self, a, b, c):
pass
```
#### 2. Add the optimizer to registry
To find the above module defined above, this module should be imported into the main namespace at first. There are two options to achieve it.
- Modify `mmdet3d/engine/optimizers/__init__.py` to import it.
The newly defined module should be imported in `mmdet3d/engine/optimizers/__init__.py` so that the registry will find the new module and add it:
```python
from .my_optimizer import MyOptimizer
```
- Use `custom_imports` in the config to manually import it.
```python
custom_imports = dict(imports=['mmdet3d.engine.optimizers.my_optimizer'], allow_failed_imports=False)
```
The module `mmdet3d.engine.optimizers.my_optimizer` will be imported at the beginning of the program and the class `MyOptimizer` is then automatically registered.
Note that only the package containing the class `MyOptimizer` should be imported.
`mmdet3d.engine.optimizers.my_optimizer.MyOptimizer` **cannot** be imported directly.
Actually users can use a totally different file directory structure with this importing method, as long as the module root is located in `PYTHONPATH`.
#### 3. Specify the optimizer in the config file
Then you can use `MyOptimizer` in `optimizer` field in `optim_wrapper` field of config files. In the configs, the optimizers are defined by the field `optimizer` like the following:
```python
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001))
```
To use your own optimizer, the field can be changed to:
```python
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='MyOptimizer', a=a_value, b=b_value, c=c_value))
```
### Customize optimizer wrapper constructor
Some models may have some parameter-specific settings for optimization, e.g. weight decay for BatchNorm layers.
The users can do those fine-grained parameter tuning through customizing optimizer wrapper constructor.
```python
from mmengine.optim import DefaultOptimWrapperConstructor
from mmdet3d.registry import OPTIM_WRAPPER_CONSTRUCTORS
from .my_optimizer import MyOptimizer
@OPTIM_WRAPPER_CONSTRUCTORS.register_module()
class MyOptimizerWrapperConstructor(DefaultOptimWrapperConstructor):
def __init__(self,
optim_wrapper_cfg: dict,
paramwise_cfg: Optional[dict] = None):
pass
def __call__(self, model: nn.Module) -> OptimWrapper:
return optim_wrapper
```
The default optimizer wrapper constructor is implemented [here](https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/default_constructor.py#L18), which could also serve as a template for the new optimizer wrapper constructor.
### Additional settings
Tricks not implemented by the optimizer should be implemented through optimizer wrapper constructor (e.g., set parameter-wise learning rates) or hooks. We list some common settings that could stabilize the training or accelerate the training. Feel free to create PR, issue for more settings.
- __Use gradient clip to stabilize training__:
Some models need gradient clip to clip the gradients to stabilize the training process. An example is as below:
```python
optim_wrapper = dict(
_delete_=True, clip_grad=dict(max_norm=35, norm_type=2))
```
If your config inherits the base config which already sets the `optim_wrapper`, you might need `_delete_=True` to override the unnecessary settings. See the [config documentation](https://mmdetection3d.readthedocs.io/en/dev-1.x/user_guides/config.html) for more details.
- __Use momentum schedule to accelerate model convergence__:
We support momentum scheduler to modify model's momentum according to learning rate, which could make the model converge in a faster way.
Momentum scheduler is usually used with LR scheduler, for example, the following config is used in [3D detection](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/configs/_base_/schedules/cyclic-20e.py) to accelerate convergence.
For more details, please refer to the implementation of [CosineAnnealingLR](https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py#L43) and [CosineAnnealingMomentum](https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/momentum_scheduler.py#L71).
```python
param_scheduler = [
# learning rate scheduler
# During the first 8 epochs, learning rate increases from 0 to lr * 10
# during the next 12 epochs, learning rate decreases from lr * 10 to lr * 1e-4
dict(
type='CosineAnnealingLR',
T_max=8,
eta_min=lr * 10,
begin=0,
end=8,
by_epoch=True,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=12,
eta_min=lr * 1e-4,
begin=8,
end=20,
by_epoch=True,
convert_to_iter_based=True),
# momentum scheduler
# During the first 8 epochs, momentum increases from 0 to 0.85 / 0.95
# during the next 12 epochs, momentum increases from 0.85 / 0.95 to 1
dict(
type='CosineAnnealingMomentum',
T_max=8,
eta_min=0.85 / 0.95,
begin=0,
end=8,
by_epoch=True,
convert_to_iter_based=True),
dict(
type='CosineAnnealingMomentum',
T_max=12,
eta_min=1,
begin=8,
end=20,
by_epoch=True,
convert_to_iter_based=True)
]
```
## Customize training schedules
By default we use step learning rate with 1x schedule, this calls [`MultiStepLR`](https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py#L144) in MMEngine.
We support many other learning rate schedule [here](https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py), such as `CosineAnnealingLR` and `PolyLR` schedules. Here are some examples:
- Poly schedule:
```python
param_scheduler = [
dict(
type='PolyLR',
power=0.9,
eta_min=1e-4,
begin=0,
end=8,
by_epoch=True)]
```
- CosineAnnealing schedule:
```python
param_scheduler = [
dict(
type='CosineAnnealingLR',
T_max=8,
eta_min=lr * 1e-5,
begin=0,
end=8,
by_epoch=True)]
```
## Customize train loop
By default, `EpochBasedTrainLoop` is used in `train_cfg` and validation is done after every train epoch, as follows:
```python
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_begin=1, val_interval=1)
```
Actually, both [`IterBasedTrainLoop`](https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/loops.py#L185) and [`EpochBasedTrainLoop`](https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/loops.py#L18) support dynamic interval, see the following example:
```python
# Before 365001th iteration, we do evaluation every 5000 iterations.
# After 365000th iteration, we do evaluation every 368750 iterations,
# which means that we do evaluation at the end of training.
interval = 5000
max_iters = 368750
dynamic_intervals = [(max_iters // interval * interval + 1, max_iters)]
train_cfg = dict(
type='IterBasedTrainLoop',
max_iters=max_iters,
val_interval=interval,
dynamic_intervals=dynamic_intervals)
```
## Customize hooks
### Customize self-implemented hooks
#### 1. Implement a new hook
MMEngine provides many useful [hooks](https://mmengine.readthedocs.io/en/latest/tutorials/hook.html), but there are some occasions when the users might need to implement a new hook. MMDetection3D supports customized hooks in training based on MMEngine after v1.1.0rc0. Thus the users could implement a hook directly in mmdet3d or their mmdet3d-based codebases and use the hook by only modifying the config in training.
Here we give an example of creating a new hook in mmdet3d and using it in training.
```python
from mmengine.hooks import Hook
from mmdet3d.registry import HOOKS
@HOOKS.register_module()
class MyHook(Hook):
def __init__(self, a, b):
def before_run(self, runner) -> None:
def after_run(self, runner) -> None:
def before_train(self, runner) -> None:
def after_train(self, runner) -> None:
def before_train_epoch(self, runner) -> None:
def after_train_epoch(self, runner) -> None:
def before_train_iter(self,
runner,
batch_idx: int,
data_batch: DATA_BATCH = None) -> None:
def after_train_iter(self,
runner,
batch_idx: int,
data_batch: DATA_BATCH = None,
outputs: Optional[dict] = None) -> None:
```
Depending on the functionality of the hook, users need to specify what the hook will do at each stage of the training in `before_run`, `after_run`, `before_train`, `after_train`, `before_train_epoch`, `after_train_epoch`, `before_train_iter`, and `after_train_iter`. There are more points where hooks can be inserted, refer to [base hook class](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/hook.py#L9) for more details.
#### 2. Register the new hook
Then we need to make `MyHook` imported. Assuming the file is in `mmdet3d/engine/hooks/my_hook.py`, there are two ways to do that:
- Modify `mmdet3d/engine/hooks/__init__.py` to import it.
The newly defined module should be imported in `mmdet3d/engine/hooks/__init__.py` so that the registry will find the new module and add it:
```python
from .my_hook import MyHook
```
- Use `custom_imports` in the config to manually import it.
```python
custom_imports = dict(imports=['mmdet3d.engine.hooks.my_hook'], allow_failed_imports=False)
```
#### 3. Modify the config
```python
custom_hooks = [
dict(type='MyHook', a=a_value, b=b_value)
]
```
You can also set the priority of the hook by adding key `priority` to `'NORMAL'` or `'HIGHEST'` as below:
```python
custom_hooks = [
dict(type='MyHook', a=a_value, b=b_value, priority='NORMAL')
]
```
By default the hook's priority is set as `NORMAL` during registration.
### Use hooks implemented in MMDetection3D
If the hook is already implemented in MMDetection3D, you can directly modify the config to use the hook as below.
#### Example: `DisableObjectSampleHook`
We implement a customized hook named [DisableObjectSampleHook](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/engine/hooks/disable_object_sample_hook.py) to disable `ObjectSample` augmentation during training after specified epoch.
We can set it in the config file if needed:
```python
custom_hooks = [dict(type='DisableObjectSampleHook', disable_after_epoch=15)]
```
### Modify default runtime hooks
There are some common hooks that are registered through `default_hooks`, they are
- `IterTimerHook`: A hook that logs 'data_time' for loading data and 'time' for a model training step.
- `LoggerHook`: A hook that collects logs from different components of `Runner` and writes them to terminal, json file, tensorboard and wandb etc.
- `ParamSchedulerHook`: A hook that updates some hyper-parameters in optimizer, e.g., learning rate and momentum.
- `CheckpointHook`: A hook that saves checkpoints periodically.
- `DistSamplerSeedHook`: A hook that sets the seed for sampler and batch_sampler.
- `Det3DVisualizationHook`: A hook used to visualize validation and testing process prediction results.
`IterTimerHook`, `ParamSchedulerHook` and `DistSamplerSeedHook` are simple and no need to be modified usually, so here we reveal what we can do with `LoggerHook`, `CheckpointHook` and `Det3DVisualizationHook`.
#### CheckpointHook
Except saving checkpoints periodically, [`CheckpointHook`](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/checkpoint_hook.py#L18) provides other options such as `max_keep_ckpts`, `save_optimizer` and etc. The users could set `max_keep_ckpts` to only save small number of checkpoints or decide whether to store state dict of optimizer by `save_optimizer`. More details of the arguments are [here](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/checkpoint_hook.py#L18).
```python
default_hooks = dict(
checkpoint=dict(
type='CheckpointHook',
interval=1,
max_keep_ckpts=3,
save_optimizer=True))
```
#### LoggerHook
The `LoggerHook` enables setting intervals. Detailed instructions can be found in the [docstring](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/logger_hook.py#L19).
```python
default_hooks = dict(logger=dict(type='LoggerHook', interval=50))
```
#### Det3DVisualizationHook
`Det3DVisualizationHook` use `DetLocalVisualizer` to visualize prediction results, and `Det3DLocalVisualizer` current supports different backends, e.g., `TensorboardVisBackend` and `WandbVisBackend` (see [docstring](https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py) for more details). The users could add multi backends to do visualization as follows.
```python
default_hooks = dict(
visualization=dict(type='Det3DVisualizationHook', draw=True))
vis_backends = [dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend')]
visualizer = dict(
type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
```
.. toctree::
:maxdepth: 3
kitti.md
nuscenes.md
lyft.md
waymo.md
sunrgbd.md
scannet.md
s3dis.md
semantickitti.md
# KITTI Dataset
This page provides specific tutorials about the usage of MMDetection3D for KITTI dataset.
## Prepare dataset
You can download KITTI 3D detection data [HERE](http://www.cvlibs.net/datasets/kitti/eval_object.php?obj_benchmark=3d) and unzip all zip files. Besides, the road planes could be downloaded from [HERE](https://download.openmmlab.com/mmdetection3d/data/train_planes.zip), which are optional for data augmentation during training for better performance. The road planes are generated by [AVOD](https://github.com/kujason/avod), you can see more details [HERE](https://github.com/kujason/avod/issues/19).
Like the general way to prepare dataset, it is recommended to symlink the dataset root to `$MMDETECTION3D/data`.
The folder structure should be organized as follows before our processing.
```
mmdetection3d
├── mmdet3d
├── tools
├── configs
├── data
│ ├── kitti
│ │ ├── ImageSets
│ │ ├── testing
│ │ │ ├── calib
│ │ │ ├── image_2
│ │ │ ├── velodyne
│ │ ├── training
│ │ │ ├── calib
│ │ │ ├── image_2
│ │ │ ├── label_2
│ │ │ ├── velodyne
│ │ │ ├── planes (optional)
```
### Create KITTI dataset
To create KITTI point cloud data, we load the raw point cloud data and generate the relevant annotations including object labels and bounding boxes. We also generate all single training objects' point cloud in KITTI dataset and save them as `.bin` files in `data/kitti/kitti_gt_database`. Meanwhile, `.pkl` info files are also generated for training or validation. Subsequently, create KITTI data by running:
```bash
mkdir ./data/kitti/ && mkdir ./data/kitti/ImageSets
# Download data split
wget -c https://raw.githubusercontent.com/traveller59/second.pytorch/master/second/data/ImageSets/test.txt --no-check-certificate --content-disposition -O ./data/kitti/ImageSets/test.txt
wget -c https://raw.githubusercontent.com/traveller59/second.pytorch/master/second/data/ImageSets/train.txt --no-check-certificate --content-disposition -O ./data/kitti/ImageSets/train.txt
wget -c https://raw.githubusercontent.com/traveller59/second.pytorch/master/second/data/ImageSets/val.txt --no-check-certificate --content-disposition -O ./data/kitti/ImageSets/val.txt
wget -c https://raw.githubusercontent.com/traveller59/second.pytorch/master/second/data/ImageSets/trainval.txt --no-check-certificate --content-disposition -O ./data/kitti/ImageSets/trainval.txt
python tools/create_data.py kitti --root-path ./data/kitti --out-dir ./data/kitti --extra-tag kitti --with-plane
```
Note that if your local disk does not have enough space for saving converted data, you can change the `--out-dir` to anywhere else, and you need to remove the `--with-plane` flag if `planes` are not prepared.
The folder structure after processing should be as below
```
kitti
├── ImageSets
│ ├── test.txt
│ ├── train.txt
│ ├── trainval.txt
│ ├── val.txt
├── testing
│ ├── calib
│ ├── image_2
│ ├── velodyne
│ ├── velodyne_reduced
├── training
│ ├── calib
│ ├── image_2
│ ├── label_2
│ ├── velodyne
│ ├── velodyne_reduced
│ ├── planes (optional)
├── kitti_gt_database
│ ├── xxxxx.bin
├── kitti_infos_train.pkl
├── kitti_infos_val.pkl
├── kitti_dbinfos_train.pkl
├── kitti_infos_test.pkl
├── kitti_infos_trainval.pkl
```
- `kitti_gt_database/xxxxx.bin`: point cloud data included in each 3D bounding box of the training dataset.
- `kitti_infos_train.pkl`: training dataset, a dict contains two keys: `metainfo` and `data_list`.
`metainfo` contains the basic information for the dataset itself, such as `categories`, `dataset` and `info_version`, while `data_list` is a list of dict, each dict (hereinafter referred to as `info`) contains all the detailed information of single sample as follows:
- info\['sample_idx'\]: The index of this sample in the whole dataset.
- info\['images'\]: Information of images captured by multiple cameras. A dict contains five keys including: `CAM0`, `CAM1`, `CAM2`, `CAM3`, `R0_rect`.
- info\['images'\]\['R0_rect'\]: Rectifying rotation matrix with shape (4, 4).
- info\['images'\]\['CAM2'\]: Include some information about the `CAM2` camera sensor.
- info\['images'\]\['CAM2'\]\['img_path'\]: The filename of the image.
- info\['images'\]\['CAM2'\]\['height'\]: The height of the image.
- info\['images'\]\['CAM2'\]\['width'\]: The width of the image.
- info\['images'\]\['CAM2'\]\['cam2img'\]: Transformation matrix from camera to image with shape (4, 4).
- info\['images'\]\['CAM2'\]\['lidar2cam'\]: Transformation matrix from lidar to camera with shape (4, 4).
- info\['images'\]\['CAM2'\]\['lidar2img'\]: Transformation matrix from lidar to image with shape (4, 4).
- info\['lidar_points'\]: A dict containing all the information related to the lidar points.
- info\['lidar_points'\]\['lidar_path'\]: The filename of the lidar point cloud data.
- info\['lidar_points'\]\['num_pts_feats'\]: The feature dimension of point.
- info\['lidar_points'\]\['Tr_velo_to_cam'\]: Transformation from Velodyne coordinate to camera coordinate with shape (4, 4).
- info\['lidar_points'\]\['Tr_imu_to_velo'\]: Transformation from IMU coordinate to Velodyne coordinate with shape (4, 4).
- info\['instances'\]: It is a list of dict. Each dict contains all annotation information of single instance. For the i-th instance:
- info\['instances'\]\[i\]\['bbox'\]: List of 4 numbers representing the 2D bounding box of the instance, in (x1, y1, x2, y2) order.
- info\['instances'\]\[i\]\['bbox_3d'\]: List of 7 numbers representing the 3D bounding box of the instance, in (x, y, z, l, h, w, yaw) order.
- info\['instances'\]\[i\]\['bbox_label'\]: An int indicate the 2D label of instance and the -1 indicating ignore.
- info\['instances'\]\[i\]\['bbox_label_3d'\]: An int indicate the 3D label of instance and the -1 indicating ignore.
- info\['instances'\]\[i\]\['depth'\]: Projected center depth of the 3D bounding box with respect to the image plane.
- info\['instances'\]\[i\]\['num_lidar_pts'\]: The number of LiDAR points in the 3D bounding box.
- info\['instances'\]\[i\]\['center_2d'\]: Projected 2D center of the 3D bounding box.
- info\['instances'\]\[i\]\['difficulty'\]: KITTI difficulty: 'Easy', 'Moderate', 'Hard'.
- info\['instances'\]\[i\]\['truncated'\]: Float from 0 (non-truncated) to 1 (truncated), where truncated refers to the object leaving image boundaries.
- info\['instances'\]\[i\]\['occluded'\]: Integer (0,1,2,3) indicating occlusion state: 0 = fully visible, 1 = partly occluded, 2 = largely occluded, 3 = unknown.
- info\['instances'\]\[i\]\['group_ids'\]: Used for multi-part object.
- info\['plane'\](optional): Road level information.
Please refer to [kitti_converter.py](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/tools/dataset_converters/kitti_converter.py) and [update_infos_to_v2.py ](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/tools/dataset_converters/update_infos_to_v2.py) for more details.
## Train pipeline
A typical train pipeline of 3D detection on KITTI is as below:
```python
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=4, # x, y, z, intensity
use_dim=4),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
dict(type='ObjectSample', db_sampler=db_sampler),
dict(
type='ObjectNoise',
num_try=100,
translation_std=[1.0, 1.0, 0.5],
global_rot_range=[0.0, 0.0],
rot_range=[-0.78539816, 0.78539816]),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.78539816, 0.78539816],
scale_ratio_range=[0.95, 1.05]),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='PointShuffle'),
dict(
type='Pack3DDetInputs',
keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
```
- Data augmentation:
- `ObjectNoise`: apply noise to each GT objects in the scene.
- `RandomFlip3D`: randomly flip input point cloud horizontally or vertically.
- `GlobalRotScaleTrans`: rotate input point cloud.
## Evaluation
An example to evaluate PointPillars with 8 GPUs with kitti metrics is as follows:
```shell
bash tools/dist_test.sh configs/pointpillars/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class.py work_dirs/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class/latest.pth 8
```
## Metrics
KITTI evaluates 3D object detection performance using mean Average Precision (mAP) and Average Orientation Similarity (AOS), Please refer to its [official website](http://www.cvlibs.net/datasets/kitti/eval_3dobject.php) and [original paper](http://www.cvlibs.net/publications/Geiger2012CVPR.pdf) for more details.
We also adopt this approach for evaluation on KITTI. An example of printed evaluation results is as follows:
```
Car AP@0.70, 0.70, 0.70:
bbox AP:97.9252, 89.6183, 88.1564
bev AP:90.4196, 87.9491, 85.1700
3d AP:88.3891, 77.1624, 74.4654
aos AP:97.70, 89.11, 87.38
Car AP@0.70, 0.50, 0.50:
bbox AP:97.9252, 89.6183, 88.1564
bev AP:98.3509, 90.2042, 89.6102
3d AP:98.2800, 90.1480, 89.4736
aos AP:97.70, 89.11, 87.38
```
## Testing and make a submission
An example to test PointPillars on KITTI with 8 GPUs and generate a submission to the leaderboard is as follows:
- First, you need to modify the `test_dataloader` and `test_evaluator` dict in your config file, just like:
```python
data_root = 'data/kitti/'
test_dataloader = dict(
dataset=dict(
ann_file='kitti_infos_test.pkl',
load_eval_anns=False,
data_prefix=dict(pts='testing/velodyne_reduced')))
test_evaluator = dict(
ann_file=data_root + 'kitti_infos_test.pkl',
format_only=True,
pklfile_prefix='results/kitti-3class/kitti_results',
submission_prefix='results/kitti-3class/kitti_results')
```
- And then, you can run the test script.
```shell
./tools/dist_test.sh configs/pointpillars/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class.py work_dirs/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class/latest.pth 8
```
After generating `results/kitti-3class/kitti_results/xxxxx.txt` files, you can submit these files to KITTI benchmark. Please refer to the [KITTI official website](http://www.cvlibs.net/datasets/kitti/index.php) for more details.
# Lyft Dataset
This page provides specific tutorials about the usage of MMDetection3D for Lyft dataset.
## Before Preparation
You can download Lyft 3D detection data [HERE](https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles/data) and unzip all zip files.
Like the general way to prepare a dataset, it is recommended to symlink the dataset root to `$MMDETECTION3D/data`.
The folder structure should be organized as follows before our processing.
```
mmdetection3d
├── mmdet3d
├── tools
├── configs
├── data
│ ├── lyft
│ │ ├── v1.01-train
│ │ │ ├── v1.01-train (train_data)
│ │ │ ├── lidar (train_lidar)
│ │ │ ├── images (train_images)
│ │ │ ├── maps (train_maps)
│ │ ├── v1.01-test
│ │ │ ├── v1.01-test (test_data)
│ │ │ ├── lidar (test_lidar)
│ │ │ ├── images (test_images)
│ │ │ ├── maps (test_maps)
│ │ ├── train.txt
│ │ ├── val.txt
│ │ ├── test.txt
│ │ ├── sample_submission.csv
```
Here `v1.01-train` and `v1.01-test` contain the metafiles which are similar to those of nuScenes. `.txt` files contain the data split information.
Lyft does not have an official split for training and validation set, so we provide a split considering the number of objects from different categories in different scenes.
`sample_submission.csv` is the base file for submission on the Kaggle evaluation server.
Note that we follow the original folder names for clear organization. Please rename the raw folders as shown above.
## Dataset Preparation
The way to organize Lyft dataset is similar to nuScenes. We also generate the `.pkl` files which share almost the same structure.
Next, we will mainly focus on the difference between these two datasets. For a more detailed explanation of the info structure, please refer to [nuScenes tutorial](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/docs/en/advanced_guides/datasets/nuscenes_det.md).
To prepare info files for Lyft, run the following commands:
```bash
python tools/create_data.py lyft --root-path ./data/lyft --out-dir ./data/lyft --extra-tag lyft --version v1.01
python tools/dataset_converters/lyft_data_fixer.py --version v1.01 --root-folder ./data/lyft
```
Note that the second command serves the purpose of fixing a corrupted lidar data file. Please refer to the discussion [here](https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles/discussion/110000) for more details.
The folder structure after processing should be as below.
```
mmdetection3d
├── mmdet3d
├── tools
├── configs
├── data
│ ├── lyft
│ │ ├── v1.01-train
│ │ │ ├── v1.01-train (train_data)
│ │ │ ├── lidar (train_lidar)
│ │ │ ├── images (train_images)
│ │ │ ├── maps (train_maps)
│ │ ├── v1.01-test
│ │ │ ├── v1.01-test (test_data)
│ │ │ ├── lidar (test_lidar)
│ │ │ ├── images (test_images)
│ │ │ ├── maps (test_maps)
│ │ ├── train.txt
│ │ ├── val.txt
│ │ ├── test.txt
│ │ ├── sample_submission.csv
│ │ ├── lyft_infos_train.pkl
│ │ ├── lyft_infos_val.pkl
│ │ ├── lyft_infos_test.pkl
```
- `lyft_infos_train.pkl`: training dataset, a dict contains two keys: `metainfo` and `data_list`.
`metainfo` contains the basic information for the dataset itself, such as `categories`, `dataset` and `info_version`, while `data_list` is a list of dict, each dict (hereinafter referred to as `info`) contains all the detailed information of single sample as follows:
- info\['sample_idx'\]: The index of this sample in the whole dataset.
- info\['token'\]: Sample data token.
- info\['timestamp'\]: Timestamp of the sample data.
- info\['lidar_points'\]: A dict containing all the information related to the lidar points.
- info\['lidar_points'\]\['lidar_path'\]: The filename of the lidar point cloud data.
- info\['lidar_points'\]\['num_pts_feats'\]: The feature dimension of point.
- info\['lidar_points'\]\['lidar2ego'\]: The transformation matrix from this lidar sensor to ego vehicle. (4x4 list)
- info\['lidar_points'\]\['ego2global'\]: The transformation matrix from the ego vehicle to global coordinates. (4x4 list)
- info\['lidar_sweeps'\]: A list contains sweeps information (The intermediate lidar frames without annotations).
- info\['lidar_sweeps'\]\[i\]\['lidar_points'\]\['data_path'\]: The lidar data path of i-th sweep.
- info\['lidar_sweeps'\]\[i\]\['lidar_points'\]\['lidar2ego'\]: The transformation matrix from this lidar sensor to ego vehicle in i-th sweep timestamp
- info\['lidar_sweeps'\]\[i\]\['lidar_points'\]\['ego2global'\]: The transformation matrix from the ego vehicle in i-th sweep timestamp to global coordinates. (4x4 list)
- info\['lidar_sweeps'\]\[i\]\['lidar2sensor'\]: The transformation matrix from the keyframe lidar to the i-th frame lidar. (4x4 list)
- info\['lidar_sweeps'\]\[i\]\['timestamp'\]: Timestamp of the sweep data.
- info\['lidar_sweeps'\]\[i\]\['sample_data_token'\]: The sweep sample data token.
- info\['images'\]: A dict contains six keys corresponding to each camera: `'CAM_FRONT'`, `'CAM_FRONT_RIGHT'`, `'CAM_FRONT_LEFT'`, `'CAM_BACK'`, `'CAM_BACK_LEFT'`, `'CAM_BACK_RIGHT'`. Each dict contains all data information related to corresponding camera.
- info\['images'\]\['CAM_XXX'\]\['img_path'\]: The filename of the image.
- info\['images'\]\['CAM_XXX'\]\['cam2img'\]: The transformation matrix recording the intrinsic parameters when projecting 3D points to each image plane. (3x3 list)
- info\['images'\]\['CAM_XXX'\]\['sample_data_token'\]: Sample data token of image.
- info\['images'\]\['CAM_XXX'\]\['timestamp'\]: Timestamp of the image.
- info\['images'\]\['CAM_XXX'\]\['cam2ego'\]: The transformation matrix from this camera sensor to ego vehicle. (4x4 list)
- info\['images'\]\['CAM_XXX'\]\['lidar2cam'\]: The transformation matrix from lidar sensor to this camera. (4x4 list)
- info\['instances'\]: It is a list of dict. Each dict contains all annotation information of single instance. For the i-th instance:
- info\['instances'\]\[i\]\['bbox_3d'\]: List of 7 numbers representing the 3D bounding box in lidar coordinate system of the instance, in (x, y, z, l, w, h, yaw) order.
- info\['instances'\]\[i\]\['bbox_label_3d'\]: A int starting from 0 indicates the label of instance, while the -1 indicates ignore class.
- info\['instances'\]\[i\]\['bbox_3d_isvalid'\]: Whether each bounding box is valid. In general, we only take the 3D boxes that include at least one lidar or radar point as valid boxes.
Next, we will elaborate on the difference compared to nuScenes in terms of the details recorded in these info files.
- Without `lyft_database/xxxxx.bin`: This folder and `.bin` files are not extracted on the Lyft dataset due to the negligible effect of ground-truth sampling in the experiments.
- `lyft_infos_train.pkl`:
- Without info\['instances'\]\[i\]\['velocity'\]: There is no velocity measurement on Lyft.
- Without info\['instances'\]\[i\]\['num_lidar_pts'\] and info\['instances'\]\['num_radar_pts'\]
Here we only explain the data recorded in the training info files. The same applies to the validation set and test set (without instances).
Please refer to [lyft_converter.py](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/tools/dataset_converters/lyft_converter.py) for more details about the structure of `lyft_infos_xxx.pkl`.
## Training pipeline
### LiDAR-Based Methods
A typical training pipeline of LiDAR-based 3D detection (including multi-modality methods) on Lyft is almost the same as nuScenes as below.
```python
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=5),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=10),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.3925, 0.3925],
scale_ratio_range=[0.95, 1.05],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='PointShuffle'),
dict(
type='Pack3DDetInputs',
keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
```
Similar to nuScenes, models on Lyft also need the `'LoadPointsFromMultiSweeps'` pipeline to load point clouds from consecutive frames.
In addition, considering the intensity of LiDAR points collected by Lyft is invalid, we also set the `use_dim` in `'LoadPointsFromMultiSweeps'` to `[0, 1, 2, 4]` by default,
where the first 3 dimensions refer to point coordinates, and the last refers to timestamp differences.
## Evaluation
An example to evaluate PointPillars with 8 GPUs with Lyft metrics is as follows:
```shell
bash ./tools/dist_test.sh configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb2-2x_lyft-3d.py checkpoints/hv_pointpillars_fpn_sbn-all_2x8_2x_lyft-3d_20210517_202818-fc6904c3.pth 8
```
## Metrics
Lyft proposes a more strict metric for evaluating the predicted 3D bounding boxes.
The basic criteria to judge whether a predicted box is positive or not is the same as KITTI, i.e. the 3D Intersection over Union (IoU).
However, it adopts a way similar to COCO to compute the mean average precision (mAP) -- compute the average precision under different thresholds of 3D IoU from 0.5-0.95.
Actually, overlap more than 0.7 3D IoU is a quite strict criterion for 3D detection methods, so the overall performance seems a little low.
The imbalance of annotations for different categories is another important reason for the finally lower results compared to other datasets.
Please refer to its [official website](https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles/overview/evaluation) for more details about the definition of this metric.
We employ this official method for evaluation on Lyft. An example of printed evaluation results is as follows:
```
+mAPs@0.5:0.95------+--------------+
| class | mAP@0.5:0.95 |
+-------------------+--------------+
| animal | 0.0 |
| bicycle | 0.099 |
| bus | 0.177 |
| car | 0.422 |
| emergency_vehicle | 0.0 |
| motorcycle | 0.049 |
| other_vehicle | 0.359 |
| pedestrian | 0.066 |
| truck | 0.176 |
| Overall | 0.15 |
+-------------------+--------------+
```
## Testing and make a submission
An example to test PointPillars on Lyft with 8 GPUs and generate a submission to the leaderboard is as follows.
```shell
./tools/dist_test.sh configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb2-2x_lyft-3d.py work_dirs/pp-lyft/latest.pth 8 --cfg-options test_evaluator.jsonfile_prefix=work_dirs/pp-lyft/results_challenge test_evaluator.csv_savepath=results/pp-lyft/results_challenge.csv
```
After generating the `work_dirs/pp-lyft/results_challenge.csv`, you can submit it to the Kaggle evaluation server. Please refer to the [official website](https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles) for more information.
We can also visualize the prediction results with our developed visualization tools. Please refer to the [visualization doc](https://mmdetection3d.readthedocs.io/en/latest/useful_tools.html#visualization) for more details.
# NuScenes Dataset
This page provides specific tutorials about the usage of MMDetection3D for nuScenes dataset.
## Before Preparation
You can download nuScenes 3D detection `Full dataset (v1.0)` [HERE](https://www.nuscenes.org/download) and unzip all zip files.
If you want to implement 3D semantic segmentation task, you need to additionally download the `nuScenes-lidarseg` data annotation and place the extracted files in the nuScenes corresponding folder.
**Note**: `v1.0trainval(test)/categroy.json` in nuScenes-lidarseg will replace the original `v1.0trainval(test)/categroy.json` of the Full dataset (v1.0), but will not affect the 3D object detection task.
Like the general way to prepare dataset, it is recommended to symlink the dataset root to `$MMDETECTION3D/data`.
The folder structure should be organized as follows before our processing.
```
mmdetection3d
├── mmdet3d
├── tools
├── configs
├── data
│ ├── nuscenes
│ │ ├── maps
│ │ ├── samples
│ │ ├── sweeps
│ │ ├── lidarseg (optional)
│ │ ├── v1.0-test
| | ├── v1.0-trainval
```
## Dataset Preparation
We typically need to organize the useful data information with a `.pkl` file in a specific style.
To prepare these files for nuScenes, run the following command:
```bash
python tools/create_data.py nuscenes --root-path ./data/nuscenes --out-dir ./data/nuscenes --extra-tag nuscenes
```
The folder structure after processing should be as below.
```
mmdetection3d
├── mmdet3d
├── tools
├── configs
├── data
│ ├── nuscenes
│ │ ├── maps
│ │ ├── samples
│ │ ├── sweeps
│ │ ├── lidarseg (optional)
│ │ ├── v1.0-test
| | ├── v1.0-trainval
│ │ ├── nuscenes_database
│ │ ├── nuscenes_infos_train.pkl
│ │ ├── nuscenes_infos_val.pkl
│ │ ├── nuscenes_infos_test.pkl
│ │ ├── nuscenes_dbinfos_train.pkl
```
- `nuscenes_database/xxxxx.bin`: point cloud data included in each 3D bounding box of the training dataset
- `nuscenes_infos_train.pkl`: training dataset, a dict contains two keys: `metainfo` and `data_list`.
`metainfo` contains the basic information for the dataset itself, such as `categories`, `dataset` and `info_version`, while `data_list` is a list of dict, each dict (hereinafter referred to as `info`) contains all the detailed information of single sample as follows:
- info\['sample_idx'\]: The index of this sample in the whole dataset.
- info\['token'\]: Sample data token.
- info\['timestamp'\]: Timestamp of the sample data.
- info\['ego2global'\]: The transformation matrix from the ego vehicle to global coordinates. (4x4 list)
- info\['lidar_points'\]: A dict containing all the information related to the lidar points.
- info\['lidar_points'\]\['lidar_path'\]: The filename of the lidar point cloud data.
- info\['lidar_points'\]\['num_pts_feats'\]: The feature dimension of point.
- info\['lidar_points'\]\['lidar2ego'\]: The transformation matrix from this lidar sensor to ego vehicle. (4x4 list)
- info\['lidar_sweeps'\]: A list contains sweeps information (The intermediate lidar frames without annotations)
- info\['lidar_sweeps'\]\[i\]\['lidar_points'\]\['data_path'\]: The lidar data path of i-th sweep.
- info\['lidar_sweeps'\]\[i\]\['lidar_points'\]\['lidar2ego'\]: The transformation matrix from this lidar sensor to ego vehicle. (4x4 list)
- info\['lidar_sweeps'\]\[i\]\['lidar_points'\]\['ego2global'\]: The transformation matrix from the ego vehicle to global coordinates. (4x4 list)
- info\['lidar_sweeps'\]\[i\]\['lidar2sensor'\]: The transformation matrix from the main lidar sensor to the current sensor (for collecting the sweep data). (4x4 list)
- info\['lidar_sweeps'\]\[i\]\['timestamp'\]: Timestamp of the sweep data.
- info\['lidar_sweeps'\]\[i\]\['sample_data_token'\]: The sweep sample data token.
- info\['images'\]: A dict contains six keys corresponding to each camera: `'CAM_FRONT'`, `'CAM_FRONT_RIGHT'`, `'CAM_FRONT_LEFT'`, `'CAM_BACK'`, `'CAM_BACK_LEFT'`, `'CAM_BACK_RIGHT'`. Each dict contains all data information related to corresponding camera.
- info\['images'\]\['CAM_XXX'\]\['img_path'\]: The filename of the image.
- info\['images'\]\['CAM_XXX'\]\['cam2img'\]: The transformation matrix recording the intrinsic parameters when projecting 3D points to each image plane. (3x3 list)
- info\['images'\]\['CAM_XXX'\]\['sample_data_token'\]: Sample data token of image.
- info\['images'\]\['CAM_XXX'\]\['timestamp'\]: Timestamp of the image.
- info\['images'\]\['CAM_XXX'\]\['cam2ego'\]: The transformation matrix from this camera sensor to ego vehicle. (4x4 list)
- info\['images'\]\['CAM_XXX'\]\['lidar2cam'\]: The transformation matrix from lidar sensor to this camera. (4x4 list)
- info\['instances'\]: It is a list of dict. Each dict contains all annotation information of single instance. For the i-th instance:
- info\['instances'\]\[i\]\['bbox_3d'\]: List of 7 numbers representing the 3D bounding box of the instance, in (x, y, z, l, w, h, yaw) order.
- info\['instances'\]\[i\]\['bbox_label_3d'\]: A int indicate the label of instance and the -1 indicate ignore.
- info\['instances'\]\[i\]\['velocity'\]: Velocities of 3D bounding boxes (no vertical measurements due to inaccuracy), a list has shape (2.).
- info\['instances'\]\[i\]\['num_lidar_pts'\]: Number of lidar points included in each 3D bounding box.
- info\['instances'\]\[i\]\['num_radar_pts'\]: Number of radar points included in each 3D bounding box.
- info\['instances'\]\[i\]\['bbox_3d_isvalid'\]: Whether each bounding box is valid. In general, we only take the 3D boxes that include at least one lidar or radar point as valid boxes.
- info\['cam_instances'\]: It is a dict containing keys `'CAM_FRONT'`, `'CAM_FRONT_RIGHT'`, `'CAM_FRONT_LEFT'`, `'CAM_BACK'`, `'CAM_BACK_LEFT'`, `'CAM_BACK_RIGHT'`. For vision-based 3D object detection task, we split 3D annotations of the whole scenes according to the camera they belong to. For the i-th instance:
- info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox_label'\]: Label of instance.
- info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox_label_3d'\]: Label of instance.
- info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox'\]: 2D bounding box annotation (exterior rectangle of the projected 3D box), a list arrange as \[x1, y1, x2, y2\].
- info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['center_2d'\]: Projected center location on the image, a list has shape (2,), .
- info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['depth'\]: The depth of projected center.
- info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['velocity'\]: Velocities of 3D bounding boxes (no vertical measurements due to inaccuracy), a list has shape (2,).
- info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['attr_label'\]: The attr label of instance. We maintain a default attribute collection and mapping for attribute classification.
- info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox_3d'\]: List of 7 numbers representing the 3D bounding box of the instance, in (x, y, z, l, h, w, yaw) order.
- info\['pts_semantic_mask_path'\]:The filename of the lidar point cloud semantic segmentation annotation.
Note:
1. The differences between `bbox_3d` in `instances` and that in `cam_instances`.
Both `bbox_3d` have been converted to MMDet3D coordinate system, but `bboxes_3d` in `instances` is in LiDAR coordinate format and `bboxes_3d` in `cam_instances` is in Camera coordinate format. Mind the difference between them in 3D Box representation ('l, w, h' and 'l, h, w').
2. Here we only explain the data recorded in the training info files. The same applies to validation and testing set (the `.pkl` file of test set does not contains `instances` and `cam_instances`).
The core function to get `nuscenes_infos_xxx.pkl` is [\_fill_trainval_infos](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/tools/dataset_converters/nuscenes_converter.py#L146).
Please refer to [nuscenes_converter.py](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/tools/dataset_converters/nuscenes_converter.py) for more details.
## Training pipeline
### LiDAR-Based Methods
A typical training pipeline of LiDAR-based 3D detection (including multi-modality methods) on nuScenes is as below.
```python
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=5),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=10),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.3925, 0.3925],
scale_ratio_range=[0.95, 1.05],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectNameFilter', classes=class_names),
dict(type='PointShuffle'),
dict(
type='Pack3DDetInputs',
keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
```
Compared to general cases, nuScenes has a specific `'LoadPointsFromMultiSweeps'` pipeline to load point clouds from consecutive frames. This is a common practice used in this setting.
Please refer to the nuScenes [original paper](https://arxiv.org/abs/1903.11027) for more details.
The default `use_dim` in `'LoadPointsFromMultiSweeps'` is `[0, 1, 2, 4]`, where the first 3 dimensions refer to point coordinates and the last refers to timestamp differences.
Intensity is not used by default due to its yielded noise when concatenating the points from different frames.
### Vision-Based Methods
#### Monocular-based
In the NuScenes dataset, for multi-view images, this paradigm usually involves detecting and outputting 3D object detection results separately for each image, and then obtaining the final detection results through post-processing (such as NMS). Essentially, it directly extends monocular 3D detection to multi-view settings. A typical training pipeline of image-based monocular 3D detection on nuScenes is as below.
```python
train_pipeline = [
dict(type='LoadImageFromFileMono3D'),
dict(
type='LoadAnnotations3D',
with_bbox=True,
with_label=True,
with_attr_label=True,
with_bbox_3d=True,
with_label_3d=True,
with_bbox_depth=True),
dict(type='mmdet.Resize', scale=(1600, 900), keep_ratio=True),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(
type='Pack3DDetInputs',
keys=[
'img', 'gt_bboxes', 'gt_bboxes_labels', 'attr_labels', 'gt_bboxes_3d',
'gt_labels_3d', 'centers_2d', 'depths'
]),
]
```
It follows the general pipeline of 2D detection while differs in some details:
- It uses monocular pipelines to load images, which includes additional required information like camera intrinsics.
- It needs to load 3D annotations.
- Some data augmentation techniques need to be adjusted, such as `RandomFlip3D`.
Currently we do not support more augmentation methods, because how to transfer and apply other techniques is still under explored.
#### BEV-based
BEV, Bird's-Eye-View, is another popular 3D detection paradigm. It directly takes multi-view images to perform 3D detection, for nuScenes, they are `CAM_FRONT`, `CAM_FRONT_LEFT`, `CAM_FRONT_RIGHT`, `CAM_BACK`, `CAM_BACK_LEFT` and `CAM_BACK_RIGHT`. A basic training pipeline of bev-based 3D detection on nuScenes is as below.
```python
class_names = [
'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
]
point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
train_transforms = [
dict(type='PhotoMetricDistortion3D'),
dict(
type='RandomResize3D',
scale=(1600, 900),
ratio_range=(1., 1.),
keep_ratio=True)
]
train_pipeline = [
dict(type='LoadMultiViewImageFromFiles',
to_float32=True,
num_views=6, ),
dict(type='LoadAnnotations3D',
with_bbox_3d=True,
with_label_3d=True,
with_attr_label=False),
# optional, data augmentation
dict(type='MultiViewWrapper', transforms=train_transforms),
# optional, filter object within specific point cloud range
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
# optional, filter object of specific classes
dict(type='ObjectNameFilter', classes=class_names),
dict(type='Pack3DDetInputs', keys=['img', 'gt_bboxes_3d', 'gt_labels_3d'])
]
```
To load multiple view of images, a little modification should be made to the dataset.
```python
data_prefix = dict(
CAM_FRONT='samples/CAM_FRONT',
CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT',
CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT',
CAM_BACK='samples/CAM_BACK',
CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
CAM_BACK_LEFT='samples/CAM_BACK_LEFT',
)
train_dataloader = dict(
batch_size=4,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type="NuScenesDataset",
data_root="./data/nuScenes",
ann_file="nuscenes_infos_train.pkl",
data_prefix=data_prefix,
modality=dict(use_camera=True, use_lidar=False, ),
pipeline=train_pipeline,
test_mode=False, )
)
```
## Evaluation
An example to evaluate PointPillars with 8 GPUs with nuScenes metrics is as follows.
```shell
bash ./tools/dist_test.sh configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb4-2x_nus-3d.py checkpoints/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d_20200620_230405-2fa62f3d.pth 8
```
## Metrics
NuScenes proposes a comprehensive metric, namely nuScenes detection score (NDS), to evaluate different methods and set up the benchmark.
It consists of mean Average Precision (mAP), Average Translation Error (ATE), Average Scale Error (ASE), Average Orientation Error (AOE), Average Velocity Error (AVE) and Average Attribute Error (AAE).
Please refer to its [official website](https://www.nuscenes.org/object-detection?externalData=all&mapData=all&modalities=Any) for more details.
We also adopt this approach for evaluation on nuScenes. An example of printed evaluation results is as follows:
```
mAP: 0.3197
mATE: 0.7595
mASE: 0.2700
mAOE: 0.4918
mAVE: 1.3307
mAAE: 0.1724
NDS: 0.3905
Eval time: 170.8s
Per-class results:
Object Class AP ATE ASE AOE AVE AAE
car 0.503 0.577 0.152 0.111 2.096 0.136
truck 0.223 0.857 0.224 0.220 1.389 0.179
bus 0.294 0.855 0.204 0.190 2.689 0.283
trailer 0.081 1.094 0.243 0.553 0.742 0.167
construction_vehicle 0.058 1.017 0.450 1.019 0.137 0.341
pedestrian 0.392 0.687 0.284 0.694 0.876 0.158
motorcycle 0.317 0.737 0.265 0.580 2.033 0.104
bicycle 0.308 0.704 0.299 0.892 0.683 0.010
traffic_cone 0.555 0.486 0.309 nan nan nan
barrier 0.466 0.581 0.269 0.169 nan nan
```
## Testing and make a submission
An example to test PointPillars on nuScenes with 8 GPUs and generate a submission to the leaderboard is as follows.
You should modify the `jsonfile_prefix` in the `test_evaluator` of corresponding configuration. For example, adding `test_evaluator = dict(type='NuScenesMetric', jsonfile_prefix='work_dirs/pp-nus/results_eval.json')` or using `--cfg-options "test_evaluator.jsonfile_prefix=work_dirs/pp-nus/results_eval.json)` after the test command.
```shell
./tools/dist_test.sh configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb4-2x_nus-3d.py work_dirs/pp-nus/latest.pth 8 --cfg-options 'test_evaluator.jsonfile_prefix=work_dirs/pp-nus/results_eval'
```
Note that the testing info should be changed to that for testing set instead of validation set [here](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/configs/_base_/datasets/nus-3d.py#L132).
After generating the `work_dirs/pp-nus/results_eval.json`, you can compress it and submit it to nuScenes benchmark. Please refer to the [nuScenes official website](https://www.nuscenes.org/object-detection?externalData=all&mapData=all&modalities=Any) for more information.
We can also visualize the prediction results with our developed visualization tools. Please refer to the [visualization doc](https://mmdetection3d.readthedocs.io/en/latest/useful_tools.html#visualization) for more details.
## Notes
### Transformation between `NuScenesBox` and our `CameraInstanceBoxes`.
In general, the main difference of `NuScenesBox` and our `CameraInstanceBoxes` is mainly reflected in the yaw definition. `NuScenesBox` defines the rotation with a quaternion or three Euler angles while ours only defines one yaw angle due to the practical scenario. It requires us to add some additional rotations manually in the pre-processing and post-processing, such as [here](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/nuscenes_mono_dataset.py#L673).
In addition, please note that the definition of corners and locations are detached in the `NuScenesBox`. For example, in monocular 3D detection, the definition of the box location is in its camera coordinate (see its official [illustration](https://www.nuscenes.org/nuscenes#data-collection) for car setup), which is consistent with [ours](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/core/bbox/structures/cam_box3d.py). In contrast, its corners are defined with the [convention](https://github.com/nutonomy/nuscenes-devkit/blob/02e9200218977193a1058dd7234f935834378319/python-sdk/nuscenes/utils/data_classes.py#L527) "x points forward, y to the left, z up". It results in different philosophy of dimension and rotation definitions from our `CameraInstanceBoxes`. An example to remove similar hacks is PR [#744](https://github.com/open-mmlab/mmdetection3d/pull/744). The same problem also exists in the LiDAR system. To deal with them, we typically add some transformation in the pre-processing and post-processing to guarantee the box will be in our coordinate system during the entire training and inference procedure.
# S3DIS Dataset
## Dataset preparation
For the overall process, please refer to the [README](https://github.com/open-mmlab/mmdetection3d/blob/master/data/s3dis/README.md/) page for S3DIS.
### Export S3DIS data
By exporting S3DIS data, we load the raw point cloud data and generate the relevant annotations including semantic labels and instance labels.
The directory structure before exporting should be as below:
```
mmdetection3d
├── mmdet3d
├── tools
├── configs
├── data
│ ├── s3dis
│ │ ├── meta_data
│ │ ├── Stanford3dDataset_v1.2_Aligned_Version
│ │ │ ├── Area_1
│ │ │ │ ├── conferenceRoom_1
│ │ │ │ ├── office_1
│ │ │ │ ├── ...
│ │ │ ├── Area_2
│ │ │ ├── Area_3
│ │ │ ├── Area_4
│ │ │ ├── Area_5
│ │ │ ├── Area_6
│ │ ├── indoor3d_util.py
│ │ ├── collect_indoor3d_data.py
│ │ ├── README.md
```
Under folder `Stanford3dDataset_v1.2_Aligned_Version`, the rooms are spilted into 6 areas. We use 5 areas for training and 1 for evaluation (typically `Area_5`). Under the directory of each area, there are folders in which raw point cloud data and relevant annotations are saved. For instance, under folder `Area_1/office_1` the files are as below:
- `office_1.txt`: A txt file storing coordinates and colors of each point in the raw point cloud data.
- `Annotations/`: This folder contains txt files for different object instances. Each txt file represents one instance, e.g.
- `chair_1.txt`: A txt file storing raw point cloud data of one chair in this room.
If we concat all the txt files under `Annotations/`, we will get the same point cloud as denoted by `office_1.txt`.
Export S3DIS data by running `python collect_indoor3d_data.py`. The main steps include:
- Export original txt files to point cloud, instance label and semantic label.
- Save point cloud data and relevant annotation files.
And the core function `export` in `indoor3d_util.py` is as follows:
```python
def export(anno_path, out_filename):
"""Convert original dataset files to points, instance mask and semantic
mask files. We aggregated all the points from each instance in the room.
Args:
anno_path (str): path to annotations. e.g. Area_1/office_2/Annotations/
out_filename (str): path to save collected points and labels.
file_format (str): txt or numpy, determines what file format to save.
Note:
the points are shifted before save, the most negative point is now
at origin.
"""
points_list = []
ins_idx = 1 # instance ids should be indexed from 1, so 0 is unannotated
# an example of `anno_path`: Area_1/office_1/Annotations
# which contains all object instances in this room as txt files
for f in glob.glob(osp.join(anno_path, '*.txt')):
# get class name of this instance
one_class = osp.basename(f).split('_')[0]
if one_class not in class_names: # some rooms have 'staris' class
one_class = 'clutter'
points = np.loadtxt(f)
labels = np.ones((points.shape[0], 1)) * class2label[one_class]
ins_labels = np.ones((points.shape[0], 1)) * ins_idx
ins_idx += 1
points_list.append(np.concatenate([points, labels, ins_labels], 1))
data_label = np.concatenate(points_list, 0) # [N, 8], (pts, rgb, sem, ins)
# align point cloud to the origin
xyz_min = np.amin(data_label, axis=0)[0:3]
data_label[:, 0:3] -= xyz_min
np.save(f'{out_filename}_point.npy', data_label[:, :6].astype(np.float32))
np.save(f'{out_filename}_sem_label.npy', data_label[:, 6].astype(np.int64))
np.save(f'{out_filename}_ins_label.npy', data_label[:, 7].astype(np.int64))
```
where we load and concatenate all the point cloud instances under `Annotations/` to form raw point cloud and generate semantic/instance labels. After exporting each room, the point cloud data, semantic labels and instance labels should be saved in `.npy` files.
### Create dataset
```shell
python tools/create_data.py s3dis --root-path ./data/s3dis \
--out-dir ./data/s3dis --extra-tag s3dis
```
The above exported point cloud files, semantic label files and instance label files are further saved in `.bin` format. Meanwhile `.pkl` info files are also generated for each area.
The directory structure after process should be as below:
```
s3dis
├── meta_data
├── indoor3d_util.py
├── collect_indoor3d_data.py
├── README.md
├── Stanford3dDataset_v1.2_Aligned_Version
├── s3dis_data
├── points
│ ├── xxxxx.bin
├── instance_mask
│ ├── xxxxx.bin
├── semantic_mask
│ ├── xxxxx.bin
├── seg_info
│ ├── Area_1_label_weight.npy
│ ├── Area_1_resampled_scene_idxs.npy
│ ├── Area_2_label_weight.npy
│ ├── Area_2_resampled_scene_idxs.npy
│ ├── Area_3_label_weight.npy
│ ├── Area_3_resampled_scene_idxs.npy
│ ├── Area_4_label_weight.npy
│ ├── Area_4_resampled_scene_idxs.npy
│ ├── Area_5_label_weight.npy
│ ├── Area_5_resampled_scene_idxs.npy
│ ├── Area_6_label_weight.npy
│ ├── Area_6_resampled_scene_idxs.npy
├── s3dis_infos_Area_1.pkl
├── s3dis_infos_Area_2.pkl
├── s3dis_infos_Area_3.pkl
├── s3dis_infos_Area_4.pkl
├── s3dis_infos_Area_5.pkl
├── s3dis_infos_Area_6.pkl
```
- `points/xxxxx.bin`: The exported point cloud data.
- `instance_mask/xxxxx.bin`: The instance label for each point, value range: \[0, ${NUM_INSTANCES}\], 0: unannotated.
- `semantic_mask/xxxxx.bin`: The semantic label for each point, value range: \[0, 12\].
- `s3dis_infos_Area_1.pkl`: Area 1 data infos, the detailed info of each room is as follows:
- info\['point_cloud'\]: {'num_features': 6, 'lidar_idx': sample_idx}.
- info\['pts_path'\]: The path of `points/xxxxx.bin`.
- info\['pts_instance_mask_path'\]: The path of `instance_mask/xxxxx.bin`.
- info\['pts_semantic_mask_path'\]: The path of `semantic_mask/xxxxx.bin`.
- `seg_info`: The generated infos to support semantic segmentation model training.
- `Area_1_label_weight.npy`: Weighting factor for each semantic class. Since the number of points in different classes varies greatly, it's a common practice to use label re-weighting to get a better performance.
- `Area_1_resampled_scene_idxs.npy`: Re-sampling index for each scene. Different rooms will be sampled multiple times according to their number of points to balance training data.
## Training pipeline
A typical training pipeline of S3DIS for 3D semantic segmentation is as below.
```python
class_names = ('ceiling', 'floor', 'wall', 'beam', 'column', 'window', 'door',
'table', 'chair', 'sofa', 'bookcase', 'board', 'clutter')
num_points = 4096
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='DEPTH',
shift_height=False,
use_color=True,
load_dim=6,
use_dim=[0, 1, 2, 3, 4, 5]),
dict(
type='LoadAnnotations3D',
with_bbox_3d=False,
with_label_3d=False,
with_mask_3d=False,
with_seg_3d=True),
dict(
type='PointSegClassMapping'),
dict(
type='IndoorPatchPointSample',
num_points=num_points,
block_size=1.0,
ignore_index=None,
use_normalized_coord=True,
enlarge_size=None,
min_unique_num=num_points // 4,
eps=0.0),
dict(type='NormalizePointsColor', color_mean=None),
dict(
type='GlobalRotScaleTrans',
rot_range=[-3.141592653589793, 3.141592653589793], # [-pi, pi]
scale_ratio_range=[0.8, 1.2],
translation_std=[0, 0, 0]),
dict(
type='RandomJitterPoints',
jitter_std=[0.01, 0.01, 0.01],
clip_range=[-0.05, 0.05]),
dict(type='RandomDropPointsColor', drop_ratio=0.2),
dict(type='Pack3DDetInputs', keys=['points', 'pts_semantic_mask'])
]
```
- `PointSegClassMapping`: Only the valid category ids will be mapped to class label ids like \[0, 13) during training. Other class ids will be converted to `ignore_index` which equals to `13`.
- `IndoorPatchPointSample`: Crop a patch containing a fixed number of points from input point cloud. `block_size` indicates the size of the cropped block, typically `1.0` for S3DIS.
- `NormalizePointsColor`: Normalize the RGB color values of input point cloud by dividing `255`.
- Data augmentation:
- `GlobalRotScaleTrans`: randomly rotate and scale input point cloud.
- `RandomJitterPoints`: randomly jitter point cloud by adding different noise vector to each point.
- `RandomDropPointsColor`: set the colors of point cloud to all zeros by a probability `drop_ratio`.
## Metrics
Typically mean intersection over union (mIoU) is used for evaluation on S3DIS. In detail, we first compute IoU for multiple classes and then average them to get mIoU, please refer to [seg_eval.py](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/evaluation/functional/seg_eval.py).
As introduced in section `Export S3DIS data`, S3DIS trains on 5 areas and evaluates on the remaining 1 area. But there are also other area split schemes in different papers.
To enable flexible combination of train-val splits, we use sub-dataset to represent one area, and concatenate them to form a larger training set. An example of training on area 1, 2, 3, 4, 6 and evaluating on area 5 is shown as below:
```python
dataset_type = 'S3DISSegDataset'
data_root = './data/s3dis/'
class_names = ('ceiling', 'floor', 'wall', 'beam', 'column', 'window', 'door',
'table', 'chair', 'sofa', 'bookcase', 'board', 'clutter')
train_area = [1, 2, 3, 4, 6]
test_area = 5
train_dataloader = dict(
batch_size=8,
num_workers=4,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_files=[f's3dis_infos_Area_{i}.pkl' for i in train_area],
metainfo=metainfo,
data_prefix=data_prefix,
pipeline=train_pipeline,
modality=input_modality,
ignore_index=len(class_names),
scene_idxs=[
f'seg_info/Area_{i}_resampled_scene_idxs.npy' for i in train_area
],
test_mode=False))
test_dataloader = dict(
batch_size=1,
num_workers=1,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_files=f's3dis_infos_Area_{test_area}.pkl',
metainfo=metainfo,
data_prefix=data_prefix,
pipeline=test_pipeline,
modality=input_modality,
ignore_index=len(class_names),
scene_idxs=f'seg_info/Area_{test_area}_resampled_scene_idxs.npy',
test_mode=True))
val_dataloader = test_dataloader
```
where we specify the areas used for training/validation by setting `ann_files` and `scene_idxs` with lists that include corresponding paths. The train-val split can be simply modified via changing the `train_area` and `test_area` variables.
# ScanNet Dataset
MMDetection3D supports LiDAR-based detection and segmentation on ScanNet dataset. This page provides specific tutorials about the usage.
## Dataset preparation
For the overall process, please refer to the [README](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/data/scannet/README.md) page for ScanNet.
### Export ScanNet point cloud data
By exporting ScanNet data, we load the raw point cloud data and generate the relevant annotations including semantic labels, instance labels and ground truth bounding boxes.
```shell
python batch_load_scannet_data.py
```
The directory structure before data preparation should be as below
```
mmdetection3d
├── mmdet3d
├── tools
├── configs
├── data
│ ├── scannet
│ │ ├── meta_data
│ │ ├── scans
│ │ │ ├── scenexxxx_xx
│ │ ├── batch_load_scannet_data.py
│ │ ├── load_scannet_data.py
│ │ ├── scannet_utils.py
│ │ ├── README.md
```
Under folder `scans` there are overall 1201 train and 312 validation folders in which raw point cloud data and relevant annotations are saved. For instance, under folder `scene0001_01` the files are as below:
- `scene0001_01_vh_clean_2.ply`: Mesh file storing coordinates and colors of each vertex. The mesh's vertices are taken as raw point cloud data.
- `scene0001_01.aggregation.json`: Aggregation file including object ID, segments ID and label.
- `scene0001_01_vh_clean_2.0.010000.segs.json`: Segmentation file including segments ID and vertex.
- `scene0001_01.txt`: Meta file including axis-aligned matrix, etc.
- `scene0001_01_vh_clean_2.labels.ply`: Annotation file containing the category of each vertex.
The procedure of exporting ScanNet data by running `python batch_load_scannet_data.py` mainly includes the following 3 steps:
- Export original files to point cloud, instance label, semantic label and bounding box file.
- Downsample raw point cloud and filter invalid classes.
- Save point cloud data and relevant annotation files.
And the core function `export` in `load_scannet_data.py` is as follows:
```python
def export(mesh_file,
agg_file,
seg_file,
meta_file,
label_map_file,
output_file=None,
test_mode=False):
# label map file: ./data/scannet/meta_data/scannetv2-labels.combined.tsv
# the various label standards in the label map file, e.g. 'nyu40id'
label_map = scannet_utils.read_label_mapping(
label_map_file, label_from='raw_category', label_to='nyu40id')
# load raw point cloud data, 6-dims feature: XYZRGB
mesh_vertices = scannet_utils.read_mesh_vertices_rgb(mesh_file)
# Load scene axis alignment matrix: a 4x4 transformation matrix
# transform raw points in sensor coordinate system to a coordinate system
# which is axis-aligned with the length/width of the room
lines = open(meta_file).readlines()
# test set data doesn't have align_matrix
axis_align_matrix = np.eye(4)
for line in lines:
if 'axisAlignment' in line:
axis_align_matrix = [
float(x)
for x in line.rstrip().strip('axisAlignment = ').split(' ')
]
break
axis_align_matrix = np.array(axis_align_matrix).reshape((4, 4))
# perform global alignment of mesh vertices
pts = np.ones((mesh_vertices.shape[0], 4))
# raw point cloud in homogeneous coordinates, each row: [x, y, z, 1]
pts[:, 0:3] = mesh_vertices[:, 0:3]
# transform raw mesh vertices to aligned mesh vertices
pts = np.dot(pts, axis_align_matrix.transpose()) # Nx4
aligned_mesh_vertices = np.concatenate([pts[:, 0:3], mesh_vertices[:, 3:]],
axis=1)
# Load semantic and instance labels
if not test_mode:
# each object has one semantic label and consists of several segments
object_id_to_segs, label_to_segs = read_aggregation(agg_file)
# many points may belong to the same segment
seg_to_verts, num_verts = read_segmentation(seg_file)
label_ids = np.zeros(shape=(num_verts), dtype=np.uint32)
object_id_to_label_id = {}
for label, segs in label_to_segs.items():
label_id = label_map[label]
for seg in segs:
verts = seg_to_verts[seg]
# each point has one semantic label
label_ids[verts] = label_id
instance_ids = np.zeros(
shape=(num_verts), dtype=np.uint32) # 0: unannotated
for object_id, segs in object_id_to_segs.items():
for seg in segs:
verts = seg_to_verts[seg]
# object_id is 1-indexed, i.e. 1,2,3,.,,,.NUM_INSTANCES
# each point belongs to one object
instance_ids[verts] = object_id
if object_id not in object_id_to_label_id:
object_id_to_label_id[object_id] = label_ids[verts][0]
# bbox format is [x, y, z, x_size, y_size, z_size, label_id]
# [x, y, z] is gravity center of bbox, [x_size, y_size, z_size] is axis-aligned
# [label_id] is semantic label id in 'nyu40id' standard
# Note: since 3D bbox is axis-aligned, the yaw is 0.
unaligned_bboxes = extract_bbox(mesh_vertices, object_id_to_segs,
object_id_to_label_id, instance_ids)
aligned_bboxes = extract_bbox(aligned_mesh_vertices, object_id_to_segs,
object_id_to_label_id, instance_ids)
...
return mesh_vertices, label_ids, instance_ids, unaligned_bboxes, \
aligned_bboxes, object_id_to_label_id, axis_align_matrix
```
After exporting each scan, the raw point cloud could be downsampled, e.g. to 50000, if the number of points is too large (the raw point cloud won't be downsampled if it's also used in 3D semantic segmentation task). In addition, invalid semantic labels outside of `nyu40id` standard or optional `DONOT CARE` classes should be filtered. Finally, the point cloud data, semantic labels, instance labels and ground truth bounding boxes should be saved in `.npy` files.
### Export ScanNet RGB data (optional)
By exporting ScanNet RGB data, for each scene we load a set of RGB images with corresponding 4x4 pose matrices, and a single 4x4 camera intrinsic matrix. Note, that this step is optional and can be skipped if multi-view detection is not planned to use.
```shell
python extract_posed_images.py
```
Each of 1201 train, 312 validation and 100 test scenes contains a single `.sens` file. For instance, for scene `0001_01` we have `data/scannet/scans/scene0001_01/0001_01.sens`. For this scene all images and poses are extracted to `data/scannet/posed_images/scene0001_01`. Specifically, there will be 300 image files xxxxx.jpg, 300 camera pose files xxxxx.txt and a single `intrinsic.txt` file. Typically, single scene contains several thousand images. By default, we extract only 300 of them with resulting space occupation of \<100 Gb. To extract more images, use `--max-images-per-scene` parameter.
### Create dataset
```shell
python tools/create_data.py scannet --root-path ./data/scannet \
--out-dir ./data/scannet --extra-tag scannet
```
The above exported point cloud file, semantic label file and instance label file are further saved in `.bin` format. Meanwhile `.pkl` info files are also generated for train or validation. The core function `process_single_scene` of getting data infos is as follows.
```python
def process_single_scene(sample_idx):
# save point cloud, instance label and semantic label in .bin file respectively, get info['pts_path'], info['pts_instance_mask_path'] and info['pts_semantic_mask_path']
...
# get annotations
if has_label:
annotations = {}
# box is of shape [k, 6 + class]
aligned_box_label = self.get_aligned_box_label(sample_idx)
unaligned_box_label = self.get_unaligned_box_label(sample_idx)
annotations['gt_num'] = aligned_box_label.shape[0]
if annotations['gt_num'] != 0:
aligned_box = aligned_box_label[:, :-1] # k, 6
unaligned_box = unaligned_box_label[:, :-1]
classes = aligned_box_label[:, -1] # k
annotations['name'] = np.array([
self.label2cat[self.cat_ids2class[classes[i]]]
for i in range(annotations['gt_num'])
])
# default names are given to aligned bbox for compatibility
# we also save unaligned bbox info with marked names
annotations['location'] = aligned_box[:, :3]
annotations['dimensions'] = aligned_box[:, 3:6]
annotations['gt_boxes_upright_depth'] = aligned_box
annotations['unaligned_location'] = unaligned_box[:, :3]
annotations['unaligned_dimensions'] = unaligned_box[:, 3:6]
annotations[
'unaligned_gt_boxes_upright_depth'] = unaligned_box
annotations['index'] = np.arange(
annotations['gt_num'], dtype=np.int32)
annotations['class'] = np.array([
self.cat_ids2class[classes[i]]
for i in range(annotations['gt_num'])
])
axis_align_matrix = self.get_axis_align_matrix(sample_idx)
annotations['axis_align_matrix'] = axis_align_matrix # 4x4
info['annos'] = annotations
return info
```
The directory structure after process should be as below:
```
scannet
├── meta_data
├── batch_load_scannet_data.py
├── load_scannet_data.py
├── scannet_utils.py
├── README.md
├── scans
├── scans_test
├── scannet_instance_data
├── points
│ ├── xxxxx.bin
├── instance_mask
│ ├── xxxxx.bin
├── semantic_mask
│ ├── xxxxx.bin
├── seg_info
│ ├── train_label_weight.npy
│ ├── train_resampled_scene_idxs.npy
│ ├── val_label_weight.npy
│ ├── val_resampled_scene_idxs.npy
├── posed_images
│ ├── scenexxxx_xx
│ │ ├── xxxxxx.txt
│ │ ├── xxxxxx.jpg
│ │ ├── intrinsic.txt
├── scannet_infos_train.pkl
├── scannet_infos_val.pkl
├── scannet_infos_test.pkl
```
- `points/xxxxx.bin`: The `axis-unaligned` point cloud data after downsample. Since ScanNet 3D detection task takes axis-aligned point clouds as input, while ScanNet 3D semantic segmentation task takes unaligned points, we choose to store unaligned points and their axis-align transform matrix. Note: the points would be axis-aligned in pre-processing pipeline [`GlobalAlignment`](https://github.com/open-mmlab/mmdetection3d/blob/9f0b01caf6aefed861ef4c3eb197c09362d26b32/mmdet3d/datasets/pipelines/transforms_3d.py#L423) of 3D detection task.
- `instance_mask/xxxxx.bin`: The instance label for each point, value range: \[0, NUM_INSTANCES\], 0: unannotated.
- `semantic_mask/xxxxx.bin`: The semantic label for each point, value range: \[1, 40\], i.e. `nyu40id` standard. Note: the `nyu40id` ID will be mapped to train ID in train pipeline `PointSegClassMapping`.
- `seg_info`: The generated infos to support semantic segmentation model training.
- `train_label_weight.npy`: Weighting factor for each semantic class. Since the number of points in different classes varies greatly, it's a common practice to use label re-weighting to get a better performance.
- `train_resampled_scene_idxs.npy`: Re-sampling index for each scene. Different rooms will be sampled multiple times according to their number of points to balance training data.
- `posed_images/scenexxxx_xx`: The set of `.jpg` images with `.txt` 4x4 poses and the single `.txt` file with camera intrinsic matrix.
- `scannet_infos_train.pkl`: The train data infos, the detailed info of each scan is as follows:
- info\['lidar_points'\]: A dict containing all information related to the lidar points.
- info\['lidar_points'\]\['lidar_path'\]: The filename of the lidar point cloud data.
- info\['lidar_points'\]\['num_pts_feats'\]: The feature dimension of point.
- info\['lidar_points'\]\['axis_align_matrix'\]: The transformation matrix to align the axis.
- info\['pts_semantic_mask_path'\]: The filename of the semantic mask annotation.
- info\['pts_instance_mask_path'\]: The filename of the instance mask annotation.
- info\['instances'\]: A list of dict contains all annotations, each dict contains all annotation information of single instance. For the i-th instance:
- info\['instances'\]\[i\]\['bbox_3d'\]: List of 6 numbers representing the axis-aligned 3D bounding box of the instance in depth coordinate system, in (x, y, z, l, w, h) order.
- info\['instances'\]\[i\]\['bbox_label_3d'\]: The label of each 3d bounding boxes.
- `scannet_infos_val.pkl`: The val data infos, which shares the same format as `scannet_infos_train.pkl`.
- `scannet_infos_test.pkl`: The test data infos, which almost shares the same format as `scannet_infos_train.pkl` except for the lack of annotation.
## Training pipeline
A typical training pipeline of ScanNet for 3D detection is as follows.
```python
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='DEPTH',
shift_height=True,
load_dim=6,
use_dim=[0, 1, 2]),
dict(
type='LoadAnnotations3D',
with_bbox_3d=True,
with_label_3d=True,
with_mask_3d=True,
with_seg_3d=True),
dict(type='GlobalAlignment', rotation_axis=2),
dict(type='PointSegClassMapping'),
dict(type='PointSample', num_points=40000),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
flip_ratio_bev_vertical=0.5),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.087266, 0.087266],
scale_ratio_range=[1.0, 1.0],
shift_height=True),
dict(
type='Pack3DDetInputs',
keys=[
'points', 'gt_bboxes_3d', 'gt_labels_3d', 'pts_semantic_mask',
'pts_instance_mask'
])
]
```
- `GlobalAlignment`: The previous point cloud would be axis-aligned using the axis-aligned matrix.
- `PointSegClassMapping`: Only the valid category IDs will be mapped to class label IDs like \[0, 18) during training.
- Data augmentation:
- `PointSample`: downsample the input point cloud.
- `RandomFlip3D`: randomly flip the input point cloud horizontally or vertically.
- `GlobalRotScaleTrans`: rotate the input point cloud, usually in the range of \[-5, 5\] (degrees) for ScanNet; then scale the input point cloud, usually by 1.0 for ScanNet (which means no scaling); finally translate the input point cloud, usually by 0 for ScanNet (which means no translation).
A typical training pipeline of ScanNet for 3D semantic segmentation is as below:
```python
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='DEPTH',
shift_height=False,
use_color=True,
load_dim=6,
use_dim=[0, 1, 2, 3, 4, 5]),
dict(
type='LoadAnnotations3D',
with_bbox_3d=False,
with_label_3d=False,
with_mask_3d=False,
with_seg_3d=True),
dict(
type='PointSegClassMapping'),
dict(
type='IndoorPatchPointSample',
num_points=num_points,
block_size=1.5,
ignore_index=len(class_names),
use_normalized_coord=False,
enlarge_size=0.2,
min_unique_num=None),
dict(type='NormalizePointsColor', color_mean=None),
dict(type='Pack3DDetInputs', keys=['points', 'pts_semantic_mask'])
]
```
- `PointSegClassMapping`: Only the valid category ids will be mapped to class label ids like \[0, 20) during training. Other class ids will be converted to `ignore_index` which equals to `20`.
- `IndoorPatchPointSample`: Crop a patch containing a fixed number of points from input point cloud. `block_size` indicates the size of the cropped block, typically `1.5` for ScanNet.
- `NormalizePointsColor`: Normalize the RGB color values of input point cloud by dividing `255`.
## Metrics
- **Object Detection**: Typically mean Average Precision (mAP) is used for evaluation on ScanNet, e.g. `mAP@0.25` and `mAP@0.5`. In detail, a generic function to compute precision and recall for 3D object detection for multiple classes is called. Please refer to [indoor_eval](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/evaluation/functional/indoor_eval.py) for more details.
**Note**: As introduced in section `Export ScanNet data`, all ground truth 3D bounding box are axis-aligned, i.e. the yaw is zero. So the yaw target of network predicted 3D bounding box is also zero and axis-aligned 3D Non-Maximum Suppression (NMS), which is regardless of rotation, is adopted during post-processing .
- **Semantic Segmentation**: Typically mean Intersection over Union (mIoU) is used for evaluation on ScanNet. In detail, we first compute IoU for multiple classes and then average them to get mIoU, please refer to [seg_eval](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/evaluation/functional/seg_eval.py).
## Testing and Making a Submission
By default, our codebase evaluates semantic segmentation results on the validation set.
If you would like to test the model performance on the online benchmark, add `--format-only` flag in the evaluation script and change `ann_file=data_root + 'scannet_infos_val.pkl'` to `ann_file=data_root + 'scannet_infos_test.pkl'` in the ScanNet dataset's [config](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/_base_/datasets/scannet-seg.py#L126). Remember to specify the `txt_prefix` as the directory to save the testing results.
Taking PointNet++ (SSG) on ScanNet for example, the following command can be used to do inference on test set:
```
./tools/dist_test.sh configs/pointnet2/pointnet2_ssg_16x2_cosine_200e_scannet-seg.py \
work_dirs/pointnet2_ssg/latest.pth --format-only \
--eval-options txt_prefix=work_dirs/pointnet2_ssg/test_submission
```
After generating the results, you can basically compress the folder and upload to the [ScanNet evaluation server](http://kaldir.vc.in.tum.de/scannet_benchmark/semantic_label_3d).
# SemanticKITTI Dataset
This page provides specific tutorials about the usage of MMDetection3D for SemanticKITTI dataset.
## Prepare dataset
You can download SemanticKITTI dataset [HERE](http://semantic-kitti.org/dataset.html#download) and unzip all zip files.
Like the general way to prepare dataset, it is recommended to symlink the dataset root to `$MMDETECTION3D/data`.
The folder structure should be organized as follows before our processing.
```
mmdetection3d
├── mmdet3d
├── tools
├── configs
├── data
│ ├── semantickitti
│ │ ├── sequences
│ │ │ ├── 00
│ │ │ │   ├── labels
│ │ │ │   ├── velodyne
│ │ │ ├── 01
│ │ │ ├── ..
│ │ │ ├── 22
```
SemanticKITTI dataset contains 23 sequences, where \[0-7\], \[9-10\] are used as training set (about 19k training samples), sequence 8 as validation set (about 4k validation samples) and \[11-22\] as test set (about 20k test samples). Each sequence contains velodyne and labels folders for LIDAR point cloud data and segmentation annotations (where the high 16 bits store the instance segmentation annotations and the low 16 bits store the semantic segmentation annotations), respectively.
### Create SemanticKITTI Dataset
We support scripts that generate dataset information for training and testing. Create `.pkl` info by running:
```bash
python ./tools/create_data.py semantickitti --root-path ./data/semantickitti --out-dir ./data/semantickitti --extra-tag semantickitti
```
The folder structure after processing should be as below
```
mmdetection3d
├── mmdet3d
├── tools
├── configs
├── data
│ ├── semantickitti
│ │ ├── sequences
│ │ │ ├── 00
│ │ │ │   ├── labels
│ │ │ │   ├── velodyne
│ │ │ ├── 01
│ │ │ ├── ..
│ │ │ ├── 22
│ │ ├── semantickitti_infos_test.pkl
│ │ ├── semantickitti_infos_train.pkl
│ │ ├── semantickitti_infos_val.pkl
```
- `semantickitti_infos_train.pkl`: training dataset, a dict contains two keys: `metainfo` and `data_list`.
`metainfo` contains the basic information for the dataset itself, while `data_list` is a list of dict, each dict (hereinafter referred to as `info`) contains all the detailed information of single sample as follows:
- info\['sample_id'\]: The index of this sample in the whole dataset.
- info\['lidar_points'\]: A dict containing all the information related to the lidar points.
- info\['lidar_points'\]\['lidar_path'\]: The filename of the lidar point cloud data.
- info\['lidar_points'\]\['num_pts_feats'\]: The feature dimension of point.
- info\['pts_semantic_mask_pth'\]: The path of 3D semantic segmentation annotation file.
Please refer to [semantickitti_converter.py](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/tools/dataset_converters/semantickitti_converter.py) and [update_infos_to_v2.py ](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/tools/dataset_converters/update_infos_to_v2.py) for more details.
## Train pipeline
A typical train pipeline of 3D segmentation on SemanticKITTI is as below:
```python
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=4,
use_dim=4,
backend_args=backend_args),
dict(
type='LoadAnnotations3D',
with_bbox_3d=False,
with_label_3d=False,
with_seg_3d=True,
seg_3d_dtype='np.int32',
seg_offset=2**16,
dataset_type='semantickitti',
backend_args=backend_args),
dict(type='PointSegClassMapping'),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
flip_ratio_bev_vertical=0.5),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.78539816, 0.78539816],
scale_ratio_range=[0.95, 1.05],
translation_std=[0.1, 0.1, 0.1],
),
dict(type='Pack3DDetInputs', keys=['points', 'pts_semantic_mask'])
]
```
- Data augmentation:
- `RandomFlip3D`: randomly flip input point cloud horizontally or vertically.
- `GlobalRotScaleTrans`: rotate/scale/transform input point cloud.
## Evaluation
An example to evaluate MinkUNet with 8 GPUs with semantickitti metrics is as follows:
```shell
bash tools/dist_test.sh configs/minkunet/minkunet_w32_8xb2-15e_semantickitti.py checkpoints/minkunet_w32_8xb2-15e_semantickitti_20230309_160710-7fa0a6f1.pth 8
```
## Metrics
Typically mean intersection over union (mIoU) is used for evaluation on Semantickitti. In detail, we first compute IoU for multiple classes and then average them to get mIoU, please refer to [seg_eval.py](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/evaluation/functional/seg_eval.py).
An example of printed evaluation results is as follows:
| classes | car | bicycle | motorcycle | truck | bus | person | bicyclist | motorcyclist | road | parking | sidewalk | other-ground | building | fence | vegetation | trunck | terrian | pole | traffic-sign | miou | acc | acc_cls |
| ------- | ------ | ------- | ---------- | ------ | ------ | ------ | --------- | ------------ | ------ | ------- | -------- | ------------ | -------- | ------ | ---------- | ------ | ------- | ------ | ------------ | ------ | ------ | ------- |
| results | 0.9687 | 0.1908 | 0.6313 | 0.8580 | 0.6359 | 0.6818 | 0.8444 | 0.0002 | 0.9353 | 0.4854 | 0.8106 | 0.0024 | 0.9050 | 0.6111 | 0.8822 | 0.6605 | 0.7493 | 0.6442 | 0.4837 | 0.6306 | 0.9202 | 0.6924 |
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment