Commit 7aa442d5 authored by raojy's avatar raojy
Browse files

raw_mmdetection

parent 9c03eaa8
_base_ = [
'../_base_/datasets/kitti-3d-3class.py',
'../_base_/schedules/cyclic-40e.py', '../_base_/default_runtime.py'
]
voxel_size = [0.05, 0.05, 0.1]
point_cloud_range = [0, -40, -3, 70.4, 40, 1]
data_root = 'data/kitti/'
class_names = ['Pedestrian', 'Cyclist', 'Car']
metainfo = dict(CLASSES=class_names)
backend_args = None
db_sampler = dict(
data_root=data_root,
info_path=data_root + 'kitti_dbinfos_train.pkl',
rate=1.0,
prepare=dict(
filter_by_difficulty=[-1],
filter_by_min_points=dict(Car=5, Pedestrian=5, Cyclist=5)),
classes=class_names,
sample_groups=dict(Car=15, Pedestrian=10, Cyclist=10),
points_loader=dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=4,
use_dim=4,
backend_args=backend_args),
backend_args=backend_args)
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=4,
use_dim=4,
backend_args=backend_args),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
dict(type='ObjectSample', db_sampler=db_sampler, use_ground_plane=True),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.78539816, 0.78539816],
scale_ratio_range=[0.95, 1.05]),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='PointShuffle'),
dict(
type='Pack3DDetInputs',
keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=4,
use_dim=4,
backend_args=backend_args),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D'),
dict(
type='PointsRangeFilter', point_cloud_range=point_cloud_range)
]),
dict(type='Pack3DDetInputs', keys=['points'])
]
model = dict(
type='PointVoxelRCNN',
data_preprocessor=dict(
type='Det3DDataPreprocessor',
voxel=True,
voxel_layer=dict(
max_num_points=5, # max_points_per_voxel
point_cloud_range=point_cloud_range,
voxel_size=voxel_size,
max_voxels=(16000, 40000))),
voxel_encoder=dict(type='HardSimpleVFE'),
middle_encoder=dict(
type='SparseEncoder',
in_channels=4,
sparse_shape=[41, 1600, 1408],
order=('conv', 'norm', 'act'),
encoder_paddings=((0, 0, 0), ((1, 1, 1), 0, 0), ((1, 1, 1), 0, 0),
((0, 1, 1), 0, 0)),
return_middle_feats=True),
points_encoder=dict(
type='VoxelSetAbstraction',
num_keypoints=2048,
fused_out_channel=128,
voxel_size=voxel_size,
point_cloud_range=point_cloud_range,
voxel_sa_cfgs_list=[
dict(
type='StackedSAModuleMSG',
in_channels=16,
scale_factor=1,
radius=(0.4, 0.8),
sample_nums=(16, 16),
mlp_channels=((16, 16), (16, 16)),
use_xyz=True),
dict(
type='StackedSAModuleMSG',
in_channels=32,
scale_factor=2,
radius=(0.8, 1.2),
sample_nums=(16, 32),
mlp_channels=((32, 32), (32, 32)),
use_xyz=True),
dict(
type='StackedSAModuleMSG',
in_channels=64,
scale_factor=4,
radius=(1.2, 2.4),
sample_nums=(16, 32),
mlp_channels=((64, 64), (64, 64)),
use_xyz=True),
dict(
type='StackedSAModuleMSG',
in_channels=64,
scale_factor=8,
radius=(2.4, 4.8),
sample_nums=(16, 32),
mlp_channels=((64, 64), (64, 64)),
use_xyz=True)
],
rawpoints_sa_cfgs=dict(
type='StackedSAModuleMSG',
in_channels=1,
radius=(0.4, 0.8),
sample_nums=(16, 16),
mlp_channels=((16, 16), (16, 16)),
use_xyz=True),
bev_feat_channel=256,
bev_scale_factor=8),
backbone=dict(
type='SECOND',
in_channels=256,
layer_nums=[5, 5],
layer_strides=[1, 2],
out_channels=[128, 256]),
neck=dict(
type='SECONDFPN',
in_channels=[128, 256],
upsample_strides=[1, 2],
out_channels=[256, 256]),
rpn_head=dict(
type='PartA2RPNHead',
num_classes=3,
in_channels=512,
feat_channels=512,
use_direction_classifier=True,
dir_offset=0.78539,
anchor_generator=dict(
type='Anchor3DRangeGenerator',
ranges=[[0, -40.0, -0.6, 70.4, 40.0, -0.6],
[0, -40.0, -0.6, 70.4, 40.0, -0.6],
[0, -40.0, -1.78, 70.4, 40.0, -1.78]],
sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]],
rotations=[0, 1.57],
reshape_out=False),
diff_rad_by_sin=True,
assigner_per_size=True,
assign_per_class=True,
bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
loss_cls=dict(
type='mmdet.FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(
type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
loss_dir=dict(
type='mmdet.CrossEntropyLoss', use_sigmoid=False,
loss_weight=0.2)),
roi_head=dict(
type='PVRCNNRoiHead',
num_classes=3,
semantic_head=dict(
type='ForegroundSegmentationHead',
in_channels=640,
extra_width=0.1,
loss_seg=dict(
type='mmdet.FocalLoss',
use_sigmoid=True,
reduction='sum',
gamma=2.0,
alpha=0.25,
activated=True,
loss_weight=1.0)),
bbox_roi_extractor=dict(
type='Batch3DRoIGridExtractor',
grid_size=6,
roi_layer=dict(
type='StackedSAModuleMSG',
in_channels=128,
radius=(0.8, 1.6),
sample_nums=(16, 16),
mlp_channels=((64, 64), (64, 64)),
use_xyz=True,
pool_mod='max'),
),
bbox_head=dict(
type='PVRCNNBBoxHead',
in_channels=128,
grid_size=6,
num_classes=3,
class_agnostic=True,
shared_fc_channels=(256, 256),
reg_channels=(256, 256),
cls_channels=(256, 256),
dropout_ratio=0.3,
with_corner_loss=True,
bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
loss_bbox=dict(
type='mmdet.SmoothL1Loss',
beta=1.0 / 9.0,
reduction='sum',
loss_weight=1.0),
loss_cls=dict(
type='mmdet.CrossEntropyLoss',
use_sigmoid=True,
reduction='sum',
loss_weight=1.0))),
# model training and testing settings
train_cfg=dict(
rpn=dict(
assigner=[
dict( # for Pedestrian
type='Max3DIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.5,
neg_iou_thr=0.35,
min_pos_iou=0.35,
ignore_iof_thr=-1),
dict( # for Cyclist
type='Max3DIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.5,
neg_iou_thr=0.35,
min_pos_iou=0.35,
ignore_iof_thr=-1),
dict( # for Car
type='Max3DIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.6,
neg_iou_thr=0.45,
min_pos_iou=0.45,
ignore_iof_thr=-1)
],
allowed_border=0,
pos_weight=-1,
debug=False),
rpn_proposal=dict(
nms_pre=9000,
nms_post=512,
max_num=512,
nms_thr=0.8,
score_thr=0,
use_rotate_nms=True),
rcnn=dict(
assigner=[
dict( # for Pedestrian
type='Max3DIoUAssigner',
iou_calculator=dict(
type='BboxOverlaps3D', coordinate='lidar'),
pos_iou_thr=0.55,
neg_iou_thr=0.55,
min_pos_iou=0.55,
ignore_iof_thr=-1),
dict( # for Cyclist
type='Max3DIoUAssigner',
iou_calculator=dict(
type='BboxOverlaps3D', coordinate='lidar'),
pos_iou_thr=0.55,
neg_iou_thr=0.55,
min_pos_iou=0.55,
ignore_iof_thr=-1),
dict( # for Car
type='Max3DIoUAssigner',
iou_calculator=dict(
type='BboxOverlaps3D', coordinate='lidar'),
pos_iou_thr=0.55,
neg_iou_thr=0.55,
min_pos_iou=0.55,
ignore_iof_thr=-1)
],
sampler=dict(
type='IoUNegPiecewiseSampler',
num=128,
pos_fraction=0.5,
neg_piece_fractions=[0.8, 0.2],
neg_iou_piece_thrs=[0.55, 0.1],
neg_pos_ub=-1,
add_gt_as_proposals=False,
return_iou=True),
cls_pos_thr=0.75,
cls_neg_thr=0.25)),
test_cfg=dict(
rpn=dict(
nms_pre=1024,
nms_post=100,
max_num=100,
nms_thr=0.7,
score_thr=0,
use_rotate_nms=True),
rcnn=dict(
use_rotate_nms=True,
use_raw_score=True,
nms_thr=0.1,
score_thr=0.1)))
train_dataloader = dict(
batch_size=2,
num_workers=2,
dataset=dict(dataset=dict(pipeline=train_pipeline, metainfo=metainfo)))
test_dataloader = dict(dataset=dict(pipeline=test_pipeline, metainfo=metainfo))
eval_dataloader = dict(dataset=dict(pipeline=test_pipeline, metainfo=metainfo))
lr = 0.001
optim_wrapper = dict(optimizer=dict(lr=lr))
param_scheduler = [
# learning rate scheduler
# During the first 16 epochs, learning rate increases from 0 to lr * 10
# during the next 24 epochs, learning rate decreases from lr * 10 to
# lr * 1e-4
dict(
type='CosineAnnealingLR',
T_max=15,
eta_min=lr * 10,
begin=0,
end=15,
by_epoch=True,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=25,
eta_min=lr * 1e-4,
begin=15,
end=40,
by_epoch=True,
convert_to_iter_based=True),
# momentum scheduler
# During the first 16 epochs, momentum increases from 0 to 0.85 / 0.95
# during the next 24 epochs, momentum increases from 0.85 / 0.95 to 1
dict(
type='CosineAnnealingMomentum',
T_max=15,
eta_min=0.85 / 0.95,
begin=0,
end=15,
by_epoch=True,
convert_to_iter_based=True),
dict(
type='CosineAnnealingMomentum',
T_max=25,
eta_min=1,
begin=15,
end=40,
by_epoch=True,
convert_to_iter_based=True)
]
# Designing Network Design Spaces
> [Designing Network Design Spaces](https://arxiv.org/abs/2003.13678)
<!-- [BACKBONE] -->
## Abstract
In this work, we present a new network design paradigm. Our goal is to help advance the understanding of network design and discover design principles that generalize across settings. Instead of focusing on designing individual network instances, we design network design spaces that parametrize populations of networks. The overall process is analogous to classic manual design of networks, but elevated to the design space level. Using our methodology we explore the structure aspect of network design and arrive at a low-dimensional design space consisting of simple, regular networks that we call RegNet. The core insight of the RegNet parametrization is surprisingly simple: widths and depths of good networks can be explained by a quantized linear function. We analyze the RegNet design space and arrive at interesting findings that do not match the current practice of network design. The RegNet design space provides simple and fast networks that work well across a wide range of flop regimes. Under comparable training settings and flops, the RegNet models outperform the popular EfficientNet models while being up to 5x faster on GPUs.
<div align=center>
<img src="https://user-images.githubusercontent.com/79644370/144025148-b73002cb-3c82-42e4-8da4-65df97aead9c.png" width="800"/>
</div>
## Introduction
We implement RegNetX models in 3D detection systems and provide their first results with PointPillars on nuScenes and Lyft dataset.
The pre-trained modles are converted from [model zoo of pycls](https://github.com/facebookresearch/pycls/blob/master/MODEL_ZOO.md) and maintained in [mmcv](https://github.com/open-mmlab/mmcv).
## Usage
To use a regnet model, there are two steps to do:
1. Convert the model to ResNet-style supported by MMDetection
2. Modify backbone and neck in config accordingly
### Convert model
We already prepare models of FLOPs from 800M to 12G in our model zoo.
For more general usage, we also provide script `regnet2mmdet.py` in the tools directory to convert the key of models pretrained by [pycls](https://github.com/facebookresearch/pycls/) to
ResNet-style checkpoints used in MMDetection.
```bash
python -u tools/model_converters/regnet2mmdet.py ${PRETRAIN_PATH} ${STORE_PATH}
```
This script convert model from `PRETRAIN_PATH` and store the converted model in `STORE_PATH`.
### Modify config
The users can modify the config's `depth` of backbone and corresponding keys in `arch` according to the configs in the [pycls model zoo](https://github.com/facebookresearch/pycls/blob/master/MODEL_ZOO.md).
The parameter `in_channels` in FPN can be found in the Figure 15 & 16 of the paper (`wi` in the legend).
This directory already provides some configs with their performance, using RegNetX from 800MF to 12GF level.
For other pre-trained models or self-implemented regnet models, the users are responsible to check these parameters by themselves.
**Note**: Although Fig. 15 & 16 also provide `w0`, `wa`, `wm`, `group_w`, and `bot_mul` for `arch`, they are quantized thus inaccurate, using them sometimes produces different backbone that does not match the key in the pre-trained model.
## Results and models
### nuScenes
| Backbone | Lr schd | Mem (GB) | Inf time (fps) | mAP | NDS | Download |
| :-------------------------------------------------------------------------------------: | :-----: | :------: | :------------: | :---: | :--: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| [SECFPN](../pointpillars/pointpillars_hv_secfpn_sbn-all_8xb4-2x_nus-3d.py) | 2x | 16.4 | | 35.17 | 49.7 | [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d_20200620_230725-0817d270.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d_20200620_230725.log.json) |
| [RegNetX-400MF-SECFPN](./pointpillars_hv_regnet-400mf_secfpn_sbn-all_8xb4-2x_nus-3d.py) | 2x | 16.4 | | 41.2 | 55.2 | [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d_20200620_230334-53044f32.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d_20200620_230334.log.json) |
| [FPN](../pointpillars/pointpillars_hv_fpn_sbn-all_8xb4-2x_nus-3d.py) | 2x | 17.1 | | 40.0 | 53.3 | [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d_20200620_230405-2fa62f3d.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d_20200620_230405.log.json) |
| [RegNetX-400MF-FPN](./pointpillars_hv_regnet-400mf_fpn_sbn-all_8xb4-2x_nus-3d.py) | 2x | 17.3 | | 44.8 | 56.4 | [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d_20200620_230239-c694dce7.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d_20200620_230239.log.json) |
| [RegNetX-1.6gF-FPN](./pointpillars_hv_regnet-1.6gf_fpn_sbn-all_8xb4-2x_nus-3d.py) | 2x | 24.0 | | 48.2 | 59.3 | [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_4x8_2x_nus-3d_20200629_050311-dcd4e090.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_4x8_2x_nus-3d_20200629_050311.log.json) |
### Lyft
| Backbone | Lr schd | Mem (GB) | Inf time (fps) | Private Score | Public Score | Download |
| :-------------------------------------------------------------------------------------: | :-----: | :------: | :------------: | :-----------: | :----------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| [SECFPN](../pointpillars/pointpillars_hv_secfpn_sbn-all_8xb2-2x_lyft-3d.py) | 2x | 12.2 | | 13.9 | 14.1 | [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn-all_2x8_2x_lyft-3d/hv_pointpillars_secfpn_sbn-all_2x8_2x_lyft-3d_20210517_204807-2518e3de.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn-all_2x8_2x_lyft-3d/hv_pointpillars_secfpn_sbn-all_2x8_2x_lyft-3d_20210517_204807.log.json) |
| [RegNetX-400MF-SECFPN](./hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_lyft-3d.py) | 2x | 15.9 | | 14.9 | 15.1 | [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_2x8_2x_lyft-3d/hv_pointpillars_regnet-400mf_secfpn_sbn-all_2x8_2x_lyft-3d_20210524_092151-42513826.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_2x8_2x_lyft-3d/hv_pointpillars_regnet-400mf_secfpn_sbn-all_2x8_2x_lyft-3d_20210524_092151.log.json) |
| [FPN](../pointpillars/pointpillars_hv_fpn_sbn-all_8xb2-2x_lyft-3d.py) | 2x | 9.2 | | 14.9 | 15.1 | [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_fpn_sbn-all_2x8_2x_lyft-3d/hv_pointpillars_fpn_sbn-all_2x8_2x_lyft-3d_20210517_202818-fc6904c3.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_fpn_sbn-all_2x8_2x_lyft-3d/hv_pointpillars_fpn_sbn-all_2x8_2x_lyft-3d_20210517_202818.log.json) |
| [RegNetX-400MF-FPN](./hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_lyft-3d.py) | 2x | 13.0 | | 16.0 | 16.1 | [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_fpn_sbn-all_2x8_2x_lyft-3d/hv_pointpillars_regnet-400mf_fpn_sbn-all_2x8_2x_lyft-3d_20210521_115618-823dcf18.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_fpn_sbn-all_2x8_2x_lyft-3d/hv_pointpillars_regnet-400mf_fpn_sbn-all_2x8_2x_lyft-3d_20210521_115618.log.json) |
## Citation
```latex
@article{radosavovic2020designing,
title={Designing Network Design Spaces},
author={Ilija Radosavovic and Raj Prateek Kosaraju and Ross Girshick and Kaiming He and Piotr Dollár},
year={2020},
eprint={2003.13678},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
```
Models:
- Name: pointpillars_hv_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d
In Collection: PointPillars
Config: configs/regnet/pointpillars_hv_regnet-400mf_secfpn_sbn-all_8xb4-2x_nus-3d.py
Metadata:
Training Data: nuScenes
Training Memory (GB): 16.4
Architecture:
- RegNetX
- Hard Voxelization
Results:
- Task: 3D Object Detection
Dataset: nuScenes
Metrics:
mAP: 41.2
NDS: 55.2
Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d_20200620_230334-53044f32.pth
- Name: pointpillars_hv_regnet-400mf_fpn_sbn-all_8xb4-2x_nus-3d
In Collection: PointPillars
Config: configs/regnet/pointpillars_hv_regnet-400mf_fpn_sbn-all_8xb4-2x_nus-3d.py
Metadata:
Training Data: nuScenes
Training Memory (GB): 17.3
Architecture:
- RegNetX
- Hard Voxelization
Results:
- Task: 3D Object Detection
Dataset: nuScenes
Metrics:
mAP: 44.8
NDS: 56.4
Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d_20200620_230239-c694dce7.pth
- Name: pointpillars_hv_regnet-1.6gf_fpn_sbn-all_8xb4-2x_nus-3d
In Collection: PointPillars
Config: configs/regnet/pointpillars_hv_regnet-1.6gf_fpn_sbn-all_8xb4-2x_nus-3d.py
Metadata:
Training Data: nuScenes
Training Memory (GB): 24.0
Architecture:
- RegNetX
- Hard Voxelization
Results:
- Task: 3D Object Detection
Dataset: nuScenes
Metrics:
mAP: 48.2
NDS: 59.3
Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_4x8_2x_nus-3d_20200629_050311-dcd4e090.pth
- Name: pointpillars_hv_regnet-400mf_secfpn_sbn-all_2x8_2x_lyft-3d
In Collection: PointPillars
Config: configs/regnet/pointpillars_hv_regnet-400mf_secfpn_sbn-all_2x8_2x_lyft-3d.py
Metadata:
Training Data: Lyft
Training Memory (GB): 15.9
Architecture:
- RegNetX
- Hard Voxelization
Results:
- Task: 3D Object Detection
Dataset: Lyft
Metrics:
Private Score: 14.9
Public Score: 15.1
Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_2x8_2x_lyft-3d/hv_pointpillars_regnet-400mf_secfpn_sbn-all_2x8_2x_lyft-3d_20210524_092151-42513826.pth
- Name: pointpillars_hv_regnet-400mf_fpn_sbn-all_2x8_2x_lyft-3d
In Collection: PointPillars
Config: configs/regnet/pointpillars_hv_regnet-400mf_fpn_sbn-all_2x8_2x_lyft-3d.py
Metadata:
Training Data: Lyft
Training Memory (GB): 13.0
Architecture:
- RegNetX
- Hard Voxelization
Results:
- Task: 3D Object Detection
Dataset: Lyft
Metrics:
Private Score: 16.0
Public Score: 16.1
Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_fpn_sbn-all_2x8_2x_lyft-3d/hv_pointpillars_regnet-400mf_fpn_sbn-all_2x8_2x_lyft-3d_20210521_115618-823dcf18.pth
_base_ = [
'../_base_/models/pointpillars_hv_fpn_nus.py',
'../_base_/datasets/nus-3d.py',
'../_base_/schedules/schedule-2x.py',
'../_base_/default_runtime.py',
]
# model settings
model = dict(
type='MVXFasterRCNN',
pts_backbone=dict(
_delete_=True,
type='NoStemRegNet',
arch='regnetx_1.6gf',
init_cfg=dict(
type='Pretrained', checkpoint='open-mmlab://regnetx_1.6gf'),
out_indices=(1, 2, 3),
frozen_stages=-1,
strides=(1, 2, 2, 2),
base_channels=64,
stem_channels=64,
norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
norm_eval=False,
style='pytorch'),
pts_neck=dict(in_channels=[168, 408, 912]))
_base_ = [
'../_base_/models/pointpillars_hv_fpn_lyft.py',
'../_base_/datasets/lyft-3d.py',
'../_base_/schedules/schedule-2x.py',
'../_base_/default_runtime.py',
]
# model settings
model = dict(
type='MVXFasterRCNN',
pts_backbone=dict(
_delete_=True,
type='NoStemRegNet',
arch=dict(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22, bot_mul=1.0),
init_cfg=dict(
type='Pretrained', checkpoint='open-mmlab://regnetx_400mf'),
out_indices=(1, 2, 3),
frozen_stages=-1,
strides=(1, 2, 2, 2),
base_channels=64,
stem_channels=64,
norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
norm_eval=False,
style='pytorch'),
pts_neck=dict(in_channels=[64, 160, 384]))
# Default setting for scaling LR automatically
# - `enable` means enable scaling LR automatically
# or not by default.
# - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
auto_scale_lr = dict(enable=False, base_batch_size=16)
_base_ = [
'../_base_/models/pointpillars_hv_fpn_nus.py',
'../_base_/datasets/nus-3d.py',
'../_base_/schedules/schedule-2x.py',
'../_base_/default_runtime.py',
]
# model settings
model = dict(
type='MVXFasterRCNN',
pts_backbone=dict(
_delete_=True,
type='NoStemRegNet',
arch=dict(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22, bot_mul=1.0),
init_cfg=dict(
type='Pretrained', checkpoint='open-mmlab://regnetx_400mf'),
out_indices=(1, 2, 3),
frozen_stages=-1,
strides=(1, 2, 2, 2),
base_channels=64,
stem_channels=64,
norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
norm_eval=False,
style='pytorch'),
pts_neck=dict(in_channels=[64, 160, 384]))
_base_ = [
'../_base_/models/pointpillars_hv_fpn_range100_lyft.py',
'../_base_/datasets/lyft-3d-range100.py',
'../_base_/schedules/schedule-2x.py',
'../_base_/default_runtime.py',
]
# model settings
model = dict(
type='MVXFasterRCNN',
pts_backbone=dict(
_delete_=True,
type='NoStemRegNet',
arch=dict(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22, bot_mul=1.0),
init_cfg=dict(
type='Pretrained', checkpoint='open-mmlab://regnetx_400mf'),
out_indices=(1, 2, 3),
frozen_stages=-1,
strides=(1, 2, 2, 2),
base_channels=64,
stem_channels=64,
norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
norm_eval=False,
style='pytorch'),
pts_neck=dict(in_channels=[64, 160, 384]))
# Default setting for scaling LR automatically
# - `enable` means enable scaling LR automatically
# or not by default.
# - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
auto_scale_lr = dict(enable=False, base_batch_size=16)
_base_ = './pointpillars_hv_regnet-400mf_fpn_sbn-all_8xb2-2x_lyft-3d.py'
# model settings
model = dict(
pts_neck=dict(
type='SECONDFPN',
_delete_=True,
norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
in_channels=[64, 160, 384],
upsample_strides=[1, 2, 4],
out_channels=[128, 128, 128]),
pts_bbox_head=dict(
type='Anchor3DHead',
in_channels=384,
feat_channels=384,
anchor_generator=dict(
_delete_=True,
type='AlignedAnchor3DRangeGenerator',
ranges=[[-80, -80, -1.0715024, 80, 80, -1.0715024],
[-80, -80, -0.3033737, 80, 80, -0.3033737],
[-80, -80, -0.3519405, 80, 80, -0.3519405],
[-80, -80, -0.8871424, 80, 80, -0.8871424],
[-80, -80, -0.6276341, 80, 80, -0.6276341],
[-80, -80, -1.3220503, 80, 80, -1.3220503],
[-80, -80, -1.0709302, 80, 80, -1.0709302],
[-80, -80, -0.9122268, 80, 80, -0.9122268],
[-80, -80, -1.8012227, 80, 80, -1.8012227]],
sizes=[
[4.75, 1.92, 1.71], # car
[10.24, 2.84, 3.44], # truck
[12.70, 2.92, 3.42], # bus
[6.52, 2.42, 2.34], # emergency vehicle
[8.17, 2.75, 3.20], # other vehicle
[2.35, 0.96, 1.59], # motorcycle
[1.76, 0.63, 1.44], # bicycle
[0.80, 0.76, 1.76], # pedestrian
[0.73, 0.35, 0.50] # animal
],
rotations=[0, 1.57],
reshape_out=True)))
_base_ = './pointpillars_hv_regnet-400mf_fpn_sbn-all_8xb4-2x_nus-3d.py'
# model settings
model = dict(
pts_neck=dict(
type='SECONDFPN',
_delete_=True,
norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
in_channels=[64, 160, 384],
upsample_strides=[1, 2, 4],
out_channels=[128, 128, 128]),
pts_bbox_head=dict(
type='Anchor3DHead',
in_channels=384,
feat_channels=384,
anchor_generator=dict(
_delete_=True,
type='AlignedAnchor3DRangeGenerator',
ranges=[
[-49.6, -49.6, -1.80032795, 49.6, 49.6, -1.80032795],
[-49.6, -49.6, -1.74440365, 49.6, 49.6, -1.74440365],
[-49.6, -49.6, -1.68526504, 49.6, 49.6, -1.68526504],
[-49.6, -49.6, -1.67339111, 49.6, 49.6, -1.67339111],
[-49.6, -49.6, -1.61785072, 49.6, 49.6, -1.61785072],
[-49.6, -49.6, -1.80984986, 49.6, 49.6, -1.80984986],
[-49.6, -49.6, -1.763965, 49.6, 49.6, -1.763965],
],
sizes=[
[4.60718145, 1.95017717, 1.72270761], # car
[6.73778078, 2.4560939, 2.73004906], # truck
[12.01320693, 2.87427237, 3.81509561], # trailer
[1.68452161, 0.60058911, 1.27192197], # bicycle
[0.7256437, 0.66344886, 1.75748069], # pedestrian
[0.40359262, 0.39694519, 1.06232151], # traffic_cone
[0.48578221, 2.49008838, 0.98297065], # barrier
],
custom_values=[0, 0],
rotations=[0, 1.57],
reshape_out=True)))
_base_ = \
'./pointpillars_hv_regnet-400mf_fpn_sbn-all_range100_8xb2-2x_lyft-3d.py'
# model settings
model = dict(
pts_neck=dict(
type='SECONDFPN',
_delete_=True,
norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
in_channels=[64, 160, 384],
upsample_strides=[1, 2, 4],
out_channels=[128, 128, 128]),
pts_bbox_head=dict(
type='Anchor3DHead',
in_channels=384,
feat_channels=384,
anchor_generator=dict(
_delete_=True,
type='AlignedAnchor3DRangeGenerator',
ranges=[[-100, -100, -1.0715024, 100, 100, -1.0715024],
[-100, -100, -0.3033737, 100, 100, -0.3033737],
[-100, -100, -0.3519405, 100, 100, -0.3519405],
[-100, -100, -0.8871424, 100, 100, -0.8871424],
[-100, -100, -0.6276341, 100, 100, -0.6276341],
[-100, -100, -1.3220503, 100, 100, -1.3220503],
[-100, -100, -1.0709302, 100, 100, -1.0709302],
[-100, -100, -0.9122268, 100, 100, -0.9122268],
[-100, -100, -1.8012227, 100, 100, -1.8012227]],
sizes=[
[4.75, 1.92, 1.71], # car
[10.24, 2.84, 3.44], # truck
[12.70, 2.92, 3.42], # bus
[6.52, 2.42, 2.34], # emergency vehicle
[8.17, 2.75, 3.20], # other vehicle
[2.35, 0.96, 1.59], # motorcycle
[1.76, 0.63, 1.44], # bicycle
[0.80, 0.76, 1.76], # pedestrian
[0.73, 0.35, 0.50] # animal
],
rotations=[0, 1.57],
reshape_out=True)))
# Structure Aware Single-stage 3D Object Detection from Point Cloud
> [Structure Aware Single-stage 3D Object Detection from Point Cloud](<%5Bhttps://arxiv.org/abs/2104.02323%5D(https://openaccess.thecvf.com/content_CVPR_2020/papers/He_Structure_Aware_Single-Stage_3D_Object_Detection_From_Point_Cloud_CVPR_2020_paper.pdf)>)
<!-- [ALGORITHM] -->
## Abstract
3D object detection from point cloud data plays an essential role in autonomous driving. Current single-stage detectors are efficient by progressively downscaling the 3D point clouds in a fully convolutional manner. However, the downscaled features inevitably lose spatial information and cannot make full use of the structure information of 3D point cloud, degrading their localization precision. In this work, we propose to improve the localization precision of single-stage detectors by explicitly leveraging the structure information of 3D point cloud. Specifically, we design an auxiliary network which converts the convolutional features in the backbone network back to point-level representations. The auxiliary network is jointly optimized, by two point-level supervisions, to guide the convolutional features in the backbone network to be aware of the object structure. The auxiliary network can be detached after training and therefore introduces no extra computation in the inference stage. Besides, considering that single-stage detectors suffer from the discordance between the predicted bounding boxes and corresponding classification confidences, we develop an efficient part-sensitive warping operation to align the confidences to the predicted bounding boxes. Our proposed detector ranks at the top of KITTI 3D/BEV detection leaderboards and runs at 25 FPS for inference.
<div align=center>
<img src="https://user-images.githubusercontent.com/30491025/172526367-c8b9bdf7-f901-4f2f-8855-bfd55c39f8d1.png" width="800"/>
</div>
## Introduction
We implement SA-SSD and provide the results and checkpoints on KITTI dataset.
## Citation
```latex
@InProceedings{he2020sassd,
title={Structure Aware Single-stage 3D Object Detection from Point Cloud},
author={He, Chenhang and Zeng, Hui and Huang, Jianqiang and Hua, Xian-Sheng and Zhang, Lei},
booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
year={2020}
}
```
_base_ = [
'../_base_/datasets/kitti-3d-3class.py',
'../_base_/schedules/cyclic-40e.py', '../_base_/default_runtime.py'
]
voxel_size = [0.05, 0.05, 0.1]
model = dict(
type='SASSD',
data_preprocessor=dict(
type='Det3DDataPreprocessor',
voxel=True,
voxel_layer=dict(
max_num_points=5,
point_cloud_range=[0, -40, -3, 70.4, 40, 1],
voxel_size=voxel_size,
max_voxels=(16000, 40000))),
voxel_encoder=dict(type='HardSimpleVFE'),
middle_encoder=dict(
type='SparseEncoderSASSD',
in_channels=4,
sparse_shape=[41, 1600, 1408],
order=('conv', 'norm', 'act')),
backbone=dict(
type='SECOND',
in_channels=256,
layer_nums=[5, 5],
layer_strides=[1, 2],
out_channels=[128, 256]),
neck=dict(
type='SECONDFPN',
in_channels=[128, 256],
upsample_strides=[1, 2],
out_channels=[256, 256]),
bbox_head=dict(
type='Anchor3DHead',
num_classes=3,
in_channels=512,
feat_channels=512,
use_direction_classifier=True,
anchor_generator=dict(
type='Anchor3DRangeGenerator',
ranges=[
[0, -40.0, -0.6, 70.4, 40.0, -0.6],
[0, -40.0, -0.6, 70.4, 40.0, -0.6],
[0, -40.0, -1.78, 70.4, 40.0, -1.78],
],
sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]],
rotations=[0, 1.57],
reshape_out=False),
diff_rad_by_sin=True,
bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
loss_cls=dict(
type='mmdet.FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(
type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
loss_dir=dict(
type='mmdet.CrossEntropyLoss', use_sigmoid=False,
loss_weight=0.2)),
# model training and testing settings
train_cfg=dict(
assigner=[
dict( # for Pedestrian
type='Max3DIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.35,
neg_iou_thr=0.2,
min_pos_iou=0.2,
ignore_iof_thr=-1),
dict( # for Cyclist
type='Max3DIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.35,
neg_iou_thr=0.2,
min_pos_iou=0.2,
ignore_iof_thr=-1),
dict( # for Car
type='Max3DIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.6,
neg_iou_thr=0.45,
min_pos_iou=0.45,
ignore_iof_thr=-1),
],
allowed_border=0,
pos_weight=-1,
debug=False),
test_cfg=dict(
use_rotate_nms=True,
nms_across_levels=False,
nms_thr=0.01,
score_thr=0.1,
min_bbox_size=0,
nms_pre=100,
max_num=50))
# Second: Sparsely embedded convolutional detection
> [SECOND: Sparsely Embedded Convolutional Detection](https://www.mdpi.com/1424-8220/18/10/3337)
<!-- [ALGORITHM] -->
## Abstract
LiDAR-based or RGB-D-based object detection is used in numerous applications, ranging from autonomous driving to robot vision. Voxel-based 3D convolutional networks have been used for some time to enhance the retention of information when processing point cloud LiDAR data. However, problems remain, including a slow inference speed and low orientation estimation performance. We therefore investigate an improved sparse convolution method for such networks, which significantly increases the speed of both training and inference. We also introduce a new form of angle loss regression to improve the orientation estimation performance and a new data augmentation approach that can enhance the convergence speed and performance. The proposed network produces state-of-the-art results on the KITTI 3D object detection benchmarks while maintaining a fast inference speed.
<div align=center>
<img src="https://user-images.githubusercontent.com/79644370/143889364-10be11c3-838e-4fc9-9613-184f0cd08907.png" width="800"/>
</div>
## Introduction
We implement SECOND and provide the results and checkpoints on KITTI dataset.
## Results and models
### KITTI
| Backbone | Class | Lr schd | Mem (GB) | Inf time (fps) | mAP | Download |
| :-----------------------------------------------------------------: | :-----: | :--------: | :------: | :------------: | :---: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| [SECFPN](./second_hv_secfpn_8xb6-80e_kitti-3d-car.py) | Car | cyclic 80e | 5.4 | | 78.2 | [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/second/second_hv_secfpn_8xb6-80e_kitti-3d-car/second_hv_secfpn_8xb6-80e_kitti-3d-car-75d9305e.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/second/second_hv_secfpn_8xb6-80e_kitti-3d-car/second_hv_secfpn_8xb6-80e_kitti-3d-car-20230420_191750.log) |
| [SECFPN (FP16)](./second_hv_secfpn_8xb6-amp-80e_kitti-3d-car.py) | Car | cyclic 80e | 2.9 | | 78.72 | [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car_20200924_211301-1f5ad833.pth)\| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car_20200924_211301.log.json) |
| [SECFPN](./second_hv_secfpn_8xb6-80e_kitti-3d-3class.py) | 3 Class | cyclic 80e | 5.4 | | 65.3 | [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/second/second_hv_secfpn_8xb6-80e_kitti-3d-3class/second_hv_secfpn_8xb6-80e_kitti-3d-3class-b086d0a3.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/second/second_hv_secfpn_8xb6-80e_kitti-3d-3class/second_hv_secfpn_8xb6-80e_kitti-3d-3class-20230420_221130.log) |
| [SECFPN (FP16)](./second_hv_secfpn_8xb6-amp-80e_kitti-3d-3class.py) | 3 Class | cyclic 80e | 2.9 | | 67.4 | [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class_20200925_110059-05f67bdf.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class_20200925_110059.log.json) |
### Waymo
| Backbone | Load Interval | Class | Lr schd | Mem (GB) | Inf time (fps) | mAP@L1 | mAPH@L1 | mAP@L2 | **mAPH@L2** | Download |
| :----------------------------------------------------------------: | :-----------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :----: | :---------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| [SECFPN](./second_hv_secfpn_sbn-all_16xb2-2x_waymoD5-3d-3class.py) | 5 | 3 Class | 2x | 8.12 | | 65.3 | 61.7 | 58.9 | 55.7 | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/second/hv_second_secfpn_sbn_4x8_2x_waymoD5-3d-3class/hv_second_secfpn_sbn_4x8_2x_waymoD5-3d-3class_20201115_112448.log.json) |
| above @ Car | | | 2x | 8.12 | | 67.1 | 66.6 | 58.7 | 58.2 | |
| above @ Pedestrian | | | 2x | 8.12 | | 68.1 | 59.1 | 59.5 | 51.5 | |
| above @ Cyclist | | | 2x | 8.12 | | 60.7 | 59.5 | 58.4 | 57.3 | |
Note:
- See more details about metrics and data split on Waymo [HERE](https://github.com/open-mmlab/mmdetection3d/tree/main/configs/pointpillars). For implementation details, we basically follow the original settings. All of these results are achieved without bells-and-whistles, e.g. ensemble, multi-scale training and test augmentation.
- `FP16` means Mixed Precision (FP16) is adopted in training.
## Citation
```latex
@article{yan2018second,
title={Second: Sparsely embedded convolutional detection},
author={Yan, Yan and Mao, Yuxing and Li, Bo},
journal={Sensors},
year={2018},
publisher={Multidisciplinary Digital Publishing Institute}
}
```
Collections:
- Name: SECOND
Metadata:
Training Techniques:
- AdamW
Architecture:
- Hard Voxelization
Paper:
URL: https://www.mdpi.com/1424-8220/18/10/3337
Title: 'SECOND: Sparsely Embedded Convolutional Detection'
README: configs/second/README.md
Code:
URL: https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/models/backbones/second.py#L11
Version: v0.5.0
Models:
- Name: second_hv_secfpn_8xb6-80e_kitti-3d-car
In Collection: SECOND
Config: configs/second/second_hv_secfpn_8xb6-80e_kitti-3d-car.py
Metadata:
Training Data: KITTI
Training Memory (GB): 5.4
Training Resources: 8x V100 GPUs
Results:
- Task: 3D Object Detection
Dataset: KITTI
Metrics:
mAP: 78.2
Weights: https://download.openmmlab.com/mmdetection3d/v1.1.0_models/second/second_hv_secfpn_8xb6-80e_kitti-3d-car/second_hv_secfpn_8xb6-80e_kitti-3d-car-75d9305e.pth
- Name: second_hv_secfpn_8xb6-80e_kitti-3d-3class
In Collection: SECOND
Config: configs/second/second_hv_secfpn_8xb6-80e_kitti-3d-3class.py
Metadata:
Training Data: KITTI
Training Memory (GB): 5.4
Training Resources: 8x V100 GPUs
Results:
- Task: 3D Object Detection
Dataset: KITTI
Metrics:
mAP: 65.3
Weights: https://download.openmmlab.com/mmdetection3d/v1.1.0_models/second/second_hv_secfpn_8xb6-80e_kitti-3d-3class/second_hv_secfpn_8xb6-80e_kitti-3d-3class-b086d0a3.pth
- Name: second_hv_secfpn_sbn-all_16xb2-2x_waymoD5-3d-3class
In Collection: SECOND
Config: configs/second/second_hv_secfpn_sbn-all_16xb2-2x_waymoD5-3d-3class.py
Metadata:
Training Data: Waymo
Training Memory (GB): 8.12
Training Resources: 8x GeForce GTX 1080 Ti
Results:
- Task: 3D Object Detection
Dataset: Waymo
Metrics:
mAP@L1: 65.3
mAPH@L1: 61.7
mAP@L2: 58.9
mAPH@L2: 55.7
- Name: second_hv_secfpn_8xb6-amp-80e_kitti-3d-car
In Collection: SECOND
Config: configs/second/second_hv_secfpn_8xb6-amp-80e_kitti-3d-car.py
Metadata:
Training Techniques:
- AdamW
- Mixed Precision Training
Training Resources: 8x TITAN Xp
Training Data: KITTI
Training Memory (GB): 2.9
Results:
- Task: 3D Object Detection
Dataset: KITTI
Metrics:
mAP: 78.72
Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car_20200924_211301-1f5ad833.pth
Code:
Version: v0.7.0
- Name: second_hv_secfpn_8xb6-amp-80e_kitti-3d-3class
In Collection: SECOND
Config: configs/second/second_hv_secfpn_8xb6-amp-80e_kitti-3d-3class.py
Metadata:
Training Techniques:
- AdamW
- Mixed Precision Training
Training Resources: 8x TITAN Xp
Training Data: KITTI
Training Memory (GB): 2.9
Results:
- Task: 3D Object Detection
Dataset: KITTI
Metrics:
mAP: 67.4
Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class_20200925_110059-05f67bdf.pth
Code:
Version: v0.7.0
_base_ = [
'../_base_/models/second_hv_secfpn_kitti.py',
'../_base_/datasets/kitti-3d-3class.py',
'../_base_/schedules/cyclic-40e.py', '../_base_/default_runtime.py'
]
_base_ = [
'../_base_/models/second_hv_secfpn_kitti.py',
'../_base_/datasets/kitti-3d-car.py', '../_base_/schedules/cyclic-40e.py',
'../_base_/default_runtime.py'
]
point_cloud_range = [0, -40, -3, 70.4, 40, 1]
model = dict(
bbox_head=dict(
type='Anchor3DHead',
num_classes=1,
anchor_generator=dict(
_delete_=True,
type='Anchor3DRangeGenerator',
ranges=[[0, -40.0, -1.78, 70.4, 40.0, -1.78]],
sizes=[[3.9, 1.6, 1.56]],
rotations=[0, 1.57],
reshape_out=True)),
# model training and testing settings
train_cfg=dict(
_delete_=True,
assigner=dict(
type='Max3DIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.6,
neg_iou_thr=0.45,
min_pos_iou=0.45,
ignore_iof_thr=-1),
allowed_border=0,
pos_weight=-1,
debug=False))
_base_ = 'second_hv_secfpn_8xb6-80e_kitti-3d-3class.py'
# schedule settings
optim_wrapper = dict(type='AmpOptimWrapper', loss_scale=4096.)
_base_ = 'second_hv_secfpn_8xb6-80e_kitti-3d-car.py'
# schedule settings
optim_wrapper = dict(type='AmpOptimWrapper', loss_scale=4096.)
_base_ = [
'../_base_/models/second_hv_secfpn_waymo.py',
'../_base_/datasets/waymoD5-3d-3class.py',
'../_base_/schedules/schedule-2x.py',
'../_base_/default_runtime.py',
]
dataset_type = 'WaymoDataset'
data_root = 'data/waymo/kitti_format/'
class_names = ['Car', 'Pedestrian', 'Cyclist']
metainfo = dict(classes=class_names)
point_cloud_range = [-76.8, -51.2, -2, 76.8, 51.2, 4]
input_modality = dict(use_lidar=True, use_camera=False)
backend_args = None
db_sampler = dict(
data_root=data_root,
info_path=data_root + 'waymo_dbinfos_train.pkl',
rate=1.0,
prepare=dict(
filter_by_difficulty=[-1],
filter_by_min_points=dict(Car=5, Pedestrian=5, Cyclist=5)),
classes=class_names,
sample_groups=dict(Car=15, Pedestrian=10, Cyclist=10),
points_loader=dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=6,
use_dim=[0, 1, 2, 3, 4],
backend_args=backend_args),
backend_args=backend_args)
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=6,
use_dim=5,
backend_args=backend_args),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
# dict(type='ObjectSample', db_sampler=db_sampler),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
flip_ratio_bev_vertical=0.5),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.78539816, 0.78539816],
scale_ratio_range=[0.95, 1.05]),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='PointShuffle'),
dict(
type='Pack3DDetInputs',
keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=6,
use_dim=5,
backend_args=backend_args),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D'),
dict(
type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='Pack3DDetInputs', keys=['points']),
])
]
train_dataloader = dict(
batch_size=4,
num_workers=4,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type='RepeatDataset',
times=2,
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='waymo_infos_train.pkl',
data_prefix=dict(pts='training/velodyne'),
pipeline=train_pipeline,
modality=input_modality,
test_mode=False,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='LiDAR',
# load one frame every five frames
load_interval=5,
backend_args=backend_args)))
val_dataloader = dict(
batch_size=1,
num_workers=1,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
data_prefix=dict(pts='training/velodyne'),
ann_file='waymo_infos_val.pkl',
pipeline=test_pipeline,
modality=input_modality,
test_mode=True,
metainfo=metainfo,
box_type_3d='LiDAR',
backend_args=backend_args))
test_dataloader = dict(
batch_size=1,
num_workers=1,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
data_prefix=dict(pts='training/velodyne'),
ann_file='waymo_infos_val.pkl',
pipeline=test_pipeline,
modality=input_modality,
test_mode=True,
metainfo=metainfo,
box_type_3d='LiDAR',
backend_args=backend_args))
# Default setting for scaling LR automatically
# - `enable` means enable scaling LR automatically
# or not by default.
# - `base_batch_size` = (16 GPUs) x (2 samples per GPU).
auto_scale_lr = dict(enable=False, base_batch_size=32)
# SMOKE: Single-Stage Monocular 3D Object Detection via Keypoint Estimation
> [SMOKE: Single-Stage Monocular 3D Object Detection via Keypoint Estimation](https://arxiv.org/abs/2002.10111)
<!-- [ALGORITHM] -->
## Abstract
Estimating 3D orientation and translation of objects is essential for infrastructure-less autonomous navigation and driving. In case of monocular vision, successful methods have been mainly based on two ingredients: (i) a network generating 2D region proposals, (ii) a R-CNN structure predicting 3D object pose by utilizing the acquired regions of interest. We argue that the 2D detection network is redundant and introduces non-negligible noise for 3D detection. Hence, we propose a novel 3D object detection method, named SMOKE, in this paper that predicts a 3D bounding box for each detected object by combining a single keypoint estimate with regressed 3D variables. As a second contribution, we propose a multi-step disentangling approach for constructing the 3D bounding box, which significantly improves both training convergence and detection accuracy. In contrast to previous 3D detection techniques, our method does not require complicated pre/post-processing, extra data, and a refinement stage. Despite of its structural simplicity, our proposed SMOKE network outperforms all existing monocular 3D detection methods on the KITTI dataset, giving the best state-of-the-art result on both 3D object detection and Bird's eye view evaluation.
<div align=center>
<img src="https://user-images.githubusercontent.com/79644370/143886681-52cb72b9-6635-4624-a728-1c243b046517.png" width="800"/>
</div>
## Introduction
We implement SMOKE and provide the results and checkpoints on KITTI dataset.
## Results and models
### KITTI
| Backbone | Lr schd | Mem (GB) | Inf time (fps) | mAP | Download |
| :-----------------------------------------------------------: | :-----: | :------: | :------------: | :---: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| [DLA34](./smoke_dla34_dlaneck_gn-all_4xb8-6x_kitti-mono3d.py) | 6x | 9.64 | | 13.85 | [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/smoke/smoke_dla34_pytorch_dlaneck_gn-all_8x4_6x_kitti-mono3d_20210929_015553-d46d9bb0.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/smoke/smoke_dla34_pytorch_dlaneck_gn-all_8x4_6x_kitti-mono3d_20210929_015553.log.json) |
Note: mAP represents Car moderate 3D strict AP11 results.
Detailed performance on KITTI 3D detection (3D/BEV) is as follows, evaluated by AP11 metric:
| | Easy | Moderate | Hard |
| ---------- | :-----------: | :-----------: | :-----------: |
| Car | 16.92 / 22.97 | 13.85 / 18.32 | 11.90 / 15.88 |
| Pedestrian | 11.13 / 12.61 | 11.10 / 11.32 | 10.67 / 11.14 |
| Cyclist | 0.99 / 1.47 | 0.54 / 0.65 | 0.55 / 0.67 |
## Citation
```latex
@inproceedings{liu2020smoke,
title={Smoke: Single-stage monocular 3d object detection via keypoint estimation},
author={Liu, Zechen and Wu, Zizhang and T{\'o}th, Roland},
booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops},
pages={996--997},
year={2020}
}
```
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment