"segmentation/configs/_base_/datasets/cityscapes_extra.py" did not exist on "c4552f794aab15e56a00ccb06747e3fa6b8bec38"
Commit 7aa442d5 authored by raojy's avatar raojy
Browse files

raw_mmdetection

parent 9c03eaa8
_base_ = [
'../_base_/datasets/scannet-3d.py', '../_base_/models/groupfree3d.py',
'../_base_/schedules/schedule-3x.py', '../_base_/default_runtime.py'
]
# model settings
model = dict(
backbone=dict(
type='PointNet2SASSG',
in_channels=3,
num_points=(2048, 1024, 512, 256),
radius=(0.2, 0.4, 0.8, 1.2),
num_samples=(64, 32, 16, 16),
sa_channels=((128, 128, 256), (256, 256, 512), (256, 256, 512),
(256, 256, 512)),
fp_channels=((512, 512), (512, 288)),
norm_cfg=dict(type='BN2d'),
sa_cfg=dict(
type='PointSAModule',
pool_mod='max',
use_xyz=True,
normalize_xyz=True)),
bbox_head=dict(
num_classes=18,
num_decoder_layers=12,
num_proposal=512,
size_cls_agnostic=False,
bbox_coder=dict(
type='GroupFree3DBBoxCoder',
num_sizes=18,
num_dir_bins=1,
with_rot=False,
size_cls_agnostic=False,
mean_sizes=[[0.76966727, 0.8116021, 0.92573744],
[1.876858, 1.8425595, 1.1931566],
[0.61328, 0.6148609, 0.7182701],
[1.3955007, 1.5121545, 0.83443564],
[0.97949594, 1.0675149, 0.6329687],
[0.531663, 0.5955577, 1.7500148],
[0.9624706, 0.72462326, 1.1481868],
[0.83221924, 1.0490936, 1.6875663],
[0.21132214, 0.4206159, 0.5372846],
[1.4440073, 1.8970833, 0.26985747],
[1.0294262, 1.4040797, 0.87554324],
[1.3766412, 0.65521795, 1.6813129],
[0.6650819, 0.71111923, 1.298853],
[0.41999173, 0.37906948, 1.7513971],
[0.59359556, 0.5912492, 0.73919016],
[0.50867593, 0.50656086, 0.30136237],
[1.1511526, 1.0546296, 0.49706793],
[0.47535285, 0.49249494, 0.5802117]]),
sampling_objectness_loss=dict(
type='mmdet.FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=8.0),
objectness_loss=dict(
type='mmdet.FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
center_loss=dict(
type='mmdet.SmoothL1Loss',
beta=0.04,
reduction='sum',
loss_weight=10.0),
dir_class_loss=dict(
type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0),
dir_res_loss=dict(
type='mmdet.SmoothL1Loss', reduction='sum', loss_weight=10.0),
size_class_loss=dict(
type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0),
size_res_loss=dict(
type='mmdet.SmoothL1Loss',
beta=1.0 / 9.0,
reduction='sum',
loss_weight=10.0 / 9.0),
semantic_loss=dict(
type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
test_cfg=dict(
sample_mode='kps',
nms_thr=0.25,
score_thr=0.0,
per_class_proposal=True,
prediction_stages='last_three'))
# dataset settings
dataset_type = 'ScanNetDataset'
data_root = './data/scannet/'
class_names = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
'bookshelf', 'picture', 'counter', 'desk', 'curtain',
'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
'garbagebin')
metainfo = dict(classes=class_names)
backend_args = None
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='DEPTH',
load_dim=6,
use_dim=[0, 1, 2],
backend_args=backend_args),
dict(
type='LoadAnnotations3D',
with_bbox_3d=True,
with_label_3d=True,
with_mask_3d=True,
with_seg_3d=True,
backend_args=backend_args),
dict(type='GlobalAlignment', rotation_axis=2),
dict(type='PointSegClassMapping'),
dict(type='PointSample', num_points=50000),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
flip_ratio_bev_vertical=0.5),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.087266, 0.087266],
scale_ratio_range=[1.0, 1.0]),
dict(
type='Pack3DDetInputs',
keys=[
'points', 'gt_bboxes_3d', 'gt_labels_3d', 'pts_semantic_mask',
'pts_instance_mask'
])
]
test_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='DEPTH',
load_dim=6,
use_dim=[0, 1, 2],
backend_args=backend_args),
dict(type='GlobalAlignment', rotation_axis=2),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
flip_ratio_bev_vertical=0.5),
dict(type='PointSample', num_points=50000),
]),
dict(type='Pack3DDetInputs', keys=['points'])
]
train_dataloader = dict(
batch_size=8,
num_workers=4,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type='RepeatDataset',
times=5,
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='scannet_infos_train.pkl',
pipeline=train_pipeline,
filter_empty_gt=False,
metainfo=metainfo,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='Depth',
backend_args=backend_args)))
val_dataloader = dict(
batch_size=1,
num_workers=1,
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='scannet_infos_val.pkl',
pipeline=test_pipeline,
metainfo=metainfo,
test_mode=True,
box_type_3d='Depth',
backend_args=backend_args))
test_dataloader = dict(
batch_size=1,
num_workers=1,
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='scannet_infos_val.pkl',
pipeline=test_pipeline,
metainfo=metainfo,
test_mode=True,
box_type_3d='Depth',
backend_args=backend_args))
val_evaluator = dict(type='IndoorMetric')
test_evaluator = val_evaluator
# optimizer
lr = 0.006
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='AdamW', lr=lr, weight_decay=0.0005),
clip_grad=dict(max_norm=0.1, norm_type=2),
paramwise_cfg=dict(
custom_keys={
'bbox_head.decoder_layers': dict(lr_mult=0.1, decay_mult=1.0),
'bbox_head.decoder_self_posembeds': dict(
lr_mult=0.1, decay_mult=1.0),
'bbox_head.decoder_cross_posembeds': dict(
lr_mult=0.1, decay_mult=1.0),
'bbox_head.decoder_query_proj': dict(lr_mult=0.1, decay_mult=1.0),
'bbox_head.decoder_key_proj': dict(lr_mult=0.1, decay_mult=1.0)
}))
# learning rate
param_scheduler = [
dict(
type='MultiStepLR',
begin=0,
end=80,
by_epoch=True,
milestones=[56, 68],
gamma=0.1)
]
# training schedule for 1x
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=80, val_interval=1)
val_cfg = dict(type='ValLoop')
test_cfg = dict(type='TestLoop')
default_hooks = dict(
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=10))
Collections:
- Name: Group-Free-3D
Metadata:
Training Techniques:
- AdamW
Training Resources: 4x V100 GPUs
Architecture:
- PointNet++
Paper:
URL: https://arxiv.org/abs/2104.00678
Title: 'Group-Free 3D Object Detection via Transformers'
README: configs/groupfree3d/README.md
Code:
URL: https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/models/detectors/groupfree3dnet.py#L10
Version: v0.15.0
Models:
- Name: groupfree3d_head-L6-O256_4xb8_scannet-seg.py
In Collection: Group-Free-3D
Config: configs/groupfree3d/groupfree3d_head-L6-O256_4xb8_scannet-seg.py
Metadata:
Training Data: ScanNet
Training Memory (GB): 6.7
Results:
- Task: 3D Object Detection
Dataset: ScanNet
Metrics:
AP@0.25: 66.17
AP@0.5: 48.47
Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/groupfree3d/groupfree3d_8x4_scannet-3d-18class-L6-O256/groupfree3d_8x4_scannet-3d-18class-L6-O256_20210702_145347-3499eb55.pth
- Name: groupfree3d_head-L12-O256_4xb8_scannet-seg.py
In Collection: Group-Free-3D
Config: configs/groupfree3d/groupfree3d_head-L12-O256_4xb8_scannet-seg.py
Metadata:
Training Data: ScanNet
Training Memory (GB): 9.4
Results:
- Task: 3D Object Detection
Dataset: ScanNet
Metrics:
AP@0.25: 66.57
AP@0.5: 48.21
Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/groupfree3d/groupfree3d_8x4_scannet-3d-18class-L12-O256/groupfree3d_8x4_scannet-3d-18class-L12-O256_20210702_150907-1c5551ad.pth
- Name: groupfree3d_w2x-head-L12-O256_4xb8_scannet-seg.py
In Collection: Group-Free-3D
Config: configs/groupfree3d/groupfree3d_w2x-head-L12-O256_4xb8_scannet-seg.py
Metadata:
Training Data: ScanNet
Training Memory (GB): 13.3
Results:
- Task: 3D Object Detection
Dataset: ScanNet
Metrics:
AP@0.25: 68.20
AP@0.5: 51.02
Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/groupfree3d/groupfree3d_8x4_scannet-3d-18class-w2x-L12-O256/groupfree3d_8x4_scannet-3d-18class-w2x-L12-O256_20210702_200301-944f0ac0.pth
- Name: groupfree3d_w2x-head-L12-O512_4xb8_scannet-seg.py
In Collection: Group-Free-3D
Config: configs/groupfree3d/groupfree3d_w2x-head-L12-O512_4xb8_scannet-seg.py
Metadata:
Training Data: ScanNet
Training Memory (GB): 18.8
Results:
- Task: 3D Object Detection
Dataset: ScanNet
Metrics:
AP@0.25: 68.22
AP@0.5: 52.61
Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/groupfree3d/groupfree3d_8x4_scannet-3d-18class-w2x-L12-O512/groupfree3d_8x4_scannet-3d-18class-w2x-L12-O512_20210702_220204-187b71c7.pth
# H3DNet: 3D Object Detection Using Hybrid Geometric Primitives
> [H3DNet: 3D Object Detection Using Hybrid Geometric Primitives](https://arxiv.org/abs/2006.05682)
<!-- [ALGORITHM] -->
## Abstract
We introduce H3DNet, which takes a colorless 3D point cloud as input and outputs a collection of oriented object bounding boxes (or BB) and their semantic labels. The critical idea of H3DNet is to predict a hybrid set of geometric primitives, i.e., BB centers, BB face centers, and BB edge centers. We show how to convert the predicted geometric primitives into object proposals by defining a distance function between an object and the geometric primitives. This distance function enables continuous optimization of object proposals, and its local minimums provide high-fidelity object proposals. H3DNet then utilizes a matching and refinement module to classify object proposals into detected objects and fine-tune the geometric parameters of the detected objects. The hybrid set of geometric primitives not only provides more accurate signals for object detection than using a single type of geometric primitives, but it also provides an overcomplete set of constraints on the resulting 3D layout. Therefore, H3DNet can tolerate outliers in predicted geometric primitives. Our model achieves state-of-the-art 3D detection results on two large datasets with real 3D scans, ScanNet and SUN RGB-D.
<div align=center>
<img src="https://user-images.githubusercontent.com/36950400/143868884-26f7fc63-93fd-48cb-a469-e2f55fda5550.png" width="800"/>
</div>
## Introduction
We implement H3DNet and provide the result and checkpoints on ScanNet datasets.
## Results and models
### ScanNet
| Backbone | Lr schd | Mem (GB) | Inf time (fps) | AP@0.25 | AP@0.5 | Download |
| :-------------------------------------------: | :-----: | :------: | :------------: | :-----: | :----: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| [MultiBackbone](./h3dnet_8xb3_scannet-seg.py) | 3x | 7.9 | | 66.07 | 47.68 | [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/h3dnet/h3dnet_3x8_scannet-3d-18class/h3dnet_3x8_scannet-3d-18class_20210824_003149-414bd304.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/h3dnet/h3dnet_scannet-3d-18class/h3dnet_3x8_scannet-3d-18class_20210824_003149.log.json) |
**Notice**: If your current mmdetection3d version >= 0.6.0, and you are using the checkpoints downloaded from the above links or using checkpoints trained with mmdetection3d version \< 0.6.0, the checkpoints have to be first converted via [tools/model_converters/convert_h3dnet_checkpoints.py](../../tools/model_converters/convert_h3dnet_checkpoints.py):
```
python ./tools/model_converters/convert_h3dnet_checkpoints.py ${ORIGINAL_CHECKPOINT_PATH} --out=${NEW_CHECKPOINT_PATH}
```
Then you can use the converted checkpoints following [get_started.md](../../docs/en/get_started.md).
## Citation
```latex
@inproceedings{zhang2020h3dnet,
author = {Zhang, Zaiwei and Sun, Bo and Yang, Haitao and Huang, Qixing},
title = {H3DNet: 3D Object Detection Using Hybrid Geometric Primitives},
booktitle = {Proceedings of the European Conference on Computer Vision},
year = {2020}
}
```
_base_ = [
'../_base_/datasets/scannet-3d.py', '../_base_/models/h3dnet.py',
'../_base_/schedules/schedule-3x.py', '../_base_/default_runtime.py'
]
# model settings
model = dict(
rpn_head=dict(
num_classes=18,
bbox_coder=dict(
type='PartialBinBasedBBoxCoder',
num_sizes=18,
num_dir_bins=24,
with_rot=False,
mean_sizes=[[0.76966727, 0.8116021, 0.92573744],
[1.876858, 1.8425595, 1.1931566],
[0.61328, 0.6148609, 0.7182701],
[1.3955007, 1.5121545, 0.83443564],
[0.97949594, 1.0675149, 0.6329687],
[0.531663, 0.5955577, 1.7500148],
[0.9624706, 0.72462326, 1.1481868],
[0.83221924, 1.0490936, 1.6875663],
[0.21132214, 0.4206159, 0.5372846],
[1.4440073, 1.8970833, 0.26985747],
[1.0294262, 1.4040797, 0.87554324],
[1.3766412, 0.65521795, 1.6813129],
[0.6650819, 0.71111923, 1.298853],
[0.41999173, 0.37906948, 1.7513971],
[0.59359556, 0.5912492, 0.73919016],
[0.50867593, 0.50656086, 0.30136237],
[1.1511526, 1.0546296, 0.49706793],
[0.47535285, 0.49249494, 0.5802117]])),
roi_head=dict(
bbox_head=dict(
num_classes=18,
bbox_coder=dict(
type='PartialBinBasedBBoxCoder',
num_sizes=18,
num_dir_bins=24,
with_rot=False,
mean_sizes=[[0.76966727, 0.8116021, 0.92573744],
[1.876858, 1.8425595, 1.1931566],
[0.61328, 0.6148609, 0.7182701],
[1.3955007, 1.5121545, 0.83443564],
[0.97949594, 1.0675149, 0.6329687],
[0.531663, 0.5955577, 1.7500148],
[0.9624706, 0.72462326, 1.1481868],
[0.83221924, 1.0490936, 1.6875663],
[0.21132214, 0.4206159, 0.5372846],
[1.4440073, 1.8970833, 0.26985747],
[1.0294262, 1.4040797, 0.87554324],
[1.3766412, 0.65521795, 1.6813129],
[0.6650819, 0.71111923, 1.298853],
[0.41999173, 0.37906948, 1.7513971],
[0.59359556, 0.5912492, 0.73919016],
[0.50867593, 0.50656086, 0.30136237],
[1.1511526, 1.0546296, 0.49706793],
[0.47535285, 0.49249494, 0.5802117]]))))
train_dataloader = dict(
batch_size=3,
num_workers=2,
)
# yapf:disable
default_hooks = dict(
logger=dict(type='LoggerHook', interval=30)
)
# yapf:enable
# Default setting for scaling LR automatically
# - `enable` means enable scaling LR automatically
# or not by default.
# - `base_batch_size` = (8 GPUs) x (3 samples per GPU).
auto_scale_lr = dict(enable=False, base_batch_size=24)
Collections:
- Name: H3DNet
Metadata:
Training Data: ScanNet
Training Techniques:
- AdamW
Training Resources: 8x GeForce GTX 1080 Ti
Architecture:
Paper:
URL: https://arxiv.org/abs/2006.05682
Title: 'H3DNet: 3D Object Detection Using Hybrid Geometric Primitives'
README: configs/h3dnet/README.md
Code:
URL: https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/models/detectors/h3dnet.py#L10
Version: v0.6.0
Models:
- Name: h3dnet_3x8_scannet-3d-18class
In Collection: H3DNet
Config: configs/h3dnet/h3dnet_8xb3_scannet-seg.py
Metadata:
Training Memory (GB): 7.9
Results:
- Task: 3D Object Detection
Dataset: ScanNet
Metrics:
AP@0.25: 66.07
AP@0.5: 47.68
Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/h3dnet/h3dnet_3x8_scannet-3d-18class/h3dnet_3x8_scannet-3d-18class_20210824_003149-414bd304.pth
# ImVoteNet: Boosting 3D Object Detection in Point Clouds with Image Votes
> [ImVoteNet: Boosting 3D Object Detection in Point Clouds with Image Votes](https://arxiv.org/abs/2001.10692)
<!-- [ALGORITHM] -->
## Abstract
3D object detection has seen quick progress thanks to advances in deep learning on point clouds. A few recent works have even shown state-of-the-art performance with just point clouds input (e.g. VOTENET). However, point cloud data have inherent limitations. They are sparse, lack color information and often suffer from sensor noise. Images, on the other hand, have high resolution and rich texture. Thus they can complement the 3D geometry provided by point clouds. Yet how to effectively use image information to assist point cloud based detection is still an open question. In this work, we build on top of VOTENET and propose a 3D detection architecture called IMVOTENET specialized for RGB-D scenes. IMVOTENET is based on fusing 2D votes in images and 3D votes in point clouds. Compared to prior work on multi-modal detection, we explicitly extract both geometric and semantic features from the 2D images. We leverage camera parameters to lift these features to 3D. To improve the synergy of 2D-3D feature fusion, we also propose a multi-tower training scheme. We validate our model on the challenging SUN RGB-D dataset, advancing state-of-the-art results by 5.7 mAP. We also provide rich ablation studies to analyze the contribution of each design choice.
<div align=center>
<img src="https://user-images.githubusercontent.com/36950400/143869878-a2ae7f43-55c3-4b95-af09-8f97dfd975f4.png" width="800"/>
</div>
## Introduction
We implement ImVoteNet and provide the result and checkpoints on SUNRGBD.
## Results and models
### SUNRGBD-2D (Stage 1, image branch pre-train)
| Backbone | Lr schd | Mem (GB) | Inf time (fps) | AP@0.25 | AP@0.5 | Download |
| :--------------------------------------------------------------: | :-----: | :------: | :------------: | :-----: | :----: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| [PointNet++](./imvotenet_faster-rcnn-r50_fpn_4xb2_sunrgbd-3d.py) | | 2.1 | | | 62.70 | [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/imvotenet/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class_20210819_225618-62eba6ce.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/imvotenet/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class_20210819_225618.json) |
### SUNRGBD-3D (Stage 2)
| Backbone | Lr schd | Mem (GB) | Inf time (fps) | AP@0.25 | AP@0.5 | Download |
| :--------------------------------------------------: | :-----: | :------: | :------------: | :-----: | :----: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| [PointNet++](./imvotenet_stage2_8xb16_sunrgbd-3d.py) | 3x | 9.4 | | 64.48 | | [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/imvotenet/imvotenet_stage2_16x8_sunrgbd-3d-10class/imvotenet_stage2_16x8_sunrgbd-3d-10class_20210819_192851-1bcd1b97.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/imvotenet/imvotenet_stage2_16x8_sunrgbd-3d-10class/imvotenet_stage2_16x8_sunrgbd-3d-10class_20210819_192851.log.json) |
## Citation
```latex
@inproceedings{qi2020imvotenet,
title={Imvotenet: Boosting 3D object detection in point clouds with image votes},
author={Qi, Charles R and Chen, Xinlei and Litany, Or and Guibas, Leonidas J},
booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
pages={4404--4413},
year={2020}
}
```
_base_ = [
'../_base_/datasets/sunrgbd-3d.py', '../_base_/default_runtime.py',
'../_base_/models/imvotenet.py'
]
backend_args = None
train_pipeline = [
dict(type='LoadImageFromFile', backend_args=backend_args),
dict(
type='LoadAnnotations3D',
with_bbox=True,
with_label=True,
with_bbox_3d=False,
with_label_3d=False),
dict(
type='RandomChoiceResize',
scales=[(1333, 480), (1333, 504), (1333, 528), (1333, 552),
(1333, 576), (1333, 600)],
keep_ratio=True),
dict(type='RandomFlip', prob=0.5),
dict(
type='Pack3DDetInputs', keys=['img', 'gt_bboxes', 'gt_bboxes_labels']),
]
test_pipeline = [
dict(type='LoadImageFromFile', backend_args=backend_args),
dict(type='Resize', scale=(1333, 600), keep_ratio=True),
dict(
type='Pack3DDetInputs',
keys=(['img']),
meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
'scale_factor'))
]
train_dataloader = dict(
batch_size=2,
num_workers=2,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type='RepeatDataset', times=1, dataset=dict(pipeline=train_pipeline)))
val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=8, val_interval=1)
val_cfg = dict(type='ValLoop')
test_cfg = dict(type='TestLoop')
# learning rate
param_scheduler = [
dict(
type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
dict(
type='MultiStepLR',
begin=0,
end=8,
by_epoch=True,
milestones=[6],
gamma=0.1)
]
val_evaluator = dict(type='Indoor2DMetric')
test_evaluator = val_evaluator
# optimizer
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))
load_from = 'http://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco_bbox_mAP-0.408__segm_mAP-0.37_20200504_163245-42aa3d00.pth' # noqa
_base_ = [
'../_base_/datasets/sunrgbd-3d.py', '../_base_/schedules/schedule-3x.py',
'../_base_/default_runtime.py', '../_base_/models/imvotenet.py'
]
class_names = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser',
'night_stand', 'bookshelf', 'bathtub')
backend_args = None
model = dict(
pts_backbone=dict(
type='PointNet2SASSG',
in_channels=4,
num_points=(2048, 1024, 512, 256),
radius=(0.2, 0.4, 0.8, 1.2),
num_samples=(64, 32, 16, 16),
sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
(128, 128, 256)),
fp_channels=((256, 256), (256, 256)),
norm_cfg=dict(type='BN2d'),
sa_cfg=dict(
type='PointSAModule',
pool_mod='max',
use_xyz=True,
normalize_xyz=True)),
pts_bbox_heads=dict(
common=dict(
type='VoteHead',
num_classes=10,
bbox_coder=dict(
type='PartialBinBasedBBoxCoder',
num_sizes=10,
num_dir_bins=12,
with_rot=True,
mean_sizes=[[2.114256, 1.620300, 0.927272],
[0.791118, 1.279516, 0.718182],
[0.923508, 1.867419, 0.845495],
[0.591958, 0.552978, 0.827272],
[0.699104, 0.454178, 0.75625],
[0.69519, 1.346299, 0.736364],
[0.528526, 1.002642, 1.172878],
[0.500618, 0.632163, 0.683424],
[0.404671, 1.071108, 1.688889],
[0.76584, 1.398258, 0.472728]]),
pred_layer_cfg=dict(
in_channels=128, shared_conv_channels=(128, 128), bias=True),
objectness_loss=dict(
type='mmdet.CrossEntropyLoss',
class_weight=[0.2, 0.8],
reduction='sum',
loss_weight=5.0),
center_loss=dict(
type='ChamferDistance',
mode='l2',
reduction='sum',
loss_src_weight=10.0,
loss_dst_weight=10.0),
dir_class_loss=dict(
type='mmdet.CrossEntropyLoss',
reduction='sum',
loss_weight=1.0),
dir_res_loss=dict(
type='mmdet.SmoothL1Loss', reduction='sum', loss_weight=10.0),
size_class_loss=dict(
type='mmdet.CrossEntropyLoss',
reduction='sum',
loss_weight=1.0),
size_res_loss=dict(
type='mmdet.SmoothL1Loss',
reduction='sum',
loss_weight=10.0 / 3.0),
semantic_loss=dict(
type='mmdet.CrossEntropyLoss',
reduction='sum',
loss_weight=1.0)),
joint=dict(
vote_module_cfg=dict(
in_channels=512,
vote_per_seed=1,
gt_per_seed=3,
conv_channels=(512, 256),
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d'),
norm_feats=True,
vote_loss=dict(
type='ChamferDistance',
mode='l1',
reduction='none',
loss_dst_weight=10.0)),
vote_aggregation_cfg=dict(
type='PointSAModule',
num_point=256,
radius=0.3,
num_sample=16,
mlp_channels=[512, 128, 128, 128],
use_xyz=True,
normalize_xyz=True)),
pts=dict(
vote_module_cfg=dict(
in_channels=256,
vote_per_seed=1,
gt_per_seed=3,
conv_channels=(256, 256),
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d'),
norm_feats=True,
vote_loss=dict(
type='ChamferDistance',
mode='l1',
reduction='none',
loss_dst_weight=10.0)),
vote_aggregation_cfg=dict(
type='PointSAModule',
num_point=256,
radius=0.3,
num_sample=16,
mlp_channels=[256, 128, 128, 128],
use_xyz=True,
normalize_xyz=True)),
img=dict(
vote_module_cfg=dict(
in_channels=256,
vote_per_seed=1,
gt_per_seed=3,
conv_channels=(256, 256),
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d'),
norm_feats=True,
vote_loss=dict(
type='ChamferDistance',
mode='l1',
reduction='none',
loss_dst_weight=10.0)),
vote_aggregation_cfg=dict(
type='PointSAModule',
num_point=256,
radius=0.3,
num_sample=16,
mlp_channels=[256, 128, 128, 128],
use_xyz=True,
normalize_xyz=True)),
loss_weights=[0.4, 0.3, 0.3]),
img_mlp=dict(
in_channel=18,
conv_channels=(256, 256),
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d'),
act_cfg=dict(type='ReLU')),
fusion_layer=dict(
type='VoteFusion',
num_classes=len(class_names),
max_imvote_per_pixel=3),
num_sampled_seed=1024,
freeze_img_branch=True,
# model training and testing settings
train_cfg=dict(
pts=dict(
pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mode='vote')),
test_cfg=dict(
img_rcnn=dict(score_thr=0.1),
pts=dict(
sample_mode='seed',
nms_thr=0.25,
score_thr=0.05,
per_class_proposal=True)))
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='DEPTH',
shift_height=True,
load_dim=6,
use_dim=[0, 1, 2],
backend_args=backend_args),
dict(type='LoadImageFromFile', backend_args=backend_args),
dict(
type='LoadAnnotations3D',
with_bbox=True,
with_label=True,
with_bbox_3d=True,
with_label_3d=True),
dict(type='Resize', scale=(1333, 600), keep_ratio=True),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.523599, 0.523599],
scale_ratio_range=[0.85, 1.15],
shift_height=True),
dict(type='PointSample', num_points=20000),
dict(
type='Pack3DDetInputs',
keys=([
'img', 'gt_bboxes', 'gt_bboxes_labels', 'points', 'gt_bboxes_3d',
'gt_labels_3d'
]))
]
test_pipeline = [
dict(type='LoadImageFromFile', backend_args=backend_args),
dict(
type='LoadPointsFromFile',
coord_type='DEPTH',
shift_height=True,
load_dim=6,
use_dim=[0, 1, 2],
backend_args=backend_args),
dict(type='Resize', scale=(1333, 600), keep_ratio=True),
dict(type='PointSample', num_points=20000),
dict(type='Pack3DDetInputs', keys=['img', 'points'])
]
train_dataloader = dict(dataset=dict(dataset=dict(pipeline=train_pipeline)))
val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
# may also use your own pre-trained image branch
load_from = 'https://download.openmmlab.com/mmdetection3d/v1.0.0_models/imvotenet/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class_20210819_225618-62eba6ce.pth' # noqa
# Default setting for scaling LR automatically
# - `enable` means enable scaling LR automatically
# or not by default.
# - `base_batch_size` = (8 GPUs) x (16 samples per GPU).
auto_scale_lr = dict(enable=False, base_batch_size=128)
randomness = dict(seed=8)
Collections:
- Name: ImVoteNet
Metadata:
Training Data: SUNRGBD
Training Techniques:
- AdamW
Training Resources: 8x TITAN Xp
Architecture:
- Faster R-CNN
- VoteNet
- Feature Pyramid Network
Paper:
URL: https://arxiv.org/abs/2001.10692
Title: 'ImVoteNet: Boosting 3D Object Detection in Point Clouds with Image Votes'
README: configs/imvotenet/README.md
Code:
URL: https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/models/detectors/imvotenet.py#L56
Version: v0.12.0
Models:
- Name: imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class
In Collection: ImVoteNet
Config: configs/imvotenet/imvotenet_faster-rcnn-r50_fpn_4xb2_sunrgbd-3d.py
Metadata:
Training Memory (GB): 2.1
Results:
- Task: Object Detection
Dataset: SUNRGBD-2D
Metrics:
AP@0.5: 62.70
Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/imvotenet/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class_20210819_225618-62eba6ce.pth
- Name: imvotenet_stage2_16x8_sunrgbd-3d-10class
In Collection: ImVoteNet
Config: configs/imvotenet/imvotenet_stage2_8xb16_sunrgbd-3d.py
Metadata:
Training Memory (GB): 9.4
Results:
- Task: 3D Object Detection
Dataset: SUNRGBD-3D
Metrics:
AP@0.25: 64.48
Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/imvotenet/imvotenet_stage2_16x8_sunrgbd-3d-10class/imvotenet_stage2_16x8_sunrgbd-3d-10class_20210819_192851-1bcd1b97.pth
# ImVoxelNet: Image to Voxels Projection for Monocular and Multi-View General-Purpose 3D Object Detection
> [ImVoxelNet: Image to Voxels Projection for Monocular and Multi-View General-Purpose 3D Object Detection](https://arxiv.org/abs/2106.01178)
<!-- [ALGORITHM] -->
## Abstract
In this paper, we introduce the task of multi-view RGB-based 3D object detection as an end-to-end optimization problem. To address this problem, we propose ImVoxelNet, a novel fully convolutional method of 3D object detection based on posed monocular or multi-view RGB images. The number of monocular images in each multiview input can variate during training and inference; actually, this number might be unique for each multi-view input. ImVoxelNet successfully handles both indoor and outdoor scenes, which makes it general-purpose. Specifically, it achieves state-of-the-art results in car detection on KITTI (monocular) and nuScenes (multi-view) benchmarks among all methods that accept RGB images. Moreover, it surpasses existing RGB-based 3D object detection methods on the SUN RGB-D dataset. On ScanNet, ImVoxelNet sets a new benchmark for multi-view 3D object detection.
<div align=center>
<img src="https://user-images.githubusercontent.com/36950400/143871445-38a55168-b8cd-4520-8ed6-f5c8c8ea304a.png" width="800"/>
</div>
## Introduction
We implement a monocular 3D detector ImVoxelNet and provide its results and checkpoints on KITTI dataset.
Results for SUN RGB-D, ScanNet and nuScenes are currently available in ImVoxelNet authors
[repo](https://github.com/saic-vul/imvoxelnet) (based on mmdetection3d).
## Results and models
### KITTI
| Backbone | Class | Lr schd | Mem (GB) | Inf time (fps) | mAP | Download |
| :--------------------------------------------: | :---: | :-----: | :------: | :------------: | :---: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| [ResNet-50](./imvoxelnet_8xb4_kitti-3d-car.py) | Car | 3x | | | 17.26 | [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/imvoxelnet/imvoxelnet_4x8_kitti-3d-car/imvoxelnet_4x8_kitti-3d-car_20210830_003014-3d0ffdf4.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/imvoxelnet/imvoxelnet_4x8_kitti-3d-car/imvoxelnet_4x8_kitti-3d-car_20210830_003014.log.json) |
### SUN RGB-D
| Backbone | Lr schd | Mem (GB) | Inf time (fps) | mAP@0.25 | mAP@0.5 | Download |
| :-------------------------------------------------: | :-----: | :------: | :------------: | :------: | :-----: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| [ResNet-50](./imvoxelnet_4x2_sunrgbd-3d-10class.py) | 2x | 7.2 | 22.5 | 40.96 | 13.50 | [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/imvoxelnet/imvoxelnet_4x2_sunrgbd-3d-10class/imvoxelnet_4x2_sunrgbd-3d-10class_20220809_184416-29ca7d2e.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/imvoxelnet/imvoxelnet_4x2_sunrgbd-3d-10class/imvoxelnet_4x2_sunrgbd-3d-10class_20220809_184416.log.json) |
## Citation
```latex
@article{rukhovich2021imvoxelnet,
title={ImVoxelNet: Image to Voxels Projection for Monocular and Multi-View General-Purpose 3D Object Detection},
author={Danila Rukhovich, Anna Vorontsova, Anton Konushin},
journal={arXiv preprint arXiv:2106.01178},
year={2021}
}
```
_base_ = [
'../_base_/schedules/mmdet-schedule-1x.py', '../_base_/default_runtime.py'
]
prior_generator = dict(
type='AlignedAnchor3DRangeGenerator',
ranges=[[-3.2, -0.2, -2.28, 3.2, 6.2, 0.28]],
rotations=[.0])
model = dict(
type='ImVoxelNet',
data_preprocessor=dict(
type='Det3DDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True,
pad_size_divisor=32),
backbone=dict(
type='mmdet.ResNet',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
norm_cfg=dict(type='BN', requires_grad=False),
norm_eval=True,
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
style='pytorch'),
neck=dict(
type='mmdet.FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
num_outs=4),
neck_3d=dict(
type='IndoorImVoxelNeck',
in_channels=256,
out_channels=128,
n_blocks=[1, 1, 1]),
bbox_head=dict(
type='ImVoxelHead',
n_classes=10,
n_levels=3,
n_channels=128,
n_reg_outs=7,
pts_assign_threshold=27,
pts_center_threshold=18,
prior_generator=prior_generator),
prior_generator=prior_generator,
n_voxels=[40, 40, 16],
coord_type='DEPTH',
train_cfg=dict(),
test_cfg=dict(nms_pre=1000, iou_thr=.25, score_thr=.01))
dataset_type = 'SUNRGBDDataset'
data_root = 'data/sunrgbd/'
class_names = [
'bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser',
'night_stand', 'bookshelf', 'bathtub'
]
metainfo = dict(CLASSES=class_names)
backend_args = None
train_pipeline = [
dict(type='LoadAnnotations3D', backend_args=backend_args),
dict(type='LoadImageFromFile', backend_args=backend_args),
dict(type='RandomResize', scale=[(512, 384), (768, 576)], keep_ratio=True),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(type='Pack3DDetInputs', keys=['img', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
dict(type='LoadImageFromFile', backend_args=backend_args),
dict(type='Resize', scale=(640, 480), keep_ratio=True),
dict(type='Pack3DDetInputs', keys=['img'])
]
train_dataloader = dict(
batch_size=4,
num_workers=4,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type='RepeatDataset',
times=2,
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='sunrgbd_infos_train.pkl',
pipeline=train_pipeline,
test_mode=False,
filter_empty_gt=True,
box_type_3d='Depth',
metainfo=metainfo,
backend_args=backend_args)))
val_dataloader = dict(
batch_size=1,
num_workers=1,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='sunrgbd_infos_val.pkl',
pipeline=test_pipeline,
test_mode=True,
box_type_3d='Depth',
metainfo=metainfo,
backend_args=backend_args))
test_dataloader = val_dataloader
val_evaluator = dict(
type='IndoorMetric',
ann_file=data_root + 'sunrgbd_infos_val.pkl',
metric='bbox')
test_evaluator = val_evaluator
# optimizer
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(
_delete_=True, type='AdamW', lr=0.0001, weight_decay=0.0001),
paramwise_cfg=dict(
custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)}),
clip_grad=dict(max_norm=35., norm_type=2))
param_scheduler = [
dict(
type='MultiStepLR',
begin=0,
end=12,
by_epoch=True,
milestones=[8, 11],
gamma=0.1)
]
# hooks
default_hooks = dict(checkpoint=dict(type='CheckpointHook', max_keep_ckpts=1))
# runtime
find_unused_parameters = True # only 1 of 4 FPN outputs is used
_base_ = [
'../_base_/schedules/mmdet-schedule-1x.py', '../_base_/default_runtime.py'
]
model = dict(
type='ImVoxelNet',
data_preprocessor=dict(
type='Det3DDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True,
pad_size_divisor=32),
backbone=dict(
type='mmdet.ResNet',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
norm_cfg=dict(type='BN', requires_grad=False),
norm_eval=True,
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
style='pytorch'),
neck=dict(
type='mmdet.FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=64,
num_outs=4),
neck_3d=dict(type='OutdoorImVoxelNeck', in_channels=64, out_channels=256),
bbox_head=dict(
type='Anchor3DHead',
num_classes=1,
in_channels=256,
feat_channels=256,
use_direction_classifier=True,
anchor_generator=dict(
type='AlignedAnchor3DRangeGenerator',
ranges=[[-0.16, -39.68, -1.78, 68.96, 39.68, -1.78]],
sizes=[[3.9, 1.6, 1.56]],
rotations=[0, 1.57],
reshape_out=True),
diff_rad_by_sin=True,
bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
loss_cls=dict(
type='mmdet.FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(
type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
loss_dir=dict(
type='mmdet.CrossEntropyLoss', use_sigmoid=False,
loss_weight=0.2)),
n_voxels=[216, 248, 12],
coord_type='LIDAR',
prior_generator=dict(
type='AlignedAnchor3DRangeGenerator',
ranges=[[-0.16, -39.68, -3.08, 68.96, 39.68, 0.76]],
rotations=[.0]),
train_cfg=dict(
assigner=dict(
type='Max3DIoUAssigner',
iou_calculator=dict(type='mmdet3d.BboxOverlapsNearest3D'),
pos_iou_thr=0.6,
neg_iou_thr=0.45,
min_pos_iou=0.45,
ignore_iof_thr=-1),
allowed_border=0,
pos_weight=-1,
debug=False),
test_cfg=dict(
use_rotate_nms=True,
nms_across_levels=False,
nms_thr=0.01,
score_thr=0.1,
min_bbox_size=0,
nms_pre=100,
max_num=50))
dataset_type = 'KittiDataset'
data_root = 'data/kitti/'
class_names = ['Car']
input_modality = dict(use_lidar=False, use_camera=True)
point_cloud_range = [0, -39.68, -3, 69.12, 39.68, 1]
metainfo = dict(classes=class_names)
backend_args = None
train_pipeline = [
dict(type='LoadAnnotations3D', backend_args=backend_args),
dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(
type='RandomResize', scale=[(1173, 352), (1387, 416)],
keep_ratio=True),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='Pack3DDetInputs', keys=['img', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
dict(type='Resize', scale=(1280, 384), keep_ratio=True),
dict(type='Pack3DDetInputs', keys=['img'])
]
train_dataloader = dict(
batch_size=4,
num_workers=4,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type='RepeatDataset',
times=3,
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='kitti_infos_train.pkl',
data_prefix=dict(img='training/image_2'),
pipeline=train_pipeline,
modality=input_modality,
test_mode=False,
metainfo=metainfo,
box_type_3d='LiDAR',
backend_args=backend_args)))
val_dataloader = dict(
batch_size=1,
num_workers=1,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='kitti_infos_val.pkl',
data_prefix=dict(img='training/image_2'),
pipeline=test_pipeline,
modality=input_modality,
test_mode=True,
metainfo=metainfo,
box_type_3d='LiDAR',
backend_args=backend_args))
test_dataloader = val_dataloader
val_evaluator = dict(
type='KittiMetric',
ann_file=data_root + 'kitti_infos_val.pkl',
metric='bbox',
backend_args=backend_args)
test_evaluator = val_evaluator
# optimizer
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(
_delete_=True, type='AdamW', lr=0.0001, weight_decay=0.0001),
paramwise_cfg=dict(
custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)}),
clip_grad=dict(max_norm=35., norm_type=2))
param_scheduler = [
dict(
type='MultiStepLR',
begin=0,
end=12,
by_epoch=True,
milestones=[8, 11],
gamma=0.1)
]
# hooks
default_hooks = dict(checkpoint=dict(type='CheckpointHook', max_keep_ckpts=1))
# runtime
find_unused_parameters = True # only 1 of 4 FPN outputs is used
vis_backends = [dict(type='LocalVisBackend')]
visualizer = dict(
type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
Collections:
- Name: ImVoxelNet
Metadata:
Training Data: KITTI
Training Techniques:
- AdamW
Training Resources: 8x Tesla P40
Architecture:
- Anchor3DHead
Paper:
URL: https://arxiv.org/abs/2106.01178
Title: 'ImVoxelNet: Image to Voxels Projection for Monocular and Multi-View General-Purpose 3D Object Detection'
README: configs/imvoxelnet/README.md
Code:
URL: https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/models/detectors/imvoxelnet.py#L11
Version: v0.15.0
Models:
- Name: imvoxelnet_kitti-3d-car
In Collection: ImVoxelNet
Config: configs/imvoxelnet/imvoxelnet_8xb4_kitti-3d-car.py
Metadata:
Training Memory (GB): 15.0
Results:
- Task: 3D Object Detection
Dataset: KITTI
Metrics:
mAP: 17.26
Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/imvoxelnet/imvoxelnet_4x8_kitti-3d-car/imvoxelnet_4x8_kitti-3d-car_20210830_003014-3d0ffdf4.pth
# 4D Spatio-Temporal ConvNets: Minkowski Convolutional Neural Networks
> [4D Spatio-Temporal ConvNets: Minkowski Convolutional Neural Networks](https://arxiv.org/abs/1904.08755)
<!-- [ALGORITHM] -->
## Abstract
In many robotics and VR/AR applications, 3D-videos are readily-available sources of input (a continuous sequence of depth images, or LIDAR scans). However, those 3D-videos are processed frame-by-frame either through 2D convnets or 3D perception algorithms. In this work, we propose 4-dimensional convolutional neural networks for spatio-temporal perception that can directly process such 3D-videos using high-dimensional convolutions. For this, we adopt sparse tensors and propose the generalized sparse convolution that encompasses all discrete convolutions. To implement the generalized sparse convolution, we create an open-source auto-differentiation library for sparse tensors that provides extensive functions for high-dimensional convolutional neural networks. We create 4D spatio-temporal convolutional neural networks using the library and validate them on various 3D semantic segmentation benchmarks and proposed 4D datasets for 3D-video perception. To overcome challenges in the 4D space, we propose the hybrid kernel, a special case of the generalized sparse convolution, and the trilateral-stationary conditional random field that enforces spatio-temporal consistency in the 7D space-time-chroma space. Experimentally, we show that convolutional neural networks with only generalized 3D sparse convolutions can outperform 2D or 2D-3D hybrid methods by a large margin. Also, we show that on 3D-videos, 4D spatio-temporal convolutional neural networks are robust to noise, outperform 3D convolutional neural networks and are faster than the 3D counterpart in some cases.
<div align=center>
<img src="https://user-images.githubusercontent.com/72679458/225243534-cd0ed738-4224-4e7c-bcac-4f4c8d89f3a9.png" width="800"/>
</div>
## Introduction
We implement MinkUNet with [TorchSparse](https://github.com/mit-han-lab/torchsparse) / [Minkowski Engine](https://github.com/NVIDIA/MinkowskiEngine) / [Spconv](https://github.com/traveller59/spconv) backend and provide the result and checkpoints on SemanticKITTI datasets.
## Results and models
### SemanticKITTI
| Method | Backend | Lr schd | Amp | Laser-Polar Mix | Mem (GB) | Training Time (hours) | FPS | mIoU | Download |
| :-------------------------------------------------------------------------------------------: | :--------------: | :-----: | :-: | :-------------: | :------: | :-------------------: | :----: | :--: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| [MinkUNet18-W16](./minkunet18_w16_torchsparse_8xb2-amp-15e_semantickitti.py) | torchsparse | 15e | ✔ | ✗ | 3.4 | - | - | 60.3 | [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet_w16_8xb2-15e_semantickitti/minkunet_w16_8xb2-15e_semantickitti_20230309_160737-0d8ec25b.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet_w16_8xb2-15e_semantickitti/minkunet_w16_8xb2-15e_semantickitti_20230309_160737.log) |
| [MinkUNet18-W20](./minkunet18_w20_torchsparse_8xb2-amp-15e_semantickitti.py) | torchsparse | 15e | ✔ | ✗ | 3.7 | - | - | 61.6 | [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet_w20_8xb2-15e_semantickitti/minkunet_w20_8xb2-15e_semantickitti_20230309_160718-c3b92e6e.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet_w20_8xb2-15e_semantickitti/minkunet_w20_8xb2-15e_semantickitti_20230309_160718.log) |
| [MinkUNet18-W32](./minkunet18_w32_torchsparse_8xb2-amp-15e_semantickitti.py) | torchsparse | 15e | ✔ | ✗ | 4.9 | - | - | 63.1 | [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet_w32_8xb2-15e_semantickitti/minkunet_w32_8xb2-15e_semantickitti_20230309_160710-7fa0a6f1.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet_w32_8xb2-15e_semantickitti/minkunet_w32_8xb2-15e_semantickitti_20230309_160710.log) |
| [MinkUNet34-W32](./minkunet34_w32_minkowski_8xb2-laser-polar-mix-3x_semantickitti.py) | minkowski engine | 3x | ✗ | ✔ | 11.5 | 6.5 | 12.2 | 69.2 | [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_minkowski_8xb2-laser-polar-mix-3x_semantickitti_20230514_202236-839847a8.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_minkowski_8xb2-laser-polar-mix-3x_semantickitti_20230514_202236.log) |
| [MinkUNet34-W32](./minkunet34_w32_spconv_8xb2-amp-laser-polar-mix-3x_semantickitti.py) | spconv | 3x | ✔ | ✔ | 6.7 | 2 | 14.6\* | 68.3 | [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_spconv_8xb2-amp-laser-polar-mix-3x_semantickitti_20230512_233152-e0698a0f.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_spconv_8xb2-amp-laser-polar-mix-3x_semantickitti_20230512_233152.log) |
| [MinkUNet34-W32](./minkunet34_w32_spconv_8xb2-laser-polar-mix-3x_semantickitti.py) | spconv | 3x | ✗ | ✔ | 10.5 | 6 | 14.5 | 69.3 | [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_spconv_8xb2-laser-polar-mix-3x_semantickitti_20230512_233817-72b200d8.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_spconv_8xb2-laser-polar-mix-3x_semantickitti_20230512_233817.log) |
| [MinkUNet34-W32](./minkunet34_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti.py) | torchsparse | 3x | ✔ | ✔ | 6.6 | 3 | 12.8 | 69.3 | [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti_20230512_233511-bef6cad0.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti_20230512_233511.log) |
| [MinkUNet34-W32](./minkunet34_w32_torchsparse_8xb2-laser-polar-mix-3x_semantickitti.py) | torchsparse | 3x | ✗ | ✔ | 11.8 | 5.5 | 15.9 | 68.7 | [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_torchsparse_8xb2-laser-polar-mix-3x_semantickitti_20230512_233601-2b61b0ab.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_torchsparse_8xb2-laser-polar-mix-3x_semantickitti_20230512_233601.log) |
| [MinkUNet34v2-W32](minkunet34v2_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti.py) | torchsparse | 3x | ✔ | ✔ | 8.9 | - | - | 70.3 | [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34v2_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti_20230510_221853-b14a68b3.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34v2_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti_20230510_221853.log) |
**Note:** We follow the implementation in SPVNAS original [repo](https://github.com/mit-han-lab/spvnas) and W16\\W20\\W32 indicates different number of channels.
**Note:** Due to TorchSparse backend, the model performance is unstable with TorchSparse backend and may fluctuate by about 1.5 mIoU for different random seeds.
**Note:** Referring to [PCSeg](https://github.com/PJLab-ADG/PCSeg), MinkUNet34v2 is modified based on MinkUNet34.
**Note\*:** Training Time and FPS are measured on NVIDIA A100. The versions of Torchsparse, Minkowski Engine and Spconv are 0.5.4, 1.4.0 and 2.3.6 respectively. Since spconv 2.3.6 has a bug with fp16 on in the inference stage, the actual FPS measurement using fp32.
## Citation
```latex
@inproceedings{choy20194d,
title={4d spatio-temporal convnets: Minkowski convolutional neural networks},
author={Choy, Christopher and Gwak, JunYoung and Savarese, Silvio},
booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
pages={3075--3084},
year={2019}
}
```
Collections:
- Name: MinkUNet
Metadata:
Training Techniques:
- AdamW
Architecture:
- MinkUNet
Paper:
URL: https://arxiv.org/abs/1904.08755
Title: '4D Spatio-Temporal ConvNets: Minkowski Convolutional Neural Networks'
README: configs/minkunet/README.md
Code:
URL: https://github.com/open-mmlab/mmdetection3d/blob/1.1/mmdet3d/models/segmentors/minkunet.py#L13
Version: v1.1.0
Models:
- Name: minkunet18_w16_torchsparse_8xb2-amp-15e_semantickitti
In Collection: MinkUNet
Config: configs/minkunet/minkunet18_w16_torchsparse_8xb2-amp-15e_semantickitti.py
Metadata:
Training Data: SemanticKITTI
Training Memory (GB): 3.4
Training Resources: 8x A100 GPUs
Results:
- Task: 3D Semantic Segmentation
Dataset: SemanticKITTI
Metrics:
mIoU: 60.3
Weights: https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet_w16_8xb2-15e_semantickitti/minkunet_w16_8xb2-15e_semantickitti_20230309_160737-0d8ec25b.pth
- Name: minkunet18_w20_torchsparse_8xb2-amp-15e_semantickitti
In Collection: MinkUNet
Config: configs/minkunet/minkunet18_w20_torchsparse_8xb2-amp-15e_semantickitti.py
Metadata:
Training Data: SemanticKITTI
Training Memory (GB): 3.7
Training Resources: 8x A100 GPUs
Results:
- Task: 3D Semantic Segmentation
Dataset: SemanticKITTI
Metrics:
mIoU: 61.6
Weights: https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet_w20_8xb2-15e_semantickitti/minkunet_w20_8xb2-15e_semantickitti_20230309_160718-c3b92e6e.pth
- Name: minkunet18_w32_torchsparse_8xb2-amp-15e_semantickitti
In Collection: MinkUNet
Config: configs/minkunet/minkunet18_w32_torchsparse_8xb2-amp-15e_semantickitti.py
Metadata:
Training Data: SemanticKITTI
Training Memory (GB): 4.9
Training Resources: 8x A100 GPUs
Results:
- Task: 3D Semantic Segmentation
Dataset: SemanticKITTI
Metrics:
mIoU: 63.1
Weights: https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet_w32_8xb2-15e_semantickitti/minkunet_w32_8xb2-15e_semantickitti_20230309_160710-7fa0a6f1.pth
- Name: minkunet34_w32_minkowski_8xb2-laser-polar-mix-3x_semantickitti
In Collection: MinkUNet
Config: configs/minkunet/minkunet34_w32_minkowski_8xb2-laser-polar-mix-3x_semantickitti.py
Metadata:
Training Data: SemanticKITTI
Training Memory (GB): 11.5
Training Resources: 8x A100 GPUs
Results:
- Task: 3D Semantic Segmentation
Dataset: SemanticKITTI
Metrics:
mIoU: 69.2
Weights: https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_minkowski_8xb2-laser-polar-mix-3x_semantickitti_20230514_202236-839847a8.pth
- Name: minkunet34_w32_spconv_8xb2-amp-laser-polar-mix-3x_semantickitti
In Collection: MinkUNet
Config: configs/minkunet/minkunet34_w32_spconv_8xb2-amp-laser-polar-mix-3x_semantickitti.py
Metadata:
Training Data: SemanticKITTI
Training Memory (GB): 6.7
Training Resources: 8x A100 GPUs
Results:
- Task: 3D Semantic Segmentation
Dataset: SemanticKITTI
Metrics:
mIoU: 68.3
Weights: https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_spconv_8xb2-amp-laser-polar-mix-3x_semantickitti_20230512_233152-e0698a0f.pth
- Name: minkunet34_w32_spconv_8xb2-laser-polar-mix-3x_semantickitti
In Collection: MinkUNet
Config: configs/minkunet/minkunet34_w32_spconv_8xb2-laser-polar-mix-3x_semantickitti.py
Metadata:
Training Data: SemanticKITTI
Training Memory (GB): 10.5
Training Resources: 8x A100 GPUs
Results:
- Task: 3D Semantic Segmentation
Dataset: SemanticKITTI
Metrics:
mIoU: 69.3
Weights: https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_spconv_8xb2-laser-polar-mix-3x_semantickitti_20230512_233817-72b200d8.pth
- Name: minkunet34_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti
In Collection: MinkUNet
Config: configs/minkunet/minkunet34_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti.py
Metadata:
Training Data: SemanticKITTI
Training Memory (GB): 6.6
Training Resources: 8x A100 GPUs
Results:
- Task: 3D Semantic Segmentation
Dataset: SemanticKITTI
Metrics:
mIoU: 69.3
Weights: https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti_20230512_233511-bef6cad0.pth
- Name: minkunet34_w32_torchsparse_8xb2-laser-polar-mix-3x_semantickitti
In Collection: MinkUNet
Config: configs/minkunet/minkunet34_w32_torchsparse_8xb2-laser-polar-mix-3x_semantickitti.py
Metadata:
Training Data: SemanticKITTI
Training Memory (GB): 11.8
Training Resources: 8x A100 GPUs
Results:
- Task: 3D Semantic Segmentation
Dataset: SemanticKITTI
Metrics:
mIoU: 68.7
Weights: https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_torchsparse_8xb2-laser-polar-mix-3x_semantickitti_20230512_233601-2b61b0ab.pth
- Name: minkunet34v2_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti
In Collection: MinkUNet
Config: configs/minkunet/minkunet34v2_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti.py
Metadata:
Training Data: SemanticKITTI
Training Memory (GB): 8.9
Training Resources: 8x A100 GPUs
Results:
- Task: 3D Semantic Segmentation
Dataset: SemanticKITTI
Metrics:
mIoU: 70.3
Weights: https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34v2_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti_20230510_221853-b14a68b3.pth
_base_ = ['./minkunet18_w32_torchsparse_8xb2-amp-15e_semantickitti.py']
model = dict(
backbone=dict(
base_channels=16,
encoder_channels=[16, 32, 64, 128],
decoder_channels=[128, 64, 48, 48]),
decode_head=dict(channels=48))
# NOTE: Due to TorchSparse backend, the model performance is relatively
# dependent on random seeds, and if random seeds are not specified the
# model performance will be different (± 1.5 mIoU).
randomness = dict(seed=1588147245)
_base_ = ['./minkunet18_w32_torchsparse_8xb2-amp-15e_semantickitti.py']
model = dict(
backbone=dict(
base_channels=20,
encoder_channels=[20, 40, 81, 163],
decoder_channels=[163, 81, 61, 61]),
decode_head=dict(channels=61))
_base_ = [
'../_base_/datasets/semantickitti.py', '../_base_/models/minkunet.py',
'../_base_/default_runtime.py'
]
train_pipeline = [
dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
dict(
type='LoadAnnotations3D',
with_bbox_3d=False,
with_label_3d=False,
with_seg_3d=True,
seg_3d_dtype='np.int32',
seg_offset=2**16,
dataset_type='semantickitti'),
dict(type='PointSegClassMapping'),
dict(
type='GlobalRotScaleTrans',
rot_range=[0., 6.28318531],
scale_ratio_range=[0.95, 1.05],
translation_std=[0, 0, 0],
),
dict(type='Pack3DDetInputs', keys=['points', 'pts_semantic_mask'])
]
train_dataloader = dict(
sampler=dict(seed=0), dataset=dict(pipeline=train_pipeline))
lr = 0.24
optim_wrapper = dict(
type='AmpOptimWrapper',
loss_scale='dynamic',
optimizer=dict(
type='SGD', lr=lr, weight_decay=0.0001, momentum=0.9, nesterov=True))
param_scheduler = [
dict(
type='LinearLR', start_factor=0.008, by_epoch=False, begin=0, end=125),
dict(
type='CosineAnnealingLR',
begin=0,
T_max=15,
by_epoch=True,
eta_min=1e-5,
convert_to_iter_based=True)
]
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=15, val_interval=1)
val_cfg = dict(type='ValLoop')
test_cfg = dict(type='TestLoop')
default_hooks = dict(checkpoint=dict(type='CheckpointHook', interval=1))
randomness = dict(seed=0, deterministic=False, diff_rank_seed=True)
env_cfg = dict(cudnn_benchmark=True)
_base_ = [
'./minkunet34_w32_torchsparse_8xb2-laser-polar-mix-3x_semantickitti.py'
]
model = dict(
data_preprocessor=dict(batch_first=True),
backbone=dict(sparseconv_backend='minkowski'))
_base_ = [
'./minkunet34_w32_torchsparse_8xb2-laser-polar-mix-3x_semantickitti.py'
]
model = dict(
data_preprocessor=dict(batch_first=True),
backbone=dict(sparseconv_backend='spconv'))
optim_wrapper = dict(type='AmpOptimWrapper', loss_scale='dynamic')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment