Commit eb1107e4 authored by raojy's avatar raojy
Browse files

fix_mmdetection

parent 7aa442d5
Pipeline #3461 canceled with stages
_base_ = [
'../_base_/schedules/mmdet-schedule-1x.py', '../_base_/default_runtime.py'
]
prior_generator = dict(
type='AlignedAnchor3DRangeGenerator',
ranges=[[-3.2, -0.2, -2.28, 3.2, 6.2, 0.28]],
rotations=[.0])
model = dict(
type='ImVoxelNet',
data_preprocessor=dict(
type='Det3DDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True,
pad_size_divisor=32),
backbone=dict(
type='mmdet.ResNet',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
norm_cfg=dict(type='BN', requires_grad=False),
norm_eval=True,
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
style='pytorch'),
neck=dict(
type='mmdet.FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
num_outs=4),
neck_3d=dict(
type='IndoorImVoxelNeck',
in_channels=256,
out_channels=128,
n_blocks=[1, 1, 1]),
bbox_head=dict(
type='ImVoxelHead',
n_classes=10,
n_levels=3,
n_channels=128,
n_reg_outs=7,
pts_assign_threshold=27,
pts_center_threshold=18,
prior_generator=prior_generator),
prior_generator=prior_generator,
n_voxels=[40, 40, 16],
coord_type='DEPTH',
train_cfg=dict(),
test_cfg=dict(nms_pre=1000, iou_thr=.25, score_thr=.01))
dataset_type = 'SUNRGBDDataset'
data_root = 'data/sunrgbd/'
class_names = [
'bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser',
'night_stand', 'bookshelf', 'bathtub'
]
metainfo = dict(CLASSES=class_names)
backend_args = None
train_pipeline = [
dict(type='LoadAnnotations3D', backend_args=backend_args),
dict(type='LoadImageFromFile', backend_args=backend_args),
dict(type='RandomResize', scale=[(512, 384), (768, 576)], keep_ratio=True),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(type='Pack3DDetInputs', keys=['img', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
dict(type='LoadImageFromFile', backend_args=backend_args),
dict(type='Resize', scale=(640, 480), keep_ratio=True),
dict(type='Pack3DDetInputs', keys=['img'])
]
train_dataloader = dict(
batch_size=4,
num_workers=4,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type='RepeatDataset',
times=2,
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='sunrgbd_infos_train.pkl',
pipeline=train_pipeline,
test_mode=False,
filter_empty_gt=True,
box_type_3d='Depth',
metainfo=metainfo,
backend_args=backend_args)))
val_dataloader = dict(
batch_size=1,
num_workers=1,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='sunrgbd_infos_val.pkl',
pipeline=test_pipeline,
test_mode=True,
box_type_3d='Depth',
metainfo=metainfo,
backend_args=backend_args))
test_dataloader = val_dataloader
val_evaluator = dict(
type='IndoorMetric',
ann_file=data_root + 'sunrgbd_infos_val.pkl',
metric='bbox')
test_evaluator = val_evaluator
# optimizer
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(
_delete_=True, type='AdamW', lr=0.0001, weight_decay=0.0001),
paramwise_cfg=dict(
custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)}),
clip_grad=dict(max_norm=35., norm_type=2))
param_scheduler = [
dict(
type='MultiStepLR',
begin=0,
end=12,
by_epoch=True,
milestones=[8, 11],
gamma=0.1)
]
# hooks
default_hooks = dict(checkpoint=dict(type='CheckpointHook', max_keep_ckpts=1))
# runtime
find_unused_parameters = True # only 1 of 4 FPN outputs is used
_base_ = [
'../_base_/schedules/mmdet-schedule-1x.py', '../_base_/default_runtime.py'
]
model = dict(
type='ImVoxelNet',
data_preprocessor=dict(
type='Det3DDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True,
pad_size_divisor=32),
backbone=dict(
type='mmdet.ResNet',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
norm_cfg=dict(type='BN', requires_grad=False),
norm_eval=True,
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
style='pytorch'),
neck=dict(
type='mmdet.FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=64,
num_outs=4),
neck_3d=dict(type='OutdoorImVoxelNeck', in_channels=64, out_channels=256),
bbox_head=dict(
type='Anchor3DHead',
num_classes=1,
in_channels=256,
feat_channels=256,
use_direction_classifier=True,
anchor_generator=dict(
type='AlignedAnchor3DRangeGenerator',
ranges=[[-0.16, -39.68, -1.78, 68.96, 39.68, -1.78]],
sizes=[[3.9, 1.6, 1.56]],
rotations=[0, 1.57],
reshape_out=True),
diff_rad_by_sin=True,
bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
loss_cls=dict(
type='mmdet.FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(
type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
loss_dir=dict(
type='mmdet.CrossEntropyLoss', use_sigmoid=False,
loss_weight=0.2)),
n_voxels=[216, 248, 12],
coord_type='LIDAR',
prior_generator=dict(
type='AlignedAnchor3DRangeGenerator',
ranges=[[-0.16, -39.68, -3.08, 68.96, 39.68, 0.76]],
rotations=[.0]),
train_cfg=dict(
assigner=dict(
type='Max3DIoUAssigner',
iou_calculator=dict(type='mmdet3d.BboxOverlapsNearest3D'),
pos_iou_thr=0.6,
neg_iou_thr=0.45,
min_pos_iou=0.45,
ignore_iof_thr=-1),
allowed_border=0,
pos_weight=-1,
debug=False),
test_cfg=dict(
use_rotate_nms=True,
nms_across_levels=False,
nms_thr=0.01,
score_thr=0.1,
min_bbox_size=0,
nms_pre=100,
max_num=50))
dataset_type = 'KittiDataset'
data_root = 'data/kitti/'
class_names = ['Car']
input_modality = dict(use_lidar=False, use_camera=True)
point_cloud_range = [0, -39.68, -3, 69.12, 39.68, 1]
metainfo = dict(classes=class_names)
backend_args = None
train_pipeline = [
dict(type='LoadAnnotations3D', backend_args=backend_args),
dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(
type='RandomResize', scale=[(1173, 352), (1387, 416)],
keep_ratio=True),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='Pack3DDetInputs', keys=['img', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
dict(type='Resize', scale=(1280, 384), keep_ratio=True),
dict(type='Pack3DDetInputs', keys=['img'])
]
train_dataloader = dict(
batch_size=4,
num_workers=4,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type='RepeatDataset',
times=3,
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='kitti_infos_train.pkl',
data_prefix=dict(img='training/image_2'),
pipeline=train_pipeline,
modality=input_modality,
test_mode=False,
metainfo=metainfo,
box_type_3d='LiDAR',
backend_args=backend_args)))
val_dataloader = dict(
batch_size=1,
num_workers=1,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='kitti_infos_val.pkl',
data_prefix=dict(img='training/image_2'),
pipeline=test_pipeline,
modality=input_modality,
test_mode=True,
metainfo=metainfo,
box_type_3d='LiDAR',
backend_args=backend_args))
test_dataloader = val_dataloader
val_evaluator = dict(
type='KittiMetric',
ann_file=data_root + 'kitti_infos_val.pkl',
metric='bbox',
backend_args=backend_args)
test_evaluator = val_evaluator
# optimizer
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(
_delete_=True, type='AdamW', lr=0.0001, weight_decay=0.0001),
paramwise_cfg=dict(
custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)}),
clip_grad=dict(max_norm=35., norm_type=2))
param_scheduler = [
dict(
type='MultiStepLR',
begin=0,
end=12,
by_epoch=True,
milestones=[8, 11],
gamma=0.1)
]
# hooks
default_hooks = dict(checkpoint=dict(type='CheckpointHook', max_keep_ckpts=1))
# runtime
find_unused_parameters = True # only 1 of 4 FPN outputs is used
vis_backends = [dict(type='LocalVisBackend')]
visualizer = dict(
type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
Collections:
- Name: ImVoxelNet
Metadata:
Training Data: KITTI
Training Techniques:
- AdamW
Training Resources: 8x Tesla P40
Architecture:
- Anchor3DHead
Paper:
URL: https://arxiv.org/abs/2106.01178
Title: 'ImVoxelNet: Image to Voxels Projection for Monocular and Multi-View General-Purpose 3D Object Detection'
README: configs/imvoxelnet/README.md
Code:
URL: https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/models/detectors/imvoxelnet.py#L11
Version: v0.15.0
Models:
- Name: imvoxelnet_kitti-3d-car
In Collection: ImVoxelNet
Config: configs/imvoxelnet/imvoxelnet_8xb4_kitti-3d-car.py
Metadata:
Training Memory (GB): 15.0
Results:
- Task: 3D Object Detection
Dataset: KITTI
Metrics:
mAP: 17.26
Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/imvoxelnet/imvoxelnet_4x8_kitti-3d-car/imvoxelnet_4x8_kitti-3d-car_20210830_003014-3d0ffdf4.pth
# 4D Spatio-Temporal ConvNets: Minkowski Convolutional Neural Networks
> [4D Spatio-Temporal ConvNets: Minkowski Convolutional Neural Networks](https://arxiv.org/abs/1904.08755)
<!-- [ALGORITHM] -->
## Abstract
In many robotics and VR/AR applications, 3D-videos are readily-available sources of input (a continuous sequence of depth images, or LIDAR scans). However, those 3D-videos are processed frame-by-frame either through 2D convnets or 3D perception algorithms. In this work, we propose 4-dimensional convolutional neural networks for spatio-temporal perception that can directly process such 3D-videos using high-dimensional convolutions. For this, we adopt sparse tensors and propose the generalized sparse convolution that encompasses all discrete convolutions. To implement the generalized sparse convolution, we create an open-source auto-differentiation library for sparse tensors that provides extensive functions for high-dimensional convolutional neural networks. We create 4D spatio-temporal convolutional neural networks using the library and validate them on various 3D semantic segmentation benchmarks and proposed 4D datasets for 3D-video perception. To overcome challenges in the 4D space, we propose the hybrid kernel, a special case of the generalized sparse convolution, and the trilateral-stationary conditional random field that enforces spatio-temporal consistency in the 7D space-time-chroma space. Experimentally, we show that convolutional neural networks with only generalized 3D sparse convolutions can outperform 2D or 2D-3D hybrid methods by a large margin. Also, we show that on 3D-videos, 4D spatio-temporal convolutional neural networks are robust to noise, outperform 3D convolutional neural networks and are faster than the 3D counterpart in some cases.
<div align=center>
<img src="https://user-images.githubusercontent.com/72679458/225243534-cd0ed738-4224-4e7c-bcac-4f4c8d89f3a9.png" width="800"/>
</div>
## Introduction
We implement MinkUNet with [TorchSparse](https://github.com/mit-han-lab/torchsparse) / [Minkowski Engine](https://github.com/NVIDIA/MinkowskiEngine) / [Spconv](https://github.com/traveller59/spconv) backend and provide the result and checkpoints on SemanticKITTI datasets.
## Results and models
### SemanticKITTI
| Method | Backend | Lr schd | Amp | Laser-Polar Mix | Mem (GB) | Training Time (hours) | FPS | mIoU | Download |
| :-------------------------------------------------------------------------------------------: | :--------------: | :-----: | :-: | :-------------: | :------: | :-------------------: | :----: | :--: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| [MinkUNet18-W16](./minkunet18_w16_torchsparse_8xb2-amp-15e_semantickitti.py) | torchsparse | 15e | ✔ | ✗ | 3.4 | - | - | 60.3 | [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet_w16_8xb2-15e_semantickitti/minkunet_w16_8xb2-15e_semantickitti_20230309_160737-0d8ec25b.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet_w16_8xb2-15e_semantickitti/minkunet_w16_8xb2-15e_semantickitti_20230309_160737.log) |
| [MinkUNet18-W20](./minkunet18_w20_torchsparse_8xb2-amp-15e_semantickitti.py) | torchsparse | 15e | ✔ | ✗ | 3.7 | - | - | 61.6 | [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet_w20_8xb2-15e_semantickitti/minkunet_w20_8xb2-15e_semantickitti_20230309_160718-c3b92e6e.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet_w20_8xb2-15e_semantickitti/minkunet_w20_8xb2-15e_semantickitti_20230309_160718.log) |
| [MinkUNet18-W32](./minkunet18_w32_torchsparse_8xb2-amp-15e_semantickitti.py) | torchsparse | 15e | ✔ | ✗ | 4.9 | - | - | 63.1 | [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet_w32_8xb2-15e_semantickitti/minkunet_w32_8xb2-15e_semantickitti_20230309_160710-7fa0a6f1.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet_w32_8xb2-15e_semantickitti/minkunet_w32_8xb2-15e_semantickitti_20230309_160710.log) |
| [MinkUNet34-W32](./minkunet34_w32_minkowski_8xb2-laser-polar-mix-3x_semantickitti.py) | minkowski engine | 3x | ✗ | ✔ | 11.5 | 6.5 | 12.2 | 69.2 | [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_minkowski_8xb2-laser-polar-mix-3x_semantickitti_20230514_202236-839847a8.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_minkowski_8xb2-laser-polar-mix-3x_semantickitti_20230514_202236.log) |
| [MinkUNet34-W32](./minkunet34_w32_spconv_8xb2-amp-laser-polar-mix-3x_semantickitti.py) | spconv | 3x | ✔ | ✔ | 6.7 | 2 | 14.6\* | 68.3 | [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_spconv_8xb2-amp-laser-polar-mix-3x_semantickitti_20230512_233152-e0698a0f.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_spconv_8xb2-amp-laser-polar-mix-3x_semantickitti_20230512_233152.log) |
| [MinkUNet34-W32](./minkunet34_w32_spconv_8xb2-laser-polar-mix-3x_semantickitti.py) | spconv | 3x | ✗ | ✔ | 10.5 | 6 | 14.5 | 69.3 | [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_spconv_8xb2-laser-polar-mix-3x_semantickitti_20230512_233817-72b200d8.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_spconv_8xb2-laser-polar-mix-3x_semantickitti_20230512_233817.log) |
| [MinkUNet34-W32](./minkunet34_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti.py) | torchsparse | 3x | ✔ | ✔ | 6.6 | 3 | 12.8 | 69.3 | [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti_20230512_233511-bef6cad0.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti_20230512_233511.log) |
| [MinkUNet34-W32](./minkunet34_w32_torchsparse_8xb2-laser-polar-mix-3x_semantickitti.py) | torchsparse | 3x | ✗ | ✔ | 11.8 | 5.5 | 15.9 | 68.7 | [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_torchsparse_8xb2-laser-polar-mix-3x_semantickitti_20230512_233601-2b61b0ab.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_torchsparse_8xb2-laser-polar-mix-3x_semantickitti_20230512_233601.log) |
| [MinkUNet34v2-W32](minkunet34v2_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti.py) | torchsparse | 3x | ✔ | ✔ | 8.9 | - | - | 70.3 | [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34v2_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti_20230510_221853-b14a68b3.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34v2_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti_20230510_221853.log) |
**Note:** We follow the implementation in SPVNAS original [repo](https://github.com/mit-han-lab/spvnas) and W16\\W20\\W32 indicates different number of channels.
**Note:** Due to TorchSparse backend, the model performance is unstable with TorchSparse backend and may fluctuate by about 1.5 mIoU for different random seeds.
**Note:** Referring to [PCSeg](https://github.com/PJLab-ADG/PCSeg), MinkUNet34v2 is modified based on MinkUNet34.
**Note\*:** Training Time and FPS are measured on NVIDIA A100. The versions of Torchsparse, Minkowski Engine and Spconv are 0.5.4, 1.4.0 and 2.3.6 respectively. Since spconv 2.3.6 has a bug with fp16 on in the inference stage, the actual FPS measurement using fp32.
## Citation
```latex
@inproceedings{choy20194d,
title={4d spatio-temporal convnets: Minkowski convolutional neural networks},
author={Choy, Christopher and Gwak, JunYoung and Savarese, Silvio},
booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
pages={3075--3084},
year={2019}
}
```
Collections:
- Name: MinkUNet
Metadata:
Training Techniques:
- AdamW
Architecture:
- MinkUNet
Paper:
URL: https://arxiv.org/abs/1904.08755
Title: '4D Spatio-Temporal ConvNets: Minkowski Convolutional Neural Networks'
README: configs/minkunet/README.md
Code:
URL: https://github.com/open-mmlab/mmdetection3d/blob/1.1/mmdet3d/models/segmentors/minkunet.py#L13
Version: v1.1.0
Models:
- Name: minkunet18_w16_torchsparse_8xb2-amp-15e_semantickitti
In Collection: MinkUNet
Config: configs/minkunet/minkunet18_w16_torchsparse_8xb2-amp-15e_semantickitti.py
Metadata:
Training Data: SemanticKITTI
Training Memory (GB): 3.4
Training Resources: 8x A100 GPUs
Results:
- Task: 3D Semantic Segmentation
Dataset: SemanticKITTI
Metrics:
mIoU: 60.3
Weights: https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet_w16_8xb2-15e_semantickitti/minkunet_w16_8xb2-15e_semantickitti_20230309_160737-0d8ec25b.pth
- Name: minkunet18_w20_torchsparse_8xb2-amp-15e_semantickitti
In Collection: MinkUNet
Config: configs/minkunet/minkunet18_w20_torchsparse_8xb2-amp-15e_semantickitti.py
Metadata:
Training Data: SemanticKITTI
Training Memory (GB): 3.7
Training Resources: 8x A100 GPUs
Results:
- Task: 3D Semantic Segmentation
Dataset: SemanticKITTI
Metrics:
mIoU: 61.6
Weights: https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet_w20_8xb2-15e_semantickitti/minkunet_w20_8xb2-15e_semantickitti_20230309_160718-c3b92e6e.pth
- Name: minkunet18_w32_torchsparse_8xb2-amp-15e_semantickitti
In Collection: MinkUNet
Config: configs/minkunet/minkunet18_w32_torchsparse_8xb2-amp-15e_semantickitti.py
Metadata:
Training Data: SemanticKITTI
Training Memory (GB): 4.9
Training Resources: 8x A100 GPUs
Results:
- Task: 3D Semantic Segmentation
Dataset: SemanticKITTI
Metrics:
mIoU: 63.1
Weights: https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet_w32_8xb2-15e_semantickitti/minkunet_w32_8xb2-15e_semantickitti_20230309_160710-7fa0a6f1.pth
- Name: minkunet34_w32_minkowski_8xb2-laser-polar-mix-3x_semantickitti
In Collection: MinkUNet
Config: configs/minkunet/minkunet34_w32_minkowski_8xb2-laser-polar-mix-3x_semantickitti.py
Metadata:
Training Data: SemanticKITTI
Training Memory (GB): 11.5
Training Resources: 8x A100 GPUs
Results:
- Task: 3D Semantic Segmentation
Dataset: SemanticKITTI
Metrics:
mIoU: 69.2
Weights: https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_minkowski_8xb2-laser-polar-mix-3x_semantickitti_20230514_202236-839847a8.pth
- Name: minkunet34_w32_spconv_8xb2-amp-laser-polar-mix-3x_semantickitti
In Collection: MinkUNet
Config: configs/minkunet/minkunet34_w32_spconv_8xb2-amp-laser-polar-mix-3x_semantickitti.py
Metadata:
Training Data: SemanticKITTI
Training Memory (GB): 6.7
Training Resources: 8x A100 GPUs
Results:
- Task: 3D Semantic Segmentation
Dataset: SemanticKITTI
Metrics:
mIoU: 68.3
Weights: https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_spconv_8xb2-amp-laser-polar-mix-3x_semantickitti_20230512_233152-e0698a0f.pth
- Name: minkunet34_w32_spconv_8xb2-laser-polar-mix-3x_semantickitti
In Collection: MinkUNet
Config: configs/minkunet/minkunet34_w32_spconv_8xb2-laser-polar-mix-3x_semantickitti.py
Metadata:
Training Data: SemanticKITTI
Training Memory (GB): 10.5
Training Resources: 8x A100 GPUs
Results:
- Task: 3D Semantic Segmentation
Dataset: SemanticKITTI
Metrics:
mIoU: 69.3
Weights: https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_spconv_8xb2-laser-polar-mix-3x_semantickitti_20230512_233817-72b200d8.pth
- Name: minkunet34_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti
In Collection: MinkUNet
Config: configs/minkunet/minkunet34_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti.py
Metadata:
Training Data: SemanticKITTI
Training Memory (GB): 6.6
Training Resources: 8x A100 GPUs
Results:
- Task: 3D Semantic Segmentation
Dataset: SemanticKITTI
Metrics:
mIoU: 69.3
Weights: https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti_20230512_233511-bef6cad0.pth
- Name: minkunet34_w32_torchsparse_8xb2-laser-polar-mix-3x_semantickitti
In Collection: MinkUNet
Config: configs/minkunet/minkunet34_w32_torchsparse_8xb2-laser-polar-mix-3x_semantickitti.py
Metadata:
Training Data: SemanticKITTI
Training Memory (GB): 11.8
Training Resources: 8x A100 GPUs
Results:
- Task: 3D Semantic Segmentation
Dataset: SemanticKITTI
Metrics:
mIoU: 68.7
Weights: https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_torchsparse_8xb2-laser-polar-mix-3x_semantickitti_20230512_233601-2b61b0ab.pth
- Name: minkunet34v2_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti
In Collection: MinkUNet
Config: configs/minkunet/minkunet34v2_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti.py
Metadata:
Training Data: SemanticKITTI
Training Memory (GB): 8.9
Training Resources: 8x A100 GPUs
Results:
- Task: 3D Semantic Segmentation
Dataset: SemanticKITTI
Metrics:
mIoU: 70.3
Weights: https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34v2_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti_20230510_221853-b14a68b3.pth
_base_ = ['./minkunet18_w32_torchsparse_8xb2-amp-15e_semantickitti.py']
model = dict(
backbone=dict(
base_channels=16,
encoder_channels=[16, 32, 64, 128],
decoder_channels=[128, 64, 48, 48]),
decode_head=dict(channels=48))
# NOTE: Due to TorchSparse backend, the model performance is relatively
# dependent on random seeds, and if random seeds are not specified the
# model performance will be different (± 1.5 mIoU).
randomness = dict(seed=1588147245)
_base_ = ['./minkunet18_w32_torchsparse_8xb2-amp-15e_semantickitti.py']
model = dict(
backbone=dict(
base_channels=20,
encoder_channels=[20, 40, 81, 163],
decoder_channels=[163, 81, 61, 61]),
decode_head=dict(channels=61))
_base_ = [
'../_base_/datasets/semantickitti.py', '../_base_/models/minkunet.py',
'../_base_/default_runtime.py'
]
train_pipeline = [
dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
dict(
type='LoadAnnotations3D',
with_bbox_3d=False,
with_label_3d=False,
with_seg_3d=True,
seg_3d_dtype='np.int32',
seg_offset=2**16,
dataset_type='semantickitti'),
dict(type='PointSegClassMapping'),
dict(
type='GlobalRotScaleTrans',
rot_range=[0., 6.28318531],
scale_ratio_range=[0.95, 1.05],
translation_std=[0, 0, 0],
),
dict(type='Pack3DDetInputs', keys=['points', 'pts_semantic_mask'])
]
train_dataloader = dict(
sampler=dict(seed=0), dataset=dict(pipeline=train_pipeline))
lr = 0.24
optim_wrapper = dict(
type='AmpOptimWrapper',
loss_scale='dynamic',
optimizer=dict(
type='SGD', lr=lr, weight_decay=0.0001, momentum=0.9, nesterov=True))
param_scheduler = [
dict(
type='LinearLR', start_factor=0.008, by_epoch=False, begin=0, end=125),
dict(
type='CosineAnnealingLR',
begin=0,
T_max=15,
by_epoch=True,
eta_min=1e-5,
convert_to_iter_based=True)
]
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=15, val_interval=1)
val_cfg = dict(type='ValLoop')
test_cfg = dict(type='TestLoop')
default_hooks = dict(checkpoint=dict(type='CheckpointHook', interval=1))
randomness = dict(seed=0, deterministic=False, diff_rank_seed=True)
env_cfg = dict(cudnn_benchmark=True)
_base_ = [
'./minkunet34_w32_torchsparse_8xb2-laser-polar-mix-3x_semantickitti.py'
]
model = dict(
data_preprocessor=dict(batch_first=True),
backbone=dict(sparseconv_backend='minkowski'))
_base_ = [
'./minkunet34_w32_torchsparse_8xb2-laser-polar-mix-3x_semantickitti.py'
]
model = dict(
data_preprocessor=dict(batch_first=True),
backbone=dict(sparseconv_backend='spconv'))
optim_wrapper = dict(type='AmpOptimWrapper', loss_scale='dynamic')
_base_ = [
'./minkunet34_w32_torchsparse_8xb2-laser-polar-mix-3x_semantickitti.py'
]
model = dict(
data_preprocessor=dict(batch_first=True),
backbone=dict(sparseconv_backend='spconv'))
_base_ = [
'./minkunet34_w32_torchsparse_8xb2-laser-polar-mix-3x_semantickitti.py'
]
optim_wrapper = dict(type='AmpOptimWrapper', loss_scale='dynamic')
_base_ = [
'../_base_/datasets/semantickitti.py', '../_base_/models/minkunet.py',
'../_base_/schedules/schedule-3x.py', '../_base_/default_runtime.py'
]
model = dict(
data_preprocessor=dict(max_voxels=None),
backbone=dict(encoder_blocks=[2, 3, 4, 6]))
train_pipeline = [
dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
dict(
type='LoadAnnotations3D',
with_bbox_3d=False,
with_label_3d=False,
with_seg_3d=True,
seg_3d_dtype='np.int32',
seg_offset=2**16,
dataset_type='semantickitti'),
dict(type='PointSegClassMapping'),
dict(
type='RandomChoice',
transforms=[
[
dict(
type='LaserMix',
num_areas=[3, 4, 5, 6],
pitch_angles=[-25, 3],
pre_transform=[
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=4,
use_dim=4),
dict(
type='LoadAnnotations3D',
with_bbox_3d=False,
with_label_3d=False,
with_seg_3d=True,
seg_3d_dtype='np.int32',
seg_offset=2**16,
dataset_type='semantickitti'),
dict(type='PointSegClassMapping')
],
prob=1)
],
[
dict(
type='PolarMix',
instance_classes=[0, 1, 2, 3, 4, 5, 6, 7],
swap_ratio=0.5,
rotate_paste_ratio=1.0,
pre_transform=[
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=4,
use_dim=4),
dict(
type='LoadAnnotations3D',
with_bbox_3d=False,
with_label_3d=False,
with_seg_3d=True,
seg_3d_dtype='np.int32',
seg_offset=2**16,
dataset_type='semantickitti'),
dict(type='PointSegClassMapping')
],
prob=1)
],
],
prob=[0.5, 0.5]),
dict(
type='GlobalRotScaleTrans',
rot_range=[0., 6.28318531],
scale_ratio_range=[0.95, 1.05],
translation_std=[0, 0, 0],
),
dict(type='Pack3DDetInputs', keys=['points', 'pts_semantic_mask'])
]
train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
default_hooks = dict(checkpoint=dict(type='CheckpointHook', interval=1))
_base_ = [
'./minkunet34_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti.py'
]
model = dict(
backbone=dict(type='MinkUNetBackboneV2'),
decode_head=dict(channels=256 + 128 + 96))
randomness = dict(seed=None, deterministic=False, diff_rank_seed=True)
env_cfg = dict(cudnn_benchmark=True)
# Objects are Different: Flexible Monocular 3D Object Detection
> [Objects are Different: Flexible Monocular 3D Object Detection](https://arxiv.org/abs/2104.02323)
<!-- [ALGORITHM] -->
## Abstract
The precise localization of 3D objects from a single image without depth information is a highly challenging problem. Most existing methods adopt the same approach for all objects regardless of their diverse distributions, leading to limited performance for truncated objects. In this paper, we propose a flexible framework for monocular 3D object detection which explicitly decouples the truncated objects and adaptively combines multiple approaches for object depth estimation. Specifically, we decouple the edge of the feature map for predicting long-tail truncated objects so that the optimization of normal objects is not influenced. Furthermore, we formulate the object depth estimation as an uncertainty-guided ensemble of directly regressed object depth and solved depths from different groups of keypoints. Experiments demonstrate that our method outperforms the state-of-the-art method by relatively 27% for the moderate level and 30% for the hard level in the test set of KITTI benchmark while maintaining real-time efficiency.
<div align=center>
<img src="https://user-images.githubusercontent.com/36950400/153138824-d54a7a47-773f-42f9-8a51-b0a71078593e.png" width="800"/>
</div>
## Introduction
We implement MonoFlex and provide the results and checkpoints on KITTI dataset.
## Results and models
### KITTI
| Backbone | Lr schd | Mem (GB) | Inf time (fps) | mAP | Download |
| :---------------------------------------------------------------------: | :-----: | :------: | :------------: | :---: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| [DLA34](./monoflex_dla34_pytorch_dlaneck_gn-all_2x4_6x_kitti-mono3d.py) | 6x | 9.64 | | 21.86 | [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/monoflex/monoflex_dla34_pytorch_dlaneck_gn-all_2x4_6x_kitti-mono3d_20211228_027553-d46d9bb0.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/monoflex/monoflex_dla34_pytorch_dlaneck_gn-all_2x4_6x_kitti-mono3d_20211228_027553.log.json) |
Note: mAP represents Car moderate 3D strict AP11 results.
Detailed performance on KITTI 3D detection (3D/BEV) is as follows, evaluated by AP11 and AP40 metric:
| | Easy | Moderate | Hard |
| ---------- | :-----------: | :-----------: | :-----------: |
| Car (AP11) | 28.02 / 36.11 | 21.86 / 29.46 | 19.01 / 24.83 |
| Car (AP40) | 23.22 / 32.74 | 17.18 / 24.02 | 15.13 / 20.67 |
Note: mAP represents Car moderate 3D strict AP11 / AP40 results. Because of the limited data for pedestrians and cyclists, the detection performance for these two classes is usually unstable. Therefore, we only list car detection results here. In addition, the AP11 result may fluctuate in a larger range (~1 AP), so AP40 is a more recommended metric for reference due to its much better stability.
## Citation
```latex
@InProceedings{MonoFlex,
author = {Zhang, Yunpeng and Lu, Jiwen and Zhou, Jie},
title = {Objects Are Different: Flexible Monocular 3D Object Detection},
booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
month = {June},
year = {2021},
pages = {3289-3298}
}
```
Collections:
- Name: MonoFlex
Metadata:
Training Data: KITTI
Training Techniques:
- Adam
Training Resources: 2x V100 GPUS
Architecture:
- MonoFlexHead
- DLA
Paper:
URL: https://arxiv.org/abs/2104.02323
Title: 'Objects are Different: Flexible Monocular 3D Object Detection'
README: configs/monoflex/README.md
Code:
URL: https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0.dev0/mmdet3d/models/detectors/monoflex.py#L7
Version: v1.0.0
Models:
- Name: monoflex_dla34_pytorch_dlaneck_gn-all_2x4_6x_kitti-mono3d
In Collection: MonoFlex
Config: configs/monoflex/monoflex_dla34_pytorch_dlaneck_gn-all_2x4_6x_kitti-mono3d.py
Metadata:
Training Memory (GB): 9.64
Results:
- Task: 3D Object Detection
Dataset: KITTI
Metrics:
mAP: 21.86
Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/monoflex/monoflex_dla34_pytorch_dlaneck_gn-all_2x4_6x_kitti-mono3d_20211228_027553-d46d9bb0.pth
# MV-FCOS3D++: Multi-View Camera-Only 4D Object Detection with Pretrained Monocular Backbones
> [MV-FCOS3D++: Multi-View} Camera-Only 4D Object Detection with Pretrained Monocular Backbones](https://arxiv.org/abs/2207.12716)
<!-- [ALGORITHM] -->
## Abstract
In this technical report, we present our solution, dubbed MV-FCOS3D++, for the Camera-Only 3D Detection track in Waymo Open Dataset Challenge 2022. For multi-view camera-only 3D detection, methods based on bird-eye-view or 3D geometric representations can leverage the stereo cues from overlapped regions between adjacent views and directly perform 3D detection without hand-crafted post-processing. However, it lacks direct semantic supervision for 2D backbones, which can be complemented by pretraining simple monocular-based detectors. Our solution is a multi-view framework for 4D detection following this paradigm. It is built upon a simple monocular detector FCOS3D++, pretrained only with object annotations of Waymo, and converts multi-view features to a 3D grid space to detect 3D objects thereon. A dual-path neck for single-frame understanding and temporal stereo matching is devised to incorporate multi-frame information. Our method finally achieves 49.75% mAPL with a single model and wins 2nd place in the WOD challenge, without any LiDAR-based depth supervision during training. The code will be released at [this https URL](https://github.com/Tai-Wang/Depth-from-Motion).
<div align=center>
<img src="https://github.com/open-mmlab/mmdetection3d/assets/72679458/9313eb3c-cc41-40be-9ead-549b3b5fef44" width="800"/>
</div>
## Introduction
We implement multi-view FCOS3D++ and provide the results on Waymo dataset.
## Usage
### Training commands
1. You should train PGD first:
```bash
bash tools/dist_train.py configs/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-mv-mono3d.py 8
```
2. Given pre-trained PGD backbone, you could train multi-view FCOS3D++:
```bash
bash tools/dist_train.sh configs/mvfcos3d/multiview-fcos3d_r101-dcn_8xb2_waymoD5-3d-3class.py --cfg-options load_from=${PRETRAINED_CHECKPOINT}
```
**Note**:
the path of `load_from` needs to be changed to yours accordingly.
## Results and models
### Waymo
| Backbone | Load Interval | mAPL | mAP | mAPH | Download |
| :--------------------------------------------------------------------: | :-----------: | :--: | :--: | :--: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| [ResNet101+DCN](./multiview-fcos3d_r101-dcn_8xb2_waymoD5-3d-3class.py) | 5x | 38.2 | 52.9 | 49.5 | [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/mvfcos3d/multiview-fcos3d_r101-dcn_8xb2_waymoD5-3d-3class/multiview-fcos3d_r101-dcn_8xb2_waymoD5-3d-3class_20231127_122815.log) |
| above @ Car | | 56.5 | 73.3 | 72.3 | |
| above @ Pedestrian | | 34.8 | 49.5 | 43.1 | |
| above @ Cyclist | | 23.2 | 35.9 | 33.3 | |
**Note**:
Regrettably, we are unable to provide the pre-trained model weights due to [Waymo Dataset License Agreement](https://waymo.com/open/terms/), so we only provide the training logs as shown above.
## Citation
```latex
@article{wang2022mvfcos3d++,
title={{MV-FCOS3D++: Multi-View} Camera-Only 4D Object Detection with Pretrained Monocular Backbones},
author={Wang, Tai and Lian, Qing and Zhu, Chenming and Zhu, Xinge and Zhang, Wenwei},
journal={arXiv preprint},
year={2022}
}
```
_base_ = [
'../_base_/datasets/waymoD5-mv3d-3class.py',
'../_base_/models/multiview_dfm.py'
]
# optimizer
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='AdamW', lr=0.0005, weight_decay=0.0001),
paramwise_cfg=dict(
custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)}),
clip_grad=dict(max_norm=35., norm_type=2))
param_scheduler = [
dict(
type='MultiStepLR',
begin=0,
end=24,
by_epoch=True,
milestones=[16, 22],
gamma=0.1)
]
# hooks
default_hooks = dict(
timer=dict(type='IterTimerHook'),
logger=dict(type='LoggerHook', interval=50),
param_scheduler=dict(type='ParamSchedulerHook'),
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=1),
sampler_seed=dict(type='DistSamplerSeedHook'),
)
# training schedule for 2x
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=24, val_interval=24)
val_cfg = dict(type='ValLoop')
test_cfg = dict(type='TestLoop')
# runtime
default_scope = 'mmdet3d'
env_cfg = dict(
cudnn_benchmark=False,
mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
dist_cfg=dict(backend='nccl'),
)
log_level = 'INFO'
load_from = None
resume = False
find_unused_parameters = True # only 1 of 4 FPN outputs is used
_base_ = ['./multiview-dfm_r101-dcn_16xb2_waymoD5-3d-3class.py']
model = dict(
bbox_head=dict(
_delete_=True,
type='CenterHead',
in_channels=256,
tasks=[
dict(num_class=1, class_names=['Pedestrian']),
dict(num_class=1, class_names=['Cyclist']),
dict(num_class=1, class_names=['Car']),
],
common_heads=dict(reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2)),
share_conv_channel=64,
bbox_coder=dict(
type='CenterPointBBoxCoder',
post_center_range=[-35.0, -75.0, -2, 75.0, 75.0, 4],
pc_range=[-35.0, -75.0, -2, 75.0, 75.0, 4],
max_num=2000,
score_threshold=0,
out_size_factor=1,
voxel_size=(.50, .50),
code_size=7),
separate_head=dict(
type='SeparateHead', init_bias=-2.19, final_kernel=3),
loss_cls=dict(type='mmdet.GaussianFocalLoss', reduction='mean'),
loss_bbox=dict(
type='mmdet.L1Loss', reduction='mean', loss_weight=0.25),
norm_bbox=True),
train_cfg=dict(
_delete_=True,
grid_size=[220, 300, 1],
voxel_size=(0.5, 0.5, 6),
out_size_factor=1,
dense_reg=1,
gaussian_overlap=0.1,
max_objs=500,
min_radius=2,
code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
point_cloud_range=[-35.0, -75.0, -2, 75.0, 75.0, 4]),
test_cfg=dict(
_delete_=True,
post_center_limit_range=[-35.0, -75.0, -2, 75.0, 75.0, 4],
max_per_img=4096,
max_pool_nms=False,
min_radius=[0.5, 2, 6],
score_threshold=0,
out_size_factor=1,
voxel_size=(0.5, 0.5),
nms_type='circle',
pre_max_size=2000,
post_max_size=200,
nms_thr=0.2))
# MVX-Net: Multimodal VoxelNet for 3D Object Detection
> [MVX-Net: Multimodal VoxelNet for 3D Object Detection](https://arxiv.org/abs/1904.01649)
<!-- [ALGORITHM] -->
## Abstract
Many recent works on 3D object detection have focused on designing neural network architectures that can consume point cloud data. While these approaches demonstrate encouraging performance, they are typically based on a single modality and are unable to leverage information from other modalities, such as a camera. Although a few approaches fuse data from different modalities, these methods either use a complicated pipeline to process the modalities sequentially, or perform late-fusion and are unable to learn interaction between different modalities at early stages. In this work, we present PointFusion and VoxelFusion: two simple yet effective early-fusion approaches to combine the RGB and point cloud modalities, by leveraging the recently introduced VoxelNet architecture. Evaluation on the KITTI dataset demonstrates significant improvements in performance over approaches which only use point cloud data. Furthermore, the proposed method provides results competitive with the state-of-the-art multimodal algorithms, achieving top-2 ranking in five of the six bird's eye view and 3D detection categories on the KITTI benchmark, by using a simple single stage network.
<div align=center>
<img src="https://user-images.githubusercontent.com/79644370/143880819-560675ca-e7e3-4d77-8808-ea661ff8e6e6.png" width="800"/>
</div>
## Introduction
We implement MVX-Net and provide its results and models on KITTI dataset.
## Results and models
### KITTI
| Backbone | Class | Lr schd | Mem (GB) | Inf time (fps) | mAP | Download |
| :-----------------------------------------------------------------: | :-----: | :--------: | :------: | :------------: | :--: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| [SECFPN](./mvxnet_fpn_dv_second_secfpn_8xb2-80e_kitti-3d-3class.py) | 3 Class | cosine 80e | 6.7 | | 63.5 | [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/mvxnet/mvxnet_fpn_dv_second_secfpn_8xb2-80e_kitti-3d-3class/mvxnet_fpn_dv_second_secfpn_8xb2-80e_kitti-3d-3class-8963258a.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/mvxnet/mvxnet_fpn_dv_second_secfpn_8xb2-80e_kitti-3d-3class/mvxnet_fpn_dv_second_secfpn_8xb2-80e_kitti-3d-3class-20230424_132228.log) |
## Citation
```latex
@inproceedings{sindagi2019mvx,
title={MVX-Net: Multimodal voxelnet for 3D object detection},
author={Sindagi, Vishwanath A and Zhou, Yin and Tuzel, Oncel},
booktitle={2019 International Conference on Robotics and Automation (ICRA)},
pages={7276--7282},
year={2019},
organization={IEEE}
}
```
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment