Unverified Commit 32a4328b authored by Wenwei Zhang's avatar Wenwei Zhang Committed by GitHub
Browse files

Bump version to V1.0.0rc0

Bump version to V1.0.0rc0
parents 86cc487c a8817998
model = dict(
type='PointRCNN',
backbone=dict(
type='PointNet2SAMSG',
in_channels=4,
num_points=(4096, 1024, 256, 64),
radii=((0.1, 0.5), (0.5, 1.0), (1.0, 2.0), (2.0, 4.0)),
num_samples=((16, 32), (16, 32), (16, 32), (16, 32)),
sa_channels=(((16, 16, 32), (32, 32, 64)), ((64, 64, 128), (64, 96,
128)),
((128, 196, 256), (128, 196, 256)), ((256, 256, 512),
(256, 384, 512))),
fps_mods=(('D-FPS'), ('D-FPS'), ('D-FPS'), ('D-FPS')),
fps_sample_range_lists=((-1), (-1), (-1), (-1)),
aggregation_channels=(None, None, None, None),
dilated_group=(False, False, False, False),
out_indices=(0, 1, 2, 3),
norm_cfg=dict(type='BN2d', eps=1e-3, momentum=0.1),
sa_cfg=dict(
type='PointSAModuleMSG',
pool_mod='max',
use_xyz=True,
normalize_xyz=False)),
neck=dict(
type='PointNetFPNeck',
fp_channels=((1536, 512, 512), (768, 512, 512), (608, 256, 256),
(257, 128, 128))),
rpn_head=dict(
type='PointRPNHead',
num_classes=3,
enlarge_width=0.1,
pred_layer_cfg=dict(
in_channels=128,
cls_linear_channels=(256, 256),
reg_linear_channels=(256, 256)),
cls_loss=dict(
type='FocalLoss',
use_sigmoid=True,
reduction='sum',
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
bbox_loss=dict(
type='SmoothL1Loss',
beta=1.0 / 9.0,
reduction='sum',
loss_weight=1.0),
bbox_coder=dict(
type='PointXYZWHLRBBoxCoder',
code_size=8,
# code_size: (center residual (3), size regression (3),
# torch.cos(yaw) (1), torch.sin(yaw) (1)
use_mean_size=True,
mean_size=[[3.9, 1.6, 1.56], [0.8, 0.6, 1.73], [1.76, 0.6,
1.73]])),
roi_head=dict(
type='PointRCNNRoIHead',
point_roi_extractor=dict(
type='Single3DRoIPointExtractor',
roi_layer=dict(type='RoIPointPool3d', num_sampled_points=512)),
bbox_head=dict(
type='PointRCNNBboxHead',
num_classes=1,
pred_layer_cfg=dict(
in_channels=512,
cls_conv_channels=(256, 256),
reg_conv_channels=(256, 256),
bias=True),
in_channels=5,
# 5 = 3 (xyz) + scores + depth
mlp_channels=[128, 128],
num_points=(128, 32, -1),
radius=(0.2, 0.4, 100),
num_samples=(16, 16, 16),
sa_channels=((128, 128, 128), (128, 128, 256), (256, 256, 512)),
with_corner_loss=True),
depth_normalizer=70.0),
# model training and testing settings
train_cfg=dict(
pos_distance_thr=10.0,
rpn=dict(
nms_cfg=dict(
use_rotate_nms=True, iou_thr=0.8, nms_pre=9000, nms_post=512),
score_thr=None),
rcnn=dict(
assigner=[
dict( # for Car
type='MaxIoUAssigner',
iou_calculator=dict(
type='BboxOverlaps3D', coordinate='lidar'),
pos_iou_thr=0.55,
neg_iou_thr=0.55,
min_pos_iou=0.55,
ignore_iof_thr=-1,
match_low_quality=False),
dict( # for Pedestrian
type='MaxIoUAssigner',
iou_calculator=dict(
type='BboxOverlaps3D', coordinate='lidar'),
pos_iou_thr=0.55,
neg_iou_thr=0.55,
min_pos_iou=0.55,
ignore_iof_thr=-1,
match_low_quality=False),
dict( # for Cyclist
type='MaxIoUAssigner',
iou_calculator=dict(
type='BboxOverlaps3D', coordinate='lidar'),
pos_iou_thr=0.55,
neg_iou_thr=0.55,
min_pos_iou=0.55,
ignore_iof_thr=-1,
match_low_quality=False)
],
sampler=dict(
type='IoUNegPiecewiseSampler',
num=128,
pos_fraction=0.5,
neg_piece_fractions=[0.8, 0.2],
neg_iou_piece_thrs=[0.55, 0.1],
neg_pos_ub=-1,
add_gt_as_proposals=False,
return_iou=True),
cls_pos_thr=0.7,
cls_neg_thr=0.25)),
test_cfg=dict(
rpn=dict(
nms_cfg=dict(
use_rotate_nms=True, iou_thr=0.85, nms_pre=9000, nms_post=512),
score_thr=None),
rcnn=dict(use_rotate_nms=True, nms_thr=0.1, score_thr=0.1)))
model = dict(
type='SMOKEMono3D',
backbone=dict(
type='DLANet',
depth=34,
in_channels=3,
norm_cfg=dict(type='GN', num_groups=32),
init_cfg=dict(
type='Pretrained',
checkpoint='http://dl.yf.io/dla/models/imagenet/dla34-ba72cf86.pth'
)),
neck=dict(
type='DLANeck',
in_channels=[16, 32, 64, 128, 256, 512],
start_level=2,
end_level=5,
norm_cfg=dict(type='GN', num_groups=32)),
bbox_head=dict(
type='SMOKEMono3DHead',
num_classes=3,
in_channels=64,
dim_channel=[3, 4, 5],
ori_channel=[6, 7],
stacked_convs=0,
feat_channels=64,
use_direction_classifier=False,
diff_rad_by_sin=False,
pred_attrs=False,
pred_velo=False,
dir_offset=0,
strides=None,
group_reg_dims=(8, ),
cls_branch=(256, ),
reg_branch=((256, ), ),
num_attrs=0,
bbox_code_size=7,
dir_branch=(),
attr_branch=(),
bbox_coder=dict(
type='SMOKECoder',
base_depth=(28.01, 16.32),
base_dims=((0.88, 1.73, 0.67), (1.78, 1.70, 0.58), (3.88, 1.63,
1.53)),
code_size=7),
loss_cls=dict(type='GaussianFocalLoss', loss_weight=1.0),
loss_bbox=dict(type='L1Loss', reduction='sum', loss_weight=1 / 300),
loss_dir=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
loss_attr=None,
conv_bias=True,
dcn_on_last_conv=False),
train_cfg=None,
test_cfg=dict(topK=100, local_maximum_kernel=3, max_per_img=100))
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
# the learning rate will change from 0.0018 to 0.018, than go to 0.0018*1e-4 # the learning rate will change from 0.0018 to 0.018, than go to 0.0018*1e-4
lr = 0.0018 lr = 0.0018
# The optimizer follows the setting in SECOND.Pytorch, but here we use # The optimizer follows the setting in SECOND.Pytorch, but here we use
# the offcial AdamW optimizer implemented by PyTorch. # the official AdamW optimizer implemented by PyTorch.
optimizer = dict(type='AdamW', lr=lr, betas=(0.95, 0.99), weight_decay=0.01) optimizer = dict(type='AdamW', lr=lr, betas=(0.95, 0.99), weight_decay=0.01)
optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2)) optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
# We use cyclic learning rate and momentum schedule following SECOND.Pytorch # We use cyclic learning rate and momentum schedule following SECOND.Pytorch
......
# optimizer
# This schedule is mainly used on S3DIS dataset in segmentation task
optimizer = dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0001)
optimizer_config = dict(grad_clip=None)
lr_config = dict(policy='CosineAnnealing', warmup=None, min_lr=1e-5)
# runtime settings
runner = dict(type='EpochBasedRunner', max_epochs=100)
...@@ -38,7 +38,7 @@ model = dict( ...@@ -38,7 +38,7 @@ model = dict(
ranges=[[0, -40.0, -0.6, 70.4, 40.0, -0.6], ranges=[[0, -40.0, -0.6, 70.4, 40.0, -0.6],
[0, -40.0, -0.6, 70.4, 40.0, -0.6], [0, -40.0, -0.6, 70.4, 40.0, -0.6],
[0, -40.0, -1.78, 70.4, 40.0, -1.78]], [0, -40.0, -1.78, 70.4, 40.0, -1.78]],
sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]], sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]],
rotations=[0, 1.57], rotations=[0, 1.57],
reshape_out=False), reshape_out=False),
diff_rad_by_sin=True, diff_rad_by_sin=True,
......
...@@ -37,7 +37,7 @@ model = dict( ...@@ -37,7 +37,7 @@ model = dict(
anchor_generator=dict( anchor_generator=dict(
type='Anchor3DRangeGenerator', type='Anchor3DRangeGenerator',
ranges=[[0, -39.68, -1.78, 69.12, 39.68, -1.78]], ranges=[[0, -39.68, -1.78, 69.12, 39.68, -1.78]],
sizes=[[1.6, 3.9, 1.56]], sizes=[[3.9, 1.6, 1.56]],
rotations=[0, 1.57], rotations=[0, 1.57],
reshape_out=True), reshape_out=True),
diff_rad_by_sin=True, diff_rad_by_sin=True,
......
...@@ -48,7 +48,7 @@ model = dict( ...@@ -48,7 +48,7 @@ model = dict(
[0, -40.0, -0.6, 70.4, 40.0, -0.6], [0, -40.0, -0.6, 70.4, 40.0, -0.6],
[0, -40.0, -1.78, 70.4, 40.0, -1.78], [0, -40.0, -1.78, 70.4, 40.0, -1.78],
], ],
sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]], sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]],
rotations=[0, 1.57], rotations=[0, 1.57],
reshape_out=False), reshape_out=False),
diff_rad_by_sin=True, diff_rad_by_sin=True,
......
...@@ -39,7 +39,7 @@ model = dict( ...@@ -39,7 +39,7 @@ model = dict(
[0, -40.0, -0.6, 70.4, 40.0, -0.6], [0, -40.0, -0.6, 70.4, 40.0, -0.6],
[0, -40.0, -1.78, 70.4, 40.0, -1.78], [0, -40.0, -1.78, 70.4, 40.0, -1.78],
], ],
sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]], sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]],
rotations=[0, 1.57], rotations=[0, 1.57],
reshape_out=False), reshape_out=False),
diff_rad_by_sin=True, diff_rad_by_sin=True,
......
# Center-based 3D Object Detection and Tracking # Center-based 3D Object Detection and Tracking
## Introduction > [Center-based 3D Object Detection and Tracking](https://arxiv.org/abs/2006.11275)
<!-- [ALGORITHM] --> <!-- [ALGORITHM] -->
## Abstract
Three-dimensional objects are commonly represented as 3D boxes in a point-cloud. This representation mimics the well-studied image-based 2D bounding-box detection but comes with additional challenges. Objects in a 3D world do not follow any particular orientation, and box-based detectors have difficulties enumerating all orientations or fitting an axis-aligned bounding box to rotated objects. In this paper, we instead propose to represent, detect, and track 3D objects as points. Our framework, CenterPoint, first detects centers of objects using a keypoint detector and regresses to other attributes, including 3D size, 3D orientation, and velocity. In a second stage, it refines these estimates using additional point features on the object. In CenterPoint, 3D object tracking simplifies to greedy closest-point matching. The resulting detection and tracking algorithm is simple, efficient, and effective. CenterPoint achieved state-of-the-art performance on the nuScenes benchmark for both 3D detection and tracking, with 65.5 NDS and 63.8 AMOTA for a single model. On the Waymo Open Dataset, CenterPoint outperforms all previous single model method by a large margin and ranks first among all Lidar-only submissions.
<div align=center>
<img src="https://user-images.githubusercontent.com/30491025/143854976-11af75ae-e828-43ad-835d-ac1146f99925.png" width="800"/>
</div>
## Introduction
We implement CenterPoint and provide the result and checkpoints on nuScenes dataset. We implement CenterPoint and provide the result and checkpoints on nuScenes dataset.
We follow the below style to name config files. Contributors are advised to follow the same style. We follow the below style to name config files. Contributors are advised to follow the same style.
...@@ -27,15 +37,6 @@ We follow the below style to name config files. Contributors are advised to foll ...@@ -27,15 +37,6 @@ We follow the below style to name config files. Contributors are advised to foll
`{dataset}`: dataset like nus-3d, kitti-3d, lyft-3d, scannet-3d, sunrgbd-3d. We also indicate the number of classes we are using if there exist multiple settings, e.g., kitti-3d-3class and kitti-3d-car means training on KITTI dataset with 3 classes and single class, respectively. `{dataset}`: dataset like nus-3d, kitti-3d, lyft-3d, scannet-3d, sunrgbd-3d. We also indicate the number of classes we are using if there exist multiple settings, e.g., kitti-3d-3class and kitti-3d-car means training on KITTI dataset with 3 classes and single class, respectively.
```
@article{yin2021center,
title={Center-based 3D Object Detection and Tracking},
author={Yin, Tianwei and Zhou, Xingyi and Kr{\"a}henb{\"u}hl, Philipp},
journal={CVPR},
year={2021},
}
```
## Usage ## Usage
### Test time augmentation ### Test time augmentation
...@@ -103,7 +104,7 @@ data = dict( ...@@ -103,7 +104,7 @@ data = dict(
``` ```
## Results ## Results and models
### CenterPoint ### CenterPoint
...@@ -124,3 +125,14 @@ data = dict( ...@@ -124,3 +125,14 @@ data = dict(
|above w/o circle nms|pillar (0.2)|✗|✗| | |49.12|59.66|| |above w/o circle nms|pillar (0.2)|✗|✗| | |49.12|59.66||
|[SECFPN](./centerpoint_02pillar_second_secfpn_dcn_4x8_cyclic_20e_nus.py)|pillar (0.2)|✓|✗| 4.6| |48.8 |59.67 |[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/centerpoint/centerpoint_02pillar_second_secfpn_dcn_4x8_cyclic_20e_nus/centerpoint_02pillar_second_secfpn_dcn_4x8_cyclic_20e_nus_20200930_103722-3bb135f2.pth) &#124; [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/centerpoint/centerpoint_02pillar_second_secfpn_dcn_4x8_cyclic_20e_nus/centerpoint_02pillar_second_secfpn_dcn_4x8_cyclic_20e_nus_20200930_103722.log.json)| |[SECFPN](./centerpoint_02pillar_second_secfpn_dcn_4x8_cyclic_20e_nus.py)|pillar (0.2)|✓|✗| 4.6| |48.8 |59.67 |[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/centerpoint/centerpoint_02pillar_second_secfpn_dcn_4x8_cyclic_20e_nus/centerpoint_02pillar_second_secfpn_dcn_4x8_cyclic_20e_nus_20200930_103722-3bb135f2.pth) &#124; [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/centerpoint/centerpoint_02pillar_second_secfpn_dcn_4x8_cyclic_20e_nus/centerpoint_02pillar_second_secfpn_dcn_4x8_cyclic_20e_nus_20200930_103722.log.json)|
|above w/ circle nms|pillar (0.2)|✓|✓| | |48.79|59.65|| |above w/ circle nms|pillar (0.2)|✓|✓| | |48.79|59.65||
## Citation
```latex
@article{yin2021center,
title={Center-based 3D Object Detection and Tracking},
author={Yin, Tianwei and Zhou, Xingyi and Kr{\"a}henb{\"u}hl, Philipp},
journal={CVPR},
year={2021},
}
```
_base_ = './centerpoint_0075voxel_second_secfpn_dcn_4x8_cyclic_20e_nus.py' _base_ = './centerpoint_0075voxel_second_secfpn_dcn_4x8_cyclic_20e_nus.py'
model = dict(test_cfg=dict(pts=dict(use_rotate_nms=True, max_num=500)))
point_cloud_range = [-54, -54, -5.0, 54, 54, 3.0] point_cloud_range = [-54, -54, -5.0, 54, 54, 3.0]
file_client_args = dict(backend='disk') file_client_args = dict(backend='disk')
class_names = [ class_names = [
......
# Dynamic Graph CNN for Learning on Point Clouds
> [Dynamic Graph CNN for Learning on Point Clouds](https://arxiv.org/abs/1801.07829)
<!-- [ALGORITHM] -->
## Abstract
Point clouds provide a flexible geometric representation suitable for countless applications in computer graphics; they also comprise the raw output of most 3D data acquisition devices. While hand-designed features on point clouds have long been proposed in graphics and vision, however, the recent overwhelming success of convolutional neural networks (CNNs) for image analysis suggests the value of adapting insight from CNN to the point cloud world. Point clouds inherently lack topological information so designing a model to recover topology can enrich the representation power of point clouds. To this end, we propose a new neural network module dubbed EdgeConv suitable for CNN-based high-level tasks on point clouds including classification and segmentation. EdgeConv acts on graphs dynamically computed in each layer of the network. It is differentiable and can be plugged into existing architectures. Compared to existing modules operating in extrinsic space or treating each point independently, EdgeConv has several appealing properties: It incorporates local neighborhood information; it can be stacked applied to learn global shape properties; and in multi-layer systems affinity in feature space captures semantic characteristics over potentially long distances in the original embedding. We show the performance of our model on standard benchmarks including ModelNet40, ShapeNetPart, and S3DIS.
<div align=center>
<img src="https://user-images.githubusercontent.com/30491025/143855852-3d7888ed-2cfc-416c-9ec8-57621edeaa34.png" width="800"/>
</div>
## Introduction
We implement DGCNN and provide the results and checkpoints on S3DIS dataset.
**Notice**: We follow the implementations in the original DGCNN paper and a PyTorch implementation of DGCNN [code](https://github.com/AnTao97/dgcnn.pytorch).
## Results and models
### S3DIS
| Method | Split | Lr schd | Mem (GB) | Inf time (fps) | mIoU (Val set) | Download |
| :-------------------------------------------------------------------------: | :----: | :--------: | :------: | :------------: | :------------: | :----------------------: |
| [DGCNN](./dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class.py) | Area_1 | cosine 100e | 13.1 | | 68.33 | [model](https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area1/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210731_000734-39658f14.pth) &#124; [log](https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area1/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210731_000734.log.json) |
| [DGCNN](./dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class.py) | Area_2 | cosine 100e | 13.1 | | 40.68 | [model](https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area2/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210731_144648-aea9ecb6.pth) &#124; [log](https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area2/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210731_144648.log.json) |
| [DGCNN](./dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class.py) | Area_3 | cosine 100e | 13.1 | | 69.38 | [model](https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area3/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210801_154629-2ff50ee0.pth) &#124; [log](https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area3/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210801_154629.log.json) |
| [DGCNN](./dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class.py) | Area_4 | cosine 100e | 13.1 | | 50.07 | [model](https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area4/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210802_073551-dffab9cd.pth) &#124; [log](https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area4/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210802_073551.log.json) |
| [DGCNN](./dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class.py) | Area_5 | cosine 100e | 13.1 | | 50.59 | [model](https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area5/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210730_235824-f277e0c5.pth) &#124; [log](https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area5/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210730_235824.log.json) |
| [DGCNN](./dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class.py) | Area_6 | cosine 100e | 13.1 | | 77.94 | [model](https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area6/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210802_154317-e3511b32.pth) &#124; [log](https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area6/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210802_154317.log.json) |
| [DGCNN](./dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class.py) | 6-fold | | | | 59.43 | |
**Notes:**
- We use XYZ+Color+Normalized_XYZ as input in all the experiments on S3DIS datasets.
- `Area_5` Split means training the model on Area_1, 2, 3, 4, 6 and testing on Area_5.
- `6-fold` Split means the overall result of 6 different splits (Area_1, Area_2, Area_3, Area_4, Area_5 and Area_6 Splits).
- Users need to modify `train_area` and `test_area` in the S3DIS dataset's [config](./configs/_base_/datasets/s3dis_seg-3d-13class.py) to set the training and testing areas, respectively.
## Indeterminism
Since DGCNN testing adopts sliding patch inference which involves random point sampling, and the test script uses fixed random seeds while the random seeds of validation in training are not fixed, the test results may be slightly different from the results reported above.
## Citation
```latex
@article{dgcnn,
title={Dynamic Graph CNN for Learning on Point Clouds},
author={Wang, Yue and Sun, Yongbin and Liu, Ziwei and Sarma, Sanjay E. and Bronstein, Michael M. and Solomon, Justin M.},
journal={ACM Transactions on Graphics (TOG)},
year={2019}
}
```
_base_ = [
'../_base_/datasets/s3dis_seg-3d-13class.py', '../_base_/models/dgcnn.py',
'../_base_/schedules/seg_cosine_100e.py', '../_base_/default_runtime.py'
]
# data settings
data = dict(samples_per_gpu=32)
evaluation = dict(interval=2)
# model settings
model = dict(
backbone=dict(in_channels=9), # [xyz, rgb, normalized_xyz]
decode_head=dict(
num_classes=13, ignore_index=13,
loss_decode=dict(class_weight=None)), # S3DIS doesn't use class_weight
test_cfg=dict(
num_points=4096,
block_size=1.0,
sample_rate=0.5,
use_normalized_coord=True,
batch_size=24))
# runtime settings
checkpoint_config = dict(interval=2)
Collections:
- Name: DGCNN
Metadata:
Training Techniques:
- SGD
Training Resources: 4x Titan XP GPUs
Architecture:
- DGCNN
Paper: https://arxiv.org/abs/1801.07829
README: configs/dgcnn/README.md
Models:
- Name: dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class.py
In Collection: DGCNN
Config: configs/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class.py
Metadata:
Training Data: S3DIS
Training Memory (GB): 13.3
Results:
- Task: 3D Semantic Segmentation
Dataset: S3DIS
Metrics:
mIoU: 50.59
Weights: https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area5/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210730_235824-f277e0c5.pth
# Dynamic Voxelization # Dynamic Voxelization
## Introduction > [End-to-End Multi-View Fusion for 3D Object Detection in LiDAR Point Clouds](https://arxiv.org/abs/1910.06528)
<!-- [ALGORITHM] --> <!-- [ALGORITHM] -->
We implement Dynamic Voxelization proposed in and provide its results and models on KITTI dataset. ## Abstract
``` Recent work on 3D object detection advocates point cloud voxelization in birds-eye view, where objects preserve their physical dimensions and are naturally separable. When represented in this view, however, point clouds are sparse and have highly variable point density, which may cause detectors difficulties in detecting distant or small objects (pedestrians, traffic signs, etc.). On the other hand, perspective view provides dense observations, which could allow more favorable feature encoding for such cases. In this paper, we aim to synergize the birds-eye view and the perspective view and propose a novel end-to-end multi-view fusion (MVF) algorithm, which can effectively learn to utilize the complementary information from both. Specifically, we introduce dynamic voxelization, which has four merits compared to existing voxelization methods, i) removing the need of pre-allocating a tensor with fixed size; ii) overcoming the information loss due to stochastic point/voxel dropout; iii) yielding deterministic voxel embeddings and more stable detection outcomes; iv) establishing the bi-directional relationship between points and voxels, which potentially lays a natural foundation for cross-view feature fusion. By employing dynamic voxelization, the proposed feature fusion architecture enables each point to learn to fuse context information from different views. MVF operates on points and can be naturally extended to other approaches using LiDAR point clouds. We evaluate our MVF model extensively on the newly released Waymo Open Dataset and on the KITTI dataset and demonstrate that it significantly improves detection accuracy over the comparable single-view PointPillars baseline.
@article{zhou2019endtoend,
title={End-to-End Multi-View Fusion for 3D Object Detection in LiDAR Point Clouds},
author={Yin Zhou and Pei Sun and Yu Zhang and Dragomir Anguelov and Jiyang Gao and Tom Ouyang and James Guo and Jiquan Ngiam and Vijay Vasudevan},
year={2019},
eprint={1910.06528},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
``` <div align=center>
<img src="https://user-images.githubusercontent.com/30491025/143856017-98b77ecb-7c13-4164-9c1d-e3011a7645e6.png" width="600"/>
</div>
## Results ## Introduction
We implement Dynamic Voxelization proposed in and provide its results and models on KITTI dataset.
## Results and models
### KITTI ### KITTI
...@@ -27,3 +25,16 @@ We implement Dynamic Voxelization proposed in and provide its results and model ...@@ -27,3 +25,16 @@ We implement Dynamic Voxelization proposed in and provide its results and model
|[SECOND](./dv_second_secfpn_6x8_80e_kitti-3d-car.py)|Car |cyclic 80e|5.5||78.83|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/dynamic_voxelization/dv_second_secfpn_6x8_80e_kitti-3d-car/dv_second_secfpn_6x8_80e_kitti-3d-car_20200620_235228-ac2c1c0c.pth) &#124; [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/dynamic_voxelization/dv_second_secfpn_6x8_80e_kitti-3d-car/dv_second_secfpn_6x8_80e_kitti-3d-car_20200620_235228.log.json)| |[SECOND](./dv_second_secfpn_6x8_80e_kitti-3d-car.py)|Car |cyclic 80e|5.5||78.83|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/dynamic_voxelization/dv_second_secfpn_6x8_80e_kitti-3d-car/dv_second_secfpn_6x8_80e_kitti-3d-car_20200620_235228-ac2c1c0c.pth) &#124; [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/dynamic_voxelization/dv_second_secfpn_6x8_80e_kitti-3d-car/dv_second_secfpn_6x8_80e_kitti-3d-car_20200620_235228.log.json)|
|[SECOND](./dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class.py)| 3 Class|cosine 80e|5.5||65.10|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/dynamic_voxelization/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class_20200620_231010-6aa607d3.pth) &#124; [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/dynamic_voxelization/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class_20200620_231010.log.json)| |[SECOND](./dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class.py)| 3 Class|cosine 80e|5.5||65.10|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/dynamic_voxelization/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class_20200620_231010-6aa607d3.pth) &#124; [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/dynamic_voxelization/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class_20200620_231010.log.json)|
|[PointPillars](./dv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py)| Car|cyclic 80e|4.7||77.76|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/dynamic_voxelization/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car_20200620_230844-ee7b75c9.pth) &#124; [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/dynamic_voxelization/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car_20200620_230844.log.json)| |[PointPillars](./dv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py)| Car|cyclic 80e|4.7||77.76|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/dynamic_voxelization/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car_20200620_230844-ee7b75c9.pth) &#124; [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/dynamic_voxelization/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car_20200620_230844.log.json)|
## Citation
```latex
@article{zhou2019endtoend,
title={End-to-End Multi-View Fusion for 3D Object Detection in LiDAR Point Clouds},
author={Yin Zhou and Pei Sun and Yu Zhang and Dragomir Anguelov and Jiyang Gao and Tom Ouyang and James Guo and Jiquan Ngiam and Vijay Vasudevan},
year={2019},
eprint={1910.06528},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
```
# FCOS3D: Fully Convolutional One-Stage Monocular 3D Object Detection # FCOS3D: Fully Convolutional One-Stage Monocular 3D Object Detection
## Introduction > [FCOS3D: Fully Convolutional One-Stage Monocular 3D Object Detection](https://arxiv.org/abs/2104.10956)
<!-- [ALGORITHM] --> <!-- [ALGORITHM] -->
## Abstract
Monocular 3D object detection is an important task for autonomous driving considering its advantage of low cost. It is much more challenging than conventional 2D cases due to its inherent ill-posed property, which is mainly reflected in the lack of depth information. Recent progress on 2D detection offers opportunities to better solving this problem. However, it is non-trivial to make a general adapted 2D detector work in this 3D task. In this paper, we study this problem with a practice built on a fully convolutional single-stage detector and propose a general framework FCOS3D. Specifically, we first transform the commonly defined 7-DoF 3D targets to the image domain and decouple them as 2D and 3D attributes. Then the objects are distributed to different feature levels with consideration of their 2D scales and assigned only according to the projected 3D-center for the training procedure. Furthermore, the center-ness is redefined with a 2D Gaussian distribution based on the 3D-center to fit the 3D target formulation. All of these make this framework simple yet effective, getting rid of any 2D detection or 2D-3D correspondence priors. Our solution achieves 1st place out of all the vision-only methods in the nuScenes 3D detection challenge of NeurIPS 2020.
<div align=center>
<img src="https://user-images.githubusercontent.com/30491025/143856739-93b7c4ff-e116-4824-8cc3-8cf1a433a84c.png" width="800"/>
</div>
## Introduction
FCOS3D is a general anchor-free, one-stage monocular 3D object detector adapted from the original 2D version FCOS. FCOS3D is a general anchor-free, one-stage monocular 3D object detector adapted from the original 2D version FCOS.
It serves as a baseline built on top of mmdetection and mmdetection3d for 3D detection based on monocular vision. It serves as a baseline built on top of mmdetection and mmdetection3d for 3D detection based on monocular vision.
Currently we first support the benchmark on the large-scale nuScenes dataset, which achieved 1st place out of all the vision-only methods in the [nuScenes 3D detecton challenge](https://www.nuscenes.org/object-detection?externalData=all&mapData=all&modalities=Camera) of NeurIPS 2020. Currently we first support the benchmark on the large-scale nuScenes dataset, which achieved 1st place out of all the vision-only methods in the [nuScenes 3D detecton challenge](https://www.nuscenes.org/object-detection?externalData=all&mapData=all&modalities=Camera) of NeurIPS 2020.
```
@inproceedings{wang2021fcos3d,
title={{FCOS3D: Fully} Convolutional One-Stage Monocular 3D Object Detection},
author={Wang, Tai and Zhu, Xinge and Pang, Jiangmiao and Lin, Dahua},
booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops},
year={2021}
}
# For the original 2D version
@inproceedings{tian2019fcos,
title = {{FCOS: Fully} Convolutional One-Stage Object Detection},
author = {Tian, Zhi and Shen, Chunhua and Chen, Hao and He, Tong},
booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
year = {2019}
}
```
![demo image](../../resources/browse_dataset_mono.png) ![demo image](../../resources/browse_dataset_mono.png)
## Usage ## Usage
...@@ -52,7 +46,7 @@ Due to the scale and measurements of depth is different from those of other regr ...@@ -52,7 +46,7 @@ Due to the scale and measurements of depth is different from those of other regr
We also provide visualization functions to show the monocular 3D detection results. Simply follow the [documentation](https://mmdetection3d.readthedocs.io/en/latest/1_exist_data_model.html#test-existing-models-on-standard-datasets) and use the `single-gpu testing` command. You only need to add the `--show` flag and specify `--show-dir` to store the visualization results. We also provide visualization functions to show the monocular 3D detection results. Simply follow the [documentation](https://mmdetection3d.readthedocs.io/en/latest/1_exist_data_model.html#test-existing-models-on-standard-datasets) and use the `single-gpu testing` command. You only need to add the `--show` flag and specify `--show-dir` to store the visualization results.
## Results ## Results and models
### NuScenes ### NuScenes
...@@ -61,3 +55,21 @@ We also provide visualization functions to show the monocular 3D detection resul ...@@ -61,3 +55,21 @@ We also provide visualization functions to show the monocular 3D detection resul
|[ResNet101 w/ DCN](./fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d.py)|1x|8.69||29.8|37.7|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d_20210715_235813-4bed5239.pth) &#124; [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d_20210715_235813.log.json)| |[ResNet101 w/ DCN](./fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d.py)|1x|8.69||29.8|37.7|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d_20210715_235813-4bed5239.pth) &#124; [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d_20210715_235813.log.json)|
|[above w/ finetune](./fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d_finetune.py)|1x|8.69||32.1|39.5|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d_finetune/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d_finetune_20210717_095645-8d806dc2.pth) &#124; [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d_finetune/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d_finetune_20210717_095645.log.json)| |[above w/ finetune](./fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d_finetune.py)|1x|8.69||32.1|39.5|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d_finetune/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d_finetune_20210717_095645-8d806dc2.pth) &#124; [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d_finetune/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d_finetune_20210717_095645.log.json)|
|above w/ tta|1x|8.69||33.1|40.3|| |above w/ tta|1x|8.69||33.1|40.3||
## Citation
```latex
@inproceedings{wang2021fcos3d,
title={{FCOS3D: Fully} Convolutional One-Stage Monocular 3D Object Detection},
author={Wang, Tai and Zhu, Xinge and Pang, Jiangmiao and Lin, Dahua},
booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops},
year={2021}
}
# For the original 2D version
@inproceedings{tian2019fcos,
title = {{FCOS: Fully} Convolutional One-Stage Object Detection},
author = {Tian, Zhi and Shen, Chunhua and Chen, Hao and He, Tong},
booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
year = {2019}
}
```
# Mixed Precision Training
## Introduction
<!-- [OTHERS] -->
We implement mixed precision training and apply it to VoxelNets (e.g., SECOND and PointPillars).
The results are in the following tables.
**Note**: For mixed precision training, we currently do not support PointNet-based methods (e.g., VoteNet).
Mixed precision training for PointNet-based methods will be supported in the future release.
## Results
### SECOND on KITTI dataset
| Backbone |Class| Lr schd | FP32 Mem (GB) | FP16 Mem (GB) | FP32 mAP | FP16 mAP |Download |
| :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: | :------: |
| [SECFPN](./hv_second_secfpn_fp16_6x8_80e_kitti-3d-car.py)| Car |cyclic 80e|5.4|2.9|79.07|78.72|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car_20200924_211301-1f5ad833.pth)&#124; [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car_20200924_211301.log.json)|
| [SECFPN](./hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py)| 3 Class |cyclic 80e|5.4|2.9|64.41|67.4|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class_20200925_110059-05f67bdf.pth) &#124; [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class_20200925_110059.log.json)|
### PointPillars on nuScenes dataset
| Backbone | Lr schd | FP32 Mem (GB) | FP16 Mem (GB) | FP32 mAP | FP32 NDS| FP16 mAP | FP16 NDS| Download |
| :---------: | :-----: | :------: | :------------: | :----: |:----: | :----: |:----: | :------: |
|[SECFPN](./hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d.py)|2x|16.4|8.37|35.17|49.7|35.19|50.27|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d_20201020_222626-c3f0483e.pth) &#124; [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d_20201020_222626.log.json)|
|[FPN](./hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d.py)|2x|16.4|8.40|40.0|53.3|39.26|53.26|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d_20201021_120719-269f9dd6.pth) &#124; [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d_20201021_120719.log.json)|
**Note**:
1. With mixed precision training, we can train PointPillars with nuScenes dataset on 8 Titan XP GPUS with batch size of 2.
This will cause OOM error without mixed precision training.
2. The loss scale for PointPillars on nuScenes dataset is specifically tuned to avoid the loss to be Nan. We find 32 is more stable than 512, though loss scale 32 still cause Nan sometimes.
Collections:
- Name: FP16
Metadata:
Training Techniques:
- AdamW
- Mixed Precision Training
Training Resources: 8x TITAN Xp
Architecture:
- Hard Voxelization
Paper:
URL: https://arxiv.org/abs/1710.03740
Title: 'Mixed Precision Training'
README: configs/fp16/README.md
Code:
Version: v0.7.0
Models:
- Name: hv_second_secfpn_fp16_6x8_80e_kitti-3d-car
In Collection: FP16
Config: configs/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car.py
Metadata:
Training Data: KITTI
Training Memory (GB): 2.9
Results:
- Task: 3D Object Detection
Dataset: KITTI
Metrics:
FP16 mAP: 78.72
Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car_20200924_211301-1f5ad833.pth
- Name: hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class
In Collection: FP16
Config: configs/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py
Metadata:
Training Data: KITTI
Training Memory (GB): 2.9
Results:
- Task: 3D Object Detection
Dataset: KITTI
Metrics:
FP16 mAP: 67.4
Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class_20200925_110059-05f67bdf.pth
- Name: hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d
In Collection: FP16
Config: configs/fp16/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d.py
Metadata:
Training Data: nuScenes
Training Memory (GB): 8.37
Results:
- Task: 3D Object Detection
Dataset: nuScenes
Metrics:
FP16 mAP: 35.19
FP16 NDS: 50.27
Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d_20201020_222626-c3f0483e.pth
- Name: hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d
In Collection: FP16
Config: configs/fp16/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d.py
Metadata:
Training Data: nuScenes
Training Memory (GB): 8.40
Results:
- Task: 3D Object Detection
Dataset: nuScenes
Metrics:
FP16 mAP: 39.26
FP16 NDS: 53.26
Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d_20201021_120719-269f9dd6.pth
# FreeAnchor for 3D Object Detection # FreeAnchor for 3D Object Detection
## Introduction > [FreeAnchor: Learning to Match Anchors for Visual Object Detection](https://arxiv.org/abs/1909.02466)
<!-- [ALGORITHM] --> <!-- [ALGORITHM] -->
## Abstract
Modern CNN-based object detectors assign anchors for ground-truth objects under the restriction of object-anchor Intersection-over-Unit (IoU). In this study, we propose a learning-to-match approach to break IoU restriction, allowing objects to match anchors in a flexible manner. Our approach, referred to as FreeAnchor, updates hand-crafted anchor assignment to “free" anchor matching by formulating detector training as a maximum likelihood estimation (MLE) procedure. FreeAnchor targets at learning features which best explain a class of objects in terms of both classification and localization. FreeAnchor is implemented by optimizing detection customized likelihood and can be fused with CNN-based detectors in a plug-and-play manner. Experiments on COCO demonstrate that FreeAnchor consistently outperforms the counterparts with significant margins.
<div align=center>
<img src="https://user-images.githubusercontent.com/36950400/143866685-e3ac08bb-cd0c-4ada-ba8a-18e03cccdd0f.png" width="600"/>
</div>
## Introduction
We implement FreeAnchor in 3D detection systems and provide their first results with PointPillars on nuScenes dataset. We implement FreeAnchor in 3D detection systems and provide their first results with PointPillars on nuScenes dataset.
With the implemented `FreeAnchor3DHead`, a PointPillar detector with a big backbone (e.g., RegNet-3.2GF) achieves top performance With the implemented `FreeAnchor3DHead`, a PointPillar detector with a big backbone (e.g., RegNet-3.2GF) achieves top performance
on the nuScenes benchmark. on the nuScenes benchmark.
```
@inproceedings{zhang2019freeanchor,
title = {{FreeAnchor}: Learning to Match Anchors for Visual Object Detection},
author = {Zhang, Xiaosong and Wan, Fang and Liu, Chang and Ji, Rongrong and Ye, Qixiang},
booktitle = {Neural Information Processing Systems},
year = {2019}
}
```
## Usage ## Usage
### Modify config ### Modify config
...@@ -49,8 +50,8 @@ model = dict( ...@@ -49,8 +50,8 @@ model = dict(
ranges=[[-50, -50, -1.8, 50, 50, -1.8]], ranges=[[-50, -50, -1.8, 50, 50, -1.8]],
scales=[1, 2, 4], scales=[1, 2, 4],
sizes=[ sizes=[
[0.8660, 2.5981, 1.], # 1.5/sqrt(3) [2.5981, 0.8660, 1.], # 1.5 / sqrt(3)
[0.5774, 1.7321, 1.], # 1/sqrt(3) [1.7321, 0.5774, 1.], # 1 / sqrt(3)
[1., 1., 1.], [1., 1., 1.],
[0.4, 0.4, 1], [0.4, 0.4, 1],
], ],
...@@ -59,8 +60,7 @@ model = dict( ...@@ -59,8 +60,7 @@ model = dict(
reshape_out=True), reshape_out=True),
assigner_per_size=False, assigner_per_size=False,
diff_rad_by_sin=True, diff_rad_by_sin=True,
dir_offset=0.7854, # pi/4 dir_offset=-0.7854, # -pi / 4
dir_limit_offset=0,
bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9), bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9),
loss_cls=dict( loss_cls=dict(
type='FocalLoss', type='FocalLoss',
...@@ -76,7 +76,7 @@ model = dict( ...@@ -76,7 +76,7 @@ model = dict(
pts=dict(code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.25, 0.25]))) pts=dict(code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.25, 0.25])))
``` ```
## Results ## Results and models
### PointPillars ### PointPillars
...@@ -92,3 +92,14 @@ model = dict( ...@@ -92,3 +92,14 @@ model = dict(
|[RegNetX-3.2GF-FPN](./hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d.py)*|✓|3x|29.5||55.09|63.5|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/free_anchor/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d_20200629_181452-297fdc66.pth) &#124; [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/free_anchor/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d_20200629_181452.log.json)| |[RegNetX-3.2GF-FPN](./hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d.py)*|✓|3x|29.5||55.09|63.5|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/free_anchor/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d_20200629_181452-297fdc66.pth) &#124; [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/free_anchor/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d_20200629_181452.log.json)|
**Note**: Models noted by `*` means it is trained using stronger augmentation with vertical flip under bird-eye-view, global translation, and larger range of global rotation. **Note**: Models noted by `*` means it is trained using stronger augmentation with vertical flip under bird-eye-view, global translation, and larger range of global rotation.
## Citation
```latex
@inproceedings{zhang2019freeanchor,
title = {{FreeAnchor}: Learning to Match Anchors for Visual Object Detection},
author = {Zhang, Xiaosong and Wan, Fang and Liu, Chang and Ji, Rongrong and Ye, Qixiang},
booktitle = {Neural Information Processing Systems},
year = {2019}
}
```
...@@ -21,8 +21,8 @@ model = dict( ...@@ -21,8 +21,8 @@ model = dict(
ranges=[[-50, -50, -1.8, 50, 50, -1.8]], ranges=[[-50, -50, -1.8, 50, 50, -1.8]],
scales=[1, 2, 4], scales=[1, 2, 4],
sizes=[ sizes=[
[0.8660, 2.5981, 1.], # 1.5/sqrt(3) [2.5981, 0.8660, 1.], # 1.5 / sqrt(3)
[0.5774, 1.7321, 1.], # 1/sqrt(3) [1.7321, 0.5774, 1.], # 1 / sqrt(3)
[1., 1., 1.], [1., 1., 1.],
[0.4, 0.4, 1], [0.4, 0.4, 1],
], ],
...@@ -31,8 +31,7 @@ model = dict( ...@@ -31,8 +31,7 @@ model = dict(
reshape_out=True), reshape_out=True),
assigner_per_size=False, assigner_per_size=False,
diff_rad_by_sin=True, diff_rad_by_sin=True,
dir_offset=0.7854, # pi/4 dir_offset=-0.7854, # -pi / 4
dir_limit_offset=0,
bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9), bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9),
loss_cls=dict( loss_cls=dict(
type='FocalLoss', type='FocalLoss',
......
# Group-Free 3D Object Detection via Transformers # Group-Free 3D Object Detection via Transformers
## Introduction > [Group-Free 3D Object Detection via Transformers](https://arxiv.org/abs/2104.00678)
<!-- [ALGORITHM] --> <!-- [ALGORITHM] -->
We implement Group-Free-3D and provide the result and checkpoints on ScanNet datasets. ## Abstract
``` Recently, directly detecting 3D objects from 3D point clouds has received increasing attention. To extract object representation from an irregular point cloud, existing methods usually take a point grouping step to assign the points to an object candidate so that a PointNet-like network could be used to derive object features from the grouped points. However, the inaccurate point assignments caused by the hand-crafted grouping scheme decrease the performance of 3D object detection. In this paper, we present a simple yet effective method for directly detecting 3D objects from the 3D point cloud. Instead of grouping local points to each object candidate, our method computes the feature of an object from all the points in the point cloud with the help of an attention mechanism in the Transformers, where the contribution of each point is automatically learned in the network training. With an improved attention stacking scheme, our method fuses object features in different stages and generates more accurate object detection results. With few bells and whistles, the proposed method achieves state-of-the-art 3D object detection performance on two widely used benchmarks, ScanNet V2 and SUN RGB-D.
@article{liu2021,
title={Group-Free 3D Object Detection via Transformers}, <div align=center>
author={Liu, Ze and Zhang, Zheng and Cao, Yue and Hu, Han and Tong, Xin}, <img src="https://user-images.githubusercontent.com/36950400/143868101-09787c2a-9e0b-4013-8800-b4e315d535f0.png" width="800"/>
journal={Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)}, </div>
year={2021}
} ## Introduction
```
We implement Group-Free-3D and provide the result and checkpoints on ScanNet datasets.
## Results ## Results and models
### ScanNet ### ScanNet
...@@ -30,3 +31,14 @@ We implement Group-Free-3D and provide the result and checkpoints on ScanNet dat ...@@ -30,3 +31,14 @@ We implement Group-Free-3D and provide the result and checkpoints on ScanNet dat
- We report the best results (AP@0.50) on validation set during each training. * means the evaluation method in the paper: we train each setting 5 times and test each training trial 5 times, then the average performance of these 25 trials is reported to account for algorithm randomness. - We report the best results (AP@0.50) on validation set during each training. * means the evaluation method in the paper: we train each setting 5 times and test each training trial 5 times, then the average performance of these 25 trials is reported to account for algorithm randomness.
- We use 4 GPUs for training by default as the original code. - We use 4 GPUs for training by default as the original code.
## Citation
```latex
@article{liu2021,
title={Group-Free 3D Object Detection via Transformers},
author={Liu, Ze and Zhang, Zheng and Cao, Yue and Hu, Han and Tong, Xin},
journal={Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
year={2021}
}
```
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment