Unverified Commit 318499ac authored by hjin2902's avatar hjin2902 Committed by GitHub
Browse files

[Feature] Group-Free-3D head (#539)



* group-free-3d head

* GroupFree3DNet->VoteNet

* modify docstring

* bugfix: calculate pts_instance_label, decoder self/cross posembed init

* support GroupFree3DNet, modify scannnet train config

* support point cloud input features dim = 0

* add groupfree3dnet test case

* bugfix: softmax in decode boxes

* support multi-stage predictions

* modify GroupFree3DMultiheadAttention input parameters

* refactor: support sunrgbd-based train

* refactor: support sunrgbd-based train

* fix parts of bug

* modify multi-stage prediction

* fixbug: conv_channels

* bugfix: permute

* bugfix: permute

* bugfix: expand

* fix MAX_NUM_OBJ=64

* 4 gpu training, score_thr = 0.0

* modify config, repeattime=1

* bigfix: expand

* modify: GroupFree3DMHA, build_positional_encoding

* modify: GroupFree3DMHA, build_positional_encoding

* bugfix: torch.nn

* bugfix: mean loss

* residual -> identity

* fix name: DropOut -> Dropout

* delete sunrgbd-based congfig

* Fix: trailing whitespace

* suffix -> prefix

* bugfix: groupfree3d config
Co-authored-by: default avatarjinhui <PJLAB\jinhui@shai14001019l.pjlab.org>
parent e63e0473
model = dict(
type='GroupFree3DNet',
backbone=dict(
type='PointNet2SASSG',
in_channels=3,
num_points=(2048, 1024, 512, 256),
radius=(0.2, 0.4, 0.8, 1.2),
num_samples=(64, 32, 16, 16),
sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
(128, 128, 256)),
fp_channels=((256, 256), (256, 288)),
norm_cfg=dict(type='BN2d'),
sa_cfg=dict(
type='PointSAModule',
pool_mod='max',
use_xyz=True,
normalize_xyz=True)),
bbox_head=dict(
type='GroupFree3DHead',
in_channels=288,
num_decoder_layers=6,
num_proposal=256,
transformerlayers=dict(
type='BaseTransformerLayer',
attn_cfgs=dict(
type='GroupFree3DMHA',
embed_dims=288,
num_heads=8,
attn_drop=0.1,
dropout_layer=dict(type='Dropout', drop_prob=0.1)),
ffn_cfgs=dict(
embed_dims=288,
feedforward_channels=2048,
ffn_drop=0.1,
act_cfg=dict(type='ReLU', inplace=True)),
operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn',
'norm')),
pred_layer_cfg=dict(
in_channels=288, shared_conv_channels=(288, 288), bias=True),
sampling_objectness_loss=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=8.0),
objectness_loss=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
center_loss=dict(
type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
dir_class_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
dir_res_loss=dict(
type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
size_class_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
size_res_loss=dict(
type='SmoothL1Loss', beta=1.0, reduction='sum', loss_weight=10.0),
semantic_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
# model training and testing settings
train_cfg=dict(sample_mod='kps'),
test_cfg=dict(
sample_mod='kps',
nms_thr=0.25,
score_thr=0.0,
per_class_proposal=True,
prediction_stages='last'))
_base_ = [
'../_base_/datasets/scannet-3d-18class.py',
'../_base_/models/groupfree3d.py', '../_base_/schedules/schedule_3x.py',
'../_base_/default_runtime.py'
]
# model settings
model = dict(
bbox_head=dict(
num_classes=18,
num_decoder_layers=12,
size_cls_agnostic=False,
bbox_coder=dict(
type='GroupFree3DBBoxCoder',
num_sizes=18,
num_dir_bins=1,
with_rot=False,
size_cls_agnostic=False,
mean_sizes=[[0.76966727, 0.8116021, 0.92573744],
[1.876858, 1.8425595, 1.1931566],
[0.61328, 0.6148609, 0.7182701],
[1.3955007, 1.5121545, 0.83443564],
[0.97949594, 1.0675149, 0.6329687],
[0.531663, 0.5955577, 1.7500148],
[0.9624706, 0.72462326, 1.1481868],
[0.83221924, 1.0490936, 1.6875663],
[0.21132214, 0.4206159, 0.5372846],
[1.4440073, 1.8970833, 0.26985747],
[1.0294262, 1.4040797, 0.87554324],
[1.3766412, 0.65521795, 1.6813129],
[0.6650819, 0.71111923, 1.298853],
[0.41999173, 0.37906948, 1.7513971],
[0.59359556, 0.5912492, 0.73919016],
[0.50867593, 0.50656086, 0.30136237],
[1.1511526, 1.0546296, 0.49706793],
[0.47535285, 0.49249494, 0.5802117]]),
sampling_objectness_loss=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=8.0),
objectness_loss=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
center_loss=dict(
type='SmoothL1Loss', beta=0.04, reduction='sum', loss_weight=10.0),
dir_class_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
dir_res_loss=dict(
type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
size_class_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
size_res_loss=dict(
type='SmoothL1Loss',
beta=1.0 / 9.0,
reduction='sum',
loss_weight=10.0 / 9.0),
semantic_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
test_cfg=dict(
sample_mod='kps',
nms_thr=0.25,
score_thr=0.0,
per_class_proposal=True,
prediction_stages='last_three'))
# dataset settings
dataset_type = 'ScanNetDataset'
data_root = './data/scannet/'
class_names = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
'bookshelf', 'picture', 'counter', 'desk', 'curtain',
'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
'garbagebin')
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='DEPTH',
load_dim=6,
use_dim=[0, 1, 2]),
dict(
type='LoadAnnotations3D',
with_bbox_3d=True,
with_label_3d=True,
with_mask_3d=True,
with_seg_3d=True),
dict(type='GlobalAlignment', rotation_axis=2),
dict(
type='PointSegClassMapping',
valid_cat_ids=(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34,
36, 39)),
dict(type='IndoorPointSample', num_points=50000),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
flip_ratio_bev_vertical=0.5),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.087266, 0.087266],
scale_ratio_range=[1.0, 1.0]),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(
type='Collect3D',
keys=[
'points', 'gt_bboxes_3d', 'gt_labels_3d', 'pts_semantic_mask',
'pts_instance_mask'
])
]
test_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='DEPTH',
load_dim=6,
use_dim=[0, 1, 2]),
dict(type='GlobalAlignment', rotation_axis=2),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
flip_ratio_bev_vertical=0.5),
dict(type='IndoorPointSample', num_points=50000),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
data = dict(
samples_per_gpu=8,
workers_per_gpu=4,
train=dict(
type='RepeatDataset',
times=1,
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'scannet_infos_train.pkl',
pipeline=train_pipeline,
filter_empty_gt=False,
classes=class_names,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='Depth')),
val=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'scannet_infos_val.pkl',
pipeline=test_pipeline,
classes=class_names,
test_mode=True,
box_type_3d='Depth'),
test=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'scannet_infos_val.pkl',
pipeline=test_pipeline,
classes=class_names,
test_mode=True,
box_type_3d='Depth'))
# optimizer
lr = 0.006
optimizer = dict(
lr=lr,
weight_decay=0.0005,
paramwise_cfg=dict(
custom_keys={
'bbox_head.decoder_layers': dict(lr_mult=0.1, decay_mult=1.0),
'bbox_head.decoder_self_posembeds': dict(
lr_mult=0.1, decay_mult=1.0),
'bbox_head.decoder_cross_posembeds': dict(
lr_mult=0.1, decay_mult=1.0),
'bbox_head.decoder_query_proj': dict(lr_mult=0.1, decay_mult=1.0),
'bbox_head.decoder_key_proj': dict(lr_mult=0.1, decay_mult=1.0)
}))
optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
lr_config = dict(policy='step', warmup=None, step=[280, 340])
# runtime settings
runner = dict(type='EpochBasedRunner', max_epochs=400)
# yapf:disable
log_config = dict(
interval=30,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
# yapf:enable
_base_ = [
'../_base_/datasets/scannet-3d-18class.py',
'../_base_/models/groupfree3d.py', '../_base_/schedules/schedule_3x.py',
'../_base_/default_runtime.py'
]
# model settings
model = dict(
bbox_head=dict(
num_classes=18,
size_cls_agnostic=False,
bbox_coder=dict(
type='GroupFree3DBBoxCoder',
num_sizes=18,
num_dir_bins=1,
with_rot=False,
size_cls_agnostic=False,
mean_sizes=[[0.76966727, 0.8116021, 0.92573744],
[1.876858, 1.8425595, 1.1931566],
[0.61328, 0.6148609, 0.7182701],
[1.3955007, 1.5121545, 0.83443564],
[0.97949594, 1.0675149, 0.6329687],
[0.531663, 0.5955577, 1.7500148],
[0.9624706, 0.72462326, 1.1481868],
[0.83221924, 1.0490936, 1.6875663],
[0.21132214, 0.4206159, 0.5372846],
[1.4440073, 1.8970833, 0.26985747],
[1.0294262, 1.4040797, 0.87554324],
[1.3766412, 0.65521795, 1.6813129],
[0.6650819, 0.71111923, 1.298853],
[0.41999173, 0.37906948, 1.7513971],
[0.59359556, 0.5912492, 0.73919016],
[0.50867593, 0.50656086, 0.30136237],
[1.1511526, 1.0546296, 0.49706793],
[0.47535285, 0.49249494, 0.5802117]]),
sampling_objectness_loss=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=8.0),
objectness_loss=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
center_loss=dict(
type='SmoothL1Loss', beta=0.04, reduction='sum', loss_weight=10.0),
dir_class_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
dir_res_loss=dict(
type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
size_class_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
size_res_loss=dict(
type='SmoothL1Loss',
beta=1.0 / 9.0,
reduction='sum',
loss_weight=10.0 / 9.0),
semantic_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
test_cfg=dict(
sample_mod='kps',
nms_thr=0.25,
score_thr=0.0,
per_class_proposal=True,
prediction_stages='last_three'))
# dataset settings
dataset_type = 'ScanNetDataset'
data_root = './data/scannet/'
class_names = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
'bookshelf', 'picture', 'counter', 'desk', 'curtain',
'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
'garbagebin')
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='DEPTH',
load_dim=6,
use_dim=[0, 1, 2]),
dict(
type='LoadAnnotations3D',
with_bbox_3d=True,
with_label_3d=True,
with_mask_3d=True,
with_seg_3d=True),
dict(type='GlobalAlignment', rotation_axis=2),
dict(
type='PointSegClassMapping',
valid_cat_ids=(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34,
36, 39)),
dict(type='IndoorPointSample', num_points=50000),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
flip_ratio_bev_vertical=0.5),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.087266, 0.087266],
scale_ratio_range=[1.0, 1.0]),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(
type='Collect3D',
keys=[
'points', 'gt_bboxes_3d', 'gt_labels_3d', 'pts_semantic_mask',
'pts_instance_mask'
])
]
test_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='DEPTH',
load_dim=6,
use_dim=[0, 1, 2]),
dict(type='GlobalAlignment', rotation_axis=2),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
flip_ratio_bev_vertical=0.5),
dict(type='IndoorPointSample', num_points=50000),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
data = dict(
samples_per_gpu=8,
workers_per_gpu=4,
train=dict(
type='RepeatDataset',
times=1,
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'scannet_infos_train.pkl',
pipeline=train_pipeline,
filter_empty_gt=False,
classes=class_names,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='Depth')),
val=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'scannet_infos_val.pkl',
pipeline=test_pipeline,
classes=class_names,
test_mode=True,
box_type_3d='Depth'),
test=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'scannet_infos_val.pkl',
pipeline=test_pipeline,
classes=class_names,
test_mode=True,
box_type_3d='Depth'))
# optimizer
lr = 0.006
optimizer = dict(
lr=lr,
weight_decay=0.0005,
paramwise_cfg=dict(
custom_keys={
'bbox_head.decoder_layers': dict(lr_mult=0.1, decay_mult=1.0),
'bbox_head.decoder_self_posembeds': dict(
lr_mult=0.1, decay_mult=1.0),
'bbox_head.decoder_cross_posembeds': dict(
lr_mult=0.1, decay_mult=1.0),
'bbox_head.decoder_query_proj': dict(lr_mult=0.1, decay_mult=1.0),
'bbox_head.decoder_key_proj': dict(lr_mult=0.1, decay_mult=1.0)
}))
optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
lr_config = dict(policy='step', warmup=None, step=[280, 340])
# runtime settings
runner = dict(type='EpochBasedRunner', max_epochs=400)
# yapf:disable
log_config = dict(
interval=30,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
# yapf:enable
_base_ = [
'../_base_/datasets/scannet-3d-18class.py',
'../_base_/models/groupfree3d.py', '../_base_/schedules/schedule_3x.py',
'../_base_/default_runtime.py'
]
# model settings
model = dict(
backbone=dict(
type='PointNet2SASSG',
in_channels=3,
num_points=(2048, 1024, 512, 256),
radius=(0.2, 0.4, 0.8, 1.2),
num_samples=(64, 32, 16, 16),
sa_channels=((128, 128, 256), (256, 256, 512), (256, 256, 512),
(256, 256, 512)),
fp_channels=((512, 512), (512, 288)),
norm_cfg=dict(type='BN2d'),
sa_cfg=dict(
type='PointSAModule',
pool_mod='max',
use_xyz=True,
normalize_xyz=True)),
bbox_head=dict(
num_classes=18,
num_decoder_layers=12,
size_cls_agnostic=False,
bbox_coder=dict(
type='GroupFree3DBBoxCoder',
num_sizes=18,
num_dir_bins=1,
with_rot=False,
size_cls_agnostic=False,
mean_sizes=[[0.76966727, 0.8116021, 0.92573744],
[1.876858, 1.8425595, 1.1931566],
[0.61328, 0.6148609, 0.7182701],
[1.3955007, 1.5121545, 0.83443564],
[0.97949594, 1.0675149, 0.6329687],
[0.531663, 0.5955577, 1.7500148],
[0.9624706, 0.72462326, 1.1481868],
[0.83221924, 1.0490936, 1.6875663],
[0.21132214, 0.4206159, 0.5372846],
[1.4440073, 1.8970833, 0.26985747],
[1.0294262, 1.4040797, 0.87554324],
[1.3766412, 0.65521795, 1.6813129],
[0.6650819, 0.71111923, 1.298853],
[0.41999173, 0.37906948, 1.7513971],
[0.59359556, 0.5912492, 0.73919016],
[0.50867593, 0.50656086, 0.30136237],
[1.1511526, 1.0546296, 0.49706793],
[0.47535285, 0.49249494, 0.5802117]]),
sampling_objectness_loss=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=8.0),
objectness_loss=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
center_loss=dict(
type='SmoothL1Loss', beta=0.04, reduction='sum', loss_weight=10.0),
dir_class_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
dir_res_loss=dict(
type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
size_class_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
size_res_loss=dict(
type='SmoothL1Loss',
beta=1.0 / 9.0,
reduction='sum',
loss_weight=10.0 / 9.0),
semantic_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
test_cfg=dict(
sample_mod='kps',
nms_thr=0.25,
score_thr=0.0,
per_class_proposal=True,
prediction_stages='last_three'))
# dataset settings
dataset_type = 'ScanNetDataset'
data_root = './data/scannet/'
class_names = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
'bookshelf', 'picture', 'counter', 'desk', 'curtain',
'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
'garbagebin')
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='DEPTH',
load_dim=6,
use_dim=[0, 1, 2]),
dict(
type='LoadAnnotations3D',
with_bbox_3d=True,
with_label_3d=True,
with_mask_3d=True,
with_seg_3d=True),
dict(type='GlobalAlignment', rotation_axis=2),
dict(
type='PointSegClassMapping',
valid_cat_ids=(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34,
36, 39)),
dict(type='IndoorPointSample', num_points=50000),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
flip_ratio_bev_vertical=0.5),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.087266, 0.087266],
scale_ratio_range=[1.0, 1.0]),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(
type='Collect3D',
keys=[
'points', 'gt_bboxes_3d', 'gt_labels_3d', 'pts_semantic_mask',
'pts_instance_mask'
])
]
test_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='DEPTH',
load_dim=6,
use_dim=[0, 1, 2]),
dict(type='GlobalAlignment', rotation_axis=2),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
flip_ratio_bev_vertical=0.5),
dict(type='IndoorPointSample', num_points=50000),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
data = dict(
samples_per_gpu=8,
workers_per_gpu=4,
train=dict(
type='RepeatDataset',
times=1,
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'scannet_infos_train.pkl',
pipeline=train_pipeline,
filter_empty_gt=False,
classes=class_names,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='Depth')),
val=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'scannet_infos_val.pkl',
pipeline=test_pipeline,
classes=class_names,
test_mode=True,
box_type_3d='Depth'),
test=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'scannet_infos_val.pkl',
pipeline=test_pipeline,
classes=class_names,
test_mode=True,
box_type_3d='Depth'))
# optimizer
lr = 0.006
optimizer = dict(
lr=lr,
weight_decay=0.0005,
paramwise_cfg=dict(
custom_keys={
'bbox_head.decoder_layers': dict(lr_mult=0.1, decay_mult=1.0),
'bbox_head.decoder_self_posembeds': dict(
lr_mult=0.1, decay_mult=1.0),
'bbox_head.decoder_cross_posembeds': dict(
lr_mult=0.1, decay_mult=1.0),
'bbox_head.decoder_query_proj': dict(lr_mult=0.1, decay_mult=1.0),
'bbox_head.decoder_key_proj': dict(lr_mult=0.1, decay_mult=1.0)
}))
optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
lr_config = dict(policy='step', warmup=None, step=[280, 340])
# runtime settings
runner = dict(type='EpochBasedRunner', max_epochs=400)
# yapf:disable
log_config = dict(
interval=30,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
# yapf:enable
_base_ = [
'../_base_/datasets/scannet-3d-18class.py',
'../_base_/models/groupfree3d.py', '../_base_/schedules/schedule_3x.py',
'../_base_/default_runtime.py'
]
# model settings
model = dict(
backbone=dict(
type='PointNet2SASSG',
in_channels=3,
num_points=(2048, 1024, 512, 256),
radius=(0.2, 0.4, 0.8, 1.2),
num_samples=(64, 32, 16, 16),
sa_channels=((128, 128, 256), (256, 256, 512), (256, 256, 512),
(256, 256, 512)),
fp_channels=((512, 512), (512, 288)),
norm_cfg=dict(type='BN2d'),
sa_cfg=dict(
type='PointSAModule',
pool_mod='max',
use_xyz=True,
normalize_xyz=True)),
bbox_head=dict(
num_classes=18,
num_decoder_layers=12,
num_proposal=512,
size_cls_agnostic=False,
bbox_coder=dict(
type='GroupFree3DBBoxCoder',
num_sizes=18,
num_dir_bins=1,
with_rot=False,
size_cls_agnostic=False,
mean_sizes=[[0.76966727, 0.8116021, 0.92573744],
[1.876858, 1.8425595, 1.1931566],
[0.61328, 0.6148609, 0.7182701],
[1.3955007, 1.5121545, 0.83443564],
[0.97949594, 1.0675149, 0.6329687],
[0.531663, 0.5955577, 1.7500148],
[0.9624706, 0.72462326, 1.1481868],
[0.83221924, 1.0490936, 1.6875663],
[0.21132214, 0.4206159, 0.5372846],
[1.4440073, 1.8970833, 0.26985747],
[1.0294262, 1.4040797, 0.87554324],
[1.3766412, 0.65521795, 1.6813129],
[0.6650819, 0.71111923, 1.298853],
[0.41999173, 0.37906948, 1.7513971],
[0.59359556, 0.5912492, 0.73919016],
[0.50867593, 0.50656086, 0.30136237],
[1.1511526, 1.0546296, 0.49706793],
[0.47535285, 0.49249494, 0.5802117]]),
sampling_objectness_loss=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=8.0),
objectness_loss=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
center_loss=dict(
type='SmoothL1Loss', beta=0.04, reduction='sum', loss_weight=10.0),
dir_class_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
dir_res_loss=dict(
type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
size_class_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
size_res_loss=dict(
type='SmoothL1Loss',
beta=1.0 / 9.0,
reduction='sum',
loss_weight=10.0 / 9.0),
semantic_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
test_cfg=dict(
sample_mod='kps',
nms_thr=0.25,
score_thr=0.0,
per_class_proposal=True,
prediction_stages='last_three'))
# dataset settings
dataset_type = 'ScanNetDataset'
data_root = './data/scannet/'
class_names = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
'bookshelf', 'picture', 'counter', 'desk', 'curtain',
'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
'garbagebin')
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='DEPTH',
load_dim=6,
use_dim=[0, 1, 2]),
dict(
type='LoadAnnotations3D',
with_bbox_3d=True,
with_label_3d=True,
with_mask_3d=True,
with_seg_3d=True),
dict(type='GlobalAlignment', rotation_axis=2),
dict(
type='PointSegClassMapping',
valid_cat_ids=(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34,
36, 39)),
dict(type='IndoorPointSample', num_points=50000),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
flip_ratio_bev_vertical=0.5),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.087266, 0.087266],
scale_ratio_range=[1.0, 1.0]),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(
type='Collect3D',
keys=[
'points', 'gt_bboxes_3d', 'gt_labels_3d', 'pts_semantic_mask',
'pts_instance_mask'
])
]
test_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='DEPTH',
load_dim=6,
use_dim=[0, 1, 2]),
dict(type='GlobalAlignment', rotation_axis=2),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
flip_ratio_bev_vertical=0.5),
dict(type='IndoorPointSample', num_points=50000),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
data = dict(
samples_per_gpu=8,
workers_per_gpu=4,
train=dict(
type='RepeatDataset',
times=1,
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'scannet_infos_train.pkl',
pipeline=train_pipeline,
filter_empty_gt=False,
classes=class_names,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='Depth')),
val=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'scannet_infos_val.pkl',
pipeline=test_pipeline,
classes=class_names,
test_mode=True,
box_type_3d='Depth'),
test=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'scannet_infos_val.pkl',
pipeline=test_pipeline,
classes=class_names,
test_mode=True,
box_type_3d='Depth'))
# optimizer
lr = 0.006
optimizer = dict(
lr=lr,
weight_decay=0.0005,
paramwise_cfg=dict(
custom_keys={
'bbox_head.decoder_layers': dict(lr_mult=0.1, decay_mult=1.0),
'bbox_head.decoder_self_posembeds': dict(
lr_mult=0.1, decay_mult=1.0),
'bbox_head.decoder_cross_posembeds': dict(
lr_mult=0.1, decay_mult=1.0),
'bbox_head.decoder_query_proj': dict(lr_mult=0.1, decay_mult=1.0),
'bbox_head.decoder_key_proj': dict(lr_mult=0.1, decay_mult=1.0)
}))
optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
lr_config = dict(policy='step', warmup=None, step=[280, 340])
# runtime settings
runner = dict(type='EpochBasedRunner', max_epochs=400)
# yapf:disable
log_config = dict(
interval=30,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
# yapf:enable
...@@ -2,9 +2,10 @@ from mmdet.core.bbox import build_bbox_coder ...@@ -2,9 +2,10 @@ from mmdet.core.bbox import build_bbox_coder
from .anchor_free_bbox_coder import AnchorFreeBBoxCoder from .anchor_free_bbox_coder import AnchorFreeBBoxCoder
from .centerpoint_bbox_coders import CenterPointBBoxCoder from .centerpoint_bbox_coders import CenterPointBBoxCoder
from .delta_xyzwhlr_bbox_coder import DeltaXYZWLHRBBoxCoder from .delta_xyzwhlr_bbox_coder import DeltaXYZWLHRBBoxCoder
from .groupfree3d_bbox_coder import GroupFree3DBBoxCoder
from .partial_bin_based_bbox_coder import PartialBinBasedBBoxCoder from .partial_bin_based_bbox_coder import PartialBinBasedBBoxCoder
__all__ = [ __all__ = [
'build_bbox_coder', 'DeltaXYZWLHRBBoxCoder', 'PartialBinBasedBBoxCoder', 'build_bbox_coder', 'DeltaXYZWLHRBBoxCoder', 'PartialBinBasedBBoxCoder',
'CenterPointBBoxCoder', 'AnchorFreeBBoxCoder' 'CenterPointBBoxCoder', 'AnchorFreeBBoxCoder', 'GroupFree3DBBoxCoder'
] ]
import numpy as np
import torch
from mmdet.core.bbox.builder import BBOX_CODERS
from .partial_bin_based_bbox_coder import PartialBinBasedBBoxCoder
@BBOX_CODERS.register_module()
class GroupFree3DBBoxCoder(PartialBinBasedBBoxCoder):
"""Modified partial bin based bbox coder for GroupFree3D.
Args:
num_dir_bins (int): Number of bins to encode direction angle.
num_sizes (int): Number of size clusters.
mean_sizes (list[list[int]]): Mean size of bboxes in each class.
with_rot (bool): Whether the bbox is with rotation. Defaults to True.
size_cls_agnostic (bool): Whether the predicted size is class-agnostic.
Defaults to True.
"""
def __init__(self,
num_dir_bins,
num_sizes,
mean_sizes,
with_rot=True,
size_cls_agnostic=True):
super(GroupFree3DBBoxCoder, self).__init__(
num_dir_bins=num_dir_bins,
num_sizes=num_sizes,
mean_sizes=mean_sizes,
with_rot=with_rot)
self.size_cls_agnostic = size_cls_agnostic
def encode(self, gt_bboxes_3d, gt_labels_3d):
"""Encode ground truth to prediction targets.
Args:
gt_bboxes_3d (BaseInstance3DBoxes): Ground truth bboxes \
with shape (n, 7).
gt_labels_3d (torch.Tensor): Ground truth classes.
Returns:
tuple: Targets of center, size and direction.
"""
# generate center target
center_target = gt_bboxes_3d.gravity_center
# generate bbox size target
size_target = gt_bboxes_3d.dims
size_class_target = gt_labels_3d
size_res_target = gt_bboxes_3d.dims - gt_bboxes_3d.tensor.new_tensor(
self.mean_sizes)[size_class_target]
# generate dir target
box_num = gt_labels_3d.shape[0]
if self.with_rot:
(dir_class_target,
dir_res_target) = self.angle2class(gt_bboxes_3d.yaw)
else:
dir_class_target = gt_labels_3d.new_zeros(box_num)
dir_res_target = gt_bboxes_3d.tensor.new_zeros(box_num)
return (center_target, size_target, size_class_target, size_res_target,
dir_class_target, dir_res_target)
def decode(self, bbox_out, prefix=''):
"""Decode predicted parts to bbox3d.
Args:
bbox_out (dict): Predictions from model, should contain keys below.
- center: predicted bottom center of bboxes.
- dir_class: predicted bbox direction class.
- dir_res: predicted bbox direction residual.
- size_class: predicted bbox size class.
- size_res: predicted bbox size residual.
- size: predicted class-agnostic bbox size
prefix (str): Decode predictions with specific prefix.
Defaults to ''.
Returns:
torch.Tensor: Decoded bbox3d with shape (batch, n, 7).
"""
center = bbox_out[f'{prefix}center']
batch_size, num_proposal = center.shape[:2]
# decode heading angle
if self.with_rot:
dir_class = torch.argmax(bbox_out[f'{prefix}dir_class'], -1)
dir_res = torch.gather(bbox_out[f'{prefix}dir_res'], 2,
dir_class.unsqueeze(-1))
dir_res.squeeze_(2)
dir_angle = self.class2angle(dir_class, dir_res).reshape(
batch_size, num_proposal, 1)
else:
dir_angle = center.new_zeros(batch_size, num_proposal, 1)
# decode bbox size
if self.size_cls_agnostic:
bbox_size = bbox_out[f'{prefix}size'].reshape(
batch_size, num_proposal, 3)
else:
size_class = torch.argmax(
bbox_out[f'{prefix}size_class'], -1, keepdim=True)
size_res = torch.gather(
bbox_out[f'{prefix}size_res'], 2,
size_class.unsqueeze(-1).repeat(1, 1, 1, 3))
mean_sizes = center.new_tensor(self.mean_sizes)
size_base = torch.index_select(mean_sizes, 0,
size_class.reshape(-1))
bbox_size = size_base.reshape(batch_size, num_proposal,
-1) + size_res.squeeze(2)
bbox3d = torch.cat([center, bbox_size, dir_angle], dim=-1)
return bbox3d
def split_pred(self, cls_preds, reg_preds, base_xyz, prefix=''):
"""Split predicted features to specific parts.
Args:
cls_preds (torch.Tensor): Class predicted features to split.
reg_preds (torch.Tensor): Regression predicted features to split.
base_xyz (torch.Tensor): Coordinates of points.
prefix (str): Decode predictions with specific prefix.
Defaults to ''.
Returns:
dict[str, torch.Tensor]: Split results.
"""
results = {}
start, end = 0, 0
cls_preds_trans = cls_preds.transpose(2, 1)
reg_preds_trans = reg_preds.transpose(2, 1)
# decode center
end += 3
# (batch_size, num_proposal, 3)
results[f'{prefix}center_residual'] = \
reg_preds_trans[..., start:end].contiguous()
results[f'{prefix}center'] = base_xyz + \
reg_preds_trans[..., start:end].contiguous()
start = end
# decode direction
end += self.num_dir_bins
results[f'{prefix}dir_class'] = \
reg_preds_trans[..., start:end].contiguous()
start = end
end += self.num_dir_bins
dir_res_norm = reg_preds_trans[..., start:end].contiguous()
start = end
results[f'{prefix}dir_res_norm'] = dir_res_norm
results[f'{prefix}dir_res'] = dir_res_norm * (
np.pi / self.num_dir_bins)
# decode size
if self.size_cls_agnostic:
end += 3
results[f'{prefix}size'] = \
reg_preds_trans[..., start:end].contiguous()
else:
end += self.num_sizes
results[f'{prefix}size_class'] = reg_preds_trans[
..., start:end].contiguous()
start = end
end += self.num_sizes * 3
size_res_norm = reg_preds_trans[..., start:end]
batch_size, num_proposal = reg_preds_trans.shape[:2]
size_res_norm = size_res_norm.view(
[batch_size, num_proposal, self.num_sizes, 3])
start = end
results[f'{prefix}size_res_norm'] = size_res_norm.contiguous()
mean_sizes = reg_preds.new_tensor(self.mean_sizes)
results[f'{prefix}size_res'] = (
size_res_norm * mean_sizes.unsqueeze(0).unsqueeze(0))
# decode objectness score
# Group-Free-3D objectness output shape (batch, proposal, 1)
results[f'{prefix}obj_scores'] = cls_preds_trans[..., :1].contiguous()
# decode semantic score
results[f'{prefix}sem_scores'] = cls_preds_trans[..., 1:].contiguous()
return results
...@@ -5,6 +5,7 @@ from .base_mono3d_dense_head import BaseMono3DDenseHead ...@@ -5,6 +5,7 @@ from .base_mono3d_dense_head import BaseMono3DDenseHead
from .centerpoint_head import CenterHead from .centerpoint_head import CenterHead
from .fcos_mono3d_head import FCOSMono3DHead from .fcos_mono3d_head import FCOSMono3DHead
from .free_anchor3d_head import FreeAnchor3DHead from .free_anchor3d_head import FreeAnchor3DHead
from .groupfree3d_head import GroupFree3DHead
from .parta2_rpn_head import PartA2RPNHead from .parta2_rpn_head import PartA2RPNHead
from .shape_aware_head import ShapeAwareHead from .shape_aware_head import ShapeAwareHead
from .ssd_3d_head import SSD3DHead from .ssd_3d_head import SSD3DHead
...@@ -13,5 +14,6 @@ from .vote_head import VoteHead ...@@ -13,5 +14,6 @@ from .vote_head import VoteHead
__all__ = [ __all__ = [
'Anchor3DHead', 'FreeAnchor3DHead', 'PartA2RPNHead', 'VoteHead', 'Anchor3DHead', 'FreeAnchor3DHead', 'PartA2RPNHead', 'VoteHead',
'SSD3DHead', 'BaseConvBboxHead', 'CenterHead', 'ShapeAwareHead', 'SSD3DHead', 'BaseConvBboxHead', 'CenterHead', 'ShapeAwareHead',
'BaseMono3DDenseHead', 'AnchorFreeMono3DHead', 'FCOSMono3DHead' 'BaseMono3DDenseHead', 'AnchorFreeMono3DHead', 'FCOSMono3DHead',
'GroupFree3DHead'
] ]
import copy
import numpy as np
import torch
from mmcv import ConfigDict
from mmcv.cnn import ConvModule
from mmcv.cnn.bricks.transformer import (build_positional_encoding,
build_transformer_layer)
from mmcv.runner import force_fp32
from torch import nn as nn
from torch.nn import functional as F
from mmdet3d.core.post_processing import aligned_3d_nms
from mmdet3d.models.builder import build_loss
from mmdet3d.ops import Points_Sampler, gather_points
from mmdet.core import build_bbox_coder, multi_apply
from mmdet.models import HEADS
from .base_conv_bbox_head import BaseConvBboxHead
EPS = 1e-6
class PointsObjClsModule(nn.Module):
"""object candidate point prediction from seed point features.
Args:
in_channel (int): number of channels of seed point features.
num_convs (int): number of conv layers.
Default: 3.
conv_cfg (dict): Config of convolution.
Default: dict(type='Conv1d').
norm_cfg (dict): Config of normalization.
Default: dict(type='BN1d').
act_cfg (dict): Config of activation.
Default: dict(type='ReLU').
"""
def __init__(self,
in_channel,
num_convs=3,
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d'),
act_cfg=dict(type='ReLU')):
super().__init__()
conv_channels = [in_channel for _ in range(num_convs - 1)]
conv_channels.append(1)
self.mlp = nn.Sequential()
prev_channels = in_channel
for i in range(num_convs):
self.mlp.add_module(
f'layer{i}',
ConvModule(
prev_channels,
conv_channels[i],
1,
padding=0,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg if i < num_convs - 1 else None,
act_cfg=act_cfg if i < num_convs - 1 else None,
bias=True,
inplace=True))
prev_channels = conv_channels[i]
def forward(self, seed_features):
"""Forward pass.
Args:
seed_features (torch.Tensor): seed features, dims:
(batch_size, feature_dim, num_seed)
Returns:
torch.Tensor: objectness logits, dim:
(batch_size, 1, num_seed)
"""
return self.mlp(seed_features)
class GeneralSamplingModule(nn.Module):
"""Sampling Points.
Sampling points with given index.
"""
def forward(self, xyz, features, sample_inds):
"""Forward pass.
Args:
xyz: (B, N, 3) the coordinates of the features.
features (Tensor): (B, C, N) features to sample.
sample_inds (Tensor): (B, M) the given index,
where M is the number of points.
Returns:
Tensor: (B, M, 3) coordinates of sampled features
Tensor: (B, C, M) the sampled features.
Tensor: (B, M) the given index.
"""
xyz_t = xyz.transpose(1, 2).contiguous()
new_xyz = gather_points(xyz_t, sample_inds).transpose(1,
2).contiguous()
new_features = gather_points(features, sample_inds).contiguous()
return new_xyz, new_features, sample_inds
@HEADS.register_module()
class GroupFree3DHead(nn.Module):
r"""Bbox head of `Group-Free 3D <https://arxiv.org/abs/2104.00678>`_.
Args:
num_classes (int): The number of class.
in_channels (int): The dims of input features from backbone.
bbox_coder (:obj:`BaseBBoxCoder`): Bbox coder for encoding and
decoding boxes.
num_decoder_layers (int): The number of transformer decoder layers.
transformerlayers (dict): Config for transformer decoder.
train_cfg (dict): Config for training.
test_cfg (dict): Config for testing.
num_proposal (int): The number of initial sampling candidates.
pred_layer_cfg (dict): Config of classfication and regression
prediction layers.
size_cls_agnostic (bool): Whether the predicted size is class-agnostic.
gt_per_seed (int): the number of candidate instance each point belongs
to.
sampling_objectness_loss (dict): Config of initial sampling
objectness loss.
objectness_loss (dict): Config of objectness loss.
center_loss (dict): Config of center loss.
dir_class_loss (dict): Config of direction classification loss.
dir_res_loss (dict): Config of direction residual regression loss.
size_class_loss (dict): Config of size classification loss.
size_res_loss (dict): Config of size residual regression loss.
size_reg_loss (dict): Config of class-agnostic size regression loss.
semantic_loss (dict): Config of point-wise semantic segmentation loss.
"""
def __init__(self,
num_classes,
in_channels,
bbox_coder,
num_decoder_layers,
transformerlayers,
decoder_self_posembeds=dict(
type='ConvBNPositionalEncoding',
input_channel=6,
num_pos_feats=288),
decoder_cross_posembeds=dict(
type='ConvBNPositionalEncoding',
input_channel=3,
num_pos_feats=288),
train_cfg=None,
test_cfg=None,
num_proposal=128,
pred_layer_cfg=None,
size_cls_agnostic=True,
gt_per_seed=3,
sampling_objectness_loss=None,
objectness_loss=None,
center_loss=None,
dir_class_loss=None,
dir_res_loss=None,
size_class_loss=None,
size_res_loss=None,
size_reg_loss=None,
semantic_loss=None):
super(GroupFree3DHead, self).__init__()
self.num_classes = num_classes
self.train_cfg = train_cfg
self.test_cfg = test_cfg
self.num_proposal = num_proposal
self.in_channels = in_channels
self.num_decoder_layers = num_decoder_layers
self.size_cls_agnostic = size_cls_agnostic
self.gt_per_seed = gt_per_seed
# Transformer decoder layers
if isinstance(transformerlayers, ConfigDict):
transformerlayers = [
copy.deepcopy(transformerlayers)
for _ in range(num_decoder_layers)
]
else:
assert isinstance(transformerlayers, list) and \
len(transformerlayers) == num_decoder_layers
self.decoder_layers = nn.ModuleList()
for i in range(self.num_decoder_layers):
self.decoder_layers.append(
build_transformer_layer(transformerlayers[i]))
self.embed_dims = self.decoder_layers[0].embed_dims
assert self.embed_dims == decoder_self_posembeds['num_pos_feats']
assert self.embed_dims == decoder_cross_posembeds['num_pos_feats']
# bbox_coder
self.bbox_coder = build_bbox_coder(bbox_coder)
self.num_sizes = self.bbox_coder.num_sizes
self.num_dir_bins = self.bbox_coder.num_dir_bins
# Initial object candidate sampling
self.gsample_module = GeneralSamplingModule()
self.fps_module = Points_Sampler([self.num_proposal])
self.points_obj_cls = PointsObjClsModule(self.in_channels)
self.fp16_enabled = False
# initial candidate prediction
self.conv_pred = BaseConvBboxHead(
**pred_layer_cfg,
num_cls_out_channels=self._get_cls_out_channels(),
num_reg_out_channels=self._get_reg_out_channels())
# query proj and key proj
self.decoder_query_proj = nn.Conv1d(
self.embed_dims, self.embed_dims, kernel_size=1)
self.decoder_key_proj = nn.Conv1d(
self.embed_dims, self.embed_dims, kernel_size=1)
# query position embed
self.decoder_self_posembeds = nn.ModuleList()
for _ in range(self.num_decoder_layers):
self.decoder_self_posembeds.append(
build_positional_encoding(decoder_self_posembeds))
# key position embed
self.decoder_cross_posembeds = nn.ModuleList()
for _ in range(self.num_decoder_layers):
self.decoder_cross_posembeds.append(
build_positional_encoding(decoder_cross_posembeds))
# Prediction Head
self.prediction_heads = nn.ModuleList()
for i in range(self.num_decoder_layers):
self.prediction_heads.append(
BaseConvBboxHead(
**pred_layer_cfg,
num_cls_out_channels=self._get_cls_out_channels(),
num_reg_out_channels=self._get_reg_out_channels()))
self.sampling_objectness_loss = build_loss(sampling_objectness_loss)
self.objectness_loss = build_loss(objectness_loss)
self.center_loss = build_loss(center_loss)
self.dir_res_loss = build_loss(dir_res_loss)
self.dir_class_loss = build_loss(dir_class_loss)
self.semantic_loss = build_loss(semantic_loss)
if self.size_cls_agnostic:
self.size_reg_loss = build_loss(size_reg_loss)
else:
self.size_res_loss = build_loss(size_res_loss)
self.size_class_loss = build_loss(size_class_loss)
def init_weights(self):
"""Initialize weights of transformer decoder in GroupFree3DHead."""
# initialize transformer
for m in self.decoder_layers.parameters():
if m.dim() > 1:
nn.init.xavier_uniform_(m)
for m in self.decoder_self_posembeds.parameters():
if m.dim() > 1:
nn.init.xavier_uniform_(m)
for m in self.decoder_cross_posembeds.parameters():
if m.dim() > 1:
nn.init.xavier_uniform_(m)
def _get_cls_out_channels(self):
"""Return the channel number of classification outputs."""
# Class numbers (k) + objectness (1)
return self.num_classes + 1
def _get_reg_out_channels(self):
"""Return the channel number of regression outputs."""
# center residual (3),
# heading class+residual (num_dir_bins*2),
# size class+residual(num_sizes*4 or 3)
if self.size_cls_agnostic:
return 6 + self.num_dir_bins * 2
else:
return 3 + self.num_dir_bins * 2 + self.num_sizes * 4
def _extract_input(self, feat_dict):
"""Extract inputs from features dictionary.
Args:
feat_dict (dict): Feature dict from backbone.
Returns:
torch.Tensor: Coordinates of input points.
torch.Tensor: Features of input points.
torch.Tensor: Indices of input points.
"""
seed_points = feat_dict['fp_xyz'][-1]
seed_features = feat_dict['fp_features'][-1]
seed_indices = feat_dict['fp_indices'][-1]
return seed_points, seed_features, seed_indices
def forward(self, feat_dict, sample_mod):
"""Forward pass.
Note:
The forward of GroupFree3DHead is devided into 2 steps:
1. Initial object candidates sampling.
2. Iterative object box prediction by transformer decoder.
Args:
feat_dict (dict): Feature dict from backbone.
sample_mod (str): sample mode for initial candidates sampling.
Returns:
results (dict): Predictions of GroupFree3D head.
"""
assert sample_mod in ['fps', 'kps']
seed_xyz, seed_features, seed_indices = self._extract_input(feat_dict)
results = dict(
seed_points=seed_xyz,
seed_features=seed_features,
seed_indices=seed_indices)
# 1. Initial object candidates sampling.
if sample_mod == 'fps':
sample_inds = self.fps_module(seed_xyz, seed_features)
elif sample_mod == 'kps':
points_obj_cls_logits = self.points_obj_cls(
seed_features) # (batch_size, 1, num_seed)
points_obj_cls_scores = points_obj_cls_logits.sigmoid().squeeze(1)
sample_inds = torch.topk(points_obj_cls_scores,
self.num_proposal)[1].int()
results['seeds_obj_cls_logits'] = points_obj_cls_logits
else:
raise NotImplementedError(
f'Sample mode {sample_mod} is not supported!')
candidate_xyz, candidate_features, sample_inds = self.gsample_module(
seed_xyz, seed_features, sample_inds)
results['query_points_xyz'] = candidate_xyz # (B, M, 3)
results['query_points_feature'] = candidate_features # (B, C, M)
results['query_points_sample_inds'] = sample_inds.long() # (B, M)
prefix = 'proposal.'
cls_predictions, reg_predictions = self.conv_pred(candidate_features)
decode_res = self.bbox_coder.split_pred(cls_predictions,
reg_predictions, candidate_xyz,
prefix)
results.update(decode_res)
bbox3d = self.bbox_coder.decode(results, prefix)
# 2. Iterative object box prediction by transformer decoder.
base_bbox3d = bbox3d[:, :, :6].detach().clone()
query = self.decoder_query_proj(candidate_features).permute(2, 0, 1)
key = self.decoder_key_proj(seed_features).permute(2, 0, 1)
value = key
# transformer decoder
results['num_decoder_layers'] = 0
for i in range(self.num_decoder_layers):
prefix = f's{i}.'
query_pos = self.decoder_self_posembeds[i](base_bbox3d).permute(
2, 0, 1)
key_pos = self.decoder_cross_posembeds[i](seed_xyz).permute(
2, 0, 1)
query = self.decoder_layers[i](
query, key, value, query_pos=query_pos,
key_pos=key_pos).permute(1, 2, 0)
results[f'{prefix}query'] = query
cls_predictions, reg_predictions = self.prediction_heads[i](query)
decode_res = self.bbox_coder.split_pred(cls_predictions,
reg_predictions,
candidate_xyz, prefix)
# TODO: should save bbox3d instead of decode_res?
results.update(decode_res)
bbox3d = self.bbox_coder.decode(results, prefix)
results[f'{prefix}bbox3d'] = bbox3d
base_bbox3d = bbox3d[:, :, :6].detach().clone()
query = query.permute(2, 0, 1)
results['num_decoder_layers'] += 1
return results
@force_fp32(apply_to=('bbox_preds', ))
def loss(self,
bbox_preds,
points,
gt_bboxes_3d,
gt_labels_3d,
pts_semantic_mask=None,
pts_instance_mask=None,
img_metas=None,
gt_bboxes_ignore=None,
ret_target=False):
"""Compute loss.
Args:
bbox_preds (dict): Predictions from forward of vote head.
points (list[torch.Tensor]): Input points.
gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth \
bboxes of each sample.
gt_labels_3d (list[torch.Tensor]): Labels of each sample.
pts_semantic_mask (None | list[torch.Tensor]): Point-wise
semantic mask.
pts_instance_mask (None | list[torch.Tensor]): Point-wise
instance mask.
img_metas (list[dict]): Contain pcd and img's meta info.
gt_bboxes_ignore (None | list[torch.Tensor]): Specify
which bounding.
ret_target (Bool): Return targets or not.
Returns:
dict: Losses of GroupFree3D.
"""
targets = self.get_targets(points, gt_bboxes_3d, gt_labels_3d,
pts_semantic_mask, pts_instance_mask,
bbox_preds)
(sampling_targets, sampling_weights, assigned_size_targets,
size_class_targets, size_res_targets, dir_class_targets,
dir_res_targets, center_targets, assigned_center_targets,
mask_targets, valid_gt_masks, objectness_targets, objectness_weights,
box_loss_weights, valid_gt_weights) = targets
batch_size, proposal_num = size_class_targets.shape[:2]
losses = dict()
# calculate objectness classification loss
sampling_obj_score = bbox_preds['seeds_obj_cls_logits'].reshape(-1, 1)
sampling_objectness_loss = self.sampling_objectness_loss(
sampling_obj_score,
1 - sampling_targets.reshape(-1),
sampling_weights.reshape(-1),
avg_factor=batch_size)
losses['sampling_objectness_loss'] = sampling_objectness_loss
prefixes = ['proposal.'] + [
f's{i}.' for i in range(bbox_preds['num_decoder_layers'])
]
num_stages = len(prefixes)
for prefix in prefixes:
# calculate objectness loss
obj_score = bbox_preds[f'{prefix}obj_scores'].transpose(2, 1)
objectness_loss = self.objectness_loss(
obj_score.reshape(-1, 1),
1 - objectness_targets.reshape(-1),
objectness_weights.reshape(-1),
avg_factor=batch_size)
losses[f'{prefix}objectness_loss'] = objectness_loss / num_stages
# calculate center loss
box_loss_weights_expand = box_loss_weights.unsqueeze(-1).expand(
-1, -1, 3)
center_loss = self.center_loss(
bbox_preds[f'{prefix}center'],
assigned_center_targets,
weight=box_loss_weights_expand)
losses[f'{prefix}center_loss'] = center_loss / num_stages
# calculate direction class loss
dir_class_loss = self.dir_class_loss(
bbox_preds[f'{prefix}dir_class'].transpose(2, 1),
dir_class_targets,
weight=box_loss_weights)
losses[f'{prefix}dir_class_loss'] = dir_class_loss / num_stages
# calculate direction residual loss
heading_label_one_hot = size_class_targets.new_zeros(
(batch_size, proposal_num, self.num_dir_bins))
heading_label_one_hot.scatter_(2, dir_class_targets.unsqueeze(-1),
1)
dir_res_norm = torch.sum(
bbox_preds[f'{prefix}dir_res_norm'] * heading_label_one_hot,
-1)
dir_res_loss = self.dir_res_loss(
dir_res_norm, dir_res_targets, weight=box_loss_weights)
losses[f'{prefix}dir_res_loss'] = dir_res_loss / num_stages
if self.size_cls_agnostic:
# calculate class-agnostic size loss
size_reg_loss = self.size_reg_loss(
bbox_preds[f'{prefix}size'],
assigned_size_targets,
weight=box_loss_weights_expand)
losses[f'{prefix}size_reg_loss'] = size_reg_loss / num_stages
else:
# calculate size class loss
size_class_loss = self.size_class_loss(
bbox_preds[f'{prefix}size_class'].transpose(2, 1),
size_class_targets,
weight=box_loss_weights)
losses[
f'{prefix}size_class_loss'] = size_class_loss / num_stages
# calculate size residual loss
one_hot_size_targets = size_class_targets.new_zeros(
(batch_size, proposal_num, self.num_sizes))
one_hot_size_targets.scatter_(2,
size_class_targets.unsqueeze(-1),
1)
one_hot_size_targets_expand = one_hot_size_targets.unsqueeze(
-1).expand(-1, -1, -1, 3).contiguous()
size_residual_norm = torch.sum(
bbox_preds[f'{prefix}size_res_norm'] *
one_hot_size_targets_expand, 2)
box_loss_weights_expand = box_loss_weights.unsqueeze(
-1).expand(-1, -1, 3)
size_res_loss = self.size_res_loss(
size_residual_norm,
size_res_targets,
weight=box_loss_weights_expand)
losses[f'{prefix}size_res_loss'] = size_res_loss / num_stages
# calculate semantic loss
semantic_loss = self.semantic_loss(
bbox_preds[f'{prefix}sem_scores'].transpose(2, 1),
mask_targets,
weight=box_loss_weights)
losses[f'{prefix}semantic_loss'] = semantic_loss / num_stages
if ret_target:
losses['targets'] = targets
return losses
def get_targets(self,
points,
gt_bboxes_3d,
gt_labels_3d,
pts_semantic_mask=None,
pts_instance_mask=None,
bbox_preds=None,
max_gt_num=64):
"""Generate targets of GroupFree3D head.
Args:
points (list[torch.Tensor]): Points of each batch.
gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth \
bboxes of each batch.
gt_labels_3d (list[torch.Tensor]): Labels of each batch.
pts_semantic_mask (None | list[torch.Tensor]): Point-wise semantic
label of each batch.
pts_instance_mask (None | list[torch.Tensor]): Point-wise instance
label of each batch.
bbox_preds (torch.Tensor): Bounding box predictions of vote head.
max_gt_num (int): Max number of GTs for single batch.
Returns:
tuple[torch.Tensor]: Targets of GroupFree3D head.
"""
# find empty example
valid_gt_masks = list()
gt_num = list()
for index in range(len(gt_labels_3d)):
if len(gt_labels_3d[index]) == 0:
fake_box = gt_bboxes_3d[index].tensor.new_zeros(
1, gt_bboxes_3d[index].tensor.shape[-1])
gt_bboxes_3d[index] = gt_bboxes_3d[index].new_box(fake_box)
gt_labels_3d[index] = gt_labels_3d[index].new_zeros(1)
valid_gt_masks.append(gt_labels_3d[index].new_zeros(1))
gt_num.append(1)
else:
valid_gt_masks.append(gt_labels_3d[index].new_ones(
gt_labels_3d[index].shape))
gt_num.append(gt_labels_3d[index].shape[0])
# max_gt_num = max(gt_num)
max_gt_nums = [max_gt_num for _ in range(len(gt_labels_3d))]
if pts_semantic_mask is None:
pts_semantic_mask = [None for i in range(len(gt_labels_3d))]
pts_instance_mask = [None for i in range(len(gt_labels_3d))]
seed_points = [
bbox_preds['seed_points'][i] for i in range(len(gt_labels_3d))
]
seed_indices = [
bbox_preds['seed_indices'][i] for i in range(len(gt_labels_3d))
]
candidate_indices = [
bbox_preds['query_points_sample_inds'][i]
for i in range(len(gt_labels_3d))
]
(sampling_targets, assigned_size_targets, size_class_targets,
size_res_targets, dir_class_targets, dir_res_targets, center_targets,
assigned_center_targets, mask_targets, objectness_targets,
objectness_masks) = multi_apply(self.get_targets_single, points,
gt_bboxes_3d, gt_labels_3d,
pts_semantic_mask, pts_instance_mask,
max_gt_nums, seed_points,
seed_indices, candidate_indices)
# pad targets as original code of GroupFree3D.
for index in range(len(gt_labels_3d)):
pad_num = max_gt_num - gt_labels_3d[index].shape[0]
valid_gt_masks[index] = F.pad(valid_gt_masks[index], (0, pad_num))
sampling_targets = torch.stack(sampling_targets)
sampling_weights = (sampling_targets >= 0).float()
sampling_normalizer = sampling_weights.sum(dim=1, keepdim=True).float()
sampling_weights /= sampling_normalizer.clamp(min=1.0)
assigned_size_targets = torch.stack(assigned_size_targets)
center_targets = torch.stack(center_targets)
valid_gt_masks = torch.stack(valid_gt_masks)
assigned_center_targets = torch.stack(assigned_center_targets)
objectness_targets = torch.stack(objectness_targets)
objectness_weights = torch.stack(objectness_masks)
cls_normalizer = objectness_weights.sum(dim=1, keepdim=True).float()
objectness_weights /= cls_normalizer.clamp(min=1.0)
box_loss_weights = objectness_targets.float() / (
objectness_targets.sum().float() + EPS)
valid_gt_weights = valid_gt_masks.float() / (
valid_gt_masks.sum().float() + EPS)
dir_class_targets = torch.stack(dir_class_targets)
dir_res_targets = torch.stack(dir_res_targets)
size_class_targets = torch.stack(size_class_targets)
size_res_targets = torch.stack(size_res_targets)
mask_targets = torch.stack(mask_targets)
return (sampling_targets, sampling_weights, assigned_size_targets,
size_class_targets, size_res_targets, dir_class_targets,
dir_res_targets, center_targets, assigned_center_targets,
mask_targets, valid_gt_masks, objectness_targets,
objectness_weights, box_loss_weights, valid_gt_weights)
def get_targets_single(self,
points,
gt_bboxes_3d,
gt_labels_3d,
pts_semantic_mask=None,
pts_instance_mask=None,
max_gt_nums=None,
seed_points=None,
seed_indices=None,
candidate_indices=None,
seed_points_obj_topk=4):
"""Generate targets of GroupFree3D head for single batch.
Args:
points (torch.Tensor): Points of each batch.
gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth \
boxes of each batch.
gt_labels_3d (torch.Tensor): Labels of each batch.
pts_semantic_mask (None | torch.Tensor): Point-wise semantic
label of each batch.
pts_instance_mask (None | torch.Tensor): Point-wise instance
label of each batch.
max_gt_nums (int): Max number of GTs for single batch.
seed_points (torch.Tensor): Coordinates of seed points.
seed_indices (torch.Tensor): Indices of seed points.
candidate_indices (torch.Tensor): Indices of object candidates.
seed_points_obj_topk (int): k value of k-Closest Points Sampling.
Returns:
tuple[torch.Tensor]: Targets of GroupFree3D head.
"""
assert self.bbox_coder.with_rot or pts_semantic_mask is not None
gt_bboxes_3d = gt_bboxes_3d.to(points.device)
# generate center, dir, size target
(center_targets, size_targets, size_class_targets, size_res_targets,
dir_class_targets,
dir_res_targets) = self.bbox_coder.encode(gt_bboxes_3d, gt_labels_3d)
# pad targets as original code of GroupFree3D
pad_num = max_gt_nums - gt_labels_3d.shape[0]
box_label_mask = points.new_zeros([max_gt_nums])
box_label_mask[:gt_labels_3d.shape[0]] = 1
gt_bboxes_pad = F.pad(gt_bboxes_3d.tensor, (0, 0, 0, pad_num))
gt_bboxes_pad[gt_labels_3d.shape[0]:, 0:3] += 1000
gt_bboxes_3d = gt_bboxes_3d.new_box(gt_bboxes_pad)
gt_labels_3d = F.pad(gt_labels_3d, (0, pad_num))
center_targets = F.pad(center_targets, (0, 0, 0, pad_num), value=1000)
size_targets = F.pad(size_targets, (0, 0, 0, pad_num))
size_class_targets = F.pad(size_class_targets, (0, pad_num))
size_res_targets = F.pad(size_res_targets, (0, 0, 0, pad_num))
dir_class_targets = F.pad(dir_class_targets, (0, pad_num))
dir_res_targets = F.pad(dir_res_targets, (0, pad_num))
# 0. generate pts_instance_label and pts_obj_mask
num_points = points.shape[0]
pts_obj_mask = points.new_zeros([num_points], dtype=torch.long)
pts_instance_label = points.new_zeros([num_points],
dtype=torch.long) - 1
if self.bbox_coder.with_rot:
vote_targets = points.new_zeros([num_points, 4 * self.gt_per_seed])
vote_target_idx = points.new_zeros([num_points], dtype=torch.long)
box_indices_all = gt_bboxes_3d.points_in_boxes(points)
for i in range(gt_labels_3d.shape[0]):
box_indices = box_indices_all[:, i]
indices = torch.nonzero(
box_indices, as_tuple=False).squeeze(-1)
selected_points = points[indices]
pts_obj_mask[indices] = 1
vote_targets_tmp = vote_targets[indices]
votes = gt_bboxes_3d.gravity_center[i].unsqueeze(
0) - selected_points[:, :3]
for j in range(self.gt_per_seed):
column_indices = torch.nonzero(
vote_target_idx[indices] == j,
as_tuple=False).squeeze(-1)
vote_targets_tmp[column_indices,
int(j * 3):int(j * 3 +
3)] = votes[column_indices]
vote_targets_tmp[column_indices,
j + 3 * self.gt_per_seed] = i
if j == 0:
vote_targets_tmp[
column_indices, :3 *
self.gt_per_seed] = votes[column_indices].repeat(
1, self.gt_per_seed)
vote_targets_tmp[column_indices,
3 * self.gt_per_seed:] = i
vote_targets[indices] = vote_targets_tmp
vote_target_idx[indices] = torch.clamp(
vote_target_idx[indices] + 1, max=2)
dist = points.new_zeros([num_points, self.gt_per_seed]) + 1000
for j in range(self.gt_per_seed):
dist[:, j] = (vote_targets[:, 3 * j:3 * j + 3]**2).sum(-1)
instance_indices = torch.argmin(
dist, dim=-1).unsqueeze(-1) + 3 * self.gt_per_seed
instance_lable = torch.gather(vote_targets, 1,
instance_indices).squeeze(-1)
pts_instance_label = instance_lable.long()
pts_instance_label[pts_obj_mask == 0] = -1
elif pts_semantic_mask is not None:
for i in torch.unique(pts_instance_mask):
indices = torch.nonzero(
pts_instance_mask == i, as_tuple=False).squeeze(-1)
if pts_semantic_mask[indices[0]] < self.num_classes:
selected_points = points[indices, :3]
center = 0.5 * (
selected_points.min(0)[0] + selected_points.max(0)[0])
delta_xyz = center - center_targets
instance_lable = torch.argmin((delta_xyz**2).sum(-1))
pts_instance_label[indices] = instance_lable
pts_obj_mask[indices] = 1
else:
raise NotImplementedError
# 1. generate objectness targets in sampling head
gt_num = gt_labels_3d.shape[0]
num_seed = seed_points.shape[0]
num_candidate = candidate_indices.shape[0]
object_assignment = torch.gather(pts_instance_label, 0, seed_indices)
# set background points to the last gt bbox as original code
object_assignment[object_assignment < 0] = gt_num - 1
object_assignment_one_hot = gt_bboxes_3d.tensor.new_zeros(
(num_seed, gt_num))
object_assignment_one_hot.scatter_(1, object_assignment.unsqueeze(-1),
1) # (num_seed, gt_num)
delta_xyz = seed_points.unsqueeze(
1) - gt_bboxes_3d.gravity_center.unsqueeze(
0) # (num_seed, gt_num, 3)
delta_xyz = delta_xyz / (gt_bboxes_3d.dims.unsqueeze(0) + EPS)
new_dist = torch.sum(delta_xyz**2, dim=-1)
euclidean_dist1 = torch.sqrt(new_dist + EPS)
euclidean_dist1 = euclidean_dist1 * object_assignment_one_hot + 100 * (
1 - object_assignment_one_hot)
# (gt_num, num_seed)
euclidean_dist1 = euclidean_dist1.permute(1, 0)
# gt_num x topk
topk_inds = torch.topk(
euclidean_dist1,
seed_points_obj_topk,
largest=False)[1] * box_label_mask[:, None] + \
(box_label_mask[:, None] - 1)
topk_inds = topk_inds.long()
topk_inds = topk_inds.view(-1).contiguous()
sampling_targets = torch.zeros(
num_seed + 1, dtype=torch.long).to(points.device)
sampling_targets[topk_inds] = 1
sampling_targets = sampling_targets[:num_seed]
# pts_instance_label
objectness_label_mask = torch.gather(pts_instance_label, 0,
seed_indices) # num_seed
sampling_targets[objectness_label_mask < 0] = 0
# 2. objectness target
seed_obj_gt = torch.gather(pts_obj_mask, 0, seed_indices) # num_seed
objectness_targets = torch.gather(seed_obj_gt, 0,
candidate_indices) # num_candidate
# 3. box target
seed_instance_label = torch.gather(pts_instance_label, 0,
seed_indices) # num_seed
query_points_instance_label = torch.gather(
seed_instance_label, 0, candidate_indices) # num_candidate
# Set assignment
# (num_candidate, ) with values in 0,1,...,gt_num-1
assignment = query_points_instance_label
# set background points to the last gt bbox as original code
assignment[assignment < 0] = gt_num - 1
assignment_expand = assignment.unsqueeze(1).expand(-1, 3)
assigned_center_targets = center_targets[assignment]
assigned_size_targets = size_targets[assignment]
dir_class_targets = dir_class_targets[assignment]
dir_res_targets = dir_res_targets[assignment]
dir_res_targets /= (np.pi / self.num_dir_bins)
size_class_targets = size_class_targets[assignment]
size_res_targets = \
torch.gather(size_res_targets, 0, assignment_expand)
one_hot_size_targets = gt_bboxes_3d.tensor.new_zeros(
(num_candidate, self.num_sizes))
one_hot_size_targets.scatter_(1, size_class_targets.unsqueeze(-1), 1)
one_hot_size_targets = one_hot_size_targets.unsqueeze(-1).expand(
-1, -1, 3) # (num_candidate,num_size_cluster,3)
mean_sizes = size_res_targets.new_tensor(
self.bbox_coder.mean_sizes).unsqueeze(0)
pos_mean_sizes = torch.sum(one_hot_size_targets * mean_sizes, 1)
size_res_targets /= pos_mean_sizes
mask_targets = gt_labels_3d[assignment].long()
objectness_masks = points.new_ones((num_candidate))
return (sampling_targets, assigned_size_targets, size_class_targets,
size_res_targets, dir_class_targets, dir_res_targets,
center_targets, assigned_center_targets, mask_targets,
objectness_targets, objectness_masks)
def get_bboxes(self,
points,
bbox_preds,
input_metas,
rescale=False,
use_nms=True):
"""Generate bboxes from GroupFree3D head predictions.
Args:
points (torch.Tensor): Input points.
bbox_preds (dict): Predictions from GroupFree3D head.
input_metas (list[dict]): Point cloud and image's meta info.
rescale (bool): Whether to rescale bboxes.
use_nms (bool): Whether to apply NMS, skip nms postprocessing
while using GroupFree3D head in rpn stage.
Returns:
list[tuple[torch.Tensor]]: Bounding boxes, scores and labels.
"""
# support multi-stage predicitons
assert self.test_cfg['prediction_stages'] in \
['last', 'all', 'last_three']
prefixes = list()
if self.test_cfg['prediction_stages'] == 'last':
prefixes = [f'_{self.num_decoder_layers - 1}']
elif self.test_cfg['prediction_stages'] == 'all':
prefixes = ['proposal.'] + \
[f's{i}.' for i in range(self.num_decoder_layers)]
elif self.test_cfg['prediction_stages'] == 'last_three':
prefixes = [
f's{i}.' for i in range(self.num_decoder_layers -
3, self.num_decoder_layers)
]
else:
raise NotImplementedError
obj_scores = list()
sem_scores = list()
bbox3d = list()
for prefix in prefixes:
# decode boxes
obj_score = bbox_preds[f'{prefix}obj_scores'][..., -1].sigmoid()
sem_score = bbox_preds[f'{prefix}sem_scores'].softmax(-1)
bbox = self.bbox_coder.decode(bbox_preds, prefix)
obj_scores.append(obj_score)
sem_scores.append(sem_score)
bbox3d.append(bbox)
obj_scores = torch.cat(obj_scores, dim=1)
sem_scores = torch.cat(sem_scores, dim=1)
bbox3d = torch.cat(bbox3d, dim=1)
if use_nms:
batch_size = bbox3d.shape[0]
results = list()
for b in range(batch_size):
bbox_selected, score_selected, labels = \
self.multiclass_nms_single(obj_scores[b], sem_scores[b],
bbox3d[b], points[b, ..., :3],
input_metas[b])
bbox = input_metas[b]['box_type_3d'](
bbox_selected,
box_dim=bbox_selected.shape[-1],
with_yaw=self.bbox_coder.with_rot)
results.append((bbox, score_selected, labels))
return results
else:
return bbox3d
def multiclass_nms_single(self, obj_scores, sem_scores, bbox, points,
input_meta):
"""Multi-class nms in single batch.
Args:
obj_scores (torch.Tensor): Objectness score of bounding boxes.
sem_scores (torch.Tensor): semantic class score of bounding boxes.
bbox (torch.Tensor): Predicted bounding boxes.
points (torch.Tensor): Input points.
input_meta (dict): Point cloud and image's meta info.
Returns:
tuple[torch.Tensor]: Bounding boxes, scores and labels.
"""
bbox = input_meta['box_type_3d'](
bbox,
box_dim=bbox.shape[-1],
with_yaw=self.bbox_coder.with_rot,
origin=(0.5, 0.5, 0.5))
box_indices = bbox.points_in_boxes(points)
corner3d = bbox.corners
minmax_box3d = corner3d.new(torch.Size((corner3d.shape[0], 6)))
minmax_box3d[:, :3] = torch.min(corner3d, dim=1)[0]
minmax_box3d[:, 3:] = torch.max(corner3d, dim=1)[0]
nonempty_box_mask = box_indices.T.sum(1) > 5
bbox_classes = torch.argmax(sem_scores, -1)
nms_selected = aligned_3d_nms(minmax_box3d[nonempty_box_mask],
obj_scores[nonempty_box_mask],
bbox_classes[nonempty_box_mask],
self.test_cfg.nms_thr)
# filter empty boxes and boxes with low score
scores_mask = (obj_scores > self.test_cfg.score_thr)
nonempty_box_inds = torch.nonzero(
nonempty_box_mask, as_tuple=False).flatten()
nonempty_mask = torch.zeros_like(bbox_classes).scatter(
0, nonempty_box_inds[nms_selected], 1)
selected = (nonempty_mask.bool() & scores_mask.bool())
if self.test_cfg.per_class_proposal:
bbox_selected, score_selected, labels = [], [], []
for k in range(sem_scores.shape[-1]):
bbox_selected.append(bbox[selected].tensor)
score_selected.append(obj_scores[selected] *
sem_scores[selected][:, k])
labels.append(
torch.zeros_like(bbox_classes[selected]).fill_(k))
bbox_selected = torch.cat(bbox_selected, 0)
score_selected = torch.cat(score_selected, 0)
labels = torch.cat(labels, 0)
else:
bbox_selected = bbox[selected].tensor
score_selected = obj_scores[selected]
labels = bbox_classes[selected]
return bbox_selected, score_selected, labels
...@@ -2,6 +2,7 @@ from .base import Base3DDetector ...@@ -2,6 +2,7 @@ from .base import Base3DDetector
from .centerpoint import CenterPoint from .centerpoint import CenterPoint
from .dynamic_voxelnet import DynamicVoxelNet from .dynamic_voxelnet import DynamicVoxelNet
from .fcos_mono3d import FCOSMono3D from .fcos_mono3d import FCOSMono3D
from .groupfree3dnet import GroupFree3DNet
from .h3dnet import H3DNet from .h3dnet import H3DNet
from .imvotenet import ImVoteNet from .imvotenet import ImVoteNet
from .imvoxelnet import ImVoxelNet from .imvoxelnet import ImVoxelNet
...@@ -17,5 +18,5 @@ __all__ = [ ...@@ -17,5 +18,5 @@ __all__ = [
'Base3DDetector', 'VoxelNet', 'DynamicVoxelNet', 'MVXTwoStageDetector', 'Base3DDetector', 'VoxelNet', 'DynamicVoxelNet', 'MVXTwoStageDetector',
'DynamicMVXFasterRCNN', 'MVXFasterRCNN', 'PartA2', 'VoteNet', 'H3DNet', 'DynamicMVXFasterRCNN', 'MVXFasterRCNN', 'PartA2', 'VoteNet', 'H3DNet',
'CenterPoint', 'SSD3DNet', 'ImVoteNet', 'SingleStageMono3DDetector', 'CenterPoint', 'SSD3DNet', 'ImVoteNet', 'SingleStageMono3DDetector',
'FCOSMono3D', 'ImVoxelNet' 'FCOSMono3D', 'ImVoxelNet', 'GroupFree3DNet'
] ]
import torch
from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d
from mmdet.models import DETECTORS
from .single_stage import SingleStage3DDetector
@DETECTORS.register_module()
class GroupFree3DNet(SingleStage3DDetector):
"""`Group-Free 3D <https://arxiv.org/abs/2104.00678>`_."""
def __init__(self,
backbone,
bbox_head=None,
train_cfg=None,
test_cfg=None,
pretrained=None):
super(GroupFree3DNet, self).__init__(
backbone=backbone,
bbox_head=bbox_head,
train_cfg=train_cfg,
test_cfg=test_cfg,
pretrained=pretrained)
def forward_train(self,
points,
img_metas,
gt_bboxes_3d,
gt_labels_3d,
pts_semantic_mask=None,
pts_instance_mask=None,
gt_bboxes_ignore=None):
"""Forward of training.
Args:
points (list[torch.Tensor]): Points of each batch.
img_metas (list): Image metas.
gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): gt bboxes of each batch.
gt_labels_3d (list[torch.Tensor]): gt class labels of each batch.
pts_semantic_mask (None | list[torch.Tensor]): point-wise semantic
label of each batch.
pts_instance_mask (None | list[torch.Tensor]): point-wise instance
label of each batch.
gt_bboxes_ignore (None | list[torch.Tensor]): Specify
which bounding.
Returns:
dict[str: torch.Tensor]: Losses.
"""
# TODO: refactor votenet series to reduce redundant codes.
points_cat = torch.stack(points)
x = self.extract_feat(points_cat)
bbox_preds = self.bbox_head(x, self.train_cfg.sample_mod)
loss_inputs = (points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask,
pts_instance_mask, img_metas)
losses = self.bbox_head.loss(
bbox_preds, *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
return losses
def simple_test(self, points, img_metas, imgs=None, rescale=False):
"""Forward of testing.
Args:
points (list[torch.Tensor]): Points of each sample.
img_metas (list): Image metas.
rescale (bool): Whether to rescale results.
Returns:
list: Predicted 3d boxes.
"""
points_cat = torch.stack(points)
x = self.extract_feat(points_cat)
bbox_preds = self.bbox_head(x, self.test_cfg.sample_mod)
bbox_list = self.bbox_head.get_bboxes(
points_cat, bbox_preds, img_metas, rescale=rescale)
bbox_results = [
bbox3d2result(bboxes, scores, labels)
for bboxes, scores, labels in bbox_list
]
return bbox_results
def aug_test(self, points, img_metas, imgs=None, rescale=False):
"""Test with augmentation."""
points_cat = [torch.stack(pts) for pts in points]
feats = self.extract_feats(points_cat, img_metas)
# only support aug_test for one sample
aug_bboxes = []
for x, pts_cat, img_meta in zip(feats, points_cat, img_metas):
bbox_preds = self.bbox_head(x, self.test_cfg.sample_mod)
bbox_list = self.bbox_head.get_bboxes(
pts_cat, bbox_preds, img_meta, rescale=rescale)
bbox_list = [
dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels)
for bboxes, scores, labels in bbox_list
]
aug_bboxes.append(bbox_list[0])
# after merging, bboxes will be rescaled to the original image size
merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, img_metas,
self.bbox_head.test_cfg)
return [merged_bboxes]
from .transformer import GroupFree3DMHA
from .vote_module import VoteModule from .vote_module import VoteModule
__all__ = ['VoteModule'] __all__ = ['VoteModule', 'GroupFree3DMHA']
from mmcv.cnn.bricks.registry import ATTENTION
from mmcv.cnn.bricks.transformer import POSITIONAL_ENCODING, MultiheadAttention
from torch import nn as nn
@ATTENTION.register_module()
class GroupFree3DMHA(MultiheadAttention):
"""A warpper for torch.nn.MultiheadAttention for GroupFree3D.
This module implements MultiheadAttention with identity connection,
and positional encoding used in DETR is also passed as input.
Args:
embed_dims (int): The embedding dimension.
num_heads (int): Parallel attention heads. Same as
`nn.MultiheadAttention`.
attn_drop (float): A Dropout layer on attn_output_weights. Default 0.0.
proj_drop (float): A Dropout layer. Default 0.0.
dropout_layer (obj:`ConfigDict`): The dropout_layer used
when adding the shortcut.
init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
Default: None.
batch_first (bool): Key, Query and Value are shape of
(batch, n, embed_dim)
or (n, batch, embed_dim). Default to False.
"""
def __init__(self,
embed_dims,
num_heads,
attn_drop=0.,
proj_drop=0.,
dropout_layer=dict(type='DropOut', drop_prob=0.),
init_cfg=None,
batch_first=False,
**kwargs):
super().__init__(embed_dims, num_heads, attn_drop, proj_drop,
dropout_layer, init_cfg, batch_first, **kwargs)
def forward(self,
query,
key,
value,
identity,
query_pos=None,
key_pos=None,
attn_mask=None,
key_padding_mask=None,
**kwargs):
"""Forward function for `GroupFree3DMHA`.
**kwargs allow passing a more general data flow when combining
with other operations in `transformerlayer`.
Args:
query (Tensor): The input query with shape [num_queries, bs,
embed_dims]. Same in `nn.MultiheadAttention.forward`.
key (Tensor): The key tensor with shape [num_keys, bs,
embed_dims]. Same in `nn.MultiheadAttention.forward`.
If None, the ``query`` will be used. Defaults to None.
value (Tensor): The value tensor with same shape as `key`.
Same in `nn.MultiheadAttention.forward`. Defaults to None.
If None, the `key` will be used.
identity (Tensor): This tensor, with the same shape as x,
will be used for the identity link.
If None, `x` will be used. Defaults to None.
query_pos (Tensor): The positional encoding for query, with
the same shape as `x`. If not None, it will
be added to `x` before forward function. Defaults to None.
key_pos (Tensor): The positional encoding for `key`, with the
same shape as `key`. Defaults to None. If not None, it will
be added to `key` before forward function. If None, and
`query_pos` has the same shape as `key`, then `query_pos`
will be used for `key_pos`. Defaults to None.
attn_mask (Tensor): ByteTensor mask with shape [num_queries,
num_keys]. Same in `nn.MultiheadAttention.forward`.
Defaults to None.
key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys].
Same in `nn.MultiheadAttention.forward`. Defaults to None.
Returns:
Tensor: forwarded results with shape [num_queries, bs, embed_dims].
"""
if hasattr(self, 'operation_name'):
if self.operation_name == 'self_attn':
value = value + query_pos
elif self.operation_name == 'cross_attn':
value = value + key_pos
else:
raise NotImplementedError(
f'{self.__class__.name} '
f"can't be used as {self.operation_name}")
else:
value = value + query_pos
return super(GroupFree3DMHA, self).forward(
query=query,
key=key,
value=value,
identity=identity,
query_pos=query_pos,
key_pos=key_pos,
attn_mask=attn_mask,
key_padding_mask=key_padding_mask,
**kwargs)
@POSITIONAL_ENCODING.register_module()
class ConvBNPositionalEncoding(nn.Module):
"""Absolute position embedding with Conv learning.
Args:
input_channel (int): input features dim.
num_pos_feats (int): output position features dim.
Defaults to 288 to be consistent with seed features dim.
"""
def __init__(self, input_channel, num_pos_feats=288):
super().__init__()
self.position_embedding_head = nn.Sequential(
nn.Conv1d(input_channel, num_pos_feats, kernel_size=1),
nn.BatchNorm1d(num_pos_feats), nn.ReLU(inplace=True),
nn.Conv1d(num_pos_feats, num_pos_feats, kernel_size=1))
def forward(self, xyz):
"""Forward pass.
Args:
xyz (Tensor): (B, N, 3) the coordinates to embed.
Returns:
Tensor: (B, num_pos_feats, N) the embeded position features.
"""
xyz = xyz.permute(0, 2, 1)
position_embedding = self.position_embedding_head(xyz)
return position_embedding
...@@ -379,6 +379,59 @@ def test_fcos3d(): ...@@ -379,6 +379,59 @@ def test_fcos3d():
assert attrs_3d.shape[0] >= 0 assert attrs_3d.shape[0] >= 0
def test_groupfree3dnet():
if not torch.cuda.is_available():
pytest.skip('test requires GPU and torch+cuda')
_setup_seed(0)
groupfree3d_cfg = _get_detector_cfg(
'groupfree3d/groupfree3d_8x8_scannet-3d-18class-L6-O256.py')
self = build_detector(groupfree3d_cfg).cuda()
points_0 = torch.rand([50000, 3], device='cuda')
points_1 = torch.rand([50000, 3], device='cuda')
points = [points_0, points_1]
img_meta_0 = dict(box_type_3d=DepthInstance3DBoxes)
img_meta_1 = dict(box_type_3d=DepthInstance3DBoxes)
img_metas = [img_meta_0, img_meta_1]
gt_bbox_0 = DepthInstance3DBoxes(torch.rand([10, 7], device='cuda'))
gt_bbox_1 = DepthInstance3DBoxes(torch.rand([10, 7], device='cuda'))
gt_bboxes = [gt_bbox_0, gt_bbox_1]
gt_labels_0 = torch.randint(0, 18, [10], device='cuda')
gt_labels_1 = torch.randint(0, 18, [10], device='cuda')
gt_labels = [gt_labels_0, gt_labels_1]
pts_instance_mask_1 = torch.randint(0, 10, [50000], device='cuda')
pts_instance_mask_2 = torch.randint(0, 10, [50000], device='cuda')
pts_instance_mask = [pts_instance_mask_1, pts_instance_mask_2]
pts_semantic_mask_1 = torch.randint(0, 19, [50000], device='cuda')
pts_semantic_mask_2 = torch.randint(0, 19, [50000], device='cuda')
pts_semantic_mask = [pts_semantic_mask_1, pts_semantic_mask_2]
# test forward_train
losses = self.forward_train(points, img_metas, gt_bboxes, gt_labels,
pts_semantic_mask, pts_instance_mask)
assert losses['sampling_objectness_loss'] >= 0
assert losses['s5.objectness_loss'] >= 0
assert losses['s5.semantic_loss'] >= 0
assert losses['s5.center_loss'] >= 0
assert losses['s5.dir_class_loss'] >= 0
assert losses['s5.dir_res_loss'] >= 0
assert losses['s5.size_class_loss'] >= 0
assert losses['s5.size_res_loss'] >= 0
# test simple_test
with torch.no_grad():
results = self.simple_test(points, img_metas)
boxes_3d = results[0]['boxes_3d']
scores_3d = results[0]['scores_3d']
labels_3d = results[0]['labels_3d']
assert boxes_3d.tensor.shape[0] >= 0
assert boxes_3d.tensor.shape[1] == 7
assert scores_3d.shape[0] >= 0
assert labels_3d.shape[0] >= 0
def test_imvoxelnet(): def test_imvoxelnet():
if not torch.cuda.is_available(): if not torch.cuda.is_available():
pytest.skip('test requires GPU and torch+cuda') pytest.skip('test requires GPU and torch+cuda')
......
...@@ -1114,3 +1114,110 @@ def test_fcos_mono3d_head(): ...@@ -1114,3 +1114,110 @@ def test_fcos_mono3d_head():
assert results[0][1].shape == torch.Size([200]) assert results[0][1].shape == torch.Size([200])
assert results[0][2].shape == torch.Size([200]) assert results[0][2].shape == torch.Size([200])
assert results[0][3].shape == torch.Size([200]) assert results[0][3].shape == torch.Size([200])
def test_groupfree3d_head():
if not torch.cuda.is_available():
pytest.skip('test requires GPU and torch+cuda')
_setup_seed(0)
vote_head_cfg = _get_vote_head_cfg(
'groupfree3d/groupfree3d_8x8_scannet-3d-18class-L6-O256.py')
self = build_head(vote_head_cfg).cuda()
fp_xyz = [torch.rand([2, 256, 3], dtype=torch.float32).cuda()]
fp_features = [torch.rand([2, 288, 256], dtype=torch.float32).cuda()]
fp_indices = [torch.randint(0, 128, [2, 256]).cuda()]
input_dict = dict(
fp_xyz=fp_xyz, fp_features=fp_features, fp_indices=fp_indices)
# test forward
ret_dict = self(input_dict, 'kps')
assert ret_dict['seeds_obj_cls_logits'].shape == torch.Size([2, 1, 256])
assert ret_dict['s5.center'].shape == torch.Size([2, 128, 3])
assert ret_dict['s5.dir_class'].shape == torch.Size([2, 128, 1])
assert ret_dict['s5.dir_res'].shape == torch.Size([2, 128, 1])
assert ret_dict['s5.size_class'].shape == torch.Size([2, 128, 18])
assert ret_dict['s5.size_res'].shape == torch.Size([2, 128, 18, 3])
assert ret_dict['s5.obj_scores'].shape == torch.Size([2, 128, 1])
assert ret_dict['s5.sem_scores'].shape == torch.Size([2, 128, 18])
# test losses
points = [torch.rand([50000, 4], device='cuda') for i in range(2)]
gt_bbox1 = torch.rand([10, 7], dtype=torch.float32).cuda()
gt_bbox2 = torch.rand([10, 7], dtype=torch.float32).cuda()
gt_bbox1 = DepthInstance3DBoxes(gt_bbox1)
gt_bbox2 = DepthInstance3DBoxes(gt_bbox2)
gt_bboxes = [gt_bbox1, gt_bbox2]
pts_instance_mask_1 = torch.randint(0, 10, [50000], device='cuda')
pts_instance_mask_2 = torch.randint(0, 10, [50000], device='cuda')
pts_instance_mask = [pts_instance_mask_1, pts_instance_mask_2]
pts_semantic_mask_1 = torch.randint(0, 19, [50000], device='cuda')
pts_semantic_mask_2 = torch.randint(0, 19, [50000], device='cuda')
pts_semantic_mask = [pts_semantic_mask_1, pts_semantic_mask_2]
labels_1 = torch.randint(0, 18, [10], device='cuda')
labels_2 = torch.randint(0, 18, [10], device='cuda')
gt_labels = [labels_1, labels_2]
losses = self.loss(ret_dict, points, gt_bboxes, gt_labels,
pts_semantic_mask, pts_instance_mask)
assert losses['s5.objectness_loss'] >= 0
assert losses['s5.semantic_loss'] >= 0
assert losses['s5.center_loss'] >= 0
assert losses['s5.dir_class_loss'] >= 0
assert losses['s5.dir_res_loss'] >= 0
assert losses['s5.size_class_loss'] >= 0
assert losses['s5.size_res_loss'] >= 0
# test multiclass_nms_single
obj_scores = torch.rand([256], device='cuda')
sem_scores = torch.rand([256, 18], device='cuda')
points = torch.rand([50000, 3], device='cuda')
bbox = torch.rand([256, 7], device='cuda')
input_meta = dict(box_type_3d=DepthInstance3DBoxes)
bbox_selected, score_selected, labels = \
self.multiclass_nms_single(obj_scores,
sem_scores,
bbox,
points,
input_meta)
assert bbox_selected.shape[0] >= 0
assert bbox_selected.shape[1] == 7
assert score_selected.shape[0] >= 0
assert labels.shape[0] >= 0
# test get_boxes
points = torch.rand([1, 50000, 3], device='cuda')
seed_points = torch.rand([1, 1024, 3], device='cuda')
seed_indices = torch.randint(0, 50000, [1, 1024], device='cuda')
obj_scores = torch.rand([1, 256, 1], device='cuda')
center = torch.rand([1, 256, 3], device='cuda')
dir_class = torch.rand([1, 256, 1], device='cuda')
dir_res_norm = torch.rand([1, 256, 1], device='cuda')
dir_res = torch.rand([1, 256, 1], device='cuda')
size_class = torch.rand([1, 256, 18], device='cuda')
size_res = torch.rand([1, 256, 18, 3], device='cuda')
sem_scores = torch.rand([1, 256, 18], device='cuda')
bbox_preds = dict()
bbox_preds['seed_points'] = seed_points
bbox_preds['seed_indices'] = seed_indices
bbox_preds['s5.obj_scores'] = obj_scores
bbox_preds['s5.center'] = center
bbox_preds['s5.dir_class'] = dir_class
bbox_preds['s5.dir_res_norm'] = dir_res_norm
bbox_preds['s5.dir_res'] = dir_res
bbox_preds['s5.size_class'] = size_class
bbox_preds['s5.size_res'] = size_res
bbox_preds['s5.sem_scores'] = sem_scores
self.test_cfg['prediction_stages'] == 'last'
results = self.get_bboxes(points, bbox_preds, [input_meta])
assert results[0][0].tensor.shape[0] >= 0
assert results[0][0].tensor.shape[1] == 7
assert results[0][1].shape[0] >= 0
assert results[0][2].shape[0] >= 0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment