Unverified Commit 318499ac authored by hjin2902's avatar hjin2902 Committed by GitHub
Browse files

[Feature] Group-Free-3D head (#539)



* group-free-3d head

* GroupFree3DNet->VoteNet

* modify docstring

* bugfix: calculate pts_instance_label, decoder self/cross posembed init

* support GroupFree3DNet, modify scannnet train config

* support point cloud input features dim = 0

* add groupfree3dnet test case

* bugfix: softmax in decode boxes

* support multi-stage predictions

* modify GroupFree3DMultiheadAttention input parameters

* refactor: support sunrgbd-based train

* refactor: support sunrgbd-based train

* fix parts of bug

* modify multi-stage prediction

* fixbug: conv_channels

* bugfix: permute

* bugfix: permute

* bugfix: expand

* fix MAX_NUM_OBJ=64

* 4 gpu training, score_thr = 0.0

* modify config, repeattime=1

* bigfix: expand

* modify: GroupFree3DMHA, build_positional_encoding

* modify: GroupFree3DMHA, build_positional_encoding

* bugfix: torch.nn

* bugfix: mean loss

* residual -> identity

* fix name: DropOut -> Dropout

* delete sunrgbd-based congfig

* Fix: trailing whitespace

* suffix -> prefix

* bugfix: groupfree3d config
Co-authored-by: default avatarjinhui <PJLAB\jinhui@shai14001019l.pjlab.org>
parent e63e0473
model = dict(
type='GroupFree3DNet',
backbone=dict(
type='PointNet2SASSG',
in_channels=3,
num_points=(2048, 1024, 512, 256),
radius=(0.2, 0.4, 0.8, 1.2),
num_samples=(64, 32, 16, 16),
sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
(128, 128, 256)),
fp_channels=((256, 256), (256, 288)),
norm_cfg=dict(type='BN2d'),
sa_cfg=dict(
type='PointSAModule',
pool_mod='max',
use_xyz=True,
normalize_xyz=True)),
bbox_head=dict(
type='GroupFree3DHead',
in_channels=288,
num_decoder_layers=6,
num_proposal=256,
transformerlayers=dict(
type='BaseTransformerLayer',
attn_cfgs=dict(
type='GroupFree3DMHA',
embed_dims=288,
num_heads=8,
attn_drop=0.1,
dropout_layer=dict(type='Dropout', drop_prob=0.1)),
ffn_cfgs=dict(
embed_dims=288,
feedforward_channels=2048,
ffn_drop=0.1,
act_cfg=dict(type='ReLU', inplace=True)),
operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn',
'norm')),
pred_layer_cfg=dict(
in_channels=288, shared_conv_channels=(288, 288), bias=True),
sampling_objectness_loss=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=8.0),
objectness_loss=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
center_loss=dict(
type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
dir_class_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
dir_res_loss=dict(
type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
size_class_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
size_res_loss=dict(
type='SmoothL1Loss', beta=1.0, reduction='sum', loss_weight=10.0),
semantic_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
# model training and testing settings
train_cfg=dict(sample_mod='kps'),
test_cfg=dict(
sample_mod='kps',
nms_thr=0.25,
score_thr=0.0,
per_class_proposal=True,
prediction_stages='last'))
_base_ = [
'../_base_/datasets/scannet-3d-18class.py',
'../_base_/models/groupfree3d.py', '../_base_/schedules/schedule_3x.py',
'../_base_/default_runtime.py'
]
# model settings
model = dict(
bbox_head=dict(
num_classes=18,
num_decoder_layers=12,
size_cls_agnostic=False,
bbox_coder=dict(
type='GroupFree3DBBoxCoder',
num_sizes=18,
num_dir_bins=1,
with_rot=False,
size_cls_agnostic=False,
mean_sizes=[[0.76966727, 0.8116021, 0.92573744],
[1.876858, 1.8425595, 1.1931566],
[0.61328, 0.6148609, 0.7182701],
[1.3955007, 1.5121545, 0.83443564],
[0.97949594, 1.0675149, 0.6329687],
[0.531663, 0.5955577, 1.7500148],
[0.9624706, 0.72462326, 1.1481868],
[0.83221924, 1.0490936, 1.6875663],
[0.21132214, 0.4206159, 0.5372846],
[1.4440073, 1.8970833, 0.26985747],
[1.0294262, 1.4040797, 0.87554324],
[1.3766412, 0.65521795, 1.6813129],
[0.6650819, 0.71111923, 1.298853],
[0.41999173, 0.37906948, 1.7513971],
[0.59359556, 0.5912492, 0.73919016],
[0.50867593, 0.50656086, 0.30136237],
[1.1511526, 1.0546296, 0.49706793],
[0.47535285, 0.49249494, 0.5802117]]),
sampling_objectness_loss=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=8.0),
objectness_loss=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
center_loss=dict(
type='SmoothL1Loss', beta=0.04, reduction='sum', loss_weight=10.0),
dir_class_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
dir_res_loss=dict(
type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
size_class_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
size_res_loss=dict(
type='SmoothL1Loss',
beta=1.0 / 9.0,
reduction='sum',
loss_weight=10.0 / 9.0),
semantic_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
test_cfg=dict(
sample_mod='kps',
nms_thr=0.25,
score_thr=0.0,
per_class_proposal=True,
prediction_stages='last_three'))
# dataset settings
dataset_type = 'ScanNetDataset'
data_root = './data/scannet/'
class_names = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
'bookshelf', 'picture', 'counter', 'desk', 'curtain',
'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
'garbagebin')
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='DEPTH',
load_dim=6,
use_dim=[0, 1, 2]),
dict(
type='LoadAnnotations3D',
with_bbox_3d=True,
with_label_3d=True,
with_mask_3d=True,
with_seg_3d=True),
dict(type='GlobalAlignment', rotation_axis=2),
dict(
type='PointSegClassMapping',
valid_cat_ids=(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34,
36, 39)),
dict(type='IndoorPointSample', num_points=50000),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
flip_ratio_bev_vertical=0.5),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.087266, 0.087266],
scale_ratio_range=[1.0, 1.0]),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(
type='Collect3D',
keys=[
'points', 'gt_bboxes_3d', 'gt_labels_3d', 'pts_semantic_mask',
'pts_instance_mask'
])
]
test_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='DEPTH',
load_dim=6,
use_dim=[0, 1, 2]),
dict(type='GlobalAlignment', rotation_axis=2),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
flip_ratio_bev_vertical=0.5),
dict(type='IndoorPointSample', num_points=50000),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
data = dict(
samples_per_gpu=8,
workers_per_gpu=4,
train=dict(
type='RepeatDataset',
times=1,
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'scannet_infos_train.pkl',
pipeline=train_pipeline,
filter_empty_gt=False,
classes=class_names,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='Depth')),
val=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'scannet_infos_val.pkl',
pipeline=test_pipeline,
classes=class_names,
test_mode=True,
box_type_3d='Depth'),
test=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'scannet_infos_val.pkl',
pipeline=test_pipeline,
classes=class_names,
test_mode=True,
box_type_3d='Depth'))
# optimizer
lr = 0.006
optimizer = dict(
lr=lr,
weight_decay=0.0005,
paramwise_cfg=dict(
custom_keys={
'bbox_head.decoder_layers': dict(lr_mult=0.1, decay_mult=1.0),
'bbox_head.decoder_self_posembeds': dict(
lr_mult=0.1, decay_mult=1.0),
'bbox_head.decoder_cross_posembeds': dict(
lr_mult=0.1, decay_mult=1.0),
'bbox_head.decoder_query_proj': dict(lr_mult=0.1, decay_mult=1.0),
'bbox_head.decoder_key_proj': dict(lr_mult=0.1, decay_mult=1.0)
}))
optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
lr_config = dict(policy='step', warmup=None, step=[280, 340])
# runtime settings
runner = dict(type='EpochBasedRunner', max_epochs=400)
# yapf:disable
log_config = dict(
interval=30,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
# yapf:enable
_base_ = [
'../_base_/datasets/scannet-3d-18class.py',
'../_base_/models/groupfree3d.py', '../_base_/schedules/schedule_3x.py',
'../_base_/default_runtime.py'
]
# model settings
model = dict(
bbox_head=dict(
num_classes=18,
size_cls_agnostic=False,
bbox_coder=dict(
type='GroupFree3DBBoxCoder',
num_sizes=18,
num_dir_bins=1,
with_rot=False,
size_cls_agnostic=False,
mean_sizes=[[0.76966727, 0.8116021, 0.92573744],
[1.876858, 1.8425595, 1.1931566],
[0.61328, 0.6148609, 0.7182701],
[1.3955007, 1.5121545, 0.83443564],
[0.97949594, 1.0675149, 0.6329687],
[0.531663, 0.5955577, 1.7500148],
[0.9624706, 0.72462326, 1.1481868],
[0.83221924, 1.0490936, 1.6875663],
[0.21132214, 0.4206159, 0.5372846],
[1.4440073, 1.8970833, 0.26985747],
[1.0294262, 1.4040797, 0.87554324],
[1.3766412, 0.65521795, 1.6813129],
[0.6650819, 0.71111923, 1.298853],
[0.41999173, 0.37906948, 1.7513971],
[0.59359556, 0.5912492, 0.73919016],
[0.50867593, 0.50656086, 0.30136237],
[1.1511526, 1.0546296, 0.49706793],
[0.47535285, 0.49249494, 0.5802117]]),
sampling_objectness_loss=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=8.0),
objectness_loss=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
center_loss=dict(
type='SmoothL1Loss', beta=0.04, reduction='sum', loss_weight=10.0),
dir_class_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
dir_res_loss=dict(
type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
size_class_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
size_res_loss=dict(
type='SmoothL1Loss',
beta=1.0 / 9.0,
reduction='sum',
loss_weight=10.0 / 9.0),
semantic_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
test_cfg=dict(
sample_mod='kps',
nms_thr=0.25,
score_thr=0.0,
per_class_proposal=True,
prediction_stages='last_three'))
# dataset settings
dataset_type = 'ScanNetDataset'
data_root = './data/scannet/'
class_names = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
'bookshelf', 'picture', 'counter', 'desk', 'curtain',
'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
'garbagebin')
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='DEPTH',
load_dim=6,
use_dim=[0, 1, 2]),
dict(
type='LoadAnnotations3D',
with_bbox_3d=True,
with_label_3d=True,
with_mask_3d=True,
with_seg_3d=True),
dict(type='GlobalAlignment', rotation_axis=2),
dict(
type='PointSegClassMapping',
valid_cat_ids=(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34,
36, 39)),
dict(type='IndoorPointSample', num_points=50000),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
flip_ratio_bev_vertical=0.5),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.087266, 0.087266],
scale_ratio_range=[1.0, 1.0]),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(
type='Collect3D',
keys=[
'points', 'gt_bboxes_3d', 'gt_labels_3d', 'pts_semantic_mask',
'pts_instance_mask'
])
]
test_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='DEPTH',
load_dim=6,
use_dim=[0, 1, 2]),
dict(type='GlobalAlignment', rotation_axis=2),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
flip_ratio_bev_vertical=0.5),
dict(type='IndoorPointSample', num_points=50000),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
data = dict(
samples_per_gpu=8,
workers_per_gpu=4,
train=dict(
type='RepeatDataset',
times=1,
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'scannet_infos_train.pkl',
pipeline=train_pipeline,
filter_empty_gt=False,
classes=class_names,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='Depth')),
val=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'scannet_infos_val.pkl',
pipeline=test_pipeline,
classes=class_names,
test_mode=True,
box_type_3d='Depth'),
test=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'scannet_infos_val.pkl',
pipeline=test_pipeline,
classes=class_names,
test_mode=True,
box_type_3d='Depth'))
# optimizer
lr = 0.006
optimizer = dict(
lr=lr,
weight_decay=0.0005,
paramwise_cfg=dict(
custom_keys={
'bbox_head.decoder_layers': dict(lr_mult=0.1, decay_mult=1.0),
'bbox_head.decoder_self_posembeds': dict(
lr_mult=0.1, decay_mult=1.0),
'bbox_head.decoder_cross_posembeds': dict(
lr_mult=0.1, decay_mult=1.0),
'bbox_head.decoder_query_proj': dict(lr_mult=0.1, decay_mult=1.0),
'bbox_head.decoder_key_proj': dict(lr_mult=0.1, decay_mult=1.0)
}))
optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
lr_config = dict(policy='step', warmup=None, step=[280, 340])
# runtime settings
runner = dict(type='EpochBasedRunner', max_epochs=400)
# yapf:disable
log_config = dict(
interval=30,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
# yapf:enable
_base_ = [
'../_base_/datasets/scannet-3d-18class.py',
'../_base_/models/groupfree3d.py', '../_base_/schedules/schedule_3x.py',
'../_base_/default_runtime.py'
]
# model settings
model = dict(
backbone=dict(
type='PointNet2SASSG',
in_channels=3,
num_points=(2048, 1024, 512, 256),
radius=(0.2, 0.4, 0.8, 1.2),
num_samples=(64, 32, 16, 16),
sa_channels=((128, 128, 256), (256, 256, 512), (256, 256, 512),
(256, 256, 512)),
fp_channels=((512, 512), (512, 288)),
norm_cfg=dict(type='BN2d'),
sa_cfg=dict(
type='PointSAModule',
pool_mod='max',
use_xyz=True,
normalize_xyz=True)),
bbox_head=dict(
num_classes=18,
num_decoder_layers=12,
size_cls_agnostic=False,
bbox_coder=dict(
type='GroupFree3DBBoxCoder',
num_sizes=18,
num_dir_bins=1,
with_rot=False,
size_cls_agnostic=False,
mean_sizes=[[0.76966727, 0.8116021, 0.92573744],
[1.876858, 1.8425595, 1.1931566],
[0.61328, 0.6148609, 0.7182701],
[1.3955007, 1.5121545, 0.83443564],
[0.97949594, 1.0675149, 0.6329687],
[0.531663, 0.5955577, 1.7500148],
[0.9624706, 0.72462326, 1.1481868],
[0.83221924, 1.0490936, 1.6875663],
[0.21132214, 0.4206159, 0.5372846],
[1.4440073, 1.8970833, 0.26985747],
[1.0294262, 1.4040797, 0.87554324],
[1.3766412, 0.65521795, 1.6813129],
[0.6650819, 0.71111923, 1.298853],
[0.41999173, 0.37906948, 1.7513971],
[0.59359556, 0.5912492, 0.73919016],
[0.50867593, 0.50656086, 0.30136237],
[1.1511526, 1.0546296, 0.49706793],
[0.47535285, 0.49249494, 0.5802117]]),
sampling_objectness_loss=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=8.0),
objectness_loss=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
center_loss=dict(
type='SmoothL1Loss', beta=0.04, reduction='sum', loss_weight=10.0),
dir_class_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
dir_res_loss=dict(
type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
size_class_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
size_res_loss=dict(
type='SmoothL1Loss',
beta=1.0 / 9.0,
reduction='sum',
loss_weight=10.0 / 9.0),
semantic_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
test_cfg=dict(
sample_mod='kps',
nms_thr=0.25,
score_thr=0.0,
per_class_proposal=True,
prediction_stages='last_three'))
# dataset settings
dataset_type = 'ScanNetDataset'
data_root = './data/scannet/'
class_names = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
'bookshelf', 'picture', 'counter', 'desk', 'curtain',
'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
'garbagebin')
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='DEPTH',
load_dim=6,
use_dim=[0, 1, 2]),
dict(
type='LoadAnnotations3D',
with_bbox_3d=True,
with_label_3d=True,
with_mask_3d=True,
with_seg_3d=True),
dict(type='GlobalAlignment', rotation_axis=2),
dict(
type='PointSegClassMapping',
valid_cat_ids=(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34,
36, 39)),
dict(type='IndoorPointSample', num_points=50000),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
flip_ratio_bev_vertical=0.5),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.087266, 0.087266],
scale_ratio_range=[1.0, 1.0]),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(
type='Collect3D',
keys=[
'points', 'gt_bboxes_3d', 'gt_labels_3d', 'pts_semantic_mask',
'pts_instance_mask'
])
]
test_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='DEPTH',
load_dim=6,
use_dim=[0, 1, 2]),
dict(type='GlobalAlignment', rotation_axis=2),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
flip_ratio_bev_vertical=0.5),
dict(type='IndoorPointSample', num_points=50000),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
data = dict(
samples_per_gpu=8,
workers_per_gpu=4,
train=dict(
type='RepeatDataset',
times=1,
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'scannet_infos_train.pkl',
pipeline=train_pipeline,
filter_empty_gt=False,
classes=class_names,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='Depth')),
val=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'scannet_infos_val.pkl',
pipeline=test_pipeline,
classes=class_names,
test_mode=True,
box_type_3d='Depth'),
test=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'scannet_infos_val.pkl',
pipeline=test_pipeline,
classes=class_names,
test_mode=True,
box_type_3d='Depth'))
# optimizer
lr = 0.006
optimizer = dict(
lr=lr,
weight_decay=0.0005,
paramwise_cfg=dict(
custom_keys={
'bbox_head.decoder_layers': dict(lr_mult=0.1, decay_mult=1.0),
'bbox_head.decoder_self_posembeds': dict(
lr_mult=0.1, decay_mult=1.0),
'bbox_head.decoder_cross_posembeds': dict(
lr_mult=0.1, decay_mult=1.0),
'bbox_head.decoder_query_proj': dict(lr_mult=0.1, decay_mult=1.0),
'bbox_head.decoder_key_proj': dict(lr_mult=0.1, decay_mult=1.0)
}))
optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
lr_config = dict(policy='step', warmup=None, step=[280, 340])
# runtime settings
runner = dict(type='EpochBasedRunner', max_epochs=400)
# yapf:disable
log_config = dict(
interval=30,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
# yapf:enable
_base_ = [
'../_base_/datasets/scannet-3d-18class.py',
'../_base_/models/groupfree3d.py', '../_base_/schedules/schedule_3x.py',
'../_base_/default_runtime.py'
]
# model settings
model = dict(
backbone=dict(
type='PointNet2SASSG',
in_channels=3,
num_points=(2048, 1024, 512, 256),
radius=(0.2, 0.4, 0.8, 1.2),
num_samples=(64, 32, 16, 16),
sa_channels=((128, 128, 256), (256, 256, 512), (256, 256, 512),
(256, 256, 512)),
fp_channels=((512, 512), (512, 288)),
norm_cfg=dict(type='BN2d'),
sa_cfg=dict(
type='PointSAModule',
pool_mod='max',
use_xyz=True,
normalize_xyz=True)),
bbox_head=dict(
num_classes=18,
num_decoder_layers=12,
num_proposal=512,
size_cls_agnostic=False,
bbox_coder=dict(
type='GroupFree3DBBoxCoder',
num_sizes=18,
num_dir_bins=1,
with_rot=False,
size_cls_agnostic=False,
mean_sizes=[[0.76966727, 0.8116021, 0.92573744],
[1.876858, 1.8425595, 1.1931566],
[0.61328, 0.6148609, 0.7182701],
[1.3955007, 1.5121545, 0.83443564],
[0.97949594, 1.0675149, 0.6329687],
[0.531663, 0.5955577, 1.7500148],
[0.9624706, 0.72462326, 1.1481868],
[0.83221924, 1.0490936, 1.6875663],
[0.21132214, 0.4206159, 0.5372846],
[1.4440073, 1.8970833, 0.26985747],
[1.0294262, 1.4040797, 0.87554324],
[1.3766412, 0.65521795, 1.6813129],
[0.6650819, 0.71111923, 1.298853],
[0.41999173, 0.37906948, 1.7513971],
[0.59359556, 0.5912492, 0.73919016],
[0.50867593, 0.50656086, 0.30136237],
[1.1511526, 1.0546296, 0.49706793],
[0.47535285, 0.49249494, 0.5802117]]),
sampling_objectness_loss=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=8.0),
objectness_loss=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
center_loss=dict(
type='SmoothL1Loss', beta=0.04, reduction='sum', loss_weight=10.0),
dir_class_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
dir_res_loss=dict(
type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
size_class_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
size_res_loss=dict(
type='SmoothL1Loss',
beta=1.0 / 9.0,
reduction='sum',
loss_weight=10.0 / 9.0),
semantic_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
test_cfg=dict(
sample_mod='kps',
nms_thr=0.25,
score_thr=0.0,
per_class_proposal=True,
prediction_stages='last_three'))
# dataset settings
dataset_type = 'ScanNetDataset'
data_root = './data/scannet/'
class_names = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
'bookshelf', 'picture', 'counter', 'desk', 'curtain',
'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
'garbagebin')
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='DEPTH',
load_dim=6,
use_dim=[0, 1, 2]),
dict(
type='LoadAnnotations3D',
with_bbox_3d=True,
with_label_3d=True,
with_mask_3d=True,
with_seg_3d=True),
dict(type='GlobalAlignment', rotation_axis=2),
dict(
type='PointSegClassMapping',
valid_cat_ids=(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34,
36, 39)),
dict(type='IndoorPointSample', num_points=50000),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
flip_ratio_bev_vertical=0.5),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.087266, 0.087266],
scale_ratio_range=[1.0, 1.0]),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(
type='Collect3D',
keys=[
'points', 'gt_bboxes_3d', 'gt_labels_3d', 'pts_semantic_mask',
'pts_instance_mask'
])
]
test_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='DEPTH',
load_dim=6,
use_dim=[0, 1, 2]),
dict(type='GlobalAlignment', rotation_axis=2),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
flip_ratio_bev_vertical=0.5),
dict(type='IndoorPointSample', num_points=50000),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
data = dict(
samples_per_gpu=8,
workers_per_gpu=4,
train=dict(
type='RepeatDataset',
times=1,
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'scannet_infos_train.pkl',
pipeline=train_pipeline,
filter_empty_gt=False,
classes=class_names,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='Depth')),
val=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'scannet_infos_val.pkl',
pipeline=test_pipeline,
classes=class_names,
test_mode=True,
box_type_3d='Depth'),
test=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'scannet_infos_val.pkl',
pipeline=test_pipeline,
classes=class_names,
test_mode=True,
box_type_3d='Depth'))
# optimizer
lr = 0.006
optimizer = dict(
lr=lr,
weight_decay=0.0005,
paramwise_cfg=dict(
custom_keys={
'bbox_head.decoder_layers': dict(lr_mult=0.1, decay_mult=1.0),
'bbox_head.decoder_self_posembeds': dict(
lr_mult=0.1, decay_mult=1.0),
'bbox_head.decoder_cross_posembeds': dict(
lr_mult=0.1, decay_mult=1.0),
'bbox_head.decoder_query_proj': dict(lr_mult=0.1, decay_mult=1.0),
'bbox_head.decoder_key_proj': dict(lr_mult=0.1, decay_mult=1.0)
}))
optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
lr_config = dict(policy='step', warmup=None, step=[280, 340])
# runtime settings
runner = dict(type='EpochBasedRunner', max_epochs=400)
# yapf:disable
log_config = dict(
interval=30,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
# yapf:enable
...@@ -2,9 +2,10 @@ from mmdet.core.bbox import build_bbox_coder ...@@ -2,9 +2,10 @@ from mmdet.core.bbox import build_bbox_coder
from .anchor_free_bbox_coder import AnchorFreeBBoxCoder from .anchor_free_bbox_coder import AnchorFreeBBoxCoder
from .centerpoint_bbox_coders import CenterPointBBoxCoder from .centerpoint_bbox_coders import CenterPointBBoxCoder
from .delta_xyzwhlr_bbox_coder import DeltaXYZWLHRBBoxCoder from .delta_xyzwhlr_bbox_coder import DeltaXYZWLHRBBoxCoder
from .groupfree3d_bbox_coder import GroupFree3DBBoxCoder
from .partial_bin_based_bbox_coder import PartialBinBasedBBoxCoder from .partial_bin_based_bbox_coder import PartialBinBasedBBoxCoder
__all__ = [ __all__ = [
'build_bbox_coder', 'DeltaXYZWLHRBBoxCoder', 'PartialBinBasedBBoxCoder', 'build_bbox_coder', 'DeltaXYZWLHRBBoxCoder', 'PartialBinBasedBBoxCoder',
'CenterPointBBoxCoder', 'AnchorFreeBBoxCoder' 'CenterPointBBoxCoder', 'AnchorFreeBBoxCoder', 'GroupFree3DBBoxCoder'
] ]
import numpy as np
import torch
from mmdet.core.bbox.builder import BBOX_CODERS
from .partial_bin_based_bbox_coder import PartialBinBasedBBoxCoder
@BBOX_CODERS.register_module()
class GroupFree3DBBoxCoder(PartialBinBasedBBoxCoder):
"""Modified partial bin based bbox coder for GroupFree3D.
Args:
num_dir_bins (int): Number of bins to encode direction angle.
num_sizes (int): Number of size clusters.
mean_sizes (list[list[int]]): Mean size of bboxes in each class.
with_rot (bool): Whether the bbox is with rotation. Defaults to True.
size_cls_agnostic (bool): Whether the predicted size is class-agnostic.
Defaults to True.
"""
def __init__(self,
num_dir_bins,
num_sizes,
mean_sizes,
with_rot=True,
size_cls_agnostic=True):
super(GroupFree3DBBoxCoder, self).__init__(
num_dir_bins=num_dir_bins,
num_sizes=num_sizes,
mean_sizes=mean_sizes,
with_rot=with_rot)
self.size_cls_agnostic = size_cls_agnostic
def encode(self, gt_bboxes_3d, gt_labels_3d):
"""Encode ground truth to prediction targets.
Args:
gt_bboxes_3d (BaseInstance3DBoxes): Ground truth bboxes \
with shape (n, 7).
gt_labels_3d (torch.Tensor): Ground truth classes.
Returns:
tuple: Targets of center, size and direction.
"""
# generate center target
center_target = gt_bboxes_3d.gravity_center
# generate bbox size target
size_target = gt_bboxes_3d.dims
size_class_target = gt_labels_3d
size_res_target = gt_bboxes_3d.dims - gt_bboxes_3d.tensor.new_tensor(
self.mean_sizes)[size_class_target]
# generate dir target
box_num = gt_labels_3d.shape[0]
if self.with_rot:
(dir_class_target,
dir_res_target) = self.angle2class(gt_bboxes_3d.yaw)
else:
dir_class_target = gt_labels_3d.new_zeros(box_num)
dir_res_target = gt_bboxes_3d.tensor.new_zeros(box_num)
return (center_target, size_target, size_class_target, size_res_target,
dir_class_target, dir_res_target)
def decode(self, bbox_out, prefix=''):
"""Decode predicted parts to bbox3d.
Args:
bbox_out (dict): Predictions from model, should contain keys below.
- center: predicted bottom center of bboxes.
- dir_class: predicted bbox direction class.
- dir_res: predicted bbox direction residual.
- size_class: predicted bbox size class.
- size_res: predicted bbox size residual.
- size: predicted class-agnostic bbox size
prefix (str): Decode predictions with specific prefix.
Defaults to ''.
Returns:
torch.Tensor: Decoded bbox3d with shape (batch, n, 7).
"""
center = bbox_out[f'{prefix}center']
batch_size, num_proposal = center.shape[:2]
# decode heading angle
if self.with_rot:
dir_class = torch.argmax(bbox_out[f'{prefix}dir_class'], -1)
dir_res = torch.gather(bbox_out[f'{prefix}dir_res'], 2,
dir_class.unsqueeze(-1))
dir_res.squeeze_(2)
dir_angle = self.class2angle(dir_class, dir_res).reshape(
batch_size, num_proposal, 1)
else:
dir_angle = center.new_zeros(batch_size, num_proposal, 1)
# decode bbox size
if self.size_cls_agnostic:
bbox_size = bbox_out[f'{prefix}size'].reshape(
batch_size, num_proposal, 3)
else:
size_class = torch.argmax(
bbox_out[f'{prefix}size_class'], -1, keepdim=True)
size_res = torch.gather(
bbox_out[f'{prefix}size_res'], 2,
size_class.unsqueeze(-1).repeat(1, 1, 1, 3))
mean_sizes = center.new_tensor(self.mean_sizes)
size_base = torch.index_select(mean_sizes, 0,
size_class.reshape(-1))
bbox_size = size_base.reshape(batch_size, num_proposal,
-1) + size_res.squeeze(2)
bbox3d = torch.cat([center, bbox_size, dir_angle], dim=-1)
return bbox3d
def split_pred(self, cls_preds, reg_preds, base_xyz, prefix=''):
"""Split predicted features to specific parts.
Args:
cls_preds (torch.Tensor): Class predicted features to split.
reg_preds (torch.Tensor): Regression predicted features to split.
base_xyz (torch.Tensor): Coordinates of points.
prefix (str): Decode predictions with specific prefix.
Defaults to ''.
Returns:
dict[str, torch.Tensor]: Split results.
"""
results = {}
start, end = 0, 0
cls_preds_trans = cls_preds.transpose(2, 1)
reg_preds_trans = reg_preds.transpose(2, 1)
# decode center
end += 3
# (batch_size, num_proposal, 3)
results[f'{prefix}center_residual'] = \
reg_preds_trans[..., start:end].contiguous()
results[f'{prefix}center'] = base_xyz + \
reg_preds_trans[..., start:end].contiguous()
start = end
# decode direction
end += self.num_dir_bins
results[f'{prefix}dir_class'] = \
reg_preds_trans[..., start:end].contiguous()
start = end
end += self.num_dir_bins
dir_res_norm = reg_preds_trans[..., start:end].contiguous()
start = end
results[f'{prefix}dir_res_norm'] = dir_res_norm
results[f'{prefix}dir_res'] = dir_res_norm * (
np.pi / self.num_dir_bins)
# decode size
if self.size_cls_agnostic:
end += 3
results[f'{prefix}size'] = \
reg_preds_trans[..., start:end].contiguous()
else:
end += self.num_sizes
results[f'{prefix}size_class'] = reg_preds_trans[
..., start:end].contiguous()
start = end
end += self.num_sizes * 3
size_res_norm = reg_preds_trans[..., start:end]
batch_size, num_proposal = reg_preds_trans.shape[:2]
size_res_norm = size_res_norm.view(
[batch_size, num_proposal, self.num_sizes, 3])
start = end
results[f'{prefix}size_res_norm'] = size_res_norm.contiguous()
mean_sizes = reg_preds.new_tensor(self.mean_sizes)
results[f'{prefix}size_res'] = (
size_res_norm * mean_sizes.unsqueeze(0).unsqueeze(0))
# decode objectness score
# Group-Free-3D objectness output shape (batch, proposal, 1)
results[f'{prefix}obj_scores'] = cls_preds_trans[..., :1].contiguous()
# decode semantic score
results[f'{prefix}sem_scores'] = cls_preds_trans[..., 1:].contiguous()
return results
...@@ -5,6 +5,7 @@ from .base_mono3d_dense_head import BaseMono3DDenseHead ...@@ -5,6 +5,7 @@ from .base_mono3d_dense_head import BaseMono3DDenseHead
from .centerpoint_head import CenterHead from .centerpoint_head import CenterHead
from .fcos_mono3d_head import FCOSMono3DHead from .fcos_mono3d_head import FCOSMono3DHead
from .free_anchor3d_head import FreeAnchor3DHead from .free_anchor3d_head import FreeAnchor3DHead
from .groupfree3d_head import GroupFree3DHead
from .parta2_rpn_head import PartA2RPNHead from .parta2_rpn_head import PartA2RPNHead
from .shape_aware_head import ShapeAwareHead from .shape_aware_head import ShapeAwareHead
from .ssd_3d_head import SSD3DHead from .ssd_3d_head import SSD3DHead
...@@ -13,5 +14,6 @@ from .vote_head import VoteHead ...@@ -13,5 +14,6 @@ from .vote_head import VoteHead
__all__ = [ __all__ = [
'Anchor3DHead', 'FreeAnchor3DHead', 'PartA2RPNHead', 'VoteHead', 'Anchor3DHead', 'FreeAnchor3DHead', 'PartA2RPNHead', 'VoteHead',
'SSD3DHead', 'BaseConvBboxHead', 'CenterHead', 'ShapeAwareHead', 'SSD3DHead', 'BaseConvBboxHead', 'CenterHead', 'ShapeAwareHead',
'BaseMono3DDenseHead', 'AnchorFreeMono3DHead', 'FCOSMono3DHead' 'BaseMono3DDenseHead', 'AnchorFreeMono3DHead', 'FCOSMono3DHead',
'GroupFree3DHead'
] ]
This diff is collapsed.
...@@ -2,6 +2,7 @@ from .base import Base3DDetector ...@@ -2,6 +2,7 @@ from .base import Base3DDetector
from .centerpoint import CenterPoint from .centerpoint import CenterPoint
from .dynamic_voxelnet import DynamicVoxelNet from .dynamic_voxelnet import DynamicVoxelNet
from .fcos_mono3d import FCOSMono3D from .fcos_mono3d import FCOSMono3D
from .groupfree3dnet import GroupFree3DNet
from .h3dnet import H3DNet from .h3dnet import H3DNet
from .imvotenet import ImVoteNet from .imvotenet import ImVoteNet
from .imvoxelnet import ImVoxelNet from .imvoxelnet import ImVoxelNet
...@@ -17,5 +18,5 @@ __all__ = [ ...@@ -17,5 +18,5 @@ __all__ = [
'Base3DDetector', 'VoxelNet', 'DynamicVoxelNet', 'MVXTwoStageDetector', 'Base3DDetector', 'VoxelNet', 'DynamicVoxelNet', 'MVXTwoStageDetector',
'DynamicMVXFasterRCNN', 'MVXFasterRCNN', 'PartA2', 'VoteNet', 'H3DNet', 'DynamicMVXFasterRCNN', 'MVXFasterRCNN', 'PartA2', 'VoteNet', 'H3DNet',
'CenterPoint', 'SSD3DNet', 'ImVoteNet', 'SingleStageMono3DDetector', 'CenterPoint', 'SSD3DNet', 'ImVoteNet', 'SingleStageMono3DDetector',
'FCOSMono3D', 'ImVoxelNet' 'FCOSMono3D', 'ImVoxelNet', 'GroupFree3DNet'
] ]
import torch
from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d
from mmdet.models import DETECTORS
from .single_stage import SingleStage3DDetector
@DETECTORS.register_module()
class GroupFree3DNet(SingleStage3DDetector):
"""`Group-Free 3D <https://arxiv.org/abs/2104.00678>`_."""
def __init__(self,
backbone,
bbox_head=None,
train_cfg=None,
test_cfg=None,
pretrained=None):
super(GroupFree3DNet, self).__init__(
backbone=backbone,
bbox_head=bbox_head,
train_cfg=train_cfg,
test_cfg=test_cfg,
pretrained=pretrained)
def forward_train(self,
points,
img_metas,
gt_bboxes_3d,
gt_labels_3d,
pts_semantic_mask=None,
pts_instance_mask=None,
gt_bboxes_ignore=None):
"""Forward of training.
Args:
points (list[torch.Tensor]): Points of each batch.
img_metas (list): Image metas.
gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): gt bboxes of each batch.
gt_labels_3d (list[torch.Tensor]): gt class labels of each batch.
pts_semantic_mask (None | list[torch.Tensor]): point-wise semantic
label of each batch.
pts_instance_mask (None | list[torch.Tensor]): point-wise instance
label of each batch.
gt_bboxes_ignore (None | list[torch.Tensor]): Specify
which bounding.
Returns:
dict[str: torch.Tensor]: Losses.
"""
# TODO: refactor votenet series to reduce redundant codes.
points_cat = torch.stack(points)
x = self.extract_feat(points_cat)
bbox_preds = self.bbox_head(x, self.train_cfg.sample_mod)
loss_inputs = (points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask,
pts_instance_mask, img_metas)
losses = self.bbox_head.loss(
bbox_preds, *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
return losses
def simple_test(self, points, img_metas, imgs=None, rescale=False):
"""Forward of testing.
Args:
points (list[torch.Tensor]): Points of each sample.
img_metas (list): Image metas.
rescale (bool): Whether to rescale results.
Returns:
list: Predicted 3d boxes.
"""
points_cat = torch.stack(points)
x = self.extract_feat(points_cat)
bbox_preds = self.bbox_head(x, self.test_cfg.sample_mod)
bbox_list = self.bbox_head.get_bboxes(
points_cat, bbox_preds, img_metas, rescale=rescale)
bbox_results = [
bbox3d2result(bboxes, scores, labels)
for bboxes, scores, labels in bbox_list
]
return bbox_results
def aug_test(self, points, img_metas, imgs=None, rescale=False):
"""Test with augmentation."""
points_cat = [torch.stack(pts) for pts in points]
feats = self.extract_feats(points_cat, img_metas)
# only support aug_test for one sample
aug_bboxes = []
for x, pts_cat, img_meta in zip(feats, points_cat, img_metas):
bbox_preds = self.bbox_head(x, self.test_cfg.sample_mod)
bbox_list = self.bbox_head.get_bboxes(
pts_cat, bbox_preds, img_meta, rescale=rescale)
bbox_list = [
dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels)
for bboxes, scores, labels in bbox_list
]
aug_bboxes.append(bbox_list[0])
# after merging, bboxes will be rescaled to the original image size
merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, img_metas,
self.bbox_head.test_cfg)
return [merged_bboxes]
from .transformer import GroupFree3DMHA
from .vote_module import VoteModule from .vote_module import VoteModule
__all__ = ['VoteModule'] __all__ = ['VoteModule', 'GroupFree3DMHA']
from mmcv.cnn.bricks.registry import ATTENTION
from mmcv.cnn.bricks.transformer import POSITIONAL_ENCODING, MultiheadAttention
from torch import nn as nn
@ATTENTION.register_module()
class GroupFree3DMHA(MultiheadAttention):
"""A warpper for torch.nn.MultiheadAttention for GroupFree3D.
This module implements MultiheadAttention with identity connection,
and positional encoding used in DETR is also passed as input.
Args:
embed_dims (int): The embedding dimension.
num_heads (int): Parallel attention heads. Same as
`nn.MultiheadAttention`.
attn_drop (float): A Dropout layer on attn_output_weights. Default 0.0.
proj_drop (float): A Dropout layer. Default 0.0.
dropout_layer (obj:`ConfigDict`): The dropout_layer used
when adding the shortcut.
init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
Default: None.
batch_first (bool): Key, Query and Value are shape of
(batch, n, embed_dim)
or (n, batch, embed_dim). Default to False.
"""
def __init__(self,
embed_dims,
num_heads,
attn_drop=0.,
proj_drop=0.,
dropout_layer=dict(type='DropOut', drop_prob=0.),
init_cfg=None,
batch_first=False,
**kwargs):
super().__init__(embed_dims, num_heads, attn_drop, proj_drop,
dropout_layer, init_cfg, batch_first, **kwargs)
def forward(self,
query,
key,
value,
identity,
query_pos=None,
key_pos=None,
attn_mask=None,
key_padding_mask=None,
**kwargs):
"""Forward function for `GroupFree3DMHA`.
**kwargs allow passing a more general data flow when combining
with other operations in `transformerlayer`.
Args:
query (Tensor): The input query with shape [num_queries, bs,
embed_dims]. Same in `nn.MultiheadAttention.forward`.
key (Tensor): The key tensor with shape [num_keys, bs,
embed_dims]. Same in `nn.MultiheadAttention.forward`.
If None, the ``query`` will be used. Defaults to None.
value (Tensor): The value tensor with same shape as `key`.
Same in `nn.MultiheadAttention.forward`. Defaults to None.
If None, the `key` will be used.
identity (Tensor): This tensor, with the same shape as x,
will be used for the identity link.
If None, `x` will be used. Defaults to None.
query_pos (Tensor): The positional encoding for query, with
the same shape as `x`. If not None, it will
be added to `x` before forward function. Defaults to None.
key_pos (Tensor): The positional encoding for `key`, with the
same shape as `key`. Defaults to None. If not None, it will
be added to `key` before forward function. If None, and
`query_pos` has the same shape as `key`, then `query_pos`
will be used for `key_pos`. Defaults to None.
attn_mask (Tensor): ByteTensor mask with shape [num_queries,
num_keys]. Same in `nn.MultiheadAttention.forward`.
Defaults to None.
key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys].
Same in `nn.MultiheadAttention.forward`. Defaults to None.
Returns:
Tensor: forwarded results with shape [num_queries, bs, embed_dims].
"""
if hasattr(self, 'operation_name'):
if self.operation_name == 'self_attn':
value = value + query_pos
elif self.operation_name == 'cross_attn':
value = value + key_pos
else:
raise NotImplementedError(
f'{self.__class__.name} '
f"can't be used as {self.operation_name}")
else:
value = value + query_pos
return super(GroupFree3DMHA, self).forward(
query=query,
key=key,
value=value,
identity=identity,
query_pos=query_pos,
key_pos=key_pos,
attn_mask=attn_mask,
key_padding_mask=key_padding_mask,
**kwargs)
@POSITIONAL_ENCODING.register_module()
class ConvBNPositionalEncoding(nn.Module):
"""Absolute position embedding with Conv learning.
Args:
input_channel (int): input features dim.
num_pos_feats (int): output position features dim.
Defaults to 288 to be consistent with seed features dim.
"""
def __init__(self, input_channel, num_pos_feats=288):
super().__init__()
self.position_embedding_head = nn.Sequential(
nn.Conv1d(input_channel, num_pos_feats, kernel_size=1),
nn.BatchNorm1d(num_pos_feats), nn.ReLU(inplace=True),
nn.Conv1d(num_pos_feats, num_pos_feats, kernel_size=1))
def forward(self, xyz):
"""Forward pass.
Args:
xyz (Tensor): (B, N, 3) the coordinates to embed.
Returns:
Tensor: (B, num_pos_feats, N) the embeded position features.
"""
xyz = xyz.permute(0, 2, 1)
position_embedding = self.position_embedding_head(xyz)
return position_embedding
...@@ -379,6 +379,59 @@ def test_fcos3d(): ...@@ -379,6 +379,59 @@ def test_fcos3d():
assert attrs_3d.shape[0] >= 0 assert attrs_3d.shape[0] >= 0
def test_groupfree3dnet():
if not torch.cuda.is_available():
pytest.skip('test requires GPU and torch+cuda')
_setup_seed(0)
groupfree3d_cfg = _get_detector_cfg(
'groupfree3d/groupfree3d_8x8_scannet-3d-18class-L6-O256.py')
self = build_detector(groupfree3d_cfg).cuda()
points_0 = torch.rand([50000, 3], device='cuda')
points_1 = torch.rand([50000, 3], device='cuda')
points = [points_0, points_1]
img_meta_0 = dict(box_type_3d=DepthInstance3DBoxes)
img_meta_1 = dict(box_type_3d=DepthInstance3DBoxes)
img_metas = [img_meta_0, img_meta_1]
gt_bbox_0 = DepthInstance3DBoxes(torch.rand([10, 7], device='cuda'))
gt_bbox_1 = DepthInstance3DBoxes(torch.rand([10, 7], device='cuda'))
gt_bboxes = [gt_bbox_0, gt_bbox_1]
gt_labels_0 = torch.randint(0, 18, [10], device='cuda')
gt_labels_1 = torch.randint(0, 18, [10], device='cuda')
gt_labels = [gt_labels_0, gt_labels_1]
pts_instance_mask_1 = torch.randint(0, 10, [50000], device='cuda')
pts_instance_mask_2 = torch.randint(0, 10, [50000], device='cuda')
pts_instance_mask = [pts_instance_mask_1, pts_instance_mask_2]
pts_semantic_mask_1 = torch.randint(0, 19, [50000], device='cuda')
pts_semantic_mask_2 = torch.randint(0, 19, [50000], device='cuda')
pts_semantic_mask = [pts_semantic_mask_1, pts_semantic_mask_2]
# test forward_train
losses = self.forward_train(points, img_metas, gt_bboxes, gt_labels,
pts_semantic_mask, pts_instance_mask)
assert losses['sampling_objectness_loss'] >= 0
assert losses['s5.objectness_loss'] >= 0
assert losses['s5.semantic_loss'] >= 0
assert losses['s5.center_loss'] >= 0
assert losses['s5.dir_class_loss'] >= 0
assert losses['s5.dir_res_loss'] >= 0
assert losses['s5.size_class_loss'] >= 0
assert losses['s5.size_res_loss'] >= 0
# test simple_test
with torch.no_grad():
results = self.simple_test(points, img_metas)
boxes_3d = results[0]['boxes_3d']
scores_3d = results[0]['scores_3d']
labels_3d = results[0]['labels_3d']
assert boxes_3d.tensor.shape[0] >= 0
assert boxes_3d.tensor.shape[1] == 7
assert scores_3d.shape[0] >= 0
assert labels_3d.shape[0] >= 0
def test_imvoxelnet(): def test_imvoxelnet():
if not torch.cuda.is_available(): if not torch.cuda.is_available():
pytest.skip('test requires GPU and torch+cuda') pytest.skip('test requires GPU and torch+cuda')
......
...@@ -1114,3 +1114,110 @@ def test_fcos_mono3d_head(): ...@@ -1114,3 +1114,110 @@ def test_fcos_mono3d_head():
assert results[0][1].shape == torch.Size([200]) assert results[0][1].shape == torch.Size([200])
assert results[0][2].shape == torch.Size([200]) assert results[0][2].shape == torch.Size([200])
assert results[0][3].shape == torch.Size([200]) assert results[0][3].shape == torch.Size([200])
def test_groupfree3d_head():
if not torch.cuda.is_available():
pytest.skip('test requires GPU and torch+cuda')
_setup_seed(0)
vote_head_cfg = _get_vote_head_cfg(
'groupfree3d/groupfree3d_8x8_scannet-3d-18class-L6-O256.py')
self = build_head(vote_head_cfg).cuda()
fp_xyz = [torch.rand([2, 256, 3], dtype=torch.float32).cuda()]
fp_features = [torch.rand([2, 288, 256], dtype=torch.float32).cuda()]
fp_indices = [torch.randint(0, 128, [2, 256]).cuda()]
input_dict = dict(
fp_xyz=fp_xyz, fp_features=fp_features, fp_indices=fp_indices)
# test forward
ret_dict = self(input_dict, 'kps')
assert ret_dict['seeds_obj_cls_logits'].shape == torch.Size([2, 1, 256])
assert ret_dict['s5.center'].shape == torch.Size([2, 128, 3])
assert ret_dict['s5.dir_class'].shape == torch.Size([2, 128, 1])
assert ret_dict['s5.dir_res'].shape == torch.Size([2, 128, 1])
assert ret_dict['s5.size_class'].shape == torch.Size([2, 128, 18])
assert ret_dict['s5.size_res'].shape == torch.Size([2, 128, 18, 3])
assert ret_dict['s5.obj_scores'].shape == torch.Size([2, 128, 1])
assert ret_dict['s5.sem_scores'].shape == torch.Size([2, 128, 18])
# test losses
points = [torch.rand([50000, 4], device='cuda') for i in range(2)]
gt_bbox1 = torch.rand([10, 7], dtype=torch.float32).cuda()
gt_bbox2 = torch.rand([10, 7], dtype=torch.float32).cuda()
gt_bbox1 = DepthInstance3DBoxes(gt_bbox1)
gt_bbox2 = DepthInstance3DBoxes(gt_bbox2)
gt_bboxes = [gt_bbox1, gt_bbox2]
pts_instance_mask_1 = torch.randint(0, 10, [50000], device='cuda')
pts_instance_mask_2 = torch.randint(0, 10, [50000], device='cuda')
pts_instance_mask = [pts_instance_mask_1, pts_instance_mask_2]
pts_semantic_mask_1 = torch.randint(0, 19, [50000], device='cuda')
pts_semantic_mask_2 = torch.randint(0, 19, [50000], device='cuda')
pts_semantic_mask = [pts_semantic_mask_1, pts_semantic_mask_2]
labels_1 = torch.randint(0, 18, [10], device='cuda')
labels_2 = torch.randint(0, 18, [10], device='cuda')
gt_labels = [labels_1, labels_2]
losses = self.loss(ret_dict, points, gt_bboxes, gt_labels,
pts_semantic_mask, pts_instance_mask)
assert losses['s5.objectness_loss'] >= 0
assert losses['s5.semantic_loss'] >= 0
assert losses['s5.center_loss'] >= 0
assert losses['s5.dir_class_loss'] >= 0
assert losses['s5.dir_res_loss'] >= 0
assert losses['s5.size_class_loss'] >= 0
assert losses['s5.size_res_loss'] >= 0
# test multiclass_nms_single
obj_scores = torch.rand([256], device='cuda')
sem_scores = torch.rand([256, 18], device='cuda')
points = torch.rand([50000, 3], device='cuda')
bbox = torch.rand([256, 7], device='cuda')
input_meta = dict(box_type_3d=DepthInstance3DBoxes)
bbox_selected, score_selected, labels = \
self.multiclass_nms_single(obj_scores,
sem_scores,
bbox,
points,
input_meta)
assert bbox_selected.shape[0] >= 0
assert bbox_selected.shape[1] == 7
assert score_selected.shape[0] >= 0
assert labels.shape[0] >= 0
# test get_boxes
points = torch.rand([1, 50000, 3], device='cuda')
seed_points = torch.rand([1, 1024, 3], device='cuda')
seed_indices = torch.randint(0, 50000, [1, 1024], device='cuda')
obj_scores = torch.rand([1, 256, 1], device='cuda')
center = torch.rand([1, 256, 3], device='cuda')
dir_class = torch.rand([1, 256, 1], device='cuda')
dir_res_norm = torch.rand([1, 256, 1], device='cuda')
dir_res = torch.rand([1, 256, 1], device='cuda')
size_class = torch.rand([1, 256, 18], device='cuda')
size_res = torch.rand([1, 256, 18, 3], device='cuda')
sem_scores = torch.rand([1, 256, 18], device='cuda')
bbox_preds = dict()
bbox_preds['seed_points'] = seed_points
bbox_preds['seed_indices'] = seed_indices
bbox_preds['s5.obj_scores'] = obj_scores
bbox_preds['s5.center'] = center
bbox_preds['s5.dir_class'] = dir_class
bbox_preds['s5.dir_res_norm'] = dir_res_norm
bbox_preds['s5.dir_res'] = dir_res
bbox_preds['s5.size_class'] = size_class
bbox_preds['s5.size_res'] = size_res
bbox_preds['s5.sem_scores'] = sem_scores
self.test_cfg['prediction_stages'] == 'last'
results = self.get_bboxes(points, bbox_preds, [input_meta])
assert results[0][0].tensor.shape[0] >= 0
assert results[0][0].tensor.shape[1] == 7
assert results[0][1].shape[0] >= 0
assert results[0][2].shape[0] >= 0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment