Commit f63a62b8 authored by zhangshilong's avatar zhangshilong Committed by ChaimZhu
Browse files

[Rrfactor]Imvotenet

parent edb6b369
...@@ -2,6 +2,9 @@ dataset_type = 'SUNRGBDDataset' ...@@ -2,6 +2,9 @@ dataset_type = 'SUNRGBDDataset'
data_root = 'data/sunrgbd/' data_root = 'data/sunrgbd/'
class_names = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser', class_names = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser',
'night_stand', 'bookshelf', 'bathtub') 'night_stand', 'bookshelf', 'bathtub')
metainfo = dict(CLASSES=class_names)
train_pipeline = [ train_pipeline = [
dict( dict(
type='LoadPointsFromFile', type='LoadPointsFromFile',
...@@ -21,8 +24,9 @@ train_pipeline = [ ...@@ -21,8 +24,9 @@ train_pipeline = [
scale_ratio_range=[0.85, 1.15], scale_ratio_range=[0.85, 1.15],
shift_height=True), shift_height=True),
dict(type='PointSample', num_points=20000), dict(type='PointSample', num_points=20000),
dict(type='DefaultFormatBundle3D', class_names=class_names), dict(
dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) type='Pack3DDetInputs',
keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
] ]
test_pipeline = [ test_pipeline = [
dict( dict(
...@@ -47,61 +51,52 @@ test_pipeline = [ ...@@ -47,61 +51,52 @@ test_pipeline = [
sync_2d=False, sync_2d=False,
flip_ratio_bev_horizontal=0.5, flip_ratio_bev_horizontal=0.5,
), ),
dict(type='PointSample', num_points=20000), dict(type='PointSample', num_points=20000)
dict( ]),
type='DefaultFormatBundle3D', dict(type='Pack3DDetInputs', keys=['points'])
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
# construct a pipeline for data and gt loading in show function
# please keep its loading function consistent with test_pipeline (e.g. client)
eval_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='DEPTH',
shift_height=False,
load_dim=6,
use_dim=[0, 1, 2]),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
] ]
data = dict( train_dataloader = dict(
samples_per_gpu=16, batch_size=16,
workers_per_gpu=4, num_workers=4,
train=dict( sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type='RepeatDataset', type='RepeatDataset',
times=5, times=5,
dataset=dict( dataset=dict(
type=dataset_type, type=dataset_type,
data_root=data_root, data_root=data_root,
ann_file=data_root + 'sunrgbd_infos_train.pkl', ann_file='sunrgbd_infos_train.pkl',
pipeline=train_pipeline, pipeline=train_pipeline,
classes=class_names,
filter_empty_gt=False, filter_empty_gt=False,
metainfo=metainfo,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset. # and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='Depth')), box_type_3d='Depth')))
val=dict(
val_dataloader = dict(
batch_size=1,
num_workers=1,
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type, type=dataset_type,
data_root=data_root, data_root=data_root,
ann_file=data_root + 'sunrgbd_infos_val.pkl', ann_file='sunrgbd_infos_val.pkl',
pipeline=test_pipeline, pipeline=test_pipeline,
classes=class_names, metainfo=metainfo,
test_mode=True, test_mode=True,
box_type_3d='Depth'), box_type_3d='Depth'))
test=dict( test_dataloader = dict(
batch_size=1,
num_workers=1,
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type, type=dataset_type,
data_root=data_root, data_root=data_root,
ann_file=data_root + 'sunrgbd_infos_val.pkl', ann_file='sunrgbd_infos_val.pkl',
pipeline=test_pipeline, pipeline=test_pipeline,
classes=class_names, metainfo=metainfo,
test_mode=True, test_mode=True,
box_type_3d='Depth')) box_type_3d='Depth'))
val_evaluator = dict(type='IndoorMetric')
evaluation = dict(pipeline=eval_pipeline) test_evaluator = val_evaluator
model = dict( model = dict(
type='ImVoteNet', type='ImVoteNet',
data_preprocessor=dict(
type='Det3DDataPreprocessor',
# use caffe img_norm
mean=[103.530, 116.280, 123.675],
std=[1.0, 1.0, 1.0],
bgr_to_rgb=False,
pad_size_divisor=32),
img_backbone=dict( img_backbone=dict(
type='ResNet', type='mmdet.ResNet',
depth=50, depth=50,
num_stages=4, num_stages=4,
out_indices=(0, 1, 2, 3), out_indices=(0, 1, 2, 3),
...@@ -10,11 +17,12 @@ model = dict( ...@@ -10,11 +17,12 @@ model = dict(
norm_eval=True, norm_eval=True,
style='caffe'), style='caffe'),
img_neck=dict( img_neck=dict(
type='FPN', type='mmdet.FPN',
in_channels=[256, 512, 1024, 2048], in_channels=[256, 512, 1024, 2048],
out_channels=256, out_channels=256,
num_outs=5), num_outs=5),
img_rpn_head=dict( img_rpn_head=dict(
_scope_='mmdet',
type='RPNHead', type='RPNHead',
in_channels=256, in_channels=256,
feat_channels=256, feat_channels=256,
...@@ -31,6 +39,7 @@ model = dict( ...@@ -31,6 +39,7 @@ model = dict(
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
loss_bbox=dict(type='L1Loss', loss_weight=1.0)), loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
img_roi_head=dict( img_roi_head=dict(
_scope_='mmdet',
type='StandardRoIHead', type='StandardRoIHead',
bbox_roi_extractor=dict( bbox_roi_extractor=dict(
type='SingleRoIExtractor', type='SingleRoIExtractor',
...@@ -56,6 +65,7 @@ model = dict( ...@@ -56,6 +65,7 @@ model = dict(
train_cfg=dict( train_cfg=dict(
img_rpn=dict( img_rpn=dict(
assigner=dict( assigner=dict(
_scope_='mmdet',
type='MaxIoUAssigner', type='MaxIoUAssigner',
pos_iou_thr=0.7, pos_iou_thr=0.7,
neg_iou_thr=0.3, neg_iou_thr=0.3,
...@@ -63,7 +73,7 @@ model = dict( ...@@ -63,7 +73,7 @@ model = dict(
match_low_quality=True, match_low_quality=True,
ignore_iof_thr=-1), ignore_iof_thr=-1),
sampler=dict( sampler=dict(
type='RandomSampler', type='mmdet.RandomSampler',
num=256, num=256,
pos_fraction=0.5, pos_fraction=0.5,
neg_pos_ub=-1, neg_pos_ub=-1,
...@@ -80,6 +90,7 @@ model = dict( ...@@ -80,6 +90,7 @@ model = dict(
min_bbox_size=0), min_bbox_size=0),
img_rcnn=dict( img_rcnn=dict(
assigner=dict( assigner=dict(
_scope_='mmdet',
type='MaxIoUAssigner', type='MaxIoUAssigner',
pos_iou_thr=0.5, pos_iou_thr=0.5,
neg_iou_thr=0.5, neg_iou_thr=0.5,
...@@ -87,7 +98,7 @@ model = dict( ...@@ -87,7 +98,7 @@ model = dict(
match_low_quality=False, match_low_quality=False,
ignore_iof_thr=-1), ignore_iof_thr=-1),
sampler=dict( sampler=dict(
type='RandomSampler', type='mmdet.RandomSampler',
num=512, num=512,
pos_fraction=0.25, pos_fraction=0.25,
neg_pos_ub=-1, neg_pos_ub=-1,
......
...@@ -3,56 +3,71 @@ _base_ = [ ...@@ -3,56 +3,71 @@ _base_ = [
'../_base_/models/imvotenet_image.py' '../_base_/models/imvotenet_image.py'
] ]
# use caffe img_norm
img_norm_cfg = dict(
mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
train_pipeline = [ train_pipeline = [
dict(type='LoadImageFromFile'), dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', with_bbox=True),
dict( dict(
type='Resize', type='LoadAnnotations3D',
img_scale=[(1333, 480), (1333, 504), (1333, 528), (1333, 552), with_bbox=True,
with_label=True,
with_bbox_3d=False,
with_label_3d=False),
dict(
type='RandomChoiceResize',
scales=[(1333, 480), (1333, 504), (1333, 528), (1333, 552),
(1333, 576), (1333, 600)], (1333, 576), (1333, 600)],
multiscale_mode='value',
keep_ratio=True), keep_ratio=True),
dict(type='RandomFlip', flip_ratio=0.5), dict(type='RandomFlip', prob=0.5),
dict(type='Normalize', **img_norm_cfg), dict(
dict(type='Pad', size_divisor=32), type='Pack3DDetInputs', keys=['img', 'gt_bboxes', 'gt_bboxes_labels']),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
] ]
test_pipeline = [ test_pipeline = [
dict(type='LoadImageFromFile'), dict(type='LoadImageFromFile'),
# online evaluation
dict(
type='LoadAnnotations3D',
with_bbox=True,
with_label=True,
with_bbox_3d=False,
with_label_3d=False),
dict(type='Resize', scale=(1333, 600), keep_ratio=True),
dict( dict(
type='MultiScaleFlipAug', type='Pack3DDetInputs',
img_scale=(1333, 600), keys=(['img']),
flip=False, meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
transforms=[ 'scale_factor'))
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img']),
])
] ]
train_dataloader = dict(
batch_size=2,
num_workers=2,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type='RepeatDataset', times=1, dataset=dict(pipeline=train_pipeline)))
data = dict( val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
samples_per_gpu=2, test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
workers_per_gpu=2,
train=dict(times=1, dataset=dict(pipeline=train_pipeline)), train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=8, val_interval=1)
val=dict(pipeline=test_pipeline), val_cfg = dict(type='ValLoop')
test=dict(pipeline=test_pipeline)) test_cfg = dict(type='TestLoop')
# learning rate
param_scheduler = [
dict(
type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
dict(
type='MultiStepLR',
begin=0,
end=8,
by_epoch=True,
milestones=[6],
gamma=0.1)
]
val_evaluator = dict(type='Indoor2DMetric')
optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) # optimizer
optimizer_config = dict(grad_clip=None) optim_wrapper = dict(
lr_config = dict( type='OptimWrapper',
policy='step', optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[6])
runner = dict(type='EpochBasedRunner', max_epochs=8)
load_from = 'http://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco_bbox_mAP-0.408__segm_mAP-0.37_20200504_163245-42aa3d00.pth' # noqa load_from = 'http://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco_bbox_mAP-0.408__segm_mAP-0.37_20200504_163245-42aa3d00.pth' # noqa
...@@ -7,10 +7,6 @@ _base_ = [ ...@@ -7,10 +7,6 @@ _base_ = [
class_names = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser', class_names = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser',
'night_stand', 'bookshelf', 'bathtub') 'night_stand', 'bookshelf', 'bathtub')
# use caffe img_norm
img_norm_cfg = dict(
mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
model = dict( model = dict(
pts_backbone=dict( pts_backbone=dict(
type='PointNet2SASSG', type='PointNet2SASSG',
...@@ -48,10 +44,8 @@ model = dict( ...@@ -48,10 +44,8 @@ model = dict(
[0.76584, 1.398258, 0.472728]]), [0.76584, 1.398258, 0.472728]]),
pred_layer_cfg=dict( pred_layer_cfg=dict(
in_channels=128, shared_conv_channels=(128, 128), bias=True), in_channels=128, shared_conv_channels=(128, 128), bias=True),
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d'),
objectness_loss=dict( objectness_loss=dict(
type='CrossEntropyLoss', type='mmdet.CrossEntropyLoss',
class_weight=[0.2, 0.8], class_weight=[0.2, 0.8],
reduction='sum', reduction='sum',
loss_weight=5.0), loss_weight=5.0),
...@@ -62,15 +56,23 @@ model = dict( ...@@ -62,15 +56,23 @@ model = dict(
loss_src_weight=10.0, loss_src_weight=10.0,
loss_dst_weight=10.0), loss_dst_weight=10.0),
dir_class_loss=dict( dir_class_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), type='mmdet.CrossEntropyLoss',
reduction='sum',
loss_weight=1.0),
dir_res_loss=dict( dir_res_loss=dict(
type='SmoothL1Loss', reduction='sum', loss_weight=10.0), type='mmdet.SmoothL1Loss', reduction='sum', loss_weight=10.0),
size_class_loss=dict( size_class_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), type='mmdet.CrossEntropyLoss',
reduction='sum',
loss_weight=1.0),
size_res_loss=dict( size_res_loss=dict(
type='SmoothL1Loss', reduction='sum', loss_weight=10.0 / 3.0), type='mmdet.SmoothL1Loss',
reduction='sum',
loss_weight=10.0 / 3.0),
semantic_loss=dict( semantic_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)), type='mmdet.CrossEntropyLoss',
reduction='sum',
loss_weight=1.0)),
joint=dict( joint=dict(
vote_module_cfg=dict( vote_module_cfg=dict(
in_channels=512, in_channels=512,
...@@ -154,11 +156,11 @@ model = dict( ...@@ -154,11 +156,11 @@ model = dict(
# model training and testing settings # model training and testing settings
train_cfg=dict( train_cfg=dict(
pts=dict( pts=dict(
pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote')), pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mode='vote')),
test_cfg=dict( test_cfg=dict(
img_rcnn=dict(score_thr=0.1), img_rcnn=dict(score_thr=0.1),
pts=dict( pts=dict(
sample_mod='seed', sample_mode='seed',
nms_thr=0.25, nms_thr=0.25,
score_thr=0.05, score_thr=0.05,
per_class_proposal=True))) per_class_proposal=True)))
...@@ -171,12 +173,13 @@ train_pipeline = [ ...@@ -171,12 +173,13 @@ train_pipeline = [
load_dim=6, load_dim=6,
use_dim=[0, 1, 2]), use_dim=[0, 1, 2]),
dict(type='LoadImageFromFile'), dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations3D'), dict(
dict(type='LoadAnnotations', with_bbox=True), type='LoadAnnotations3D',
dict(type='Resize', img_scale=(1333, 600), keep_ratio=True), with_bbox=True,
dict(type='RandomFlip', flip_ratio=0.0), with_label=True,
dict(type='Normalize', **img_norm_cfg), with_bbox_3d=True,
dict(type='Pad', size_divisor=32), with_label_3d=True),
dict(type='Resize', scale=(1333, 600), keep_ratio=True),
dict( dict(
type='RandomFlip3D', type='RandomFlip3D',
sync_2d=False, sync_2d=False,
...@@ -188,15 +191,13 @@ train_pipeline = [ ...@@ -188,15 +191,13 @@ train_pipeline = [
scale_ratio_range=[0.85, 1.15], scale_ratio_range=[0.85, 1.15],
shift_height=True), shift_height=True),
dict(type='PointSample', num_points=20000), dict(type='PointSample', num_points=20000),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict( dict(
type='Collect3D', type='Pack3DDetInputs',
keys=[ keys=([
'img', 'gt_bboxes', 'gt_labels', 'points', 'gt_bboxes_3d', 'img', 'gt_bboxes', 'gt_bboxes_labels', 'points', 'gt_bboxes_3d',
'gt_labels_3d' 'gt_labels_3d'
]) ]))
] ]
test_pipeline = [ test_pipeline = [
dict(type='LoadImageFromFile'), dict(type='LoadImageFromFile'),
dict( dict(
...@@ -205,56 +206,15 @@ test_pipeline = [ ...@@ -205,56 +206,15 @@ test_pipeline = [
shift_height=True, shift_height=True,
load_dim=6, load_dim=6,
use_dim=[0, 1, 2]), use_dim=[0, 1, 2]),
dict( dict(type='Resize', scale=(1333, 600), keep_ratio=True),
type='MultiScaleFlipAug3D',
img_scale=(1333, 600),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip', flip_ratio=0.0),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
),
dict(type='PointSample', num_points=20000), dict(type='PointSample', num_points=20000),
dict( dict(type='Pack3DDetInputs', keys=['img', 'points'])
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['img', 'points'])
]),
]
# construct a pipeline for data and gt loading in show function
# please keep its loading function consistent with test_pipeline (e.g. client)
eval_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='LoadPointsFromFile',
coord_type='DEPTH',
shift_height=False,
load_dim=6,
use_dim=[0, 1, 2]),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['img', 'points'])
] ]
data = dict( train_dataloader = dict(dataset=dict(dataset=dict(pipeline=train_pipeline)))
train=dict(dataset=dict(pipeline=train_pipeline)),
val=dict(pipeline=test_pipeline), val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
test=dict(pipeline=test_pipeline)) test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
evaluation = dict(pipeline=eval_pipeline)
# may also use your own pre-trained image branch # may also use your own pre-trained image branch
load_from = 'https://download.openmmlab.com/mmdetection3d/v0.1.0_models/imvotenet/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class_20210323_173222-cad62aeb.pth' # noqa load_from = 'https://download.openmmlab.com/mmdetection3d/v0.1.0_models/imvotenet/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class_20210323_173222-cad62aeb.pth' # noqa
...@@ -178,10 +178,11 @@ class Det3DDataset(BaseDataset): ...@@ -178,10 +178,11 @@ class Det3DDataset(BaseDataset):
dict | None: Processed `ann_info` dict | None: Processed `ann_info`
""" """
# add s or gt prefix for most keys after concat # add s or gt prefix for most keys after concat
# we only process 3d annotations here, the corresponding
# 2d annotation process is in the `LoadAnnotations3D`
# in `pipelines`
name_mapping = { name_mapping = {
'bbox_label': 'gt_labels',
'bbox_label_3d': 'gt_labels_3d', 'bbox_label_3d': 'gt_labels_3d',
'bbox': 'gt_bboxes',
'bbox_3d': 'gt_bboxes_3d', 'bbox_3d': 'gt_bboxes_3d',
'depth': 'depths', 'depth': 'depths',
'center_2d': 'centers_2d', 'center_2d': 'centers_2d',
...@@ -196,6 +197,7 @@ class Det3DDataset(BaseDataset): ...@@ -196,6 +197,7 @@ class Det3DDataset(BaseDataset):
keys = list(instances[0].keys()) keys = list(instances[0].keys())
ann_info = dict() ann_info = dict()
for ann_name in keys: for ann_name in keys:
if ann_name in name_mapping:
temp_anns = [item[ann_name] for item in instances] temp_anns = [item[ann_name] for item in instances]
# map the original dataset label to training label # map the original dataset label to training label
if 'label' in ann_name: if 'label' in ann_name:
...@@ -203,9 +205,10 @@ class Det3DDataset(BaseDataset): ...@@ -203,9 +205,10 @@ class Det3DDataset(BaseDataset):
self.label_mapping[item] for item in temp_anns self.label_mapping[item] for item in temp_anns
] ]
temp_anns = np.array(temp_anns) temp_anns = np.array(temp_anns)
if ann_name in name_mapping:
ann_name = name_mapping[ann_name] ann_name = name_mapping[ann_name]
ann_info[ann_name] = temp_anns ann_info[ann_name] = temp_anns
ann_info['instances'] = info['instances']
return ann_info return ann_info
def parse_data_info(self, info: dict) -> dict: def parse_data_info(self, info: dict) -> dict:
......
...@@ -87,7 +87,8 @@ class KittiDataset(Det3DDataset): ...@@ -87,7 +87,8 @@ class KittiDataset(Det3DDataset):
if 'plane' in info: if 'plane' in info:
# convert ground plane to velodyne coordinates # convert ground plane to velodyne coordinates
plane = np.array(info['plane']) plane = np.array(info['plane'])
lidar2cam = np.array(info['images']['CAM2']['lidar2cam']) lidar2cam = np.array(
info['images']['CAM2']['lidar2cam'], dtype=np.float32)
reverse = np.linalg.inv(lidar2cam) reverse = np.linalg.inv(lidar2cam)
(plane_norm_cam, plane_off_cam) = (plane[:3], (plane_norm_cam, plane_off_cam) = (plane[:3],
......
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
from typing import List, Union from typing import List, Sequence, Union
import mmcv
import numpy as np import numpy as np
import torch
from mmcv import BaseTransform from mmcv import BaseTransform
from mmcv.transforms import to_tensor
from mmengine import InstanceData from mmengine import InstanceData
from numpy import dtype
from mmdet3d.core import Det3DDataSample, PointData from mmdet3d.core import Det3DDataSample, PointData
from mmdet3d.core.bbox import BaseInstance3DBoxes from mmdet3d.core.bbox import BaseInstance3DBoxes
...@@ -12,6 +14,38 @@ from mmdet3d.core.points import BasePoints ...@@ -12,6 +14,38 @@ from mmdet3d.core.points import BasePoints
from mmdet3d.registry import TRANSFORMS from mmdet3d.registry import TRANSFORMS
def to_tensor(
data: Union[torch.Tensor, np.ndarray, Sequence, int,
float]) -> torch.Tensor:
"""Convert objects of various python types to :obj:`torch.Tensor`.
Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
:class:`Sequence`, :class:`int` and :class:`float`.
Args:
data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to
be converted.
Returns:
torch.Tensor: the converted data.
"""
if isinstance(data, torch.Tensor):
return data
elif isinstance(data, np.ndarray):
if data.dtype is dtype('float64'):
data = data.astype(np.float32)
return torch.from_numpy(data)
elif isinstance(data, Sequence) and not mmcv.is_str(data):
return torch.tensor(data)
elif isinstance(data, int):
return torch.LongTensor([data])
elif isinstance(data, float):
return torch.FloatTensor([data])
else:
raise TypeError(f'type {type(data)} cannot be converted to tensor.')
@TRANSFORMS.register_module() @TRANSFORMS.register_module()
class Pack3DDetInputs(BaseTransform): class Pack3DDetInputs(BaseTransform):
INPUTS_KEYS = ['points', 'img'] INPUTS_KEYS = ['points', 'img']
...@@ -20,7 +54,7 @@ class Pack3DDetInputs(BaseTransform): ...@@ -20,7 +54,7 @@ class Pack3DDetInputs(BaseTransform):
] ]
INSTANCEDATA_2D_KEYS = [ INSTANCEDATA_2D_KEYS = [
'gt_bboxes', 'gt_bboxes',
'gt_labels', 'gt_bboxes_labels',
] ]
SEG_KEYS = [ SEG_KEYS = [
...@@ -121,8 +155,8 @@ class Pack3DDetInputs(BaseTransform): ...@@ -121,8 +155,8 @@ class Pack3DDetInputs(BaseTransform):
for key in [ for key in [
'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels', 'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels',
'gt_labels_3d', 'attr_labels', 'pts_instance_mask', 'gt_bboxes_labels', 'attr_labels', 'pts_instance_mask',
'pts_semantic_mask', 'centers_2d', 'depths' 'pts_semantic_mask', 'centers_2d', 'depths', 'gt_labels_3d'
]: ]:
if key not in results: if key not in results:
continue continue
...@@ -159,6 +193,9 @@ class Pack3DDetInputs(BaseTransform): ...@@ -159,6 +193,9 @@ class Pack3DDetInputs(BaseTransform):
elif key in self.INSTANCEDATA_3D_KEYS: elif key in self.INSTANCEDATA_3D_KEYS:
gt_instances_3d[self._remove_prefix(key)] = results[key] gt_instances_3d[self._remove_prefix(key)] = results[key]
elif key in self.INSTANCEDATA_2D_KEYS: elif key in self.INSTANCEDATA_2D_KEYS:
if key == 'gt_bboxes_labels':
gt_instances['labels'] = results[key]
else:
gt_instances[self._remove_prefix(key)] = results[key] gt_instances[self._remove_prefix(key)] = results[key]
elif key in self.SEG_KEYS: elif key in self.SEG_KEYS:
gt_pts_seg[self._remove_prefix(key)] = results[key] gt_pts_seg[self._remove_prefix(key)] = results[key]
......
...@@ -632,34 +632,6 @@ class LoadAnnotations3D(LoadAnnotations): ...@@ -632,34 +632,6 @@ class LoadAnnotations3D(LoadAnnotations):
self.with_seg_3d = with_seg_3d self.with_seg_3d = with_seg_3d
self.seg_3d_dtype = seg_3d_dtype self.seg_3d_dtype = seg_3d_dtype
def _load_bboxes(self, results: dict) -> None:
"""Private function to load bounding box annotations.
Rewrite '_load_bboxes` since mmdet3d uses 'parse_anno_info' in
datasets.
Args:
results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
Returns:
dict: The dict contains loaded bounding box annotations.
"""
results['gt_bboxes'] = results['ann_info']['gt_bboxes']
def _load_labels(self, results: dict) -> None:
"""Private function to load label annotations.
Rewrite '_load_bboxes` since mmdet3d uses 'parse_anno_info' in
datasets.
Args:
results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
Returns:
dict: The dict contains loaded label annotations.
"""
results['gt_labels'] = results['ann_info']['gt_labels']
def _load_bboxes_3d(self, results: dict) -> dict: def _load_bboxes_3d(self, results: dict) -> dict:
"""Private function to move the 3D bounding box annotation from """Private function to move the 3D bounding box annotation from
`ann_info` field to the root of `results`. `ann_info` field to the root of `results`.
...@@ -769,6 +741,56 @@ class LoadAnnotations3D(LoadAnnotations): ...@@ -769,6 +741,56 @@ class LoadAnnotations3D(LoadAnnotations):
results['eval_ann_info']['pts_semantic_mask'] = pts_semantic_mask results['eval_ann_info']['pts_semantic_mask'] = pts_semantic_mask
return results return results
def _load_bboxes(self, results: dict) -> None:
"""Private function to load bounding box annotations.
The only difference is it remove the proceess for
`ignore_flag`
Args:
results (dict): Result dict from :obj:``mmcv.BaseDataset``.
Returns:
dict: The dict contains loaded bounding box annotations.
"""
gt_bboxes = []
for instance in results['instances']:
gt_bboxes.append(instance['bbox'])
if len(gt_bboxes) == 0:
results['gt_bboxes'] = np.zeros((0, 4), dtype=np.float32)
else:
results['gt_bboxes'] = np.array(
gt_bboxes, dtype=np.float32).reshape((-1, 4))
if self.denorm_bbox:
bbox_num = results['gt_bboxes'].shape[0]
if bbox_num != 0:
h, w = results['img_shape']
results['gt_bboxes'][:, 0::2] *= w
results['gt_bboxes'][:, 1::2] *= h
if 'eval_ann_info' in results:
results['eval_ann_info']['gt_bboxes'] = results['gt_bboxes']
def _load_labels(self, results: dict) -> None:
"""Private function to load label annotations.
Args:
results (dict): Result dict from :obj :obj:``mmcv.BaseDataset``.
Returns:
dict: The dict contains loaded label annotations.
"""
gt_bboxes_labels = []
for instance in results['instances']:
gt_bboxes_labels.append(instance['bbox_label'])
if len(gt_bboxes_labels) == 0:
results['gt_bboxes_labels'] = np.zeros((0, ), dtype=np.int64)
else:
results['gt_bboxes_labels'] = np.array(
gt_bboxes_labels, dtype=np.int64)
if 'eval_ann_info' in results:
results['eval_ann_info']['gt_bboxes_labels'] = results[
'gt_bboxes_labels']
def transform(self, results: dict) -> dict: def transform(self, results: dict) -> dict:
"""Function to load multiple types annotations. """Function to load multiple types annotations.
......
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
import os.path as osp
from typing import Callable, List, Optional, Union from typing import Callable, List, Optional, Union
import numpy as np import numpy as np
...@@ -22,13 +24,15 @@ class SUNRGBDDataset(Det3DDataset): ...@@ -22,13 +24,15 @@ class SUNRGBDDataset(Det3DDataset):
ann_file (str): Path of annotation file. ann_file (str): Path of annotation file.
metainfo (dict, optional): Meta information for dataset, such as class metainfo (dict, optional): Meta information for dataset, such as class
information. Defaults to None. information. Defaults to None.
data_prefix (dict, optional): Prefix for data. Defaults to data_prefix (dict): Prefix for data. Defaults to
`dict(pts='points',img='sunrgbd_trainval')`. `dict(pts='points',img='sunrgbd_trainval')`.
pipeline (list[dict], optional): Pipeline used for data processing. pipeline (list[dict], optional): Pipeline used for data processing.
Defaults to None. Defaults to None.
modality (dict, optional): Modality to specify the sensor data used modality (dict, optional): Modality to specify the sensor data used
as input. Defaults to `dict(use_camera=True, use_lidar=True)`. as input. Defaults to `dict(use_camera=True, use_lidar=True)`.
box_type_3d (str, optional): Type of 3D box of this dataset. default_cam_key (str): The default camera name adopted.
Defaults to "CAM0".
box_type_3d (str): Type of 3D box of this dataset.
Based on the `box_type_3d`, the dataset will encapsulate the box Based on the `box_type_3d`, the dataset will encapsulate the box
to its original format then converted them to `box_type_3d`. to its original format then converted them to `box_type_3d`.
Defaults to 'Depth' in this dataset. Available options includes Defaults to 'Depth' in this dataset. Available options includes
...@@ -36,9 +40,9 @@ class SUNRGBDDataset(Det3DDataset): ...@@ -36,9 +40,9 @@ class SUNRGBDDataset(Det3DDataset):
- 'LiDAR': Box in LiDAR coordinates. - 'LiDAR': Box in LiDAR coordinates.
- 'Depth': Box in depth coordinates, usually for indoor dataset. - 'Depth': Box in depth coordinates, usually for indoor dataset.
- 'Camera': Box in camera coordinates. - 'Camera': Box in camera coordinates.
filter_empty_gt (bool, optional): Whether to filter empty GT. filter_empty_gt (bool): Whether to filter empty GT.
Defaults to True. Defaults to True.
test_mode (bool, optional): Whether the dataset is in test mode. test_mode (bool): Whether the dataset is in test mode.
Defaults to False. Defaults to False.
""" """
METAINFO = { METAINFO = {
...@@ -51,8 +55,9 @@ class SUNRGBDDataset(Det3DDataset): ...@@ -51,8 +55,9 @@ class SUNRGBDDataset(Det3DDataset):
ann_file: str, ann_file: str,
metainfo: Optional[dict] = None, metainfo: Optional[dict] = None,
data_prefix: dict = dict( data_prefix: dict = dict(
pts='points', img='sunrgbd_trainval'), pts='points', img='sunrgbd_trainval/image'),
pipeline: List[Union[dict, Callable]] = [], pipeline: List[Union[dict, Callable]] = [],
default_cam_key: str = 'CAM0',
modality=dict(use_camera=True, use_lidar=True), modality=dict(use_camera=True, use_lidar=True),
box_type_3d: str = 'Depth', box_type_3d: str = 'Depth',
filter_empty_gt: bool = True, filter_empty_gt: bool = True,
...@@ -64,6 +69,7 @@ class SUNRGBDDataset(Det3DDataset): ...@@ -64,6 +69,7 @@ class SUNRGBDDataset(Det3DDataset):
metainfo=metainfo, metainfo=metainfo,
data_prefix=data_prefix, data_prefix=data_prefix,
pipeline=pipeline, pipeline=pipeline,
default_cam_key=default_cam_key,
modality=modality, modality=modality,
box_type_3d=box_type_3d, box_type_3d=box_type_3d,
filter_empty_gt=filter_empty_gt, filter_empty_gt=filter_empty_gt,
...@@ -73,6 +79,47 @@ class SUNRGBDDataset(Det3DDataset): ...@@ -73,6 +79,47 @@ class SUNRGBDDataset(Det3DDataset):
'use_lidar' in self.modality 'use_lidar' in self.modality
assert self.modality['use_camera'] or self.modality['use_lidar'] assert self.modality['use_camera'] or self.modality['use_lidar']
def parse_data_info(self, info: dict) -> dict:
"""Process the raw data info.
Convert all relative path of needed modality data file to
the absolute path. And process
the `instances` field to `ann_info` in training stage.
Args:
info (dict): Raw info dict.
Returns:
dict: Has `ann_info` in training stage. And
all path has been converted to absolute path.
"""
if self.modality['use_lidar']:
info['lidar_points']['lidar_path'] = \
osp.join(
self.data_prefix.get('pts', ''),
info['lidar_points']['lidar_path'])
if self.modality['use_camera']:
for cam_id, img_info in info['images'].items():
if 'img_path' in img_info:
img_info['img_path'] = osp.join(
self.data_prefix.get('img', ''), img_info['img_path'])
if self.default_cam_key is not None:
info['img_path'] = info['images'][
self.default_cam_key]['img_path']
info['depth2img'] = np.array(
info['images'][self.default_cam_key]['depth2img'],
dtype=np.float32)
if not self.test_mode:
# used in traing
info['ann_info'] = self.parse_ann_info(info)
if self.test_mode and self.load_eval_anns:
info['eval_ann_info'] = self.parse_ann_info(info)
return info
def parse_ann_info(self, info: dict) -> dict: def parse_ann_info(self, info: dict) -> dict:
"""Process the `instances` in data info to `ann_info` """Process the `instances` in data info to `ann_info`
...@@ -83,12 +130,11 @@ class SUNRGBDDataset(Det3DDataset): ...@@ -83,12 +130,11 @@ class SUNRGBDDataset(Det3DDataset):
dict: Processed `ann_info` dict: Processed `ann_info`
""" """
ann_info = super().parse_ann_info(info) ann_info = super().parse_ann_info(info)
# empty gt # process data without any annotations
if ann_info is None: if ann_info is None:
ann_info = dict() ann_info = dict()
ann_info['gt_bboxes_3d'] = np.zeros((0, 6), dtype=np.float32) ann_info['gt_bboxes_3d'] = np.zeros((0, 6), dtype=np.float32)
ann_info['gt_labels_3d'] = np.zeros((0, ), dtype=np.int64) ann_info['gt_labels_3d'] = np.zeros((0, ), dtype=np.int64)
# to target box structure # to target box structure
ann_info['gt_bboxes_3d'] = DepthInstance3DBoxes( ann_info['gt_bboxes_3d'] = DepthInstance3DBoxes(
ann_info['gt_bboxes_3d'], ann_info['gt_bboxes_3d'],
......
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
from collections import OrderedDict
from typing import Dict, List, Optional, Sequence from typing import Dict, List, Optional, Sequence
import numpy as np
from mmengine.evaluator import BaseMetric from mmengine.evaluator import BaseMetric
from mmengine.logging import MMLogger from mmengine.logging import MMLogger
from mmdet3d.core import get_box_type, indoor_eval from mmdet3d.core import get_box_type, indoor_eval
from mmdet3d.registry import METRICS from mmdet3d.registry import METRICS
from mmdet.core import eval_map
@METRICS.register_module() @METRICS.register_module()
class IndoorMetric(BaseMetric): class IndoorMetric(BaseMetric):
"""Kitti evaluation metric. """Indoor scene evaluation metric.
Args: Args:
iou_thr (list[float]): List of iou threshold when calculate the iou_thr (list[float]): List of iou threshold when calculate the
...@@ -90,3 +93,91 @@ class IndoorMetric(BaseMetric): ...@@ -90,3 +93,91 @@ class IndoorMetric(BaseMetric):
box_mode_3d=box_mode_3d) box_mode_3d=box_mode_3d)
return ret_dict return ret_dict
@METRICS.register_module()
class Indoor2DMetric(BaseMetric):
"""indoor 2d predictions evaluation metric.
Args:
iou_thr (list[float]): List of iou threshold when calculate the
metric. Defaults to [0.5].
collect_device (str, optional): Device name used for collecting
results from different ranks during distributed training.
Must be 'cpu' or 'gpu'. Defaults to 'cpu'.
prefix (str): The prefix that will be added in the metric
names to disambiguate homonymous metrics of different evaluators.
If prefix is not provided in the argument, self.default_prefix
will be used instead. Default: None
"""
def __init__(self,
iou_thr: List[float] = [0.5],
collect_device: str = 'cpu',
prefix: Optional[str] = None,
**kwargs):
super(Indoor2DMetric, self).__init__(
prefix=prefix, collect_device=collect_device)
self.iou_thr = iou_thr
def process(self, data_batch: Sequence[dict],
predictions: Sequence[dict]) -> None:
"""Process one batch of data samples and predictions.
The processed results should be stored in ``self.results``,
which will be used to compute the metrics when all batches
have been processed.
Args:
data_batch (Sequence[dict]): A batch of data
from the dataloader.
predictions (Sequence[dict]): A batch of outputs from
the model.
"""
batch_eval_anns = [
item['data_sample']['eval_ann_info'] for item in data_batch
]
for eval_ann, pred_dict in zip(batch_eval_anns, predictions):
pred = pred_dict['pred_instances']
ann = dict(
labels=eval_ann['gt_bboxes_labels'],
bboxes=eval_ann['gt_bboxes'])
pred_bboxes = pred['bboxes'].cpu().numpy()
pred_scores = pred['scores'].cpu().numpy()
pred_labels = pred['labels'].cpu().numpy()
dets = []
for label in range(len(self.dataset_meta['CLASSES'])):
index = np.where(pred_labels == label)[0]
pred_bbox_scores = np.hstack(
[pred_bboxes[index], pred_scores[index].reshape((-1, 1))])
dets.append(pred_bbox_scores)
self.results.append((ann, dets))
def compute_metrics(self, results: list) -> Dict[str, float]:
"""Compute the metrics from processed results.
Args:
results (list): The processed results of each batch.
Returns:
Dict[str, float]: The computed metrics. The keys are the names of
the metrics, and the values are corresponding results.
"""
logger: MMLogger = MMLogger.get_current_instance()
annotations, preds = zip(*results)
eval_results = OrderedDict()
iou_thr_2d = (self.iou_thr) if isinstance(self.iou_thr,
float) else self.iou_thr
for iou_thr_2d_single in iou_thr_2d:
mean_ap, _ = eval_map(
preds,
annotations,
scale_ranges=None,
iou_thr=iou_thr_2d_single,
dataset=self.dataset_meta['CLASSES'],
logger=logger)
eval_results['mAP_' + str(iou_thr_2d_single)] = mean_ap
return eval_results
...@@ -754,15 +754,16 @@ class VoteHead(BaseModule): ...@@ -754,15 +754,16 @@ class VoteHead(BaseModule):
batch_size = bbox3d.shape[0] batch_size = bbox3d.shape[0]
results_list = list() results_list = list()
if use_nms: if use_nms:
for b in range(batch_size): for batch_index in range(batch_size):
temp_results = InstanceData() temp_results = InstanceData()
bbox_selected, score_selected, labels = \ bbox_selected, score_selected, labels = \
self.multiclass_nms_single(obj_scores[b], self.multiclass_nms_single(
sem_scores[b], obj_scores[batch_index],
bbox3d[b], sem_scores[batch_index],
stack_points[b, ..., :3], bbox3d[batch_index],
batch_input_metas[b]) stack_points[batch_index, ..., :3],
bbox = batch_input_metas[b]['box_type_3d']( batch_input_metas[batch_index])
bbox = batch_input_metas[batch_index]['box_type_3d'](
bbox_selected, bbox_selected,
box_dim=bbox_selected.shape[-1], box_dim=bbox_selected.shape[-1],
with_yaw=self.bbox_coder.with_rot) with_yaw=self.bbox_coder.with_rot)
......
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
from typing import List, Union from typing import List, Optional, Union
from mmengine import InstanceData from mmengine import InstanceData
...@@ -91,8 +91,8 @@ class Base3DDetector(BaseDetector): ...@@ -91,8 +91,8 @@ class Base3DDetector(BaseDetector):
def convert_to_datasample( def convert_to_datasample(
self, self,
results_list_3d: InstanceList, results_list_3d: Optional[InstanceList] = None,
results_list_2d: InstanceList = None, results_list_2d: Optional[InstanceList] = None,
) -> SampleList: ) -> SampleList:
"""Convert results list to `Det3DDataSample`. """Convert results list to `Det3DDataSample`.
...@@ -128,10 +128,18 @@ class Base3DDetector(BaseDetector): ...@@ -128,10 +128,18 @@ class Base3DDetector(BaseDetector):
""" """
data_sample_list = [] data_sample_list = []
assert (results_list_2d is not None) or \
(results_list_3d is not None),\
'please pass at least one type of results_list'
if results_list_2d is None: if results_list_2d is None:
results_list_2d = [ results_list_2d = [
InstanceData() for _ in range(len(results_list_3d)) InstanceData() for _ in range(len(results_list_3d))
] ]
if results_list_3d is None:
results_list_3d = [
InstanceData() for _ in range(len(results_list_2d))
]
for i in range(len(results_list_3d)): for i in range(len(results_list_3d)):
result = Det3DDataSample() result = Det3DDataSample()
result.pred_instances_3d = results_list_3d[i] result.pred_instances_3d = results_list_3d[i]
......
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
import warnings import copy
from typing import Dict, List, Optional, Sequence, Tuple, Union
import numpy as np
import torch import torch
from mmengine import InstanceData
from torch import Tensor
from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d from mmdet3d.core import Det3DDataSample
from mmdet3d.models.utils import MLP from mmdet3d.models.utils import MLP
from mmdet3d.registry import MODELS from mmdet3d.registry import MODELS
from .base import Base3DDetector from .base import Base3DDetector
def sample_valid_seeds(mask, num_sampled_seed=1024): def sample_valid_seeds(mask: Tensor, num_sampled_seed: int = 1024) -> Tensor:
r"""Randomly sample seeds from all imvotes. r"""Randomly sample seeds from all imvotes.
Modified from `<https://github.com/facebookresearch/imvotenet/blob/a8856345146bacf29a57266a2f0b874406fd8823/models/imvotenet.py#L26>`_ Modified from `<https://github.com/facebookresearch/imvotenet/blob/a8856345146bacf29a57266a2f0b874406fd8823/models/imvotenet.py#L26>`_
...@@ -54,26 +56,34 @@ def sample_valid_seeds(mask, num_sampled_seed=1024): ...@@ -54,26 +56,34 @@ def sample_valid_seeds(mask, num_sampled_seed=1024):
@MODELS.register_module() @MODELS.register_module()
class ImVoteNet(Base3DDetector): class ImVoteNet(Base3DDetector):
r"""`ImVoteNet <https://arxiv.org/abs/2001.10692>`_ for 3D detection.""" r"""`ImVoteNet <https://arxiv.org/abs/2001.10692>`_ for 3D detection.
ImVoteNet is based on fusing 2D votes in images and 3D votes in point
clouds, which explicitly extract both geometric and semantic features
from the 2D images. It leverage camera parameters to lift these
features to 3D. A multi-tower training scheme also improve the synergy
of 2D-3D feature fusion.
"""
def __init__(self, def __init__(self,
pts_backbone=None, pts_backbone: Optional[dict] = None,
pts_bbox_heads=None, pts_bbox_heads: Optional[dict] = None,
pts_neck=None, pts_neck: Optional[dict] = None,
img_backbone=None, img_backbone: Optional[dict] = None,
img_neck=None, img_neck: Optional[dict] = None,
img_roi_head=None, img_roi_head: Optional[dict] = None,
img_rpn_head=None, img_rpn_head: Optional[dict] = None,
img_mlp=None, img_mlp: Optional[dict] = None,
freeze_img_branch=False, freeze_img_branch: bool = False,
fusion_layer=None, fusion_layer: Optional[dict] = None,
num_sampled_seed=None, num_sampled_seed: Optional[dict] = None,
train_cfg=None, train_cfg: Optional[dict] = None,
test_cfg=None, test_cfg: Optional[dict] = None,
pretrained=None, init_cfg: Optional[dict] = None,
init_cfg=None): **kwargs) -> None:
super(ImVoteNet, self).__init__(init_cfg=init_cfg) super(ImVoteNet, self).__init__(init_cfg=init_cfg, **kwargs)
# point branch # point branch
if pts_backbone is not None: if pts_backbone is not None:
...@@ -137,35 +147,8 @@ class ImVoteNet(Base3DDetector): ...@@ -137,35 +147,8 @@ class ImVoteNet(Base3DDetector):
self.train_cfg = train_cfg self.train_cfg = train_cfg
self.test_cfg = test_cfg self.test_cfg = test_cfg
if pretrained is None: def _forward(self):
img_pretrained = None raise NotImplementedError
pts_pretrained = None
elif isinstance(pretrained, dict):
img_pretrained = pretrained.get('img', None)
pts_pretrained = pretrained.get('pts', None)
else:
raise ValueError(
f'pretrained should be a dict, got {type(pretrained)}')
if self.with_img_backbone:
if img_pretrained is not None:
warnings.warn('DeprecationWarning: pretrained is a deprecated '
'key, please consider using init_cfg.')
self.img_backbone.init_cfg = dict(
type='Pretrained', checkpoint=img_pretrained)
if self.with_img_roi_head:
if img_pretrained is not None:
warnings.warn('DeprecationWarning: pretrained is a deprecated '
'key, please consider using init_cfg.')
self.img_roi_head.init_cfg = dict(
type='Pretrained', checkpoint=img_pretrained)
if self.with_pts_backbone:
if img_pretrained is not None:
warnings.warn('DeprecationWarning: pretrained is a deprecated '
'key, please consider using init_cfg.')
self.pts_backbone.init_cfg = dict(
type='Pretrained', checkpoint=pts_pretrained)
def freeze_img_branch_params(self): def freeze_img_branch_params(self):
"""Freeze all image branch parameters.""" """Freeze all image branch parameters."""
...@@ -267,28 +250,14 @@ class ImVoteNet(Base3DDetector): ...@@ -267,28 +250,14 @@ class ImVoteNet(Base3DDetector):
"""Just to inherit from abstract method.""" """Just to inherit from abstract method."""
pass pass
def extract_img_feat(self, img): def extract_img_feat(self, img: Tensor) -> Sequence[Tensor]:
"""Directly extract features from the img backbone+neck.""" """Directly extract features from the img backbone+neck."""
x = self.img_backbone(img) x = self.img_backbone(img)
if self.with_img_neck: if self.with_img_neck:
x = self.img_neck(x) x = self.img_neck(x)
return x return x
def extract_img_feats(self, imgs): def extract_pts_feat(self, pts: Tensor) -> Tuple[Tensor]:
"""Extract features from multiple images.
Args:
imgs (list[torch.Tensor]): A list of images. The images are
augmented from the same image but in different ways.
Returns:
list[torch.Tensor]: Features of different images
"""
assert isinstance(imgs, list)
return [self.extract_img_feat(img) for img in imgs]
def extract_pts_feat(self, pts):
"""Extract features of points.""" """Extract features of points."""
x = self.pts_backbone(pts) x = self.pts_backbone(pts)
if self.with_pts_neck: if self.with_pts_neck:
...@@ -300,151 +269,93 @@ class ImVoteNet(Base3DDetector): ...@@ -300,151 +269,93 @@ class ImVoteNet(Base3DDetector):
return (seed_points, seed_features, seed_indices) return (seed_points, seed_features, seed_indices)
def extract_pts_feats(self, pts): def loss(self, batch_inputs_dict: Dict[str, Union[List, Tensor]],
"""Extract features of points from multiple samples.""" batch_data_samples: List[Det3DDataSample],
assert isinstance(pts, list) **kwargs) -> List[Det3DDataSample]:
return [self.extract_pts_feat(pt) for pt in pts]
@torch.no_grad()
def extract_bboxes_2d(self,
img,
img_metas,
train=True,
bboxes_2d=None,
**kwargs):
"""Extract bounding boxes from 2d detector.
Args:
img (torch.Tensor): of shape (N, C, H, W) encoding input images.
Typically these should be mean centered and std scaled.
img_metas (list[dict]): Image meta info.
train (bool): train-time or not.
bboxes_2d (list[torch.Tensor]): provided 2d bboxes,
not supported yet.
Return:
list[torch.Tensor]: a list of processed 2d bounding boxes.
""" """
if bboxes_2d is None:
x = self.extract_img_feat(img)
proposal_list = self.img_rpn_head.simple_test_rpn(x, img_metas)
rets = self.img_roi_head.simple_test(
x, proposal_list, img_metas, rescale=False)
rets_processed = []
for ret in rets:
tmp = np.concatenate(ret, axis=0)
sem_class = img.new_zeros((len(tmp)))
start = 0
for i, bboxes in enumerate(ret):
sem_class[start:start + len(bboxes)] = i
start += len(bboxes)
ret = img.new_tensor(tmp)
# append class index
ret = torch.cat([ret, sem_class[:, None]], dim=-1)
inds = torch.argsort(ret[:, 4], descending=True)
ret = ret.index_select(0, inds)
# drop half bboxes during training for better generalization
if train:
rand_drop = torch.randperm(len(ret))[:(len(ret) + 1) // 2]
rand_drop = torch.sort(rand_drop)[0]
ret = ret[rand_drop]
rets_processed.append(ret.float())
return rets_processed
else:
rets_processed = []
for ret in bboxes_2d:
if len(ret) > 0 and train:
rand_drop = torch.randperm(len(ret))[:(len(ret) + 1) // 2]
rand_drop = torch.sort(rand_drop)[0]
ret = ret[rand_drop]
rets_processed.append(ret.float())
return rets_processed
def forward_train(self,
points=None,
img=None,
img_metas=None,
gt_bboxes=None,
gt_labels=None,
gt_bboxes_ignore=None,
gt_masks=None,
proposals=None,
bboxes_2d=None,
gt_bboxes_3d=None,
gt_labels_3d=None,
pts_semantic_mask=None,
pts_instance_mask=None,
**kwargs):
"""Forwarding of train for image branch pretrain or stage 2 train.
Args: Args:
points (list[torch.Tensor]): Points of each batch. batch_inputs_dict (dict): The model input dict which include
img (torch.Tensor): of shape (N, C, H, W) encoding input images. 'points', 'imgs` keys.
Typically these should be mean centered and std scaled.
img_metas (list[dict]): list of image and point cloud meta info - points (list[torch.Tensor]): Point cloud of each sample.
dict. For example, keys include 'ori_shape', 'img_norm_cfg', - imgs (list[torch.Tensor]): Image tensor with shape
and 'transformation_3d_flow'. For details on the values of (N, C, H ,W).
the keys see `mmdet/datasets/pipelines/formatting.py:Collect`. batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
gt_bboxes (list[torch.Tensor]): Ground truth bboxes for each image Samples. It usually includes information such as
with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. `gt_instance_3d`.
gt_labels (list[torch.Tensor]): class indices for each
2d bounding box.
gt_bboxes_ignore (list[torch.Tensor]): specify which
2d bounding boxes can be ignored when computing the loss.
gt_masks (torch.Tensor): true segmentation masks for each
2d bbox, used if the architecture supports a segmentation task.
proposals: override rpn proposals (2d) with custom proposals.
Use when `with_rpn` is False.
bboxes_2d (list[torch.Tensor]): provided 2d bboxes,
not supported yet.
gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): 3d gt bboxes.
gt_labels_3d (list[torch.Tensor]): gt class labels for 3d bboxes.
pts_semantic_mask (list[torch.Tensor]): point-wise semantic
label of each batch.
pts_instance_mask (list[torch.Tensor]): point-wise instance
label of each batch.
Returns: Returns:
dict[str, torch.Tensor]: a dictionary of loss components. dict[str, Tensor]: A dictionary of loss components.
""" """
imgs = batch_inputs_dict.get('imgs', None)
points = batch_inputs_dict.get('points', None)
if points is None: if points is None:
x = self.extract_img_feat(img) x = self.extract_img_feat(imgs)
losses = dict() losses = dict()
# RPN forward and loss # RPN forward and loss
if self.with_img_rpn: if self.with_img_rpn:
proposal_cfg = self.train_cfg.get('img_rpn_proposal', proposal_cfg = self.train_cfg.get('img_rpn_proposal',
self.test_cfg.img_rpn) self.test_cfg.img_rpn)
rpn_losses, proposal_list = self.img_rpn_head.forward_train( rpn_data_samples = copy.deepcopy(batch_data_samples)
x, # set cat_id of gt_labels to 0 in RPN
img_metas, for data_sample in rpn_data_samples:
gt_bboxes, data_sample.gt_instances.labels = \
gt_labels=None, torch.zeros_like(data_sample.gt_instances.labels)
gt_bboxes_ignore=gt_bboxes_ignore,
proposal_cfg=proposal_cfg) rpn_losses, rpn_results_list = \
self.img_rpn_head.loss_and_predict(
x, rpn_data_samples,
proposal_cfg=proposal_cfg, **kwargs)
# avoid get same name with roi_head loss
keys = rpn_losses.keys()
for key in keys:
if 'loss' in key and 'rpn' not in key:
rpn_losses[f'rpn_{key}'] = rpn_losses.pop(key)
losses.update(rpn_losses) losses.update(rpn_losses)
else: else:
proposal_list = proposals assert batch_data_samples[0].get('proposals', None) is not None
# use pre-defined proposals in InstanceData for
# the second stage
# to extract ROI features.
rpn_results_list = [
data_sample.proposals for data_sample in batch_data_samples
]
roi_losses = self.img_roi_head.forward_train( roi_losses = self.img_roi_head.loss(x, rpn_results_list,
x, img_metas, proposal_list, gt_bboxes, gt_labels, batch_data_samples, **kwargs)
gt_bboxes_ignore, gt_masks, **kwargs)
losses.update(roi_losses) losses.update(roi_losses)
return losses return losses
else: else:
bboxes_2d = self.extract_bboxes_2d( with torch.no_grad():
img, img_metas, bboxes_2d=bboxes_2d, **kwargs) results_2d = self.predict_img_only(
batch_inputs_dict['imgs'],
batch_data_samples,
rescale=False)
# tensor with shape (n, 6), the 6 arrange
# as [x1, x2, y1, y2, score, label]
pred_bboxes_with_label_list = []
for single_results in results_2d:
cat_preds = torch.cat(
(single_results.bboxes, single_results.scores[:, None],
single_results.labels[:, None]),
dim=-1)
cat_preds = cat_preds[torch.argsort(
cat_preds[:, 4], descending=True)]
# drop half bboxes during training for better generalization
if self.training:
rand_drop = torch.randperm(
len(cat_preds))[:(len(cat_preds) + 1) // 2]
rand_drop = torch.sort(rand_drop)[0]
cat_preds = cat_preds[rand_drop]
points = torch.stack(points) pred_bboxes_with_label_list.append(cat_preds)
seeds_3d, seed_3d_features, seed_indices = \
self.extract_pts_feat(points)
img_features, masks = self.fusion_layer(img, bboxes_2d, seeds_3d, stack_points = torch.stack(points)
img_metas) seeds_3d, seed_3d_features, seed_indices = \
self.extract_pts_feat(stack_points)
img_metas = [item.metainfo for item in batch_data_samples]
img_features, masks = self.fusion_layer(
imgs, pred_bboxes_with_label_list, seeds_3d, img_metas)
inds = sample_valid_seeds(masks, self.num_sampled_seed) inds = sample_valid_seeds(masks, self.num_sampled_seed)
batch_size, img_feat_size = img_features.shape[:2] batch_size, img_feat_size = img_features.shape[:2]
...@@ -476,27 +387,13 @@ class ImVoteNet(Base3DDetector): ...@@ -476,27 +387,13 @@ class ImVoteNet(Base3DDetector):
seed_features=img_features, seed_features=img_features,
seed_indices=seed_indices) seed_indices=seed_indices)
loss_inputs = (points, gt_bboxes_3d, gt_labels_3d,
pts_semantic_mask, pts_instance_mask, img_metas)
bbox_preds_joints = self.pts_bbox_head_joint(
feat_dict_joint, self.train_cfg.pts.sample_mod)
bbox_preds_pts = self.pts_bbox_head_pts(
feat_dict_pts, self.train_cfg.pts.sample_mod)
bbox_preds_img = self.pts_bbox_head_img(
feat_dict_img, self.train_cfg.pts.sample_mod)
losses_towers = [] losses_towers = []
losses_joint = self.pts_bbox_head_joint.loss( losses_joint = self.pts_bbox_head_joint.loss(
bbox_preds_joints, points, feat_dict_joint, batch_data_samples)
*loss_inputs, losses_pts = self.pts_bbox_head_pts.loss(points, feat_dict_pts,
gt_bboxes_ignore=gt_bboxes_ignore) batch_data_samples)
losses_pts = self.pts_bbox_head_pts.loss( losses_img = self.pts_bbox_head_img.loss(points, feat_dict_img,
bbox_preds_pts, batch_data_samples)
*loss_inputs,
gt_bboxes_ignore=gt_bboxes_ignore)
losses_img = self.pts_bbox_head_img.loss(
bbox_preds_img,
*loss_inputs,
gt_bboxes_ignore=gt_bboxes_ignore)
losses_towers.append(losses_joint) losses_towers.append(losses_joint)
losses_towers.append(losses_pts) losses_towers.append(losses_pts)
losses_towers.append(losses_img) losses_towers.append(losses_img)
...@@ -516,162 +413,56 @@ class ImVoteNet(Base3DDetector): ...@@ -516,162 +413,56 @@ class ImVoteNet(Base3DDetector):
return combined_losses return combined_losses
def forward_test(self, def predict(self, batch_inputs_dict: Dict[str, Optional[Tensor]],
points=None, batch_data_samples: List[Det3DDataSample],
img_metas=None, **kwargs) -> List[Det3DDataSample]:
img=None, """Forward of testing.
bboxes_2d=None,
**kwargs):
"""Forwarding of test for image branch pretrain or stage 2 train.
Args: Args:
points (list[list[torch.Tensor]], optional): the outer batch_inputs_dict (dict): The model input dict which include
list indicates test-time augmentations and the inner 'points' and 'imgs keys.
list contains all points in the batch, where each Tensor
should have a shape NxC. Defaults to None. - points (list[torch.Tensor]): Point cloud of each sample.
img_metas (list[list[dict]], optional): the outer list - imgs (list[torch.Tensor]): Tensor of Images.
indicates test-time augs (multiscale, flip, etc.) batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
and the inner list indicates images in a batch. Samples. It usually includes information such as
Defaults to None. `gt_instance_3d`.
img (list[list[torch.Tensor]], optional): the outer
list indicates test-time augmentations and inner Tensor
should have a shape NxCxHxW, which contains all images
in the batch. Defaults to None. Defaults to None.
bboxes_2d (list[list[torch.Tensor]], optional):
Provided 2d bboxes, not supported yet. Defaults to None.
Returns:
list[list[torch.Tensor]]|list[dict]: Predicted 2d or 3d boxes.
""" """
points = batch_inputs_dict.get('points', None)
imgs = batch_inputs_dict.get('imgs', None)
if points is None: if points is None:
for var, name in [(img, 'img'), (img_metas, 'img_metas')]: assert imgs is not None
if not isinstance(var, list): results_2d = self.predict_img_only(imgs, batch_data_samples)
raise TypeError( return self.convert_to_datasample(results_list_2d=results_2d)
f'{name} must be a list, but got {type(var)}')
num_augs = len(img)
if num_augs != len(img_metas):
raise ValueError(f'num of augmentations ({len(img)}) '
f'!= num of image meta ({len(img_metas)})')
if num_augs == 1:
# proposals (List[List[Tensor]]): the outer list indicates
# test-time augs (multiscale, flip, etc.) and the inner list
# indicates images in a batch.
# The Tensor should have a shape Px4, where P is the number of
# proposals.
if 'proposals' in kwargs:
kwargs['proposals'] = kwargs['proposals'][0]
return self.simple_test_img_only(
img=img[0], img_metas=img_metas[0], **kwargs)
else:
assert img[0].size(0) == 1, 'aug test does not support ' \
'inference with batch size ' \
f'{img[0].size(0)}'
# TODO: support test augmentation for predefined proposals
assert 'proposals' not in kwargs
return self.aug_test_img_only(
img=img, img_metas=img_metas, **kwargs)
else: else:
for var, name in [(points, 'points'), (img_metas, 'img_metas')]: results_2d = self.predict_img_only(
if not isinstance(var, list): batch_inputs_dict['imgs'], batch_data_samples, rescale=False)
raise TypeError('{} must be a list, but got {}'.format( # tensor with shape (n, 6), the 6 arrange
name, type(var))) # as [x1, x2, y1, y2, score, label]
pred_bboxes_with_label_list = []
num_augs = len(points) for single_results in results_2d:
if num_augs != len(img_metas): cat_preds = torch.cat(
raise ValueError( (single_results.bboxes, single_results.scores[:, None],
'num of augmentations ({}) != num of image meta ({})'. single_results.labels[:, None]),
format(len(points), len(img_metas))) dim=-1)
cat_preds = cat_preds[torch.argsort(
if num_augs == 1: cat_preds[:, 4], descending=True)]
return self.simple_test( pred_bboxes_with_label_list.append(cat_preds)
points[0],
img_metas[0], stack_points = torch.stack(points)
img[0],
bboxes_2d=bboxes_2d[0] if bboxes_2d is not None else None,
**kwargs)
else:
return self.aug_test(points, img_metas, img, bboxes_2d,
**kwargs)
def simple_test_img_only(self,
img,
img_metas,
proposals=None,
rescale=False):
r"""Test without augmentation, image network pretrain. May refer to
`<https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/two_stage.py>`_.
Args:
img (torch.Tensor): Should have a shape NxCxHxW, which contains
all images in the batch.
img_metas (list[dict]):
proposals (list[Tensor], optional): override rpn proposals
with custom proposals. Defaults to None.
rescale (bool, optional): Whether or not rescale bboxes to the
original shape of input image. Defaults to False.
Returns:
list[list[torch.Tensor]]: Predicted 2d boxes.
""" # noqa: E501
assert self.with_img_bbox, 'Img bbox head must be implemented.'
assert self.with_img_backbone, 'Img backbone must be implemented.'
assert self.with_img_rpn, 'Img rpn must be implemented.'
assert self.with_img_roi_head, 'Img roi head must be implemented.'
x = self.extract_img_feat(img)
if proposals is None:
proposal_list = self.img_rpn_head.simple_test_rpn(x, img_metas)
else:
proposal_list = proposals
ret = self.img_roi_head.simple_test(
x, proposal_list, img_metas, rescale=rescale)
return ret
def simple_test(self,
points=None,
img_metas=None,
img=None,
bboxes_2d=None,
rescale=False,
**kwargs):
"""Test without augmentation, stage 2.
Args:
points (list[torch.Tensor], optional): Elements in the list
should have a shape NxC, the list indicates all point-clouds
in the batch. Defaults to None.
img_metas (list[dict], optional): List indicates
images in a batch. Defaults to None.
img (torch.Tensor, optional): Should have a shape NxCxHxW,
which contains all images in the batch. Defaults to None.
bboxes_2d (list[torch.Tensor], optional):
Provided 2d bboxes, not supported yet. Defaults to None.
rescale (bool, optional): Whether or not rescale bboxes.
Defaults to False.
Returns:
list[dict]: Predicted 3d boxes.
"""
bboxes_2d = self.extract_bboxes_2d(
img, img_metas, train=False, bboxes_2d=bboxes_2d, **kwargs)
points = torch.stack(points)
seeds_3d, seed_3d_features, seed_indices = \ seeds_3d, seed_3d_features, seed_indices = \
self.extract_pts_feat(points) self.extract_pts_feat(stack_points)
img_features, masks = self.fusion_layer(img, bboxes_2d, seeds_3d, img_features, masks = self.fusion_layer(
img_metas) imgs, pred_bboxes_with_label_list, seeds_3d,
[item.metainfo for item in batch_data_samples])
inds = sample_valid_seeds(masks, self.num_sampled_seed) inds = sample_valid_seeds(masks, self.num_sampled_seed)
batch_size, img_feat_size = img_features.shape[:2] batch_size, img_feat_size = img_features.shape[:2]
pts_feat_size = seed_3d_features.shape[1] pts_feat_size = seed_3d_features.shape[1]
inds_img = inds.view(batch_size, 1, -1).expand(-1, img_feat_size, -1) inds_img = inds.view(batch_size, 1,
-1).expand(-1, img_feat_size, -1)
img_features = img_features.gather(-1, inds_img) img_features = img_features.gather(-1, inds_img)
inds = inds % inds.shape[1] inds = inds % inds.shape[1]
inds_seed_xyz = inds.view(batch_size, -1, 1).expand(-1, -1, 3) inds_seed_xyz = inds.view(batch_size, -1, 1).expand(-1, -1, 3)
...@@ -689,130 +480,57 @@ class ImVoteNet(Base3DDetector): ...@@ -689,130 +480,57 @@ class ImVoteNet(Base3DDetector):
seed_points=seeds_3d, seed_points=seeds_3d,
seed_features=fused_features, seed_features=fused_features,
seed_indices=seed_indices) seed_indices=seed_indices)
bbox_preds = self.pts_bbox_head_joint(feat_dict,
self.test_cfg.pts.sample_mod)
bbox_list = self.pts_bbox_head_joint.get_bboxes(
points, bbox_preds, img_metas, rescale=rescale)
bbox_results = [
bbox3d2result(bboxes, scores, labels)
for bboxes, scores, labels in bbox_list
]
return bbox_results
def aug_test_img_only(self, img, img_metas, rescale=False): results_3d = self.pts_bbox_head_joint.predict(
r"""Test function with augmentation, image network pretrain. May refer batch_inputs_dict['points'],
to `<https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/two_stage.py>`_. feat_dict,
batch_data_samples,
rescale=True)
Args: return self.convert_to_datasample(results_3d)
img (list[list[torch.Tensor]], optional): the outer
list indicates test-time augmentations and inner Tensor
should have a shape NxCxHxW, which contains all images
in the batch. Defaults to None. Defaults to None.
img_metas (list[list[dict]], optional): the outer list
indicates test-time augs (multiscale, flip, etc.)
and the inner list indicates images in a batch.
Defaults to None.
rescale (bool, optional): Whether or not rescale bboxes to the
original shape of input image. If rescale is False, then
returned bboxes and masks will fit the scale of imgs[0].
Defaults to None.
Returns: def predict_img_only(self,
list[list[torch.Tensor]]: Predicted 2d boxes. imgs: Tensor,
""" # noqa: E501 batch_data_samples: List[Det3DDataSample],
assert self.with_img_bbox, 'Img bbox head must be implemented.' rescale: bool = True) -> List[InstanceData]:
assert self.with_img_backbone, 'Img backbone must be implemented.' """Predict results from a batch of imgs with post- processing.
assert self.with_img_rpn, 'Img rpn must be implemented.'
assert self.with_img_roi_head, 'Img roi head must be implemented.'
x = self.extract_img_feats(img)
proposal_list = self.img_rpn_head.aug_test_rpn(x, img_metas)
return self.img_roi_head.aug_test(
x, proposal_list, img_metas, rescale=rescale)
def aug_test(self,
points=None,
img_metas=None,
imgs=None,
bboxes_2d=None,
rescale=False,
**kwargs):
"""Test function with augmentation, stage 2.
Args: Args:
points (list[list[torch.Tensor]], optional): the outer imgs (Tensor): Inputs images with shape (N, C, H, W).
list indicates test-time augmentations and the inner batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
list contains all points in the batch, where each Tensor Samples. It usually includes information such as
should have a shape NxC. Defaults to None. `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
img_metas (list[list[dict]], optional): the outer list rescale (bool): Whether to rescale the results.
indicates test-time augs (multiscale, flip, etc.) Defaults to True.
and the inner list indicates images in a batch.
Defaults to None.
imgs (list[list[torch.Tensor]], optional): the outer
list indicates test-time augmentations and inner Tensor
should have a shape NxCxHxW, which contains all images
in the batch. Defaults to None. Defaults to None.
bboxes_2d (list[list[torch.Tensor]], optional):
Provided 2d bboxes, not supported yet. Defaults to None.
rescale (bool, optional): Whether or not rescale bboxes.
Defaults to False.
Returns: Returns:
list[dict]: Predicted 3d boxes. list[:obj:`InstanceData`]: Return the list of detection
results of the input images, usually contains following keys.
- scores (Tensor): Classification scores, has a shape
(num_instance, )
- labels (Tensor): Labels of bboxes, has a shape
(num_instances, ).
- bboxes (Tensor): Has a shape (num_instances, 4),
the last dimension 4 arrange as (x1, y1, x2, y2).
""" """
points_cat = [torch.stack(pts) for pts in points]
feats = self.extract_pts_feats(points_cat, img_metas)
# only support aug_test for one sample
aug_bboxes = []
for x, pts_cat, img_meta, bbox_2d, img in zip(feats, points_cat,
img_metas, bboxes_2d,
imgs):
bbox_2d = self.extract_bboxes_2d(
img, img_metas, train=False, bboxes_2d=bbox_2d, **kwargs)
seeds_3d, seed_3d_features, seed_indices = x
img_features, masks = self.fusion_layer(img, bbox_2d, seeds_3d,
img_metas)
inds = sample_valid_seeds(masks, self.num_sampled_seed)
batch_size, img_feat_size = img_features.shape[:2]
pts_feat_size = seed_3d_features.shape[1]
inds_img = inds.view(batch_size, 1,
-1).expand(-1, img_feat_size, -1)
img_features = img_features.gather(-1, inds_img)
inds = inds % inds.shape[1]
inds_seed_xyz = inds.view(batch_size, -1, 1).expand(-1, -1, 3)
seeds_3d = seeds_3d.gather(1, inds_seed_xyz)
inds_seed_feats = inds.view(batch_size, 1,
-1).expand(-1, pts_feat_size, -1)
seed_3d_features = seed_3d_features.gather(-1, inds_seed_feats)
seed_indices = seed_indices.gather(1, inds)
img_features = self.img_mlp(img_features) assert self.with_img_bbox, 'Img bbox head must be implemented.'
assert self.with_img_backbone, 'Img backbone must be implemented.'
fused_features = torch.cat([seed_3d_features, img_features], dim=1) assert self.with_img_rpn, 'Img rpn must be implemented.'
assert self.with_img_roi_head, 'Img roi head must be implemented.'
x = self.extract_img_feat(imgs)
feat_dict = dict( # If there are no pre-defined proposals, use RPN to get proposals
seed_points=seeds_3d, if batch_data_samples[0].get('proposals', None) is None:
seed_features=fused_features, rpn_results_list = self.img_rpn_head.predict(
seed_indices=seed_indices) x, batch_data_samples, rescale=False)
bbox_preds = self.pts_bbox_head_joint(feat_dict, else:
self.test_cfg.pts.sample_mod) rpn_results_list = [
bbox_list = self.pts_bbox_head_joint.get_bboxes( data_sample.proposals for data_sample in batch_data_samples
pts_cat, bbox_preds, img_metas, rescale=rescale)
bbox_list = [
dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels)
for bboxes, scores, labels in bbox_list
] ]
aug_bboxes.append(bbox_list[0])
# after merging, bboxes will be rescaled to the original image size results_list = self.img_roi_head.predict(
merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, img_metas, x, rpn_results_list, batch_data_samples, rescale=rescale)
self.bbox_head.test_cfg)
return [merged_bboxes] return results_list
...@@ -105,8 +105,8 @@ def extract_2d_info(img_meta, tensor): ...@@ -105,8 +105,8 @@ def extract_2d_info(img_meta, tensor):
""" """
img_shape = img_meta['img_shape'] img_shape = img_meta['img_shape']
ori_shape = img_meta['ori_shape'] ori_shape = img_meta['ori_shape']
img_h, img_w, _ = img_shape img_h, img_w = img_shape
ori_h, ori_w, _ = ori_shape ori_h, ori_w = ori_shape
img_scale_factor = ( img_scale_factor = (
tensor.new_tensor(img_meta['scale_factor'][:2]) tensor.new_tensor(img_meta['scale_factor'][:2])
......
...@@ -45,8 +45,6 @@ class VoteFusion(nn.Module): ...@@ -45,8 +45,6 @@ class VoteFusion(nn.Module):
seed_num = seed_3d_depth.shape[0] seed_num = seed_3d_depth.shape[0]
img_shape = img_meta['img_shape'] img_shape = img_meta['img_shape']
img_h, img_w, _ = img_shape
# first reverse the data transformations # first reverse the data transformations
xyz_depth = apply_3d_transformation( xyz_depth = apply_3d_transformation(
seed_3d_depth, 'DEPTH', img_meta, reverse=True) seed_3d_depth, 'DEPTH', img_meta, reverse=True)
......
import unittest
import torch
from mmengine import DefaultScope
from mmdet3d.registry import MODELS
from tests.utils.model_utils import (_create_detector_inputs,
_get_detector_cfg, _setup_seed)
class TestH3D(unittest.TestCase):
def test_h3dnet(self):
import mmdet3d.models
assert hasattr(mmdet3d.models, 'H3DNet')
DefaultScope.get_instance('test_H3DNet', scope_name='mmdet3d')
_setup_seed(0)
voxel_net_cfg = _get_detector_cfg(
'h3dnet/h3dnet_3x8_scannet-3d-18class.py')
model = MODELS.build(voxel_net_cfg)
num_gt_instance = 5
data = [
_create_detector_inputs(
num_gt_instance=num_gt_instance,
points_feat_dim=4,
bboxes_3d_type='depth',
with_pts_semantic_mask=True,
with_pts_instance_mask=True)
]
if torch.cuda.is_available():
model = model.cuda()
# test simple_test
with torch.no_grad():
batch_inputs, data_samples = model.data_preprocessor(
data, True)
results = model.forward(
batch_inputs, data_samples, mode='predict')
self.assertEqual(len(results), len(data))
self.assertIn('bboxes_3d', results[0].pred_instances_3d)
self.assertIn('scores_3d', results[0].pred_instances_3d)
self.assertIn('labels_3d', results[0].pred_instances_3d)
# save the memory
with torch.no_grad():
losses = model.forward(batch_inputs, data_samples, mode='loss')
self.assertGreater(losses['vote_loss'], 0)
self.assertGreater(losses['objectness_loss'], 0)
self.assertGreater(losses['center_loss'], 0)
import unittest
import torch
from mmengine import DefaultScope
from mmdet3d.registry import MODELS
from tests.utils.model_utils import (_create_detector_inputs,
_get_detector_cfg, _setup_seed)
class TestImvoteNet(unittest.TestCase):
def test_imvotenet_only_img(self):
import mmdet3d.models
assert hasattr(mmdet3d.models, 'ImVoteNet')
DefaultScope.get_instance('test_imvotenet_img', scope_name='mmdet3d')
_setup_seed(0)
votenet_net_cfg = _get_detector_cfg(
'imvotenet/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class.py'
)
model = MODELS.build(votenet_net_cfg)
data = [
_create_detector_inputs(
with_points=False, with_img=True, img_size=128)
]
if torch.cuda.is_available():
model = model.cuda()
# test simple_test
with torch.no_grad():
batch_inputs, data_samples = model.data_preprocessor(
data, True)
results = model.forward(
batch_inputs, data_samples, mode='predict')
self.assertEqual(len(results), len(data))
self.assertIn('bboxes', results[0].pred_instances)
self.assertIn('scores', results[0].pred_instances)
self.assertIn('labels', results[0].pred_instances)
# save the memory
with torch.no_grad():
losses = model.forward(batch_inputs, data_samples, mode='loss')
self.assertGreater(sum(losses['loss_rpn_cls']), 0)
self.assertGreater(losses['loss_cls'], 0)
self.assertGreater(losses['loss_bbox'], 0)
def test_imvotenet(self):
import mmdet3d.models
assert hasattr(mmdet3d.models, 'ImVoteNet')
DefaultScope.get_instance('test_imvotenet', scope_name='mmdet3d')
_setup_seed(0)
votenet_net_cfg = _get_detector_cfg(
'imvotenet/imvotenet_stage2_16x8_sunrgbd-3d-10class.py')
model = MODELS.build(votenet_net_cfg)
data = [
_create_detector_inputs(
with_points=True,
with_img=True,
img_size=128,
bboxes_3d_type='depth')
]
if torch.cuda.is_available():
model = model.cuda()
# test simple_test
with torch.no_grad():
batch_inputs, data_samples = model.data_preprocessor(
data, True)
results = model.forward(
batch_inputs, data_samples, mode='predict')
self.assertEqual(len(results), len(data))
self.assertIn('bboxes_3d', results[0].pred_instances_3d)
self.assertIn('scores_3d', results[0].pred_instances_3d)
self.assertIn('labels_3d', results[0].pred_instances_3d)
# save the memory
with torch.no_grad():
losses = model.forward(batch_inputs, data_samples, mode='loss')
self.assertGreater(losses['vote_loss'], 0)
self.assertGreater(losses['objectness_loss'], 0)
self.assertGreater(losses['semantic_loss'], 0)
...@@ -75,6 +75,7 @@ def _get_detector_cfg(fname): ...@@ -75,6 +75,7 @@ def _get_detector_cfg(fname):
def _create_detector_inputs(seed=0, def _create_detector_inputs(seed=0,
with_points=True, with_points=True,
with_img=False, with_img=False,
img_size=10,
num_gt_instance=20, num_gt_instance=20,
num_points=10, num_points=10,
points_feat_dim=4, points_feat_dim=4,
...@@ -90,23 +91,46 @@ def _create_detector_inputs(seed=0, ...@@ -90,23 +91,46 @@ def _create_detector_inputs(seed=0,
'depth': DepthInstance3DBoxes, 'depth': DepthInstance3DBoxes,
'cam': CameraInstance3DBoxes 'cam': CameraInstance3DBoxes
} }
meta_info = dict()
meta_info['depth2img'] = np.array(
[[5.23289349e+02, 3.68831943e+02, 6.10469439e+01],
[1.09560138e+02, 1.97404735e+02, -5.47377738e+02],
[1.25930002e-02, 9.92229998e-01, -1.23769999e-01]])
meta_info['lidar2img'] = np.array(
[[5.23289349e+02, 3.68831943e+02, 6.10469439e+01],
[1.09560138e+02, 1.97404735e+02, -5.47377738e+02],
[1.25930002e-02, 9.92229998e-01, -1.23769999e-01]])
if with_points: if with_points:
points = torch.rand([num_points, points_feat_dim]) points = torch.rand([num_points, points_feat_dim])
else: else:
points = None points = None
if with_img: if with_img:
img = torch.rand(3, 10, 10) img = torch.rand(3, img_size, img_size)
meta_info['img_shape'] = (img_size, img_size)
meta_info['ori_shape'] = (img_size, img_size)
meta_info['scale_factor'] = np.array([1., 1.])
else: else:
img = None img = None
inputs_dict = dict(img=img, points=points) inputs_dict = dict(img=img, points=points)
gt_instance_3d = InstanceData() gt_instance_3d = InstanceData()
gt_instance_3d.bboxes_3d = bbox_3d_class[bboxes_3d_type]( gt_instance_3d.bboxes_3d = bbox_3d_class[bboxes_3d_type](
torch.rand([num_gt_instance, gt_bboxes_dim]), box_dim=gt_bboxes_dim) torch.rand([num_gt_instance, gt_bboxes_dim]), box_dim=gt_bboxes_dim)
gt_instance_3d.labels_3d = torch.randint(0, num_classes, [num_gt_instance]) gt_instance_3d.labels_3d = torch.randint(0, num_classes, [num_gt_instance])
data_sample = Det3DDataSample( data_sample = Det3DDataSample(
metainfo=dict(box_type_3d=bbox_3d_class[bboxes_3d_type])) metainfo=dict(box_type_3d=bbox_3d_class[bboxes_3d_type]))
data_sample.set_metainfo(meta_info)
data_sample.gt_instances_3d = gt_instance_3d data_sample.gt_instances_3d = gt_instance_3d
gt_instance = InstanceData()
gt_instance.labels = torch.randint(0, num_classes, [num_gt_instance])
gt_instance.bboxes = torch.rand(num_gt_instance, 4)
gt_instance.bboxes[:,
2:] = gt_instance.bboxes[:, :2] + gt_instance.bboxes[:,
2:]
data_sample.gt_instances = gt_instance
data_sample.gt_pts_seg = PointData() data_sample.gt_pts_seg = PointData()
if with_pts_instance_mask: if with_pts_instance_mask:
pts_instance_mask = torch.randint(0, num_gt_instance, [num_points]) pts_instance_mask = torch.randint(0, num_gt_instance, [num_points])
......
...@@ -652,6 +652,9 @@ def update_sunrgbd_infos(pkl_path, out_dir): ...@@ -652,6 +652,9 @@ def update_sunrgbd_infos(pkl_path, out_dir):
temp_data_info['images']['CAM0']['width'] = w temp_data_info['images']['CAM0']['width'] = w
anns = ori_info_dict['annos'] anns = ori_info_dict['annos']
if anns['gt_num'] == 0:
instance_list = []
else:
num_instances = len(anns['name']) num_instances = len(anns['name'])
ignore_class_name = set() ignore_class_name = set()
instance_list = [] instance_list = []
...@@ -659,12 +662,16 @@ def update_sunrgbd_infos(pkl_path, out_dir): ...@@ -659,12 +662,16 @@ def update_sunrgbd_infos(pkl_path, out_dir):
empty_instance = get_empty_instance() empty_instance = get_empty_instance()
empty_instance['bbox_3d'] = anns['gt_boxes_upright_depth'][ empty_instance['bbox_3d'] = anns['gt_boxes_upright_depth'][
instance_id].tolist() instance_id].tolist()
empty_instance['bbox'] = anns['bbox'][instance_id].tolist()
if anns['name'][instance_id] in METAINFO['CLASSES']: if anns['name'][instance_id] in METAINFO['CLASSES']:
empty_instance['bbox_label_3d'] = METAINFO['CLASSES'].index( empty_instance['bbox_label_3d'] = METAINFO[
anns['name'][instance_id]) 'CLASSES'].index(anns['name'][instance_id])
empty_instance['bbox_label'] = empty_instance[
'bbox_label_3d']
else: else:
ignore_class_name.add(anns['name'][instance_id]) ignore_class_name.add(anns['name'][instance_id])
empty_instance['bbox_label_3d'] = -1 empty_instance['bbox_label_3d'] = -1
empty_instance['bbox_label'] = -1
empty_instance = clear_instance_unused_keys(empty_instance) empty_instance = clear_instance_unused_keys(empty_instance)
instance_list.append(empty_instance) instance_list.append(empty_instance)
temp_data_info['instances'] = instance_list temp_data_info['instances'] = instance_list
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment