"vscode:/vscode.git/clone" did not exist on "407df37dfb109b8e2410e9afb0b72615aa4d3b07"
Commit f63a62b8 authored by zhangshilong's avatar zhangshilong Committed by ChaimZhu
Browse files

[Rrfactor]Imvotenet

parent edb6b369
......@@ -2,6 +2,9 @@ dataset_type = 'SUNRGBDDataset'
data_root = 'data/sunrgbd/'
class_names = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser',
'night_stand', 'bookshelf', 'bathtub')
metainfo = dict(CLASSES=class_names)
train_pipeline = [
dict(
type='LoadPointsFromFile',
......@@ -21,8 +24,9 @@ train_pipeline = [
scale_ratio_range=[0.85, 1.15],
shift_height=True),
dict(type='PointSample', num_points=20000),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
dict(
type='Pack3DDetInputs',
keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
dict(
......@@ -47,61 +51,52 @@ test_pipeline = [
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
),
dict(type='PointSample', num_points=20000),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
# construct a pipeline for data and gt loading in show function
# please keep its loading function consistent with test_pipeline (e.g. client)
eval_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='DEPTH',
shift_height=False,
load_dim=6,
use_dim=[0, 1, 2]),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
dict(type='PointSample', num_points=20000)
]),
dict(type='Pack3DDetInputs', keys=['points'])
]
data = dict(
samples_per_gpu=16,
workers_per_gpu=4,
train=dict(
train_dataloader = dict(
batch_size=16,
num_workers=4,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type='RepeatDataset',
times=5,
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'sunrgbd_infos_train.pkl',
ann_file='sunrgbd_infos_train.pkl',
pipeline=train_pipeline,
classes=class_names,
filter_empty_gt=False,
metainfo=metainfo,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='Depth')),
val=dict(
box_type_3d='Depth')))
val_dataloader = dict(
batch_size=1,
num_workers=1,
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'sunrgbd_infos_val.pkl',
ann_file='sunrgbd_infos_val.pkl',
pipeline=test_pipeline,
classes=class_names,
metainfo=metainfo,
test_mode=True,
box_type_3d='Depth'),
test=dict(
box_type_3d='Depth'))
test_dataloader = dict(
batch_size=1,
num_workers=1,
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'sunrgbd_infos_val.pkl',
ann_file='sunrgbd_infos_val.pkl',
pipeline=test_pipeline,
classes=class_names,
metainfo=metainfo,
test_mode=True,
box_type_3d='Depth'))
evaluation = dict(pipeline=eval_pipeline)
val_evaluator = dict(type='IndoorMetric')
test_evaluator = val_evaluator
model = dict(
type='ImVoteNet',
data_preprocessor=dict(
type='Det3DDataPreprocessor',
# use caffe img_norm
mean=[103.530, 116.280, 123.675],
std=[1.0, 1.0, 1.0],
bgr_to_rgb=False,
pad_size_divisor=32),
img_backbone=dict(
type='ResNet',
type='mmdet.ResNet',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
......@@ -10,11 +17,12 @@ model = dict(
norm_eval=True,
style='caffe'),
img_neck=dict(
type='FPN',
type='mmdet.FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
num_outs=5),
img_rpn_head=dict(
_scope_='mmdet',
type='RPNHead',
in_channels=256,
feat_channels=256,
......@@ -31,6 +39,7 @@ model = dict(
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
img_roi_head=dict(
_scope_='mmdet',
type='StandardRoIHead',
bbox_roi_extractor=dict(
type='SingleRoIExtractor',
......@@ -56,6 +65,7 @@ model = dict(
train_cfg=dict(
img_rpn=dict(
assigner=dict(
_scope_='mmdet',
type='MaxIoUAssigner',
pos_iou_thr=0.7,
neg_iou_thr=0.3,
......@@ -63,7 +73,7 @@ model = dict(
match_low_quality=True,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
type='mmdet.RandomSampler',
num=256,
pos_fraction=0.5,
neg_pos_ub=-1,
......@@ -80,6 +90,7 @@ model = dict(
min_bbox_size=0),
img_rcnn=dict(
assigner=dict(
_scope_='mmdet',
type='MaxIoUAssigner',
pos_iou_thr=0.5,
neg_iou_thr=0.5,
......@@ -87,7 +98,7 @@ model = dict(
match_low_quality=False,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
type='mmdet.RandomSampler',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
......
......@@ -3,56 +3,71 @@ _base_ = [
'../_base_/models/imvotenet_image.py'
]
# use caffe img_norm
img_norm_cfg = dict(
mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', with_bbox=True),
dict(
type='Resize',
img_scale=[(1333, 480), (1333, 504), (1333, 528), (1333, 552),
(1333, 576), (1333, 600)],
multiscale_mode='value',
type='LoadAnnotations3D',
with_bbox=True,
with_label=True,
with_bbox_3d=False,
with_label_3d=False),
dict(
type='RandomChoiceResize',
scales=[(1333, 480), (1333, 504), (1333, 528), (1333, 552),
(1333, 576), (1333, 600)],
keep_ratio=True),
dict(type='RandomFlip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
dict(type='RandomFlip', prob=0.5),
dict(
type='Pack3DDetInputs', keys=['img', 'gt_bboxes', 'gt_bboxes_labels']),
]
test_pipeline = [
dict(type='LoadImageFromFile'),
# online evaluation
dict(
type='LoadAnnotations3D',
with_bbox=True,
with_label=True,
with_bbox_3d=False,
with_label_3d=False),
dict(type='Resize', scale=(1333, 600), keep_ratio=True),
dict(
type='MultiScaleFlipAug',
img_scale=(1333, 600),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img']),
])
type='Pack3DDetInputs',
keys=(['img']),
meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
'scale_factor'))
]
train_dataloader = dict(
batch_size=2,
num_workers=2,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type='RepeatDataset', times=1, dataset=dict(pipeline=train_pipeline)))
data = dict(
samples_per_gpu=2,
workers_per_gpu=2,
train=dict(times=1, dataset=dict(pipeline=train_pipeline)),
val=dict(pipeline=test_pipeline),
test=dict(pipeline=test_pipeline))
val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=8, val_interval=1)
val_cfg = dict(type='ValLoop')
test_cfg = dict(type='TestLoop')
# learning rate
param_scheduler = [
dict(
type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
dict(
type='MultiStepLR',
begin=0,
end=8,
by_epoch=True,
milestones=[6],
gamma=0.1)
]
val_evaluator = dict(type='Indoor2DMetric')
optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
optimizer_config = dict(grad_clip=None)
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[6])
runner = dict(type='EpochBasedRunner', max_epochs=8)
# optimizer
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))
load_from = 'http://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco_bbox_mAP-0.408__segm_mAP-0.37_20200504_163245-42aa3d00.pth' # noqa
......@@ -7,10 +7,6 @@ _base_ = [
class_names = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser',
'night_stand', 'bookshelf', 'bathtub')
# use caffe img_norm
img_norm_cfg = dict(
mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
model = dict(
pts_backbone=dict(
type='PointNet2SASSG',
......@@ -48,10 +44,8 @@ model = dict(
[0.76584, 1.398258, 0.472728]]),
pred_layer_cfg=dict(
in_channels=128, shared_conv_channels=(128, 128), bias=True),
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d'),
objectness_loss=dict(
type='CrossEntropyLoss',
type='mmdet.CrossEntropyLoss',
class_weight=[0.2, 0.8],
reduction='sum',
loss_weight=5.0),
......@@ -62,15 +56,23 @@ model = dict(
loss_src_weight=10.0,
loss_dst_weight=10.0),
dir_class_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
type='mmdet.CrossEntropyLoss',
reduction='sum',
loss_weight=1.0),
dir_res_loss=dict(
type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
type='mmdet.SmoothL1Loss', reduction='sum', loss_weight=10.0),
size_class_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
type='mmdet.CrossEntropyLoss',
reduction='sum',
loss_weight=1.0),
size_res_loss=dict(
type='SmoothL1Loss', reduction='sum', loss_weight=10.0 / 3.0),
type='mmdet.SmoothL1Loss',
reduction='sum',
loss_weight=10.0 / 3.0),
semantic_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
type='mmdet.CrossEntropyLoss',
reduction='sum',
loss_weight=1.0)),
joint=dict(
vote_module_cfg=dict(
in_channels=512,
......@@ -154,11 +156,11 @@ model = dict(
# model training and testing settings
train_cfg=dict(
pts=dict(
pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote')),
pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mode='vote')),
test_cfg=dict(
img_rcnn=dict(score_thr=0.1),
pts=dict(
sample_mod='seed',
sample_mode='seed',
nms_thr=0.25,
score_thr=0.05,
per_class_proposal=True)))
......@@ -171,12 +173,13 @@ train_pipeline = [
load_dim=6,
use_dim=[0, 1, 2]),
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations3D'),
dict(type='LoadAnnotations', with_bbox=True),
dict(type='Resize', img_scale=(1333, 600), keep_ratio=True),
dict(type='RandomFlip', flip_ratio=0.0),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(
type='LoadAnnotations3D',
with_bbox=True,
with_label=True,
with_bbox_3d=True,
with_label_3d=True),
dict(type='Resize', scale=(1333, 600), keep_ratio=True),
dict(
type='RandomFlip3D',
sync_2d=False,
......@@ -188,15 +191,13 @@ train_pipeline = [
scale_ratio_range=[0.85, 1.15],
shift_height=True),
dict(type='PointSample', num_points=20000),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(
type='Collect3D',
keys=[
'img', 'gt_bboxes', 'gt_labels', 'points', 'gt_bboxes_3d',
type='Pack3DDetInputs',
keys=([
'img', 'gt_bboxes', 'gt_bboxes_labels', 'points', 'gt_bboxes_3d',
'gt_labels_3d'
])
]))
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
......@@ -205,56 +206,15 @@ test_pipeline = [
shift_height=True,
load_dim=6,
use_dim=[0, 1, 2]),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 600),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip', flip_ratio=0.0),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
),
dict(type='PointSample', num_points=20000),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['img', 'points'])
]),
]
# construct a pipeline for data and gt loading in show function
# please keep its loading function consistent with test_pipeline (e.g. client)
eval_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='LoadPointsFromFile',
coord_type='DEPTH',
shift_height=False,
load_dim=6,
use_dim=[0, 1, 2]),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['img', 'points'])
dict(type='Resize', scale=(1333, 600), keep_ratio=True),
dict(type='PointSample', num_points=20000),
dict(type='Pack3DDetInputs', keys=['img', 'points'])
]
data = dict(
train=dict(dataset=dict(pipeline=train_pipeline)),
val=dict(pipeline=test_pipeline),
test=dict(pipeline=test_pipeline))
evaluation = dict(pipeline=eval_pipeline)
train_dataloader = dict(dataset=dict(dataset=dict(pipeline=train_pipeline)))
val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
# may also use your own pre-trained image branch
load_from = 'https://download.openmmlab.com/mmdetection3d/v0.1.0_models/imvotenet/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class_20210323_173222-cad62aeb.pth' # noqa
......@@ -178,10 +178,11 @@ class Det3DDataset(BaseDataset):
dict | None: Processed `ann_info`
"""
# add s or gt prefix for most keys after concat
# we only process 3d annotations here, the corresponding
# 2d annotation process is in the `LoadAnnotations3D`
# in `pipelines`
name_mapping = {
'bbox_label': 'gt_labels',
'bbox_label_3d': 'gt_labels_3d',
'bbox': 'gt_bboxes',
'bbox_3d': 'gt_bboxes_3d',
'depth': 'depths',
'center_2d': 'centers_2d',
......@@ -196,16 +197,18 @@ class Det3DDataset(BaseDataset):
keys = list(instances[0].keys())
ann_info = dict()
for ann_name in keys:
temp_anns = [item[ann_name] for item in instances]
# map the original dataset label to training label
if 'label' in ann_name:
temp_anns = [
self.label_mapping[item] for item in temp_anns
]
temp_anns = np.array(temp_anns)
if ann_name in name_mapping:
temp_anns = [item[ann_name] for item in instances]
# map the original dataset label to training label
if 'label' in ann_name:
temp_anns = [
self.label_mapping[item] for item in temp_anns
]
temp_anns = np.array(temp_anns)
ann_name = name_mapping[ann_name]
ann_info[ann_name] = temp_anns
ann_info[ann_name] = temp_anns
ann_info['instances'] = info['instances']
return ann_info
def parse_data_info(self, info: dict) -> dict:
......
......@@ -87,7 +87,8 @@ class KittiDataset(Det3DDataset):
if 'plane' in info:
# convert ground plane to velodyne coordinates
plane = np.array(info['plane'])
lidar2cam = np.array(info['images']['CAM2']['lidar2cam'])
lidar2cam = np.array(
info['images']['CAM2']['lidar2cam'], dtype=np.float32)
reverse = np.linalg.inv(lidar2cam)
(plane_norm_cam, plane_off_cam) = (plane[:3],
......
# Copyright (c) OpenMMLab. All rights reserved.
from typing import List, Union
from typing import List, Sequence, Union
import mmcv
import numpy as np
import torch
from mmcv import BaseTransform
from mmcv.transforms import to_tensor
from mmengine import InstanceData
from numpy import dtype
from mmdet3d.core import Det3DDataSample, PointData
from mmdet3d.core.bbox import BaseInstance3DBoxes
......@@ -12,6 +14,38 @@ from mmdet3d.core.points import BasePoints
from mmdet3d.registry import TRANSFORMS
def to_tensor(
data: Union[torch.Tensor, np.ndarray, Sequence, int,
float]) -> torch.Tensor:
"""Convert objects of various python types to :obj:`torch.Tensor`.
Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
:class:`Sequence`, :class:`int` and :class:`float`.
Args:
data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to
be converted.
Returns:
torch.Tensor: the converted data.
"""
if isinstance(data, torch.Tensor):
return data
elif isinstance(data, np.ndarray):
if data.dtype is dtype('float64'):
data = data.astype(np.float32)
return torch.from_numpy(data)
elif isinstance(data, Sequence) and not mmcv.is_str(data):
return torch.tensor(data)
elif isinstance(data, int):
return torch.LongTensor([data])
elif isinstance(data, float):
return torch.FloatTensor([data])
else:
raise TypeError(f'type {type(data)} cannot be converted to tensor.')
@TRANSFORMS.register_module()
class Pack3DDetInputs(BaseTransform):
INPUTS_KEYS = ['points', 'img']
......@@ -20,7 +54,7 @@ class Pack3DDetInputs(BaseTransform):
]
INSTANCEDATA_2D_KEYS = [
'gt_bboxes',
'gt_labels',
'gt_bboxes_labels',
]
SEG_KEYS = [
......@@ -121,8 +155,8 @@ class Pack3DDetInputs(BaseTransform):
for key in [
'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels',
'gt_labels_3d', 'attr_labels', 'pts_instance_mask',
'pts_semantic_mask', 'centers_2d', 'depths'
'gt_bboxes_labels', 'attr_labels', 'pts_instance_mask',
'pts_semantic_mask', 'centers_2d', 'depths', 'gt_labels_3d'
]:
if key not in results:
continue
......@@ -159,7 +193,10 @@ class Pack3DDetInputs(BaseTransform):
elif key in self.INSTANCEDATA_3D_KEYS:
gt_instances_3d[self._remove_prefix(key)] = results[key]
elif key in self.INSTANCEDATA_2D_KEYS:
gt_instances[self._remove_prefix(key)] = results[key]
if key == 'gt_bboxes_labels':
gt_instances['labels'] = results[key]
else:
gt_instances[self._remove_prefix(key)] = results[key]
elif key in self.SEG_KEYS:
gt_pts_seg[self._remove_prefix(key)] = results[key]
else:
......
......@@ -632,34 +632,6 @@ class LoadAnnotations3D(LoadAnnotations):
self.with_seg_3d = with_seg_3d
self.seg_3d_dtype = seg_3d_dtype
def _load_bboxes(self, results: dict) -> None:
"""Private function to load bounding box annotations.
Rewrite '_load_bboxes` since mmdet3d uses 'parse_anno_info' in
datasets.
Args:
results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
Returns:
dict: The dict contains loaded bounding box annotations.
"""
results['gt_bboxes'] = results['ann_info']['gt_bboxes']
def _load_labels(self, results: dict) -> None:
"""Private function to load label annotations.
Rewrite '_load_bboxes` since mmdet3d uses 'parse_anno_info' in
datasets.
Args:
results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
Returns:
dict: The dict contains loaded label annotations.
"""
results['gt_labels'] = results['ann_info']['gt_labels']
def _load_bboxes_3d(self, results: dict) -> dict:
"""Private function to move the 3D bounding box annotation from
`ann_info` field to the root of `results`.
......@@ -769,6 +741,56 @@ class LoadAnnotations3D(LoadAnnotations):
results['eval_ann_info']['pts_semantic_mask'] = pts_semantic_mask
return results
def _load_bboxes(self, results: dict) -> None:
"""Private function to load bounding box annotations.
The only difference is it remove the proceess for
`ignore_flag`
Args:
results (dict): Result dict from :obj:``mmcv.BaseDataset``.
Returns:
dict: The dict contains loaded bounding box annotations.
"""
gt_bboxes = []
for instance in results['instances']:
gt_bboxes.append(instance['bbox'])
if len(gt_bboxes) == 0:
results['gt_bboxes'] = np.zeros((0, 4), dtype=np.float32)
else:
results['gt_bboxes'] = np.array(
gt_bboxes, dtype=np.float32).reshape((-1, 4))
if self.denorm_bbox:
bbox_num = results['gt_bboxes'].shape[0]
if bbox_num != 0:
h, w = results['img_shape']
results['gt_bboxes'][:, 0::2] *= w
results['gt_bboxes'][:, 1::2] *= h
if 'eval_ann_info' in results:
results['eval_ann_info']['gt_bboxes'] = results['gt_bboxes']
def _load_labels(self, results: dict) -> None:
"""Private function to load label annotations.
Args:
results (dict): Result dict from :obj :obj:``mmcv.BaseDataset``.
Returns:
dict: The dict contains loaded label annotations.
"""
gt_bboxes_labels = []
for instance in results['instances']:
gt_bboxes_labels.append(instance['bbox_label'])
if len(gt_bboxes_labels) == 0:
results['gt_bboxes_labels'] = np.zeros((0, ), dtype=np.int64)
else:
results['gt_bboxes_labels'] = np.array(
gt_bboxes_labels, dtype=np.int64)
if 'eval_ann_info' in results:
results['eval_ann_info']['gt_bboxes_labels'] = results[
'gt_bboxes_labels']
def transform(self, results: dict) -> dict:
"""Function to load multiple types annotations.
......
# Copyright (c) OpenMMLab. All rights reserved.
import os.path as osp
from typing import Callable, List, Optional, Union
import numpy as np
......@@ -22,13 +24,15 @@ class SUNRGBDDataset(Det3DDataset):
ann_file (str): Path of annotation file.
metainfo (dict, optional): Meta information for dataset, such as class
information. Defaults to None.
data_prefix (dict, optional): Prefix for data. Defaults to
data_prefix (dict): Prefix for data. Defaults to
`dict(pts='points',img='sunrgbd_trainval')`.
pipeline (list[dict], optional): Pipeline used for data processing.
Defaults to None.
modality (dict, optional): Modality to specify the sensor data used
as input. Defaults to `dict(use_camera=True, use_lidar=True)`.
box_type_3d (str, optional): Type of 3D box of this dataset.
default_cam_key (str): The default camera name adopted.
Defaults to "CAM0".
box_type_3d (str): Type of 3D box of this dataset.
Based on the `box_type_3d`, the dataset will encapsulate the box
to its original format then converted them to `box_type_3d`.
Defaults to 'Depth' in this dataset. Available options includes
......@@ -36,9 +40,9 @@ class SUNRGBDDataset(Det3DDataset):
- 'LiDAR': Box in LiDAR coordinates.
- 'Depth': Box in depth coordinates, usually for indoor dataset.
- 'Camera': Box in camera coordinates.
filter_empty_gt (bool, optional): Whether to filter empty GT.
filter_empty_gt (bool): Whether to filter empty GT.
Defaults to True.
test_mode (bool, optional): Whether the dataset is in test mode.
test_mode (bool): Whether the dataset is in test mode.
Defaults to False.
"""
METAINFO = {
......@@ -51,8 +55,9 @@ class SUNRGBDDataset(Det3DDataset):
ann_file: str,
metainfo: Optional[dict] = None,
data_prefix: dict = dict(
pts='points', img='sunrgbd_trainval'),
pts='points', img='sunrgbd_trainval/image'),
pipeline: List[Union[dict, Callable]] = [],
default_cam_key: str = 'CAM0',
modality=dict(use_camera=True, use_lidar=True),
box_type_3d: str = 'Depth',
filter_empty_gt: bool = True,
......@@ -64,6 +69,7 @@ class SUNRGBDDataset(Det3DDataset):
metainfo=metainfo,
data_prefix=data_prefix,
pipeline=pipeline,
default_cam_key=default_cam_key,
modality=modality,
box_type_3d=box_type_3d,
filter_empty_gt=filter_empty_gt,
......@@ -73,6 +79,47 @@ class SUNRGBDDataset(Det3DDataset):
'use_lidar' in self.modality
assert self.modality['use_camera'] or self.modality['use_lidar']
def parse_data_info(self, info: dict) -> dict:
"""Process the raw data info.
Convert all relative path of needed modality data file to
the absolute path. And process
the `instances` field to `ann_info` in training stage.
Args:
info (dict): Raw info dict.
Returns:
dict: Has `ann_info` in training stage. And
all path has been converted to absolute path.
"""
if self.modality['use_lidar']:
info['lidar_points']['lidar_path'] = \
osp.join(
self.data_prefix.get('pts', ''),
info['lidar_points']['lidar_path'])
if self.modality['use_camera']:
for cam_id, img_info in info['images'].items():
if 'img_path' in img_info:
img_info['img_path'] = osp.join(
self.data_prefix.get('img', ''), img_info['img_path'])
if self.default_cam_key is not None:
info['img_path'] = info['images'][
self.default_cam_key]['img_path']
info['depth2img'] = np.array(
info['images'][self.default_cam_key]['depth2img'],
dtype=np.float32)
if not self.test_mode:
# used in traing
info['ann_info'] = self.parse_ann_info(info)
if self.test_mode and self.load_eval_anns:
info['eval_ann_info'] = self.parse_ann_info(info)
return info
def parse_ann_info(self, info: dict) -> dict:
"""Process the `instances` in data info to `ann_info`
......@@ -83,12 +130,11 @@ class SUNRGBDDataset(Det3DDataset):
dict: Processed `ann_info`
"""
ann_info = super().parse_ann_info(info)
# empty gt
# process data without any annotations
if ann_info is None:
ann_info = dict()
ann_info['gt_bboxes_3d'] = np.zeros((0, 6), dtype=np.float32)
ann_info['gt_labels_3d'] = np.zeros((0, ), dtype=np.int64)
# to target box structure
ann_info['gt_bboxes_3d'] = DepthInstance3DBoxes(
ann_info['gt_bboxes_3d'],
......
# Copyright (c) OpenMMLab. All rights reserved.
from collections import OrderedDict
from typing import Dict, List, Optional, Sequence
import numpy as np
from mmengine.evaluator import BaseMetric
from mmengine.logging import MMLogger
from mmdet3d.core import get_box_type, indoor_eval
from mmdet3d.registry import METRICS
from mmdet.core import eval_map
@METRICS.register_module()
class IndoorMetric(BaseMetric):
"""Kitti evaluation metric.
"""Indoor scene evaluation metric.
Args:
iou_thr (list[float]): List of iou threshold when calculate the
......@@ -90,3 +93,91 @@ class IndoorMetric(BaseMetric):
box_mode_3d=box_mode_3d)
return ret_dict
@METRICS.register_module()
class Indoor2DMetric(BaseMetric):
"""indoor 2d predictions evaluation metric.
Args:
iou_thr (list[float]): List of iou threshold when calculate the
metric. Defaults to [0.5].
collect_device (str, optional): Device name used for collecting
results from different ranks during distributed training.
Must be 'cpu' or 'gpu'. Defaults to 'cpu'.
prefix (str): The prefix that will be added in the metric
names to disambiguate homonymous metrics of different evaluators.
If prefix is not provided in the argument, self.default_prefix
will be used instead. Default: None
"""
def __init__(self,
iou_thr: List[float] = [0.5],
collect_device: str = 'cpu',
prefix: Optional[str] = None,
**kwargs):
super(Indoor2DMetric, self).__init__(
prefix=prefix, collect_device=collect_device)
self.iou_thr = iou_thr
def process(self, data_batch: Sequence[dict],
predictions: Sequence[dict]) -> None:
"""Process one batch of data samples and predictions.
The processed results should be stored in ``self.results``,
which will be used to compute the metrics when all batches
have been processed.
Args:
data_batch (Sequence[dict]): A batch of data
from the dataloader.
predictions (Sequence[dict]): A batch of outputs from
the model.
"""
batch_eval_anns = [
item['data_sample']['eval_ann_info'] for item in data_batch
]
for eval_ann, pred_dict in zip(batch_eval_anns, predictions):
pred = pred_dict['pred_instances']
ann = dict(
labels=eval_ann['gt_bboxes_labels'],
bboxes=eval_ann['gt_bboxes'])
pred_bboxes = pred['bboxes'].cpu().numpy()
pred_scores = pred['scores'].cpu().numpy()
pred_labels = pred['labels'].cpu().numpy()
dets = []
for label in range(len(self.dataset_meta['CLASSES'])):
index = np.where(pred_labels == label)[0]
pred_bbox_scores = np.hstack(
[pred_bboxes[index], pred_scores[index].reshape((-1, 1))])
dets.append(pred_bbox_scores)
self.results.append((ann, dets))
def compute_metrics(self, results: list) -> Dict[str, float]:
"""Compute the metrics from processed results.
Args:
results (list): The processed results of each batch.
Returns:
Dict[str, float]: The computed metrics. The keys are the names of
the metrics, and the values are corresponding results.
"""
logger: MMLogger = MMLogger.get_current_instance()
annotations, preds = zip(*results)
eval_results = OrderedDict()
iou_thr_2d = (self.iou_thr) if isinstance(self.iou_thr,
float) else self.iou_thr
for iou_thr_2d_single in iou_thr_2d:
mean_ap, _ = eval_map(
preds,
annotations,
scale_ranges=None,
iou_thr=iou_thr_2d_single,
dataset=self.dataset_meta['CLASSES'],
logger=logger)
eval_results['mAP_' + str(iou_thr_2d_single)] = mean_ap
return eval_results
......@@ -754,15 +754,16 @@ class VoteHead(BaseModule):
batch_size = bbox3d.shape[0]
results_list = list()
if use_nms:
for b in range(batch_size):
for batch_index in range(batch_size):
temp_results = InstanceData()
bbox_selected, score_selected, labels = \
self.multiclass_nms_single(obj_scores[b],
sem_scores[b],
bbox3d[b],
stack_points[b, ..., :3],
batch_input_metas[b])
bbox = batch_input_metas[b]['box_type_3d'](
self.multiclass_nms_single(
obj_scores[batch_index],
sem_scores[batch_index],
bbox3d[batch_index],
stack_points[batch_index, ..., :3],
batch_input_metas[batch_index])
bbox = batch_input_metas[batch_index]['box_type_3d'](
bbox_selected,
box_dim=bbox_selected.shape[-1],
with_yaw=self.bbox_coder.with_rot)
......
# Copyright (c) OpenMMLab. All rights reserved.
from typing import List, Union
from typing import List, Optional, Union
from mmengine import InstanceData
......@@ -91,8 +91,8 @@ class Base3DDetector(BaseDetector):
def convert_to_datasample(
self,
results_list_3d: InstanceList,
results_list_2d: InstanceList = None,
results_list_3d: Optional[InstanceList] = None,
results_list_2d: Optional[InstanceList] = None,
) -> SampleList:
"""Convert results list to `Det3DDataSample`.
......@@ -128,10 +128,18 @@ class Base3DDetector(BaseDetector):
"""
data_sample_list = []
assert (results_list_2d is not None) or \
(results_list_3d is not None),\
'please pass at least one type of results_list'
if results_list_2d is None:
results_list_2d = [
InstanceData() for _ in range(len(results_list_3d))
]
if results_list_3d is None:
results_list_3d = [
InstanceData() for _ in range(len(results_list_2d))
]
for i in range(len(results_list_3d)):
result = Det3DDataSample()
result.pred_instances_3d = results_list_3d[i]
......
# Copyright (c) OpenMMLab. All rights reserved.
import warnings
import copy
from typing import Dict, List, Optional, Sequence, Tuple, Union
import numpy as np
import torch
from mmengine import InstanceData
from torch import Tensor
from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d
from mmdet3d.core import Det3DDataSample
from mmdet3d.models.utils import MLP
from mmdet3d.registry import MODELS
from .base import Base3DDetector
def sample_valid_seeds(mask, num_sampled_seed=1024):
def sample_valid_seeds(mask: Tensor, num_sampled_seed: int = 1024) -> Tensor:
r"""Randomly sample seeds from all imvotes.
Modified from `<https://github.com/facebookresearch/imvotenet/blob/a8856345146bacf29a57266a2f0b874406fd8823/models/imvotenet.py#L26>`_
......@@ -54,26 +56,34 @@ def sample_valid_seeds(mask, num_sampled_seed=1024):
@MODELS.register_module()
class ImVoteNet(Base3DDetector):
r"""`ImVoteNet <https://arxiv.org/abs/2001.10692>`_ for 3D detection."""
r"""`ImVoteNet <https://arxiv.org/abs/2001.10692>`_ for 3D detection.
ImVoteNet is based on fusing 2D votes in images and 3D votes in point
clouds, which explicitly extract both geometric and semantic features
from the 2D images. It leverage camera parameters to lift these
features to 3D. A multi-tower training scheme also improve the synergy
of 2D-3D feature fusion.
"""
def __init__(self,
pts_backbone=None,
pts_bbox_heads=None,
pts_neck=None,
img_backbone=None,
img_neck=None,
img_roi_head=None,
img_rpn_head=None,
img_mlp=None,
freeze_img_branch=False,
fusion_layer=None,
num_sampled_seed=None,
train_cfg=None,
test_cfg=None,
pretrained=None,
init_cfg=None):
super(ImVoteNet, self).__init__(init_cfg=init_cfg)
pts_backbone: Optional[dict] = None,
pts_bbox_heads: Optional[dict] = None,
pts_neck: Optional[dict] = None,
img_backbone: Optional[dict] = None,
img_neck: Optional[dict] = None,
img_roi_head: Optional[dict] = None,
img_rpn_head: Optional[dict] = None,
img_mlp: Optional[dict] = None,
freeze_img_branch: bool = False,
fusion_layer: Optional[dict] = None,
num_sampled_seed: Optional[dict] = None,
train_cfg: Optional[dict] = None,
test_cfg: Optional[dict] = None,
init_cfg: Optional[dict] = None,
**kwargs) -> None:
super(ImVoteNet, self).__init__(init_cfg=init_cfg, **kwargs)
# point branch
if pts_backbone is not None:
......@@ -137,35 +147,8 @@ class ImVoteNet(Base3DDetector):
self.train_cfg = train_cfg
self.test_cfg = test_cfg
if pretrained is None:
img_pretrained = None
pts_pretrained = None
elif isinstance(pretrained, dict):
img_pretrained = pretrained.get('img', None)
pts_pretrained = pretrained.get('pts', None)
else:
raise ValueError(
f'pretrained should be a dict, got {type(pretrained)}')
if self.with_img_backbone:
if img_pretrained is not None:
warnings.warn('DeprecationWarning: pretrained is a deprecated '
'key, please consider using init_cfg.')
self.img_backbone.init_cfg = dict(
type='Pretrained', checkpoint=img_pretrained)
if self.with_img_roi_head:
if img_pretrained is not None:
warnings.warn('DeprecationWarning: pretrained is a deprecated '
'key, please consider using init_cfg.')
self.img_roi_head.init_cfg = dict(
type='Pretrained', checkpoint=img_pretrained)
if self.with_pts_backbone:
if img_pretrained is not None:
warnings.warn('DeprecationWarning: pretrained is a deprecated '
'key, please consider using init_cfg.')
self.pts_backbone.init_cfg = dict(
type='Pretrained', checkpoint=pts_pretrained)
def _forward(self):
raise NotImplementedError
def freeze_img_branch_params(self):
"""Freeze all image branch parameters."""
......@@ -267,28 +250,14 @@ class ImVoteNet(Base3DDetector):
"""Just to inherit from abstract method."""
pass
def extract_img_feat(self, img):
def extract_img_feat(self, img: Tensor) -> Sequence[Tensor]:
"""Directly extract features from the img backbone+neck."""
x = self.img_backbone(img)
if self.with_img_neck:
x = self.img_neck(x)
return x
def extract_img_feats(self, imgs):
"""Extract features from multiple images.
Args:
imgs (list[torch.Tensor]): A list of images. The images are
augmented from the same image but in different ways.
Returns:
list[torch.Tensor]: Features of different images
"""
assert isinstance(imgs, list)
return [self.extract_img_feat(img) for img in imgs]
def extract_pts_feat(self, pts):
def extract_pts_feat(self, pts: Tensor) -> Tuple[Tensor]:
"""Extract features of points."""
x = self.pts_backbone(pts)
if self.with_pts_neck:
......@@ -300,151 +269,93 @@ class ImVoteNet(Base3DDetector):
return (seed_points, seed_features, seed_indices)
def extract_pts_feats(self, pts):
"""Extract features of points from multiple samples."""
assert isinstance(pts, list)
return [self.extract_pts_feat(pt) for pt in pts]
@torch.no_grad()
def extract_bboxes_2d(self,
img,
img_metas,
train=True,
bboxes_2d=None,
**kwargs):
"""Extract bounding boxes from 2d detector.
Args:
img (torch.Tensor): of shape (N, C, H, W) encoding input images.
Typically these should be mean centered and std scaled.
img_metas (list[dict]): Image meta info.
train (bool): train-time or not.
bboxes_2d (list[torch.Tensor]): provided 2d bboxes,
not supported yet.
Return:
list[torch.Tensor]: a list of processed 2d bounding boxes.
def loss(self, batch_inputs_dict: Dict[str, Union[List, Tensor]],
batch_data_samples: List[Det3DDataSample],
**kwargs) -> List[Det3DDataSample]:
"""
if bboxes_2d is None:
x = self.extract_img_feat(img)
proposal_list = self.img_rpn_head.simple_test_rpn(x, img_metas)
rets = self.img_roi_head.simple_test(
x, proposal_list, img_metas, rescale=False)
rets_processed = []
for ret in rets:
tmp = np.concatenate(ret, axis=0)
sem_class = img.new_zeros((len(tmp)))
start = 0
for i, bboxes in enumerate(ret):
sem_class[start:start + len(bboxes)] = i
start += len(bboxes)
ret = img.new_tensor(tmp)
# append class index
ret = torch.cat([ret, sem_class[:, None]], dim=-1)
inds = torch.argsort(ret[:, 4], descending=True)
ret = ret.index_select(0, inds)
# drop half bboxes during training for better generalization
if train:
rand_drop = torch.randperm(len(ret))[:(len(ret) + 1) // 2]
rand_drop = torch.sort(rand_drop)[0]
ret = ret[rand_drop]
rets_processed.append(ret.float())
return rets_processed
else:
rets_processed = []
for ret in bboxes_2d:
if len(ret) > 0 and train:
rand_drop = torch.randperm(len(ret))[:(len(ret) + 1) // 2]
rand_drop = torch.sort(rand_drop)[0]
ret = ret[rand_drop]
rets_processed.append(ret.float())
return rets_processed
def forward_train(self,
points=None,
img=None,
img_metas=None,
gt_bboxes=None,
gt_labels=None,
gt_bboxes_ignore=None,
gt_masks=None,
proposals=None,
bboxes_2d=None,
gt_bboxes_3d=None,
gt_labels_3d=None,
pts_semantic_mask=None,
pts_instance_mask=None,
**kwargs):
"""Forwarding of train for image branch pretrain or stage 2 train.
Args:
points (list[torch.Tensor]): Points of each batch.
img (torch.Tensor): of shape (N, C, H, W) encoding input images.
Typically these should be mean centered and std scaled.
img_metas (list[dict]): list of image and point cloud meta info
dict. For example, keys include 'ori_shape', 'img_norm_cfg',
and 'transformation_3d_flow'. For details on the values of
the keys see `mmdet/datasets/pipelines/formatting.py:Collect`.
gt_bboxes (list[torch.Tensor]): Ground truth bboxes for each image
with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels (list[torch.Tensor]): class indices for each
2d bounding box.
gt_bboxes_ignore (list[torch.Tensor]): specify which
2d bounding boxes can be ignored when computing the loss.
gt_masks (torch.Tensor): true segmentation masks for each
2d bbox, used if the architecture supports a segmentation task.
proposals: override rpn proposals (2d) with custom proposals.
Use when `with_rpn` is False.
bboxes_2d (list[torch.Tensor]): provided 2d bboxes,
not supported yet.
gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): 3d gt bboxes.
gt_labels_3d (list[torch.Tensor]): gt class labels for 3d bboxes.
pts_semantic_mask (list[torch.Tensor]): point-wise semantic
label of each batch.
pts_instance_mask (list[torch.Tensor]): point-wise instance
label of each batch.
batch_inputs_dict (dict): The model input dict which include
'points', 'imgs` keys.
- points (list[torch.Tensor]): Point cloud of each sample.
- imgs (list[torch.Tensor]): Image tensor with shape
(N, C, H ,W).
batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
Samples. It usually includes information such as
`gt_instance_3d`.
Returns:
dict[str, torch.Tensor]: a dictionary of loss components.
dict[str, Tensor]: A dictionary of loss components.
"""
imgs = batch_inputs_dict.get('imgs', None)
points = batch_inputs_dict.get('points', None)
if points is None:
x = self.extract_img_feat(img)
x = self.extract_img_feat(imgs)
losses = dict()
# RPN forward and loss
if self.with_img_rpn:
proposal_cfg = self.train_cfg.get('img_rpn_proposal',
self.test_cfg.img_rpn)
rpn_losses, proposal_list = self.img_rpn_head.forward_train(
x,
img_metas,
gt_bboxes,
gt_labels=None,
gt_bboxes_ignore=gt_bboxes_ignore,
proposal_cfg=proposal_cfg)
rpn_data_samples = copy.deepcopy(batch_data_samples)
# set cat_id of gt_labels to 0 in RPN
for data_sample in rpn_data_samples:
data_sample.gt_instances.labels = \
torch.zeros_like(data_sample.gt_instances.labels)
rpn_losses, rpn_results_list = \
self.img_rpn_head.loss_and_predict(
x, rpn_data_samples,
proposal_cfg=proposal_cfg, **kwargs)
# avoid get same name with roi_head loss
keys = rpn_losses.keys()
for key in keys:
if 'loss' in key and 'rpn' not in key:
rpn_losses[f'rpn_{key}'] = rpn_losses.pop(key)
losses.update(rpn_losses)
else:
proposal_list = proposals
roi_losses = self.img_roi_head.forward_train(
x, img_metas, proposal_list, gt_bboxes, gt_labels,
gt_bboxes_ignore, gt_masks, **kwargs)
assert batch_data_samples[0].get('proposals', None) is not None
# use pre-defined proposals in InstanceData for
# the second stage
# to extract ROI features.
rpn_results_list = [
data_sample.proposals for data_sample in batch_data_samples
]
roi_losses = self.img_roi_head.loss(x, rpn_results_list,
batch_data_samples, **kwargs)
losses.update(roi_losses)
return losses
else:
bboxes_2d = self.extract_bboxes_2d(
img, img_metas, bboxes_2d=bboxes_2d, **kwargs)
with torch.no_grad():
results_2d = self.predict_img_only(
batch_inputs_dict['imgs'],
batch_data_samples,
rescale=False)
# tensor with shape (n, 6), the 6 arrange
# as [x1, x2, y1, y2, score, label]
pred_bboxes_with_label_list = []
for single_results in results_2d:
cat_preds = torch.cat(
(single_results.bboxes, single_results.scores[:, None],
single_results.labels[:, None]),
dim=-1)
cat_preds = cat_preds[torch.argsort(
cat_preds[:, 4], descending=True)]
# drop half bboxes during training for better generalization
if self.training:
rand_drop = torch.randperm(
len(cat_preds))[:(len(cat_preds) + 1) // 2]
rand_drop = torch.sort(rand_drop)[0]
cat_preds = cat_preds[rand_drop]
points = torch.stack(points)
seeds_3d, seed_3d_features, seed_indices = \
self.extract_pts_feat(points)
pred_bboxes_with_label_list.append(cat_preds)
img_features, masks = self.fusion_layer(img, bboxes_2d, seeds_3d,
img_metas)
stack_points = torch.stack(points)
seeds_3d, seed_3d_features, seed_indices = \
self.extract_pts_feat(stack_points)
img_metas = [item.metainfo for item in batch_data_samples]
img_features, masks = self.fusion_layer(
imgs, pred_bboxes_with_label_list, seeds_3d, img_metas)
inds = sample_valid_seeds(masks, self.num_sampled_seed)
batch_size, img_feat_size = img_features.shape[:2]
......@@ -476,27 +387,13 @@ class ImVoteNet(Base3DDetector):
seed_features=img_features,
seed_indices=seed_indices)
loss_inputs = (points, gt_bboxes_3d, gt_labels_3d,
pts_semantic_mask, pts_instance_mask, img_metas)
bbox_preds_joints = self.pts_bbox_head_joint(
feat_dict_joint, self.train_cfg.pts.sample_mod)
bbox_preds_pts = self.pts_bbox_head_pts(
feat_dict_pts, self.train_cfg.pts.sample_mod)
bbox_preds_img = self.pts_bbox_head_img(
feat_dict_img, self.train_cfg.pts.sample_mod)
losses_towers = []
losses_joint = self.pts_bbox_head_joint.loss(
bbox_preds_joints,
*loss_inputs,
gt_bboxes_ignore=gt_bboxes_ignore)
losses_pts = self.pts_bbox_head_pts.loss(
bbox_preds_pts,
*loss_inputs,
gt_bboxes_ignore=gt_bboxes_ignore)
losses_img = self.pts_bbox_head_img.loss(
bbox_preds_img,
*loss_inputs,
gt_bboxes_ignore=gt_bboxes_ignore)
points, feat_dict_joint, batch_data_samples)
losses_pts = self.pts_bbox_head_pts.loss(points, feat_dict_pts,
batch_data_samples)
losses_img = self.pts_bbox_head_img.loss(points, feat_dict_img,
batch_data_samples)
losses_towers.append(losses_joint)
losses_towers.append(losses_pts)
losses_towers.append(losses_img)
......@@ -516,267 +413,50 @@ class ImVoteNet(Base3DDetector):
return combined_losses
def forward_test(self,
points=None,
img_metas=None,
img=None,
bboxes_2d=None,
**kwargs):
"""Forwarding of test for image branch pretrain or stage 2 train.
def predict(self, batch_inputs_dict: Dict[str, Optional[Tensor]],
batch_data_samples: List[Det3DDataSample],
**kwargs) -> List[Det3DDataSample]:
"""Forward of testing.
Args:
points (list[list[torch.Tensor]], optional): the outer
list indicates test-time augmentations and the inner
list contains all points in the batch, where each Tensor
should have a shape NxC. Defaults to None.
img_metas (list[list[dict]], optional): the outer list
indicates test-time augs (multiscale, flip, etc.)
and the inner list indicates images in a batch.
Defaults to None.
img (list[list[torch.Tensor]], optional): the outer
list indicates test-time augmentations and inner Tensor
should have a shape NxCxHxW, which contains all images
in the batch. Defaults to None. Defaults to None.
bboxes_2d (list[list[torch.Tensor]], optional):
Provided 2d bboxes, not supported yet. Defaults to None.
Returns:
list[list[torch.Tensor]]|list[dict]: Predicted 2d or 3d boxes.
batch_inputs_dict (dict): The model input dict which include
'points' and 'imgs keys.
- points (list[torch.Tensor]): Point cloud of each sample.
- imgs (list[torch.Tensor]): Tensor of Images.
batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
Samples. It usually includes information such as
`gt_instance_3d`.
"""
points = batch_inputs_dict.get('points', None)
imgs = batch_inputs_dict.get('imgs', None)
if points is None:
for var, name in [(img, 'img'), (img_metas, 'img_metas')]:
if not isinstance(var, list):
raise TypeError(
f'{name} must be a list, but got {type(var)}')
num_augs = len(img)
if num_augs != len(img_metas):
raise ValueError(f'num of augmentations ({len(img)}) '
f'!= num of image meta ({len(img_metas)})')
if num_augs == 1:
# proposals (List[List[Tensor]]): the outer list indicates
# test-time augs (multiscale, flip, etc.) and the inner list
# indicates images in a batch.
# The Tensor should have a shape Px4, where P is the number of
# proposals.
if 'proposals' in kwargs:
kwargs['proposals'] = kwargs['proposals'][0]
return self.simple_test_img_only(
img=img[0], img_metas=img_metas[0], **kwargs)
else:
assert img[0].size(0) == 1, 'aug test does not support ' \
'inference with batch size ' \
f'{img[0].size(0)}'
# TODO: support test augmentation for predefined proposals
assert 'proposals' not in kwargs
return self.aug_test_img_only(
img=img, img_metas=img_metas, **kwargs)
assert imgs is not None
results_2d = self.predict_img_only(imgs, batch_data_samples)
return self.convert_to_datasample(results_list_2d=results_2d)
else:
for var, name in [(points, 'points'), (img_metas, 'img_metas')]:
if not isinstance(var, list):
raise TypeError('{} must be a list, but got {}'.format(
name, type(var)))
num_augs = len(points)
if num_augs != len(img_metas):
raise ValueError(
'num of augmentations ({}) != num of image meta ({})'.
format(len(points), len(img_metas)))
if num_augs == 1:
return self.simple_test(
points[0],
img_metas[0],
img[0],
bboxes_2d=bboxes_2d[0] if bboxes_2d is not None else None,
**kwargs)
else:
return self.aug_test(points, img_metas, img, bboxes_2d,
**kwargs)
def simple_test_img_only(self,
img,
img_metas,
proposals=None,
rescale=False):
r"""Test without augmentation, image network pretrain. May refer to
`<https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/two_stage.py>`_.
Args:
img (torch.Tensor): Should have a shape NxCxHxW, which contains
all images in the batch.
img_metas (list[dict]):
proposals (list[Tensor], optional): override rpn proposals
with custom proposals. Defaults to None.
rescale (bool, optional): Whether or not rescale bboxes to the
original shape of input image. Defaults to False.
Returns:
list[list[torch.Tensor]]: Predicted 2d boxes.
""" # noqa: E501
assert self.with_img_bbox, 'Img bbox head must be implemented.'
assert self.with_img_backbone, 'Img backbone must be implemented.'
assert self.with_img_rpn, 'Img rpn must be implemented.'
assert self.with_img_roi_head, 'Img roi head must be implemented.'
x = self.extract_img_feat(img)
if proposals is None:
proposal_list = self.img_rpn_head.simple_test_rpn(x, img_metas)
else:
proposal_list = proposals
ret = self.img_roi_head.simple_test(
x, proposal_list, img_metas, rescale=rescale)
return ret
def simple_test(self,
points=None,
img_metas=None,
img=None,
bboxes_2d=None,
rescale=False,
**kwargs):
"""Test without augmentation, stage 2.
Args:
points (list[torch.Tensor], optional): Elements in the list
should have a shape NxC, the list indicates all point-clouds
in the batch. Defaults to None.
img_metas (list[dict], optional): List indicates
images in a batch. Defaults to None.
img (torch.Tensor, optional): Should have a shape NxCxHxW,
which contains all images in the batch. Defaults to None.
bboxes_2d (list[torch.Tensor], optional):
Provided 2d bboxes, not supported yet. Defaults to None.
rescale (bool, optional): Whether or not rescale bboxes.
Defaults to False.
Returns:
list[dict]: Predicted 3d boxes.
"""
bboxes_2d = self.extract_bboxes_2d(
img, img_metas, train=False, bboxes_2d=bboxes_2d, **kwargs)
points = torch.stack(points)
seeds_3d, seed_3d_features, seed_indices = \
self.extract_pts_feat(points)
img_features, masks = self.fusion_layer(img, bboxes_2d, seeds_3d,
img_metas)
inds = sample_valid_seeds(masks, self.num_sampled_seed)
batch_size, img_feat_size = img_features.shape[:2]
pts_feat_size = seed_3d_features.shape[1]
inds_img = inds.view(batch_size, 1, -1).expand(-1, img_feat_size, -1)
img_features = img_features.gather(-1, inds_img)
inds = inds % inds.shape[1]
inds_seed_xyz = inds.view(batch_size, -1, 1).expand(-1, -1, 3)
seeds_3d = seeds_3d.gather(1, inds_seed_xyz)
inds_seed_feats = inds.view(batch_size, 1,
-1).expand(-1, pts_feat_size, -1)
seed_3d_features = seed_3d_features.gather(-1, inds_seed_feats)
seed_indices = seed_indices.gather(1, inds)
img_features = self.img_mlp(img_features)
fused_features = torch.cat([seed_3d_features, img_features], dim=1)
feat_dict = dict(
seed_points=seeds_3d,
seed_features=fused_features,
seed_indices=seed_indices)
bbox_preds = self.pts_bbox_head_joint(feat_dict,
self.test_cfg.pts.sample_mod)
bbox_list = self.pts_bbox_head_joint.get_bboxes(
points, bbox_preds, img_metas, rescale=rescale)
bbox_results = [
bbox3d2result(bboxes, scores, labels)
for bboxes, scores, labels in bbox_list
]
return bbox_results
def aug_test_img_only(self, img, img_metas, rescale=False):
r"""Test function with augmentation, image network pretrain. May refer
to `<https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/two_stage.py>`_.
Args:
img (list[list[torch.Tensor]], optional): the outer
list indicates test-time augmentations and inner Tensor
should have a shape NxCxHxW, which contains all images
in the batch. Defaults to None. Defaults to None.
img_metas (list[list[dict]], optional): the outer list
indicates test-time augs (multiscale, flip, etc.)
and the inner list indicates images in a batch.
Defaults to None.
rescale (bool, optional): Whether or not rescale bboxes to the
original shape of input image. If rescale is False, then
returned bboxes and masks will fit the scale of imgs[0].
Defaults to None.
Returns:
list[list[torch.Tensor]]: Predicted 2d boxes.
""" # noqa: E501
assert self.with_img_bbox, 'Img bbox head must be implemented.'
assert self.with_img_backbone, 'Img backbone must be implemented.'
assert self.with_img_rpn, 'Img rpn must be implemented.'
assert self.with_img_roi_head, 'Img roi head must be implemented.'
x = self.extract_img_feats(img)
proposal_list = self.img_rpn_head.aug_test_rpn(x, img_metas)
return self.img_roi_head.aug_test(
x, proposal_list, img_metas, rescale=rescale)
def aug_test(self,
points=None,
img_metas=None,
imgs=None,
bboxes_2d=None,
rescale=False,
**kwargs):
"""Test function with augmentation, stage 2.
Args:
points (list[list[torch.Tensor]], optional): the outer
list indicates test-time augmentations and the inner
list contains all points in the batch, where each Tensor
should have a shape NxC. Defaults to None.
img_metas (list[list[dict]], optional): the outer list
indicates test-time augs (multiscale, flip, etc.)
and the inner list indicates images in a batch.
Defaults to None.
imgs (list[list[torch.Tensor]], optional): the outer
list indicates test-time augmentations and inner Tensor
should have a shape NxCxHxW, which contains all images
in the batch. Defaults to None. Defaults to None.
bboxes_2d (list[list[torch.Tensor]], optional):
Provided 2d bboxes, not supported yet. Defaults to None.
rescale (bool, optional): Whether or not rescale bboxes.
Defaults to False.
Returns:
list[dict]: Predicted 3d boxes.
"""
points_cat = [torch.stack(pts) for pts in points]
feats = self.extract_pts_feats(points_cat, img_metas)
# only support aug_test for one sample
aug_bboxes = []
for x, pts_cat, img_meta, bbox_2d, img in zip(feats, points_cat,
img_metas, bboxes_2d,
imgs):
bbox_2d = self.extract_bboxes_2d(
img, img_metas, train=False, bboxes_2d=bbox_2d, **kwargs)
seeds_3d, seed_3d_features, seed_indices = x
results_2d = self.predict_img_only(
batch_inputs_dict['imgs'], batch_data_samples, rescale=False)
# tensor with shape (n, 6), the 6 arrange
# as [x1, x2, y1, y2, score, label]
pred_bboxes_with_label_list = []
for single_results in results_2d:
cat_preds = torch.cat(
(single_results.bboxes, single_results.scores[:, None],
single_results.labels[:, None]),
dim=-1)
cat_preds = cat_preds[torch.argsort(
cat_preds[:, 4], descending=True)]
pred_bboxes_with_label_list.append(cat_preds)
stack_points = torch.stack(points)
seeds_3d, seed_3d_features, seed_indices = \
self.extract_pts_feat(stack_points)
img_features, masks = self.fusion_layer(img, bbox_2d, seeds_3d,
img_metas)
img_features, masks = self.fusion_layer(
imgs, pred_bboxes_with_label_list, seeds_3d,
[item.metainfo for item in batch_data_samples])
inds = sample_valid_seeds(masks, self.num_sampled_seed)
batch_size, img_feat_size = img_features.shape[:2]
......@@ -800,19 +480,57 @@ class ImVoteNet(Base3DDetector):
seed_points=seeds_3d,
seed_features=fused_features,
seed_indices=seed_indices)
bbox_preds = self.pts_bbox_head_joint(feat_dict,
self.test_cfg.pts.sample_mod)
bbox_list = self.pts_bbox_head_joint.get_bboxes(
pts_cat, bbox_preds, img_metas, rescale=rescale)
bbox_list = [
dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels)
for bboxes, scores, labels in bbox_list
results_3d = self.pts_bbox_head_joint.predict(
batch_inputs_dict['points'],
feat_dict,
batch_data_samples,
rescale=True)
return self.convert_to_datasample(results_3d)
def predict_img_only(self,
imgs: Tensor,
batch_data_samples: List[Det3DDataSample],
rescale: bool = True) -> List[InstanceData]:
"""Predict results from a batch of imgs with post- processing.
Args:
imgs (Tensor): Inputs images with shape (N, C, H, W).
batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
Samples. It usually includes information such as
`gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
rescale (bool): Whether to rescale the results.
Defaults to True.
Returns:
list[:obj:`InstanceData`]: Return the list of detection
results of the input images, usually contains following keys.
- scores (Tensor): Classification scores, has a shape
(num_instance, )
- labels (Tensor): Labels of bboxes, has a shape
(num_instances, ).
- bboxes (Tensor): Has a shape (num_instances, 4),
the last dimension 4 arrange as (x1, y1, x2, y2).
"""
assert self.with_img_bbox, 'Img bbox head must be implemented.'
assert self.with_img_backbone, 'Img backbone must be implemented.'
assert self.with_img_rpn, 'Img rpn must be implemented.'
assert self.with_img_roi_head, 'Img roi head must be implemented.'
x = self.extract_img_feat(imgs)
# If there are no pre-defined proposals, use RPN to get proposals
if batch_data_samples[0].get('proposals', None) is None:
rpn_results_list = self.img_rpn_head.predict(
x, batch_data_samples, rescale=False)
else:
rpn_results_list = [
data_sample.proposals for data_sample in batch_data_samples
]
aug_bboxes.append(bbox_list[0])
# after merging, bboxes will be rescaled to the original image size
merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, img_metas,
self.bbox_head.test_cfg)
results_list = self.img_roi_head.predict(
x, rpn_results_list, batch_data_samples, rescale=rescale)
return [merged_bboxes]
return results_list
......@@ -105,8 +105,8 @@ def extract_2d_info(img_meta, tensor):
"""
img_shape = img_meta['img_shape']
ori_shape = img_meta['ori_shape']
img_h, img_w, _ = img_shape
ori_h, ori_w, _ = ori_shape
img_h, img_w = img_shape
ori_h, ori_w = ori_shape
img_scale_factor = (
tensor.new_tensor(img_meta['scale_factor'][:2])
......
......@@ -45,8 +45,6 @@ class VoteFusion(nn.Module):
seed_num = seed_3d_depth.shape[0]
img_shape = img_meta['img_shape']
img_h, img_w, _ = img_shape
# first reverse the data transformations
xyz_depth = apply_3d_transformation(
seed_3d_depth, 'DEPTH', img_meta, reverse=True)
......
import unittest
import torch
from mmengine import DefaultScope
from mmdet3d.registry import MODELS
from tests.utils.model_utils import (_create_detector_inputs,
_get_detector_cfg, _setup_seed)
class TestH3D(unittest.TestCase):
def test_h3dnet(self):
import mmdet3d.models
assert hasattr(mmdet3d.models, 'H3DNet')
DefaultScope.get_instance('test_H3DNet', scope_name='mmdet3d')
_setup_seed(0)
voxel_net_cfg = _get_detector_cfg(
'h3dnet/h3dnet_3x8_scannet-3d-18class.py')
model = MODELS.build(voxel_net_cfg)
num_gt_instance = 5
data = [
_create_detector_inputs(
num_gt_instance=num_gt_instance,
points_feat_dim=4,
bboxes_3d_type='depth',
with_pts_semantic_mask=True,
with_pts_instance_mask=True)
]
if torch.cuda.is_available():
model = model.cuda()
# test simple_test
with torch.no_grad():
batch_inputs, data_samples = model.data_preprocessor(
data, True)
results = model.forward(
batch_inputs, data_samples, mode='predict')
self.assertEqual(len(results), len(data))
self.assertIn('bboxes_3d', results[0].pred_instances_3d)
self.assertIn('scores_3d', results[0].pred_instances_3d)
self.assertIn('labels_3d', results[0].pred_instances_3d)
# save the memory
with torch.no_grad():
losses = model.forward(batch_inputs, data_samples, mode='loss')
self.assertGreater(losses['vote_loss'], 0)
self.assertGreater(losses['objectness_loss'], 0)
self.assertGreater(losses['center_loss'], 0)
import unittest
import torch
from mmengine import DefaultScope
from mmdet3d.registry import MODELS
from tests.utils.model_utils import (_create_detector_inputs,
_get_detector_cfg, _setup_seed)
class TestImvoteNet(unittest.TestCase):
def test_imvotenet_only_img(self):
import mmdet3d.models
assert hasattr(mmdet3d.models, 'ImVoteNet')
DefaultScope.get_instance('test_imvotenet_img', scope_name='mmdet3d')
_setup_seed(0)
votenet_net_cfg = _get_detector_cfg(
'imvotenet/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class.py'
)
model = MODELS.build(votenet_net_cfg)
data = [
_create_detector_inputs(
with_points=False, with_img=True, img_size=128)
]
if torch.cuda.is_available():
model = model.cuda()
# test simple_test
with torch.no_grad():
batch_inputs, data_samples = model.data_preprocessor(
data, True)
results = model.forward(
batch_inputs, data_samples, mode='predict')
self.assertEqual(len(results), len(data))
self.assertIn('bboxes', results[0].pred_instances)
self.assertIn('scores', results[0].pred_instances)
self.assertIn('labels', results[0].pred_instances)
# save the memory
with torch.no_grad():
losses = model.forward(batch_inputs, data_samples, mode='loss')
self.assertGreater(sum(losses['loss_rpn_cls']), 0)
self.assertGreater(losses['loss_cls'], 0)
self.assertGreater(losses['loss_bbox'], 0)
def test_imvotenet(self):
import mmdet3d.models
assert hasattr(mmdet3d.models, 'ImVoteNet')
DefaultScope.get_instance('test_imvotenet', scope_name='mmdet3d')
_setup_seed(0)
votenet_net_cfg = _get_detector_cfg(
'imvotenet/imvotenet_stage2_16x8_sunrgbd-3d-10class.py')
model = MODELS.build(votenet_net_cfg)
data = [
_create_detector_inputs(
with_points=True,
with_img=True,
img_size=128,
bboxes_3d_type='depth')
]
if torch.cuda.is_available():
model = model.cuda()
# test simple_test
with torch.no_grad():
batch_inputs, data_samples = model.data_preprocessor(
data, True)
results = model.forward(
batch_inputs, data_samples, mode='predict')
self.assertEqual(len(results), len(data))
self.assertIn('bboxes_3d', results[0].pred_instances_3d)
self.assertIn('scores_3d', results[0].pred_instances_3d)
self.assertIn('labels_3d', results[0].pred_instances_3d)
# save the memory
with torch.no_grad():
losses = model.forward(batch_inputs, data_samples, mode='loss')
self.assertGreater(losses['vote_loss'], 0)
self.assertGreater(losses['objectness_loss'], 0)
self.assertGreater(losses['semantic_loss'], 0)
......@@ -75,6 +75,7 @@ def _get_detector_cfg(fname):
def _create_detector_inputs(seed=0,
with_points=True,
with_img=False,
img_size=10,
num_gt_instance=20,
num_points=10,
points_feat_dim=4,
......@@ -90,23 +91,46 @@ def _create_detector_inputs(seed=0,
'depth': DepthInstance3DBoxes,
'cam': CameraInstance3DBoxes
}
meta_info = dict()
meta_info['depth2img'] = np.array(
[[5.23289349e+02, 3.68831943e+02, 6.10469439e+01],
[1.09560138e+02, 1.97404735e+02, -5.47377738e+02],
[1.25930002e-02, 9.92229998e-01, -1.23769999e-01]])
meta_info['lidar2img'] = np.array(
[[5.23289349e+02, 3.68831943e+02, 6.10469439e+01],
[1.09560138e+02, 1.97404735e+02, -5.47377738e+02],
[1.25930002e-02, 9.92229998e-01, -1.23769999e-01]])
if with_points:
points = torch.rand([num_points, points_feat_dim])
else:
points = None
if with_img:
img = torch.rand(3, 10, 10)
img = torch.rand(3, img_size, img_size)
meta_info['img_shape'] = (img_size, img_size)
meta_info['ori_shape'] = (img_size, img_size)
meta_info['scale_factor'] = np.array([1., 1.])
else:
img = None
inputs_dict = dict(img=img, points=points)
gt_instance_3d = InstanceData()
gt_instance_3d.bboxes_3d = bbox_3d_class[bboxes_3d_type](
torch.rand([num_gt_instance, gt_bboxes_dim]), box_dim=gt_bboxes_dim)
gt_instance_3d.labels_3d = torch.randint(0, num_classes, [num_gt_instance])
data_sample = Det3DDataSample(
metainfo=dict(box_type_3d=bbox_3d_class[bboxes_3d_type]))
data_sample.set_metainfo(meta_info)
data_sample.gt_instances_3d = gt_instance_3d
gt_instance = InstanceData()
gt_instance.labels = torch.randint(0, num_classes, [num_gt_instance])
gt_instance.bboxes = torch.rand(num_gt_instance, 4)
gt_instance.bboxes[:,
2:] = gt_instance.bboxes[:, :2] + gt_instance.bboxes[:,
2:]
data_sample.gt_instances = gt_instance
data_sample.gt_pts_seg = PointData()
if with_pts_instance_mask:
pts_instance_mask = torch.randint(0, num_gt_instance, [num_points])
......
......@@ -652,21 +652,28 @@ def update_sunrgbd_infos(pkl_path, out_dir):
temp_data_info['images']['CAM0']['width'] = w
anns = ori_info_dict['annos']
num_instances = len(anns['name'])
ignore_class_name = set()
instance_list = []
for instance_id in range(num_instances):
empty_instance = get_empty_instance()
empty_instance['bbox_3d'] = anns['gt_boxes_upright_depth'][
instance_id].tolist()
if anns['name'][instance_id] in METAINFO['CLASSES']:
empty_instance['bbox_label_3d'] = METAINFO['CLASSES'].index(
anns['name'][instance_id])
else:
ignore_class_name.add(anns['name'][instance_id])
empty_instance['bbox_label_3d'] = -1
empty_instance = clear_instance_unused_keys(empty_instance)
instance_list.append(empty_instance)
if anns['gt_num'] == 0:
instance_list = []
else:
num_instances = len(anns['name'])
ignore_class_name = set()
instance_list = []
for instance_id in range(num_instances):
empty_instance = get_empty_instance()
empty_instance['bbox_3d'] = anns['gt_boxes_upright_depth'][
instance_id].tolist()
empty_instance['bbox'] = anns['bbox'][instance_id].tolist()
if anns['name'][instance_id] in METAINFO['CLASSES']:
empty_instance['bbox_label_3d'] = METAINFO[
'CLASSES'].index(anns['name'][instance_id])
empty_instance['bbox_label'] = empty_instance[
'bbox_label_3d']
else:
ignore_class_name.add(anns['name'][instance_id])
empty_instance['bbox_label_3d'] = -1
empty_instance['bbox_label'] = -1
empty_instance = clear_instance_unused_keys(empty_instance)
instance_list.append(empty_instance)
temp_data_info['instances'] = instance_list
temp_data_info, _ = clear_data_info_unused_keys(temp_data_info)
converted_list.append(temp_data_info)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment