Unverified Commit 4eed122d authored by Yezhen Cong's avatar Yezhen Cong Committed by GitHub
Browse files

[Feature] Support ImVoteNet complete model (#352)



* Added image loading in SUNRGB-D dataset (#195)

* image loading

* format and docstring fix

* removed irrelevant files

* removed irrelevant files

* load image only if modality is pc+img

* added modality like nuscenes

* Added imvotenet image branch pretrain (#217)

* image loading

* naive commit

* format and docstring fix

* removed irrelevant files

* removed irrelevant files

* load image only if modality is pc+img

* added modality like nuscenes

* pretrain_2d_model

* finetune sunrgbd

* finetune sunrgbd

* deleted unused code

* fixed a bug

* resolve conflict

* update config file

* fix docstring and configs

* integrated vote fusion

* coords transform and unit test

* Update docstring

* refactor and add unit test

* fix bug caused by mmcv upgrade; delete pdb breakpoint

* add point fusion unittest

* remove unused file

* fix typos

* updates

* add assertion info

* update

* add unittest

* add vote fusion unittest

* add vote fusion unittest

* [Refactor] VoteNet refactor (#322)

* votenet refactor

* remove file

* minor update

* docstring

* initial update of imvotenet

* [Feature] Support vote fusion (#297)

* integrated vote fusion

* coords transform and unit test

* Update docstring

* refactor and add unit test

* add point fusion unittest

* remove unused file

* updates

* add assertion info

* update

* add unittest

* add vote fusion unittest

* add vote fusion unittest

* minor update

* docstring

* change np ops to torch

* refactor test

* update

* refactor of image mlp and np random ops to torch

* add docstring

* add config and mod dataset

* fix bugs

* add_comments

* fix bugs

* fix_bug

* fix bug

* fix bug

* fix bug

* fix bug

* final fix

* fix bug

* ?

* add docstring

* move train/test cfg

* change img mlp default param

* rename config

* minor mod

* change config name

* move train/test cfg

* some fixes and 2d utils

* fix config name

* fix config override issue

* config simplify & reformat

* explicitly set eval mode->override train()

* add fix_img_branch to config

* remove set_img_branch_eval_mode

* temporal fix, change calibs to calib

* more docstring and view/reshape, expand/repeat change

* complete imvotenet docstring

* fix docstring

* add config and some minor fix

* rename config
Co-authored-by: default avatarZwwWayne <wayne.zw@outlook.com>
parent 097b66ee
model = dict(
type='ImVoteNet',
img_backbone=dict(
type='ResNet',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
norm_cfg=dict(type='BN', requires_grad=False),
norm_eval=True,
style='caffe'),
img_neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
num_outs=5),
img_rpn_head=dict(
type='RPNHead',
in_channels=256,
feat_channels=256,
anchor_generator=dict(
type='AnchorGenerator',
scales=[8],
ratios=[0.5, 1.0, 2.0],
strides=[4, 8, 16, 32, 64]),
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
target_means=[.0, .0, .0, .0],
target_stds=[1.0, 1.0, 1.0, 1.0]),
loss_cls=dict(
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
img_roi_head=dict(
type='StandardRoIHead',
bbox_roi_extractor=dict(
type='SingleRoIExtractor',
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
out_channels=256,
featmap_strides=[4, 8, 16, 32]),
bbox_head=dict(
type='Shared2FCBBoxHead',
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=10,
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
target_means=[0., 0., 0., 0.],
target_stds=[0.1, 0.1, 0.2, 0.2]),
reg_class_agnostic=False,
loss_cls=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
# model training and testing settings
train_cfg=dict(
img_rpn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.7,
neg_iou_thr=0.3,
min_pos_iou=0.3,
match_low_quality=True,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=256,
pos_fraction=0.5,
neg_pos_ub=-1,
add_gt_as_proposals=False),
allowed_border=-1,
pos_weight=-1,
debug=False),
img_rpn_proposal=dict(
nms_across_levels=False,
nms_pre=2000,
nms_post=1000,
max_num=1000,
nms_thr=0.7,
min_bbox_size=0),
img_rcnn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.5,
neg_iou_thr=0.5,
min_pos_iou=0.5,
match_low_quality=False,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
pos_weight=-1,
debug=False)),
test_cfg=dict(
img_rpn=dict(
nms_across_levels=False,
nms_pre=1000,
nms_post=1000,
max_num=1000,
nms_thr=0.7,
min_bbox_size=0),
img_rcnn=dict(
score_thr=0.05,
nms=dict(type='nms', iou_threshold=0.5),
max_per_img=100)))
_base_ = [
'../_base_/datasets/sunrgbd-3d-10class.py', '../_base_/default_runtime.py',
'../_base_/models/imvotenet_image.py'
]
# use caffe img_norm
img_norm_cfg = dict(
mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', with_bbox=True),
dict(
type='Resize',
img_scale=[(1333, 480), (1333, 504), (1333, 528), (1333, 552),
(1333, 576), (1333, 600)],
multiscale_mode='value',
keep_ratio=True),
dict(type='RandomFlip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(1333, 600),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img']),
])
]
data = dict(
samples_per_gpu=2,
workers_per_gpu=2,
train=dict(times=1, dataset=dict(pipeline=train_pipeline)),
val=dict(pipeline=test_pipeline),
test=dict(pipeline=test_pipeline))
optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
optimizer_config = dict(grad_clip=None)
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[6])
total_epochs = 8
load_from = 'http://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco_bbox_mAP-0.408__segm_mAP-0.37_20200504_163245-42aa3d00.pth' # noqa
_base_ = [
'../_base_/datasets/sunrgbd-3d-10class.py',
'../_base_/schedules/schedule_3x.py', '../_base_/default_runtime.py',
'../_base_/models/imvotenet_image.py'
]
class_names = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser',
'night_stand', 'bookshelf', 'bathtub')
# use caffe img_norm
img_norm_cfg = dict(
mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
model = dict(
pts_backbone=dict(
type='PointNet2SASSG',
in_channels=4,
num_points=(2048, 1024, 512, 256),
radius=(0.2, 0.4, 0.8, 1.2),
num_samples=(64, 32, 16, 16),
sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
(128, 128, 256)),
fp_channels=((256, 256), (256, 256)),
norm_cfg=dict(type='BN2d'),
sa_cfg=dict(
type='PointSAModule',
pool_mod='max',
use_xyz=True,
normalize_xyz=True)),
pts_bbox_heads=dict(
common=dict(
type='VoteHead',
num_classes=10,
bbox_coder=dict(
type='PartialBinBasedBBoxCoder',
num_sizes=10,
num_dir_bins=12,
with_rot=True,
mean_sizes=[[2.114256, 1.620300, 0.927272],
[0.791118, 1.279516, 0.718182],
[0.923508, 1.867419, 0.845495],
[0.591958, 0.552978, 0.827272],
[0.699104, 0.454178, 0.75625],
[0.69519, 1.346299, 0.736364],
[0.528526, 1.002642, 1.172878],
[0.500618, 0.632163, 0.683424],
[0.404671, 1.071108, 1.688889],
[0.76584, 1.398258, 0.472728]]),
pred_layer_cfg=dict(
in_channels=128, shared_conv_channels=(128, 128), bias=True),
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d'),
objectness_loss=dict(
type='CrossEntropyLoss',
class_weight=[0.2, 0.8],
reduction='sum',
loss_weight=5.0),
center_loss=dict(
type='ChamferDistance',
mode='l2',
reduction='sum',
loss_src_weight=10.0,
loss_dst_weight=10.0),
dir_class_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
dir_res_loss=dict(
type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
size_class_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
size_res_loss=dict(
type='SmoothL1Loss', reduction='sum', loss_weight=10.0 / 3.0),
semantic_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
joint=dict(
vote_module_cfg=dict(
in_channels=512,
vote_per_seed=1,
gt_per_seed=3,
conv_channels=(512, 256),
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d'),
norm_feats=True,
vote_loss=dict(
type='ChamferDistance',
mode='l1',
reduction='none',
loss_dst_weight=10.0)),
vote_aggregation_cfg=dict(
type='PointSAModule',
num_point=256,
radius=0.3,
num_sample=16,
mlp_channels=[512, 128, 128, 128],
use_xyz=True,
normalize_xyz=True)),
pts=dict(
vote_module_cfg=dict(
in_channels=256,
vote_per_seed=1,
gt_per_seed=3,
conv_channels=(256, 256),
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d'),
norm_feats=True,
vote_loss=dict(
type='ChamferDistance',
mode='l1',
reduction='none',
loss_dst_weight=10.0)),
vote_aggregation_cfg=dict(
type='PointSAModule',
num_point=256,
radius=0.3,
num_sample=16,
mlp_channels=[256, 128, 128, 128],
use_xyz=True,
normalize_xyz=True)),
img=dict(
vote_module_cfg=dict(
in_channels=256,
vote_per_seed=1,
gt_per_seed=3,
conv_channels=(256, 256),
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d'),
norm_feats=True,
vote_loss=dict(
type='ChamferDistance',
mode='l1',
reduction='none',
loss_dst_weight=10.0)),
vote_aggregation_cfg=dict(
type='PointSAModule',
num_point=256,
radius=0.3,
num_sample=16,
mlp_channels=[256, 128, 128, 128],
use_xyz=True,
normalize_xyz=True)),
loss_weights=[0.4, 0.3, 0.3]),
img_mlp=dict(
in_channel=18,
conv_channels=(256, 256),
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d'),
act_cfg=dict(type='ReLU')),
fusion_layer=dict(
type='VoteFusion',
num_classes=len(class_names),
max_imvote_per_pixel=3),
num_sampled_seed=1024,
freeze_img_branch=True,
# model training and testing settings
train_cfg=dict(
pts=dict(
pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote')),
test_cfg=dict(
img_rcnn=dict(score_thr=0.1),
pts=dict(
sample_mod='seed',
nms_thr=0.25,
score_thr=0.05,
per_class_proposal=True)))
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='DEPTH',
shift_height=True,
load_dim=6,
use_dim=[0, 1, 2]),
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations3D'),
dict(type='LoadAnnotations', with_bbox=True),
dict(type='Resize', img_scale=(1333, 600), keep_ratio=True),
dict(type='RandomFlip', flip_ratio=0.0),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.523599, 0.523599],
scale_ratio_range=[0.85, 1.15],
shift_height=True),
dict(type='IndoorPointSample', num_points=20000),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(
type='Collect3D',
keys=[
'img', 'gt_bboxes', 'gt_labels', 'points', 'gt_bboxes_3d',
'gt_labels_3d', 'calib'
])
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='LoadPointsFromFile',
coord_type='DEPTH',
shift_height=True,
load_dim=6,
use_dim=[0, 1, 2]),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 600),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip', flip_ratio=0.0),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
),
dict(type='IndoorPointSample', num_points=20000),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['img', 'points', 'calib'])
]),
]
data = dict(
train=dict(dataset=dict(pipeline=train_pipeline)),
val=dict(pipeline=test_pipeline),
test=dict(pipeline=test_pipeline))
load_from = None # TODO after we update model zoo
...@@ -182,7 +182,7 @@ class Coord3DMode(IntEnum): ...@@ -182,7 +182,7 @@ class Coord3DMode(IntEnum):
"""Convert points from `src` mode to `dst` mode. """Convert points from `src` mode to `dst` mode.
Args: Args:
box (tuple | list | np.dnarray | point (tuple | list | np.dnarray |
torch.Tensor | BasePoints): torch.Tensor | BasePoints):
Can be a k-tuple, k-list or an Nxk array/tensor. Can be a k-tuple, k-list or an Nxk array/tensor.
src (:obj:`CoordMode`): The src Point mode. src (:obj:`CoordMode`): The src Point mode.
...@@ -218,17 +218,25 @@ class Coord3DMode(IntEnum): ...@@ -218,17 +218,25 @@ class Coord3DMode(IntEnum):
arr = point.clone() arr = point.clone()
# convert point from `src` mode to `dst` mode. # convert point from `src` mode to `dst` mode.
if rt_mat is not None: # TODO: LIDAR
if not isinstance(rt_mat, torch.Tensor): # only implemented provided Rt matrix in cam-depth conversion
rt_mat = arr.new_tensor(rt_mat)
if src == Coord3DMode.LIDAR and dst == Coord3DMode.CAM: if src == Coord3DMode.LIDAR and dst == Coord3DMode.CAM:
rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]]) rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]])
elif src == Coord3DMode.CAM and dst == Coord3DMode.LIDAR: elif src == Coord3DMode.CAM and dst == Coord3DMode.LIDAR:
rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]]) rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]])
elif src == Coord3DMode.DEPTH and dst == Coord3DMode.CAM: elif src == Coord3DMode.DEPTH and dst == Coord3DMode.CAM:
rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]]) if rt_mat is None:
elif src == Coord3DMode.CAM and dst == Coord3DMode.DEPTH:
rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]]) rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]])
else:
rt_mat = rt_mat.new_tensor(
[[1, 0, 0], [0, 0, -1], [0, 1, 0]]) @ \
rt_mat.transpose(1, 0)
elif src == Coord3DMode.CAM and dst == Coord3DMode.DEPTH:
if rt_mat is None:
rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]])
else:
rt_mat = rt_mat @ rt_mat.new_tensor([[1, 0, 0], [0, 0, 1],
[0, -1, 0]])
elif src == Coord3DMode.LIDAR and dst == Coord3DMode.DEPTH: elif src == Coord3DMode.LIDAR and dst == Coord3DMode.DEPTH:
rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]]) rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]])
elif src == Coord3DMode.DEPTH and dst == Coord3DMode.LIDAR: elif src == Coord3DMode.DEPTH and dst == Coord3DMode.LIDAR:
...@@ -245,7 +253,7 @@ class Coord3DMode(IntEnum): ...@@ -245,7 +253,7 @@ class Coord3DMode(IntEnum):
else: else:
xyz = arr[:, :3] @ rt_mat.t() xyz = arr[:, :3] @ rt_mat.t()
remains = arr[..., 3:] remains = arr[:, 3:]
arr = torch.cat([xyz[:, :3], remains], dim=-1) arr = torch.cat([xyz[:, :3], remains], dim=-1)
# convert arr to the original type # convert arr to the original type
......
...@@ -122,7 +122,20 @@ def points_cam2img(points_3d, proj_mat): ...@@ -122,7 +122,20 @@ def points_cam2img(points_3d, proj_mat):
torch.Tensor: Points in image coordinates with shape [N, 2]. torch.Tensor: Points in image coordinates with shape [N, 2].
""" """
points_num = list(points_3d.shape)[:-1] points_num = list(points_3d.shape)[:-1]
points_shape = np.concatenate([points_num, [1]], axis=0).tolist() points_shape = np.concatenate([points_num, [1]], axis=0).tolist()
assert len(proj_mat.shape) == 2, f'The dimension of the projection'\
f'matrix should be 2 instead of {len(proj_mat.shape)}.'
d1, d2 = proj_mat.shape[:2]
assert (d1 == 3 and d2 == 3) or (d1 == 3 and d2 == 4) or (
d1 == 4 and d2 == 4), f'The shape of the projection matrix'\
f' ({d1}*{d2}) is not supported.'
if d1 == 3:
proj_mat_expanded = torch.eye(
4, device=proj_mat.device, dtype=proj_mat.dtype)
proj_mat_expanded[:d1, :d2] = proj_mat
proj_mat = proj_mat_expanded
# previous implementation use new_zeros, new_one yeilds better results # previous implementation use new_zeros, new_one yeilds better results
points_4 = torch.cat( points_4 = torch.cat(
[points_3d, points_3d.new_ones(*points_shape)], dim=-1) [points_3d, points_3d.new_ones(*points_shape)], dim=-1)
......
...@@ -10,14 +10,14 @@ class BasePoints(object): ...@@ -10,14 +10,14 @@ class BasePoints(object):
tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix. tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix.
points_dim (int): Number of the dimension of a point. points_dim (int): Number of the dimension of a point.
Each row is (x, y, z). Default to 3. Each row is (x, y, z). Default to 3.
attribute_dims (dict): Dictinory to indicate the meaning of extra attribute_dims (dict): Dictionary to indicate the meaning of extra
dimension. Default to None. dimension. Default to None.
Attributes: Attributes:
tensor (torch.Tensor): Float matrix of N x points_dim. tensor (torch.Tensor): Float matrix of N x points_dim.
points_dim (int): Integer indicating the dimension of a point. points_dim (int): Integer indicating the dimension of a point.
Each row is (x, y, z, ...). Each row is (x, y, z, ...).
attribute_dims (bool): Dictinory to indicate the meaning of extra attribute_dims (bool): Dictionary to indicate the meaning of extra
dimension. Default to None. dimension. Default to None.
rotation_axis (int): Default rotation axis for points rotation. rotation_axis (int): Default rotation axis for points rotation.
""" """
......
...@@ -8,14 +8,14 @@ class CameraPoints(BasePoints): ...@@ -8,14 +8,14 @@ class CameraPoints(BasePoints):
tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix. tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix.
points_dim (int): Number of the dimension of a point. points_dim (int): Number of the dimension of a point.
Each row is (x, y, z). Default to 3. Each row is (x, y, z). Default to 3.
attribute_dims (dict): Dictinory to indicate the meaning of extra attribute_dims (dict): Dictionary to indicate the meaning of extra
dimension. Default to None. dimension. Default to None.
Attributes: Attributes:
tensor (torch.Tensor): Float matrix of N x points_dim. tensor (torch.Tensor): Float matrix of N x points_dim.
points_dim (int): Integer indicating the dimension of a point. points_dim (int): Integer indicating the dimension of a point.
Each row is (x, y, z, ...). Each row is (x, y, z, ...).
attribute_dims (bool): Dictinory to indicate the meaning of extra attribute_dims (bool): Dictionary to indicate the meaning of extra
dimension. Default to None. dimension. Default to None.
rotation_axis (int): Default rotation axis for points rotation. rotation_axis (int): Default rotation axis for points rotation.
""" """
......
...@@ -8,14 +8,14 @@ class DepthPoints(BasePoints): ...@@ -8,14 +8,14 @@ class DepthPoints(BasePoints):
tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix. tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix.
points_dim (int): Number of the dimension of a point. points_dim (int): Number of the dimension of a point.
Each row is (x, y, z). Default to 3. Each row is (x, y, z). Default to 3.
attribute_dims (dict): Dictinory to indicate the meaning of extra attribute_dims (dict): Dictionary to indicate the meaning of extra
dimension. Default to None. dimension. Default to None.
Attributes: Attributes:
tensor (torch.Tensor): Float matrix of N x points_dim. tensor (torch.Tensor): Float matrix of N x points_dim.
points_dim (int): Integer indicating the dimension of a point. points_dim (int): Integer indicating the dimension of a point.
Each row is (x, y, z, ...). Each row is (x, y, z, ...).
attribute_dims (bool): Dictinory to indicate the meaning of extra attribute_dims (bool): Dictionary to indicate the meaning of extra
dimension. Default to None. dimension. Default to None.
rotation_axis (int): Default rotation axis for points rotation. rotation_axis (int): Default rotation axis for points rotation.
""" """
......
...@@ -8,14 +8,14 @@ class LiDARPoints(BasePoints): ...@@ -8,14 +8,14 @@ class LiDARPoints(BasePoints):
tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix. tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix.
points_dim (int): Number of the dimension of a point. points_dim (int): Number of the dimension of a point.
Each row is (x, y, z). Default to 3. Each row is (x, y, z). Default to 3.
attribute_dims (dict): Dictinory to indicate the meaning of extra attribute_dims (dict): Dictionary to indicate the meaning of extra
dimension. Default to None. dimension. Default to None.
Attributes: Attributes:
tensor (torch.Tensor): Float matrix of N x points_dim. tensor (torch.Tensor): Float matrix of N x points_dim.
points_dim (int): Integer indicating the dimension of a point. points_dim (int): Integer indicating the dimension of a point.
Each row is (x, y, z, ...). Each row is (x, y, z, ...).
attribute_dims (bool): Dictinory to indicate the meaning of extra attribute_dims (bool): Dictionary to indicate the meaning of extra
dimension. Default to None. dimension. Default to None.
rotation_axis (int): Default rotation axis for points rotation. rotation_axis (int): Default rotation axis for points rotation.
""" """
......
...@@ -137,8 +137,8 @@ class Collect3D(object): ...@@ -137,8 +137,8 @@ class Collect3D(object):
'pcd_horizontal_flip', 'pcd_vertical_flip', 'pcd_horizontal_flip', 'pcd_vertical_flip',
'box_mode_3d', 'box_type_3d', 'img_norm_cfg', 'box_mode_3d', 'box_type_3d', 'img_norm_cfg',
'rect', 'Trv2c', 'P2', 'pcd_trans', 'sample_idx', 'rect', 'Trv2c', 'P2', 'pcd_trans', 'sample_idx',
'pcd_scale_factor', 'pcd_rotation', 'pcd_scale_factor', 'pcd_rotation', 'pts_filename',
'pts_filename')): 'transformation_3d_flow')):
self.keys = keys self.keys = keys
self.meta_keys = meta_keys self.meta_keys = meta_keys
......
...@@ -96,10 +96,15 @@ class RandomFlip3D(RandomFlip): ...@@ -96,10 +96,15 @@ class RandomFlip3D(RandomFlip):
) < self.flip_ratio_bev_vertical else False ) < self.flip_ratio_bev_vertical else False
input_dict['pcd_vertical_flip'] = flip_vertical input_dict['pcd_vertical_flip'] = flip_vertical
if 'transformation_3d_flow' not in input_dict:
input_dict['transformation_3d_flow'] = []
if input_dict['pcd_horizontal_flip']: if input_dict['pcd_horizontal_flip']:
self.random_flip_data_3d(input_dict, 'horizontal') self.random_flip_data_3d(input_dict, 'horizontal')
input_dict['transformation_3d_flow'].extend(['HF'])
if input_dict['pcd_vertical_flip']: if input_dict['pcd_vertical_flip']:
self.random_flip_data_3d(input_dict, 'vertical') self.random_flip_data_3d(input_dict, 'vertical')
input_dict['transformation_3d_flow'].extend(['VF'])
return input_dict return input_dict
def __repr__(self): def __repr__(self):
...@@ -405,6 +410,9 @@ class GlobalRotScaleTrans(object): ...@@ -405,6 +410,9 @@ class GlobalRotScaleTrans(object):
'pcd_scale_factor', 'pcd_trans' and keys in \ 'pcd_scale_factor', 'pcd_trans' and keys in \
input_dict['bbox3d_fields'] are updated in the result dict. input_dict['bbox3d_fields'] are updated in the result dict.
""" """
if 'transformation_3d_flow' not in input_dict:
input_dict['transformation_3d_flow'] = []
self._rot_bbox_points(input_dict) self._rot_bbox_points(input_dict)
if 'pcd_scale_factor' not in input_dict: if 'pcd_scale_factor' not in input_dict:
...@@ -412,6 +420,8 @@ class GlobalRotScaleTrans(object): ...@@ -412,6 +420,8 @@ class GlobalRotScaleTrans(object):
self._scale_bbox_points(input_dict) self._scale_bbox_points(input_dict)
self._trans_bbox_points(input_dict) self._trans_bbox_points(input_dict)
input_dict['transformation_3d_flow'].extend(['R', 'S', 'T'])
return input_dict return input_dict
def __repr__(self): def __repr__(self):
......
import numpy as np import numpy as np
from collections import OrderedDict
from os import path as osp from os import path as osp
from mmdet3d.core import show_result from mmdet3d.core import show_result
from mmdet3d.core.bbox import DepthInstance3DBoxes from mmdet3d.core.bbox import DepthInstance3DBoxes
from mmdet.core import eval_map
from mmdet.datasets import DATASETS from mmdet.datasets import DATASETS
from .custom_3d import Custom3DDataset from .custom_3d import Custom3DDataset
...@@ -59,6 +61,52 @@ class SUNRGBDDataset(Custom3DDataset): ...@@ -59,6 +61,52 @@ class SUNRGBDDataset(Custom3DDataset):
box_type_3d=box_type_3d, box_type_3d=box_type_3d,
filter_empty_gt=filter_empty_gt, filter_empty_gt=filter_empty_gt,
test_mode=test_mode) test_mode=test_mode)
if self.modality is None:
self.modality = dict(use_camera=True, use_lidar=True)
assert self.modality['use_camera'] or self.modality['use_lidar']
def get_data_info(self, index):
"""Get data info according to the given index.
Args:
index (int): Index of the sample data to get.
Returns:
dict: Data information that will be passed to the data \
preprocessing pipelines. It includes the following keys:
- sample_idx (str): Sample index.
- pts_filename (str, optional): Filename of point clouds.
- file_name (str, optional): Filename of point clouds.
- img_prefix (str | None, optional): Prefix of image files.
- img_info (dict, optional): Image info.
- calib (dict, optional): Camera calibration info.
- ann_info (dict): Annotation info.
"""
info = self.data_infos[index]
sample_idx = info['point_cloud']['lidar_idx']
assert info['point_cloud']['lidar_idx'] == info['image']['image_idx']
input_dict = dict(sample_idx=sample_idx)
if self.modality['use_lidar']:
pts_filename = osp.join(self.data_root, info['pts_path'])
input_dict['pts_filename'] = pts_filename
input_dict['file_name'] = pts_filename
if self.modality['use_camera']:
img_filename = osp.join(self.data_root,
info['image']['image_path'])
input_dict['img_prefix'] = None
input_dict['img_info'] = dict(filename=img_filename)
calib = info['calib']
input_dict['calib'] = calib
if not self.test_mode:
annos = self.get_ann_info(index)
input_dict['ann_info'] = annos
if self.filter_empty_gt and len(annos['gt_bboxes_3d']) == 0:
return None
return input_dict
def get_ann_info(self, index): def get_ann_info(self, index):
"""Get annotation info according to the given index. """Get annotation info according to the given index.
...@@ -91,6 +139,15 @@ class SUNRGBDDataset(Custom3DDataset): ...@@ -91,6 +139,15 @@ class SUNRGBDDataset(Custom3DDataset):
anns_results = dict( anns_results = dict(
gt_bboxes_3d=gt_bboxes_3d, gt_labels_3d=gt_labels_3d) gt_bboxes_3d=gt_bboxes_3d, gt_labels_3d=gt_labels_3d)
if self.modality['use_camera']:
if info['annos']['gt_num'] != 0:
gt_bboxes_2d = info['annos']['bbox'].astype(np.float32)
else:
gt_bboxes_2d = np.zeros((0, 4), dtype=np.float32)
anns_results['bboxes'] = gt_bboxes_2d
anns_results['labels'] = gt_labels_3d
return anns_results return anns_results
def show(self, results, out_dir, show=True): def show(self, results, out_dir, show=True):
...@@ -114,3 +171,33 @@ class SUNRGBDDataset(Custom3DDataset): ...@@ -114,3 +171,33 @@ class SUNRGBDDataset(Custom3DDataset):
pred_bboxes = result['boxes_3d'].tensor.numpy() pred_bboxes = result['boxes_3d'].tensor.numpy()
show_result(points, gt_bboxes, pred_bboxes, out_dir, file_name, show_result(points, gt_bboxes, pred_bboxes, out_dir, file_name,
show) show)
def evaluate(self,
results,
metric=None,
iou_thr=(0.25, 0.5),
iou_thr_2d=(0.5, ),
logger=None,
show=False,
out_dir=None):
# evaluate 3D detection performance
if isinstance(results[0], dict):
return super().evaluate(results, metric, iou_thr, logger, show,
out_dir)
# evaluate 2D detection performance
else:
eval_results = OrderedDict()
annotations = [self.get_ann_info(i) for i in range(len(self))]
iou_thr_2d = (iou_thr_2d) if isinstance(iou_thr_2d,
float) else iou_thr_2d
for iou_thr_2d_single in iou_thr_2d:
mean_ap, _ = eval_map(
results,
annotations,
scale_ranges=None,
iou_thr=iou_thr_2d_single,
dataset=self.CLASSES,
logger=logger)
eval_results['mAP_' + str(iou_thr_2d_single)] = mean_ap
return eval_results
...@@ -119,9 +119,9 @@ class VoteHead(nn.Module): ...@@ -119,9 +119,9 @@ class VoteHead(nn.Module):
torch.Tensor: Features of input points. torch.Tensor: Features of input points.
torch.Tensor: Indices of input points. torch.Tensor: Indices of input points.
""" """
seed_points = feat_dict['fp_xyz'][-1] seed_points = feat_dict['seed_points']
seed_features = feat_dict['fp_features'][-1] seed_features = feat_dict['seed_features']
seed_indices = feat_dict['fp_indices'][-1] seed_indices = feat_dict['seed_indices']
return seed_points, seed_features, seed_indices return seed_points, seed_features, seed_indices
......
...@@ -2,6 +2,7 @@ from .base import Base3DDetector ...@@ -2,6 +2,7 @@ from .base import Base3DDetector
from .centerpoint import CenterPoint from .centerpoint import CenterPoint
from .dynamic_voxelnet import DynamicVoxelNet from .dynamic_voxelnet import DynamicVoxelNet
from .h3dnet import H3DNet from .h3dnet import H3DNet
from .imvotenet import ImVoteNet
from .mvx_faster_rcnn import DynamicMVXFasterRCNN, MVXFasterRCNN from .mvx_faster_rcnn import DynamicMVXFasterRCNN, MVXFasterRCNN
from .mvx_two_stage import MVXTwoStageDetector from .mvx_two_stage import MVXTwoStageDetector
from .parta2 import PartA2 from .parta2 import PartA2
...@@ -10,7 +11,16 @@ from .votenet import VoteNet ...@@ -10,7 +11,16 @@ from .votenet import VoteNet
from .voxelnet import VoxelNet from .voxelnet import VoxelNet
__all__ = [ __all__ = [
'Base3DDetector', 'VoxelNet', 'DynamicVoxelNet', 'MVXTwoStageDetector', 'Base3DDetector',
'DynamicMVXFasterRCNN', 'MVXFasterRCNN', 'PartA2', 'VoteNet', 'H3DNet', 'VoxelNet',
'CenterPoint', 'SSD3DNet' 'DynamicVoxelNet',
'MVXTwoStageDetector',
'DynamicMVXFasterRCNN',
'MVXFasterRCNN',
'PartA2',
'VoteNet',
'H3DNet',
'CenterPoint',
'SSD3DNet',
'ImVoteNet',
] ]
This diff is collapsed.
...@@ -7,10 +7,7 @@ from .single_stage import SingleStage3DDetector ...@@ -7,10 +7,7 @@ from .single_stage import SingleStage3DDetector
@DETECTORS.register_module() @DETECTORS.register_module()
class VoteNet(SingleStage3DDetector): class VoteNet(SingleStage3DDetector):
"""VoteNet model. r"""`VoteNet <https://arxiv.org/pdf/1904.09664.pdf>`_ for 3D detection."""
https://arxiv.org/pdf/1904.09664.pdf
"""
def __init__(self, def __init__(self,
backbone, backbone,
...@@ -25,6 +22,28 @@ class VoteNet(SingleStage3DDetector): ...@@ -25,6 +22,28 @@ class VoteNet(SingleStage3DDetector):
test_cfg=test_cfg, test_cfg=test_cfg,
pretrained=pretrained) pretrained=pretrained)
def extract_feat(self, points, img_metas=None):
"""Directly extract features from the backbone+neck.
Args:
points (torch.Tensor): Input points.
"""
x = self.backbone(points)
if self.with_neck:
x = self.neck(x)
seed_points = x['fp_xyz'][-1]
seed_features = x['fp_features'][-1]
seed_indices = x['fp_indices'][-1]
feat_dict = {
'seed_points': seed_points,
'seed_features': seed_features,
'seed_indices': seed_indices
}
return feat_dict
def forward_train(self, def forward_train(self,
points, points,
img_metas, img_metas,
......
from .coord_transform import (apply_3d_transformation, bbox_2d_transform,
coord_2d_transform)
from .point_fusion import PointFusion from .point_fusion import PointFusion
from .vote_fusion import VoteFusion
__all__ = ['PointFusion'] __all__ = [
'PointFusion', 'VoteFusion', 'apply_3d_transformation',
'bbox_2d_transform', 'coord_2d_transform'
]
import torch
from functools import partial
from mmdet3d.core.points import get_points_type
def apply_3d_transformation(pcd, coords_type, img_meta, reverse=False):
"""Apply transformation to input point cloud.
Args:
pcd (torch.Tensor): The point cloud to be transformed.
coords_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'
img_meta(dict): Meta info regarding data transformation.
reverse (bool): Reversed transformation or not.
Note:
The elements in img_meta['transformation_3d_flow']:
"T" stands for translation;
"S" stands for scale;
"R" stands for rotation;
"HF" stands for horizontal flip;
"VF" stands for vertical flip.
Returns:
torch.Tensor: The transformed point cloud.
"""
dtype = pcd.dtype
device = pcd.device
pcd_rotate_mat = (
torch.tensor(img_meta['pcd_rotation'], dtype=dtype, device=device)
if 'pcd_rotation' in img_meta else torch.eye(
3, dtype=dtype, device=device))
pcd_scale_factor = (
img_meta['pcd_scale_factor'] if 'pcd_scale_factor' in img_meta else 1.)
pcd_trans_factor = (
torch.tensor(img_meta['pcd_trans'], dtype=dtype, device=device)
if 'pcd_trans' in img_meta else torch.zeros(
(3), dtype=dtype, device=device))
pcd_horizontal_flip = img_meta[
'pcd_horizontal_flip'] if 'pcd_horizontal_flip' in \
img_meta else False
pcd_vertical_flip = img_meta[
'pcd_vertical_flip'] if 'pcd_vertical_flip' in \
img_meta else False
flow = img_meta['transformation_3d_flow'] \
if 'transformation_3d_flow' in img_meta else []
pcd = pcd.clone() # prevent inplace modification
pcd = get_points_type(coords_type)(pcd)
horizontal_flip_func = partial(pcd.flip, bev_direction='horizontal') \
if pcd_horizontal_flip else lambda: None
vertical_flip_func = partial(pcd.flip, bev_direction='vertical') \
if pcd_vertical_flip else lambda: None
if reverse:
scale_func = partial(pcd.scale, scale_factor=1.0 / pcd_scale_factor)
translate_func = partial(pcd.translate, trans_vector=-pcd_trans_factor)
# pcd_rotate_mat @ pcd_rotate_mat.inverse() is not
# exactly an identity matrix
# use angle to create the inverse rot matrix neither.
rotate_func = partial(pcd.rotate, rotation=pcd_rotate_mat.inverse())
# reverse the pipeline
flow = flow[::-1]
else:
scale_func = partial(pcd.scale, scale_factor=pcd_scale_factor)
translate_func = partial(pcd.translate, trans_vector=pcd_trans_factor)
rotate_func = partial(pcd.rotate, rotation=pcd_rotate_mat)
flow_mapping = {
'T': translate_func,
'S': scale_func,
'R': rotate_func,
'HF': horizontal_flip_func,
'VF': vertical_flip_func
}
for op in flow:
assert op in flow_mapping, f'This 3D data '\
f'transformation op ({op}) is not supported'
func = flow_mapping[op]
func()
return pcd.coord
def extract_2d_info(img_meta, tensor):
"""Extract image augmentation information from img_meta.
Args:
img_meta(dict): Meta info regarding data transformation.
tensor(torch.Tensor): Input tensor used to create new ones.
Returns:
(int, int, int, int, torch.Tensor, bool, torch.Tensor):
The extracted information.
"""
img_shape = img_meta['img_shape']
ori_shape = img_meta['ori_shape']
img_h, img_w, _ = img_shape
ori_h, ori_w, _ = ori_shape
img_scale_factor = (
tensor.new_tensor(img_meta['scale_factor'][:2])
if 'scale_factor' in img_meta else tensor.new_tensor([1.0, 1.0]))
img_flip = img_meta['flip'] if 'flip' in img_meta else False
img_crop_offset = (
tensor.new_tensor(img_meta['img_crop_offset'])
if 'img_crop_offset' in img_meta else tensor.new_tensor([0.0, 0.0]))
return (img_h, img_w, ori_h, ori_w, img_scale_factor, img_flip,
img_crop_offset)
def bbox_2d_transform(img_meta, bbox_2d, ori2new):
"""Transform 2d bbox according to img_meta.
Args:
img_meta(dict): Meta info regarding data transformation.
bbox_2d (torch.Tensor): Shape (..., >4)
The input 2d bboxes to transform.
ori2new (bool): Origin img coord system to new or not.
Returns:
torch.Tensor: The transformed 2d bboxes.
"""
img_h, img_w, ori_h, ori_w, img_scale_factor, img_flip, \
img_crop_offset = extract_2d_info(img_meta, bbox_2d)
bbox_2d_new = bbox_2d.clone()
if ori2new:
bbox_2d_new[:, 0] = bbox_2d_new[:, 0] * img_scale_factor[0]
bbox_2d_new[:, 2] = bbox_2d_new[:, 2] * img_scale_factor[0]
bbox_2d_new[:, 1] = bbox_2d_new[:, 1] * img_scale_factor[1]
bbox_2d_new[:, 3] = bbox_2d_new[:, 3] * img_scale_factor[1]
bbox_2d_new[:, 0] = bbox_2d_new[:, 0] + img_crop_offset[0]
bbox_2d_new[:, 2] = bbox_2d_new[:, 2] + img_crop_offset[0]
bbox_2d_new[:, 1] = bbox_2d_new[:, 1] + img_crop_offset[1]
bbox_2d_new[:, 3] = bbox_2d_new[:, 3] + img_crop_offset[1]
if img_flip:
bbox_2d_r = img_w - bbox_2d_new[:, 0]
bbox_2d_l = img_w - bbox_2d_new[:, 2]
bbox_2d_new[:, 0] = bbox_2d_l
bbox_2d_new[:, 2] = bbox_2d_r
else:
if img_flip:
bbox_2d_r = img_w - bbox_2d_new[:, 0]
bbox_2d_l = img_w - bbox_2d_new[:, 2]
bbox_2d_new[:, 0] = bbox_2d_l
bbox_2d_new[:, 2] = bbox_2d_r
bbox_2d_new[:, 0] = bbox_2d_new[:, 0] - img_crop_offset[0]
bbox_2d_new[:, 2] = bbox_2d_new[:, 2] - img_crop_offset[0]
bbox_2d_new[:, 1] = bbox_2d_new[:, 1] - img_crop_offset[1]
bbox_2d_new[:, 3] = bbox_2d_new[:, 3] - img_crop_offset[1]
bbox_2d_new[:, 0] = bbox_2d_new[:, 0] / img_scale_factor[0]
bbox_2d_new[:, 2] = bbox_2d_new[:, 2] / img_scale_factor[0]
bbox_2d_new[:, 1] = bbox_2d_new[:, 1] / img_scale_factor[1]
bbox_2d_new[:, 3] = bbox_2d_new[:, 3] / img_scale_factor[1]
return bbox_2d_new
def coord_2d_transform(img_meta, coord_2d, ori2new):
"""Transform 2d pixel coordinates according to img_meta.
Args:
img_meta(dict): Meta info regarding data transformation.
coord_2d (torch.Tensor): Shape (..., 2)
The input 2d coords to transform.
ori2new (bool): Origin img coord system to new or not.
Returns:
torch.Tensor: The transformed 2d coordinates.
"""
img_h, img_w, ori_h, ori_w, img_scale_factor, img_flip, \
img_crop_offset = extract_2d_info(img_meta, coord_2d)
coord_2d_new = coord_2d.clone()
if ori2new:
# TODO here we assume this order of transformation
coord_2d_new[..., 0] = coord_2d_new[..., 0] * img_scale_factor[0]
coord_2d_new[..., 1] = coord_2d_new[..., 1] * img_scale_factor[1]
coord_2d_new[..., 0] += img_crop_offset[0]
coord_2d_new[..., 1] += img_crop_offset[1]
# flip uv coordinates and bbox
if img_flip:
coord_2d_new[..., 0] = img_w - coord_2d_new[..., 0]
else:
if img_flip:
coord_2d_new[..., 0] = img_w - coord_2d_new[..., 0]
coord_2d_new[..., 0] -= img_crop_offset[0]
coord_2d_new[..., 1] -= img_crop_offset[1]
coord_2d_new[..., 0] = coord_2d_new[..., 0] / img_scale_factor[0]
coord_2d_new[..., 1] = coord_2d_new[..., 1] / img_scale_factor[1]
return coord_2d_new
...@@ -4,18 +4,16 @@ from torch import nn as nn ...@@ -4,18 +4,16 @@ from torch import nn as nn
from torch.nn import functional as F from torch.nn import functional as F
from ..registry import FUSION_LAYERS from ..registry import FUSION_LAYERS
from . import apply_3d_transformation
def point_sample( def point_sample(
img_meta,
img_features, img_features,
points, points,
lidar2img_rt, lidar2img_rt,
pcd_rotate_mat,
img_scale_factor, img_scale_factor,
img_crop_offset, img_crop_offset,
pcd_trans_factor,
pcd_scale_factor,
pcd_flip,
img_flip, img_flip,
img_pad_shape, img_pad_shape,
img_shape, img_shape,
...@@ -26,19 +24,14 @@ def point_sample( ...@@ -26,19 +24,14 @@ def point_sample(
"""Obtain image features using points. """Obtain image features using points.
Args: Args:
img_meta (dict): Meta info.
img_features (torch.Tensor): 1 x C x H x W image features. img_features (torch.Tensor): 1 x C x H x W image features.
points (torch.Tensor): Nx3 point cloud in LiDAR coordinates. points (torch.Tensor): Nx3 point cloud in LiDAR coordinates.
lidar2img_rt (torch.Tensor): 4x4 transformation matrix. lidar2img_rt (torch.Tensor): 4x4 transformation matrix.
pcd_rotate_mat (torch.Tensor): 3x3 rotation matrix of points
during augmentation.
img_scale_factor (torch.Tensor): Scale factor with shape of \ img_scale_factor (torch.Tensor): Scale factor with shape of \
(w_scale, h_scale). (w_scale, h_scale).
img_crop_offset (torch.Tensor): Crop offset used to crop \ img_crop_offset (torch.Tensor): Crop offset used to crop \
image during data augmentation with shape of (w_offset, h_offset). image during data augmentation with shape of (w_offset, h_offset).
pcd_trans_factor ([type]): Translation of points in augmentation.
pcd_scale_factor (float): Scale factor of points during.
data augmentation
pcd_flip (bool): Whether the points are flipped.
img_flip (bool): Whether the image is flipped. img_flip (bool): Whether the image is flipped.
img_pad_shape (tuple[int]): int tuple indicates the h & w after img_pad_shape (tuple[int]): int tuple indicates the h & w after
padding, this is necessary to obtain features in feature map. padding, this is necessary to obtain features in feature map.
...@@ -54,19 +47,9 @@ def point_sample( ...@@ -54,19 +47,9 @@ def point_sample(
Returns: Returns:
torch.Tensor: NxC image features sampled by point coordinates. torch.Tensor: NxC image features sampled by point coordinates.
""" """
# aug order: flip -> trans -> scale -> rot
# The transformation follows the augmentation order in data pipeline # apply transformation based on info in img_meta
if pcd_flip: points = apply_3d_transformation(points, 'LIDAR', img_meta, reverse=True)
# if the points are flipped, flip them back first
points[:, 1] = -points[:, 1]
points -= pcd_trans_factor
# the points should be scaled to the original scale in velo coordinate
points /= pcd_scale_factor
# the points should be rotated back
# pcd_rotate_mat @ pcd_rotate_mat.inverse() is not exactly an identity
# matrix, use angle to create the inverse rot matrix neither.
points = points @ pcd_rotate_mat.inverse()
# project points from velo coordinate to camera coordinate # project points from velo coordinate to camera coordinate
num_points = points.shape[0] num_points = points.shape[0]
...@@ -298,34 +281,21 @@ class PointFusion(nn.Module): ...@@ -298,34 +281,21 @@ class PointFusion(nn.Module):
Returns: Returns:
torch.Tensor: Single level image features of each point. torch.Tensor: Single level image features of each point.
""" """
pcd_scale_factor = ( # TODO: image transformation also extracted
img_meta['pcd_scale_factor']
if 'pcd_scale_factor' in img_meta.keys() else 1)
pcd_trans_factor = (
pts.new_tensor(img_meta['pcd_trans'])
if 'pcd_trans' in img_meta.keys() else 0)
pcd_rotate_mat = (
pts.new_tensor(img_meta['pcd_rotation']) if 'pcd_rotation'
in img_meta.keys() else torch.eye(3).type_as(pts).to(pts.device))
img_scale_factor = ( img_scale_factor = (
pts.new_tensor(img_meta['scale_factor'][:2]) pts.new_tensor(img_meta['scale_factor'][:2])
if 'scale_factor' in img_meta.keys() else 1) if 'scale_factor' in img_meta.keys() else 1)
pcd_flip = img_meta['pcd_flip'] if 'pcd_flip' in img_meta.keys(
) else False
img_flip = img_meta['flip'] if 'flip' in img_meta.keys() else False img_flip = img_meta['flip'] if 'flip' in img_meta.keys() else False
img_crop_offset = ( img_crop_offset = (
pts.new_tensor(img_meta['img_crop_offset']) pts.new_tensor(img_meta['img_crop_offset'])
if 'img_crop_offset' in img_meta.keys() else 0) if 'img_crop_offset' in img_meta.keys() else 0)
img_pts = point_sample( img_pts = point_sample(
img_meta,
img_feats, img_feats,
pts, pts,
pts.new_tensor(img_meta['lidar2img']), pts.new_tensor(img_meta['lidar2img']),
pcd_rotate_mat,
img_scale_factor, img_scale_factor,
img_crop_offset, img_crop_offset,
pcd_trans_factor,
pcd_scale_factor,
pcd_flip=pcd_flip,
img_flip=img_flip, img_flip=img_flip,
img_pad_shape=img_meta['input_shape'][:2], img_pad_shape=img_meta['input_shape'][:2],
img_shape=img_meta['img_shape'][:2], img_shape=img_meta['img_shape'][:2],
......
import torch
from torch import nn as nn
from mmdet3d.core.bbox import Coord3DMode, points_cam2img
from ..registry import FUSION_LAYERS
from . import apply_3d_transformation, bbox_2d_transform, coord_2d_transform
EPS = 1e-6
@FUSION_LAYERS.register_module()
class VoteFusion(nn.Module):
"""Fuse 2d features from 3d seeds.
Args:
num_classes (int): number of classes.
max_imvote_per_pixel (int): max number of imvotes.
"""
def __init__(self, num_classes=10, max_imvote_per_pixel=3):
super(VoteFusion, self).__init__()
self.num_classes = num_classes
self.max_imvote_per_pixel = max_imvote_per_pixel
def forward(self, imgs, bboxes_2d_rescaled, seeds_3d_depth, img_metas,
calibs):
"""Forward function.
Args:
imgs (list[torch.Tensor]): Image features.
bboxes_2d_rescaled (list[torch.Tensor]): 2D bboxes.
seeds_3d_depth (torch.Tensor): 3D seeds.
img_metas (list[dict]): Meta information of images.
calibs: Camera calibration information of the images.
Returns:
torch.Tensor: Concatenated cues of each point.
torch.Tensor: Validity mask of each feature.
"""
img_features = []
masks = []
for i, data in enumerate(
zip(imgs, bboxes_2d_rescaled, seeds_3d_depth, img_metas)):
img, bbox_2d_rescaled, seed_3d_depth, img_meta = data
bbox_num = bbox_2d_rescaled.shape[0]
seed_num = seed_3d_depth.shape[0]
img_shape = img_meta['img_shape']
img_h, img_w, _ = img_shape
# first reverse the data transformations
xyz_depth = apply_3d_transformation(
seed_3d_depth, 'DEPTH', img_meta, reverse=True)
# then convert from depth coords to camera coords
xyz_cam = Coord3DMode.convert_point(
xyz_depth,
Coord3DMode.DEPTH,
Coord3DMode.CAM,
rt_mat=calibs['Rt'][i])
# project to 2d to get image coords (uv)
uv_origin = points_cam2img(xyz_cam, calibs['K'][i])
uv_origin = (uv_origin - 1).round()
# rescale 2d coordinates and bboxes
uv_rescaled = coord_2d_transform(img_meta, uv_origin, True)
bbox_2d_origin = bbox_2d_transform(img_meta, bbox_2d_rescaled,
False)
if bbox_num == 0:
imvote_num = seed_num * self.max_imvote_per_pixel
# use zero features
two_cues = torch.zeros((15, imvote_num),
device=seed_3d_depth.device)
mask_zero = torch.zeros(
imvote_num - seed_num, device=seed_3d_depth.device).bool()
mask_one = torch.ones(
seed_num, device=seed_3d_depth.device).bool()
mask = torch.cat([mask_one, mask_zero], dim=0)
else:
# expand bboxes and seeds
bbox_expanded = bbox_2d_origin.view(1, bbox_num, -1).expand(
seed_num, -1, -1)
seed_2d_expanded = uv_origin.view(seed_num, 1,
-1).expand(-1, bbox_num, -1)
seed_2d_expanded_x, seed_2d_expanded_y = \
seed_2d_expanded.split(1, dim=-1)
bbox_expanded_l, bbox_expanded_t, bbox_expanded_r, \
bbox_expanded_b, bbox_expanded_conf, bbox_expanded_cls = \
bbox_expanded.split(1, dim=-1)
bbox_expanded_midx = (bbox_expanded_l + bbox_expanded_r) / 2
bbox_expanded_midy = (bbox_expanded_t + bbox_expanded_b) / 2
seed_2d_in_bbox_x = (seed_2d_expanded_x > bbox_expanded_l) * \
(seed_2d_expanded_x < bbox_expanded_r)
seed_2d_in_bbox_y = (seed_2d_expanded_y > bbox_expanded_t) * \
(seed_2d_expanded_y < bbox_expanded_b)
seed_2d_in_bbox = seed_2d_in_bbox_x * seed_2d_in_bbox_y
# semantic cues, dim=class_num
sem_cue = torch.zeros_like(bbox_expanded_conf).expand(
-1, -1, self.num_classes)
sem_cue = sem_cue.scatter(-1, bbox_expanded_cls.long(),
bbox_expanded_conf)
# bbox center - uv
delta_u = bbox_expanded_midx - seed_2d_expanded_x
delta_v = bbox_expanded_midy - seed_2d_expanded_y
seed_3d_expanded = seed_3d_depth.view(seed_num, 1, -1).expand(
-1, bbox_num, -1)
z_cam = xyz_cam[..., 2:3].view(seed_num, 1,
1).expand(-1, bbox_num, -1)
delta_u = delta_u * z_cam / calibs['K'][i, 0, 0]
delta_v = delta_v * z_cam / calibs['K'][i, 0, 0]
imvote = torch.cat(
[delta_u, delta_v,
torch.zeros_like(delta_v)], dim=-1).view(-1, 3)
# convert from camera coords to depth coords
imvote = Coord3DMode.convert_point(
imvote.view((-1, 3)),
Coord3DMode.CAM,
Coord3DMode.DEPTH,
rt_mat=calibs['Rt'][i])
# apply transformation to lifted imvotes
imvote = apply_3d_transformation(
imvote, 'DEPTH', img_meta, reverse=False)
seed_3d_expanded = seed_3d_expanded.reshape(imvote.shape)
# ray angle
ray_angle = seed_3d_expanded + imvote
ray_angle /= torch.sqrt(torch.sum(ray_angle**2, -1) +
EPS).unsqueeze(-1)
# imvote lifted to 3d
xz = ray_angle[:, [0, 2]] / (ray_angle[:, [1]] + EPS) \
* seed_3d_expanded[:, [1]] - seed_3d_expanded[:, [0, 2]]
# geometric cues, dim=5
geo_cue = torch.cat([xz, ray_angle],
dim=-1).view(seed_num, -1, 5)
two_cues = torch.cat([geo_cue, sem_cue], dim=-1)
# mask to 0 if seed not in bbox
two_cues = two_cues * seed_2d_in_bbox.float()
feature_size = two_cues.shape[-1]
# if bbox number is too small, append zeros
if bbox_num < self.max_imvote_per_pixel:
append_num = self.max_imvote_per_pixel - bbox_num
append_zeros = torch.zeros(
(seed_num, append_num, 1),
device=seed_2d_in_bbox.device).bool()
seed_2d_in_bbox = torch.cat(
[seed_2d_in_bbox, append_zeros], dim=1)
append_zeros = torch.zeros(
(seed_num, append_num, feature_size),
device=two_cues.device)
two_cues = torch.cat([two_cues, append_zeros], dim=1)
append_zeros = torch.zeros((seed_num, append_num, 1),
device=two_cues.device)
bbox_expanded_conf = torch.cat(
[bbox_expanded_conf, append_zeros], dim=1)
# sort the valid seed-bbox pair according to confidence
pair_score = seed_2d_in_bbox.float() + bbox_expanded_conf
# and find the largests
mask, indices = pair_score.topk(
self.max_imvote_per_pixel,
dim=1,
largest=True,
sorted=True)
indices_img = indices.expand(-1, -1, feature_size)
two_cues = two_cues.gather(dim=1, index=indices_img)
two_cues = two_cues.transpose(1, 0)
two_cues = two_cues.reshape(-1, feature_size).transpose(
1, 0).contiguous()
# since conf is ~ (0, 1), floor gives us validity
mask = mask.floor().int()
mask = mask.transpose(1, 0).reshape(-1).bool()
# clear the padding
img = img[:, :img_shape[0], :img_shape[1]]
img_flatten = img.reshape(3, -1).float()
img_flatten /= 255.
# take the normalized pixel value as texture cue
uv_flatten = uv_rescaled[:, 1].round() * \
img_shape[1] + uv_rescaled[:, 0].round()
uv_expanded = uv_flatten.unsqueeze(0).expand(3, -1).long()
txt_cue = torch.gather(img_flatten, dim=-1, index=uv_expanded)
txt_cue = txt_cue.unsqueeze(1).expand(-1,
self.max_imvote_per_pixel,
-1).reshape(3, -1)
# append texture cue
img_feature = torch.cat([two_cues, txt_cue], dim=0)
img_features.append(img_feature)
masks.append(mask)
return torch.stack(img_features, 0), torch.stack(masks, 0)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment