Unverified Commit 6d3518d0 authored by lianqing01's avatar lianqing01 Committed by GitHub
Browse files

[Features] Support waymo challenge solution (#1716)



* update evaluation metric to support waymo cam only evaluation

* add transformation for bev detection

* add multiview dfm

* support multiview detection in datasets with transformation, dfm model and metric

* remove deprecated config and update doc string

* remove file_client_args=file_client_args and update docstr

* add doc string and remove pdb

* fix the doc string of voxel fusion

* add doc string

* remove lidar2img

* add doc string

* update doc string

* support waymo dataset for replace_ceph and modify path of pkl in config

* update evaluation metrics; and the config for waymo solution

* fix the index error in waymo metric and add lidar2img utils function

* replace __call_ to transform

* fix doc string

* rename configs

* update the config name

* update the lidar2cam calib in waymo data creater
Co-authored-by: default avatarlianqing <lianqing1997@gmail.com>
Co-authored-by: default avatarTai-Wang <tab_wang@outlook.com>
parent 25e38012
...@@ -133,3 +133,4 @@ data/sunrgbd/OFFICIAL_SUNRGBD/ ...@@ -133,3 +133,4 @@ data/sunrgbd/OFFICIAL_SUNRGBD/
# Waymo evaluation # Waymo evaluation
mmdet3d/core/evaluation/waymo_utils/compute_detection_metrics_main mmdet3d/core/evaluation/waymo_utils/compute_detection_metrics_main
mmdet3d/core/evaluation/waymo_utils/compute_detection_let_metrics_main
# dataset settings
# D3 in the config name means the whole dataset is divided into 3 folds
# We only use one fold for efficient experiments
dataset_type = 'WaymoDataset'
data_root = 'data/waymo/kitti_format/'
class_names = ['Car', 'Pedestrian', 'Cyclist']
input_modality = dict(use_lidar=False, use_camera=True)
file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
train_pipeline = [
dict(type='LoadImageFromFileMono3D'),
dict(
type='LoadAnnotations3D',
with_bbox=True,
with_label=True,
with_attr_label=False,
with_bbox_3d=True,
with_label_3d=True,
with_bbox_depth=True),
# base shape (1248, 832), scale (0.95, 1.05)
dict(
type='RandomResize3D',
scale=(1284, 832),
ratio_range=(0.95, 1.05),
keep_ratio=True,
),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(
type='Pack3DDetInputs',
keys=[
'img', 'gt_bboxes', 'gt_bboxes_labels', 'gt_bboxes_3d',
'gt_labels_3d', 'centers_2d', 'depths'
]),
]
test_pipeline = [
dict(type='LoadImageFromFileMono3D'),
dict(
type='RandomResize3D',
scale=(1248, 832),
ratio_range=(1., 1.),
keep_ratio=True),
dict(type='Pack3DDetInputs', keys=['img']),
]
# construct a pipeline for data and gt loading in show function
# please keep its loading function consistent with test_pipeline (e.g. client)
eval_pipeline = [
dict(type='LoadImageFromFileMono3D'),
dict(
type='RandomResize3D',
scale=(1248, 832),
ratio_range=(1., 1.),
keep_ratio=True),
dict(type='Pack3DDetInputs', keys=['img']),
]
metainfo = dict(CLASSES=class_names)
train_dataloader = dict(
batch_size=3,
num_workers=3,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='waymo_infos_train.pkl',
data_prefix=dict(
pts='training/velodyne',
CAM_FRONT='training/image_0',
CAM_FRONT_RIGHT='training/image_1',
CAM_FRONT_LEFT='training/image_2',
CAM_SIDE_RIGHT='training/image_3',
CAM_SIDE_LEFT='training/image_4'),
pipeline=train_pipeline,
modality=input_modality,
test_mode=False,
metainfo=metainfo,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='Camera',
task='mono3d',
# load one frame every three frames
load_interval=5))
val_dataloader = dict(
batch_size=1,
num_workers=1,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
data_prefix=dict(
pts='training/velodyne',
CAM_FRONT='training/image_0',
CAM_FRONT_RIGHT='training/image_1',
CAM_FRONT_LEFT='training/image_2',
CAM_SIDE_RIGHT='training/image_3',
CAM_SIDE_LEFT='training/image_4'),
ann_file='waymo_infos_val.pkl',
pipeline=eval_pipeline,
modality=input_modality,
test_mode=True,
metainfo=metainfo,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='Camera',
task='mono3d',
))
test_dataloader = dict(
batch_size=1,
num_workers=1,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
data_prefix=dict(
pts='training/velodyne',
CAM_FRONT='training/image_0',
CAM_FRONT_RIGHT='training/image_1',
CAM_FRONT_LEFT='training/image_2',
CAM_SIDE_RIGHT='training/image_3',
CAM_SIDE_LEFT='training/image_4'),
ann_file='waymo_infos_val.pkl',
pipeline=eval_pipeline,
modality=input_modality,
test_mode=True,
metainfo=metainfo,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='Camera',
task='mono3d',
))
val_evaluator = dict(
type='WaymoMetric',
ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
waymo_bin_file='./data/waymo/waymo_format/cam_gt.bin',
data_root='./data/waymo/waymo_format',
metric='LET_mAP',
task='mono3d')
test_evaluator = val_evaluator
# dataset settings
# D3 in the config name means the whole dataset is divided into 3 folds
# We only use one fold for efficient experiments
dataset_type = 'WaymoDataset'
data_root = 'data/waymo/kitti_format/'
file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
class_names = ['Car', 'Pedestrian', 'Cyclist']
input_modality = dict(use_lidar=False, use_camera=True)
point_cloud_range = [-35.0, -75.0, -2, 75.0, 75.0, 4]
train_transforms = [
dict(type='PhotoMetricDistortion3D'),
dict(
type='RandomResize3D',
scale=(1248, 832),
ratio_range=(0.95, 1.05),
keep_ratio=True),
dict(type='RandomCrop3D', crop_size=(720, 1080)),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5, flip_box3d=False),
]
train_pipeline = [
dict(type='LoadMultiViewImageFromFiles', to_float32=True),
dict(
type='LoadAnnotations3D',
with_bbox=True,
with_label=True,
with_attr_label=False,
with_bbox_3d=True,
with_label_3d=True,
with_bbox_depth=True),
dict(type='MultiViewWrapper', transforms=train_transforms),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectNameFilter', classes=class_names),
dict(
type='Pack3DDetInputs', keys=[
'img',
'gt_bboxes_3d',
'gt_labels_3d',
]),
]
test_transforms = [
dict(
type='RandomResize3D',
scale=(1248, 832),
ratio_range=(1., 1.),
keep_ratio=True)
]
test_pipeline = [
dict(type='LoadMultiViewImageFromFiles', to_float32=True),
dict(type='MultiViewWrapper', transforms=test_transforms),
dict(type='Pack3DDetInputs', keys=['img'])
]
# construct a pipeline for data and gt loading in show function
# please keep its loading function consistent with test_pipeline (e.g. client)
eval_pipeline = [
dict(type='LoadMultiViewImageFromFiles', to_float32=True),
dict(type='MultiViewWrapper', transforms=test_transforms),
dict(type='Pack3DDetInputs', keys=['img'])
]
metainfo = dict(CLASSES=class_names)
train_dataloader = dict(
batch_size=2,
num_workers=2,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='waymo_infos_train.pkl',
data_prefix=dict(
pts='training/velodyne',
CAM_FRONT='training/image_0',
CAM_FRONT_RIGHT='training/image_1',
CAM_FRONT_LEFT='training/image_2',
CAM_SIDE_RIGHT='training/image_3',
CAM_SIDE_LEFT='training/image_4',
),
pipeline=train_pipeline,
modality=input_modality,
test_mode=False,
metainfo=metainfo,
box_type_3d='Lidar',
load_interval=5,
))
val_dataloader = dict(
batch_size=1,
num_workers=1,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='waymo_infos_val.pkl',
data_prefix=dict(
pts='training/velodyne',
CAM_FRONT='training/image_0',
CAM_FRONT_RIGHT='training/image_1',
CAM_FRONT_LEFT='training/image_2',
CAM_SIDE_RIGHT='training/image_3',
CAM_SIDE_LEFT='training/image_4',
),
pipeline=eval_pipeline,
modality=input_modality,
test_mode=True,
metainfo=metainfo,
box_type_3d='Lidar',
))
test_dataloader = dict(
batch_size=1,
num_workers=1,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='waymo_infos_val.pkl',
data_prefix=dict(
pts='training/velodyne',
CAM_FRONT='training/image_0',
CAM_FRONT_RIGHT='training/image_1',
CAM_FRONT_LEFT='training/image_2',
CAM_SIDE_RIGHT='training/image_3',
CAM_SIDE_LEFT='training/image_4',
),
pipeline=eval_pipeline,
modality=input_modality,
test_mode=True,
metainfo=metainfo,
box_type_3d='Lidar',
))
val_evaluator = dict(
type='WaymoMetric',
ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
waymo_bin_file='./data/waymo/waymo_format/cam_gt.bin',
data_root='./data/waymo/waymo_format',
metric='LET_mAP')
test_evaluator = val_evaluator
model = dict(
type='MultiViewDfM',
data_preprocessor=dict(
type='Det3DDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True,
pad_size_divisor=32),
backbone=dict(
type='mmdet.ResNet',
depth=101,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
norm_cfg=dict(type='BN', requires_grad=False),
norm_eval=True,
style='pytorch',
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
stage_with_dcn=(False, False, True, True)),
neck=dict(
type='mmdet.FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=64,
num_outs=4),
neck_2d=None,
bbox_head_2d=None,
backbone_stereo=None,
depth_head=None,
backbone_3d=None,
neck_3d=dict(type='OutdoorImVoxelNeck', in_channels=64, out_channels=256),
valid_sample=True,
voxel_size=(0.5, 0.5, 0.5), # n_voxels=[240, 300, 12]
anchor_generator=dict(
type='AlignedAnchor3DRangeGenerator',
ranges=[[-35.0, -75.0, -2, 75.0, 75.0, 4]],
rotations=[.0]),
bbox_head=dict(
type='Anchor3DHead',
num_classes=3,
in_channels=256,
feat_channels=256,
use_direction_classifier=True,
anchor_generator=dict(
type='AlignedAnchor3DRangeGenerator',
ranges=[[-35.0, -75.0, -0.0345, 75.0, 75.0, -0.0345],
[-35.0, -75.0, 0, 75.0, 75.0, 0],
[-35.0, -75.0, -0.1188, 75.0, 75.0, -0.1188]],
sizes=[
[4.73, 2.08, 1.77], # car
[0.91, 0.84, 1.74], # pedestrian
[1.81, 0.84, 1.77], # cyclist
],
rotations=[0, 1.57],
reshape_out=False),
diff_rad_by_sin=True,
dir_offset=-0.7854, # -pi / 4
bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
loss_cls=dict(
type='mmdet.FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(
type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
loss_dir=dict(
type='mmdet.CrossEntropyLoss', use_sigmoid=False,
loss_weight=0.2)),
train_cfg=dict(
assigner=[
dict( # for Car
type='Max3DIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.6,
neg_iou_thr=0.45,
min_pos_iou=0.45,
ignore_iof_thr=-1),
dict( # for Pedestrian
type='Max3DIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.5,
neg_iou_thr=0.35,
min_pos_iou=0.35,
ignore_iof_thr=-1),
dict( # for Cyclist
type='Max3DIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.5,
neg_iou_thr=0.35,
min_pos_iou=0.35,
ignore_iof_thr=-1),
],
allowed_border=0,
pos_weight=-1,
debug=False),
test_cfg=dict(
use_rotate_nms=True,
nms_across_levels=False,
nms_thr=0.05,
score_thr=0.001,
min_bbox_size=0,
nms_pre=500,
max_num=100))
_base_ = [
'../_base_/datasets/waymoD5-mv3d-3class.py',
'../_base_/models/multiview_dfm.py'
]
# optimizer
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='AdamW', lr=0.0005, weight_decay=0.0001),
paramwise_cfg=dict(
custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)}),
clip_grad=dict(max_norm=35., norm_type=2))
param_scheduler = [
dict(
type='MultiStepLR',
begin=0,
end=24,
by_epoch=True,
milestones=[16, 22],
gamma=0.1)
]
# hooks
default_hooks = dict(
timer=dict(type='IterTimerHook'),
logger=dict(type='LoggerHook', interval=50),
param_scheduler=dict(type='ParamSchedulerHook'),
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=1),
sampler_seed=dict(type='DistSamplerSeedHook'),
)
# training schedule for 2x
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=24, val_interval=24)
val_cfg = dict(type='ValLoop')
test_cfg = dict(type='TestLoop')
# runtime
default_scope = 'mmdet3d'
env_cfg = dict(
cudnn_benchmark=False,
mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
dist_cfg=dict(backend='nccl'),
)
log_level = 'INFO'
load_from = None
resume = False
find_unused_parameters = True # only 1 of 4 FPN outputs is used
_base_ = ['./multiview-dfm_r101_dcn_2x16_waymoD5-3d-3class.py']
model = dict(
bbox_head=dict(
_delete_=True,
type='CenterHead',
in_channels=256,
tasks=[
dict(num_class=1, class_names=['Pedestrian']),
dict(num_class=1, class_names=['Cyclist']),
dict(num_class=1, class_names=['Car']),
],
common_heads=dict(reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2)),
share_conv_channel=64,
bbox_coder=dict(
type='CenterPointBBoxCoder',
post_center_range=[-35.0, -75.0, -2, 75.0, 75.0, 4],
pc_range=[-35.0, -75.0, -2, 75.0, 75.0, 4],
max_num=2000,
score_threshold=0,
out_size_factor=1,
voxel_size=(.50, .50),
code_size=7),
separate_head=dict(
type='SeparateHead', init_bias=-2.19, final_kernel=3),
loss_cls=dict(type='mmdet.GaussianFocalLoss', reduction='mean'),
loss_bbox=dict(
type='mmdet.L1Loss', reduction='mean', loss_weight=0.25),
norm_bbox=True),
train_cfg=dict(
_delete_=True,
grid_size=[220, 300, 1],
voxel_size=(0.5, 0.5, 6),
out_size_factor=1,
dense_reg=1,
gaussian_overlap=0.1,
max_objs=500,
min_radius=2,
code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
point_cloud_range=[-35.0, -75.0, -2, 75.0, 75.0, 4]),
test_cfg=dict(
_delete_=True,
post_center_limit_range=[-35.0, -75.0, -2, 75.0, 75.0, 4],
max_per_img=4096,
max_pool_nms=False,
min_radius=[0.5, 2, 6],
score_threshold=0,
out_size_factor=1,
voxel_size=(0.5, 0.5),
nms_type='circle',
pre_max_size=2000,
post_max_size=200,
nms_thr=0.2))
...@@ -11,8 +11,9 @@ from .test_time_aug import MultiScaleFlipAug3D ...@@ -11,8 +11,9 @@ from .test_time_aug import MultiScaleFlipAug3D
from .transforms_3d import (AffineResize, BackgroundPointsFilter, from .transforms_3d import (AffineResize, BackgroundPointsFilter,
GlobalAlignment, GlobalRotScaleTrans, GlobalAlignment, GlobalRotScaleTrans,
IndoorPatchPointSample, IndoorPointSample, IndoorPatchPointSample, IndoorPointSample,
ObjectNameFilter, ObjectNoise, ObjectRangeFilter, MultiViewWrapper, ObjectNameFilter, ObjectNoise,
ObjectSample, PointSample, PointShuffle, ObjectRangeFilter, ObjectSample,
PhotoMetricDistortion3D, PointSample, PointShuffle,
PointsRangeFilter, RandomDropPointsColor, PointsRangeFilter, RandomDropPointsColor,
RandomFlip3D, RandomJitterPoints, RandomResize3D, RandomFlip3D, RandomJitterPoints, RandomResize3D,
RandomShiftScale, Resize3D, VoxelBasedPointSampler) RandomShiftScale, Resize3D, VoxelBasedPointSampler)
...@@ -29,5 +30,6 @@ __all__ = [ ...@@ -29,5 +30,6 @@ __all__ = [
'VoxelBasedPointSampler', 'GlobalAlignment', 'IndoorPatchPointSample', 'VoxelBasedPointSampler', 'GlobalAlignment', 'IndoorPatchPointSample',
'LoadImageFromFileMono3D', 'ObjectNameFilter', 'RandomDropPointsColor', 'LoadImageFromFileMono3D', 'ObjectNameFilter', 'RandomDropPointsColor',
'RandomJitterPoints', 'AffineResize', 'RandomShiftScale', 'RandomJitterPoints', 'AffineResize', 'RandomShiftScale',
'LoadPointsFromDict', 'Resize3D', 'RandomResize3D' 'LoadPointsFromDict', 'Resize3D', 'RandomResize3D',
'MultiViewWrapper', 'PhotoMetricDistortion3D'
] ]
...@@ -63,7 +63,6 @@ class Pack3DDetInputs(BaseTransform): ...@@ -63,7 +63,6 @@ class Pack3DDetInputs(BaseTransform):
def __init__( def __init__(
self, self,
keys: tuple,
meta_keys: tuple = ('img_path', 'ori_shape', 'img_shape', 'lidar2img', meta_keys: tuple = ('img_path', 'ori_shape', 'img_shape', 'lidar2img',
'depth2img', 'cam2img', 'pad_shape', 'depth2img', 'cam2img', 'pad_shape',
'scale_factor', 'flip', 'pcd_horizontal_flip', 'scale_factor', 'flip', 'pcd_horizontal_flip',
...@@ -72,8 +71,10 @@ class Pack3DDetInputs(BaseTransform): ...@@ -72,8 +71,10 @@ class Pack3DDetInputs(BaseTransform):
'sample_idx', 'pcd_scale_factor', 'pcd_rotation', 'sample_idx', 'pcd_scale_factor', 'pcd_rotation',
'pcd_rotation_angle', 'lidar_path', 'pcd_rotation_angle', 'lidar_path',
'transformation_3d_flow', 'trans_mat', 'transformation_3d_flow', 'trans_mat',
'affine_aug') 'affine_aug', 'sweep_img_metas', 'ori_cam2img',
) -> None: 'cam2global', 'crop_offset', 'img_crop_offset',
'resize_img_shape', 'lidar2cam', 'ori_lidar2img',
'num_ref_frames', 'num_views', 'ego2global')) -> None:
self.keys = keys self.keys = keys
self.meta_keys = meta_keys self.meta_keys = meta_keys
......
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
from typing import List, Union import copy
from typing import List, Optional, Union
import mmcv import mmcv
import mmengine import mmengine
...@@ -23,15 +24,38 @@ class LoadMultiViewImageFromFiles(BaseTransform): ...@@ -23,15 +24,38 @@ class LoadMultiViewImageFromFiles(BaseTransform):
Defaults to False. Defaults to False.
color_type (str, optional): Color type of the file. color_type (str, optional): Color type of the file.
Defaults to 'unchanged'. Defaults to 'unchanged'.
file_client_args (dict): Config dict of file clients,
refer to
https://github.com/open-mmlab/mmcv/blob/master/mmcv/fileio/file_client.py
for more details. Defaults to dict(backend='disk').
num_views (int): num of view in a frame. Default to 5.
num_ref_frames (int): num of frame in loading. Default to -1.
test_mode (bool): Whether is test mode in loading. Default to False.
set_default_scale (bool): Whether to set default scale. Default to
True.
""" """
def __init__(self, def __init__(self,
to_float32: bool = False, to_float32: bool = False,
color_type: str = 'unchanged') -> None: color_type: str = 'unchanged',
file_client_args: dict = dict(backend='disk'),
num_views: int = 5,
num_ref_frames: int = -1,
test_mode: bool = False,
set_default_scale: bool = True) -> None:
self.to_float32 = to_float32 self.to_float32 = to_float32
self.color_type = color_type self.color_type = color_type
self.file_client_args = file_client_args.copy()
self.file_client = None
self.num_views = num_views
# num_ref_frames is used for multi-sweep loading
self.num_ref_frames = num_ref_frames
# when test_mode=False, we randomly select previous frames
# otherwise, select the earliest one
self.test_mode = test_mode
self.set_default_scale = set_default_scale
def transform(self, results: dict) -> dict: def transform(self, results: dict) -> Optional[dict]:
"""Call function to load multi-view image from files. """Call function to load multi-view image from files.
Args: Args:
...@@ -49,33 +73,151 @@ class LoadMultiViewImageFromFiles(BaseTransform): ...@@ -49,33 +73,151 @@ class LoadMultiViewImageFromFiles(BaseTransform):
- scale_factor (float): Scale factor. - scale_factor (float): Scale factor.
- img_norm_cfg (dict): Normalization configuration of images. - img_norm_cfg (dict): Normalization configuration of images.
""" """
filename = results['img_filename'] # TODO: consider split the multi-sweep part out of this pipeline
# Derive the mask and transform for loading of multi-sweep data
if self.num_ref_frames > 0:
# init choice with the current frame
init_choice = np.array([0], dtype=np.int64)
num_frames = len(results['img_filename']) // self.num_views - 1
if num_frames == 0: # no previous frame, then copy cur frames
choices = np.random.choice(
1, self.num_ref_frames, replace=True)
elif num_frames >= self.num_ref_frames:
# NOTE: suppose the info is saved following the order
# from latest to earlier frames
if self.test_mode:
choices = np.arange(num_frames - self.num_ref_frames,
num_frames) + 1
# NOTE: +1 is for selecting previous frames
else:
choices = np.random.choice(
num_frames, self.num_ref_frames, replace=False) + 1
elif num_frames > 0 and num_frames < self.num_ref_frames:
if self.test_mode:
base_choices = np.arange(num_frames) + 1
random_choices = np.random.choice(
num_frames,
self.num_ref_frames - num_frames,
replace=True) + 1
choices = np.concatenate([base_choices, random_choices])
else:
choices = np.random.choice(
num_frames, self.num_ref_frames, replace=True) + 1
else:
raise NotImplementedError
choices = np.concatenate([init_choice, choices])
select_filename = []
for choice in choices:
select_filename += results['img_filename'][choice *
self.num_views:
(choice + 1) *
self.num_views]
results['img_filename'] = select_filename
for key in ['cam2img', 'lidar2cam']:
if key in results:
select_results = []
for choice in choices:
select_results += results[key][choice *
self.num_views:(choice +
1) *
self.num_views]
results[key] = select_results
for key in ['ego2global']:
if key in results:
select_results = []
for choice in choices:
select_results += [results[key][choice]]
results[key] = select_results
# Transform lidar2cam to
# [cur_lidar]2[prev_img] and [cur_lidar]2[prev_cam]
for key in ['lidar2cam']:
if key in results:
# only change matrices of previous frames
for choice_idx in range(1, len(choices)):
pad_prev_ego2global = np.eye(4)
prev_ego2global = results['ego2global'][choice_idx]
pad_prev_ego2global[:prev_ego2global.
shape[0], :prev_ego2global.
shape[1]] = prev_ego2global
pad_cur_ego2global = np.eye(4)
cur_ego2global = results['ego2global'][0]
pad_cur_ego2global[:cur_ego2global.
shape[0], :cur_ego2global.
shape[1]] = cur_ego2global
cur2prev = np.linalg.inv(pad_prev_ego2global).dot(
pad_cur_ego2global)
for result_idx in range(choice_idx * self.num_views,
(choice_idx + 1) *
self.num_views):
results[key][result_idx] = \
results[key][result_idx].dot(cur2prev)
# Support multi-view images with different shapes
# TODO: record the origin shape and padded shape
filename, cam2img, lidar2cam = [], [], []
for _, cam_item in results['images'].items():
filename.append(cam_item['img_path'])
cam2img.append(cam_item['cam2img'])
lidar2cam.append(cam_item['lidar2cam'])
results['filename'] = filename
results['cam2img'] = cam2img
results['lidar2cam'] = lidar2cam
results['ori_cam2img'] = copy.deepcopy(results['cam2img'])
if self.file_client is None:
self.file_client = mmcv.FileClient(**self.file_client_args)
# img is of shape (h, w, c, num_views) # img is of shape (h, w, c, num_views)
img = np.stack( # h and w can be different for different views
[mmcv.imread(name, self.color_type) for name in filename], axis=-1) img_bytes = [self.file_client.get(name) for name in filename]
imgs = [
mmcv.imfrombytes(img_byte, flag=self.color_type)
for img_byte in img_bytes
]
# handle the image with different shape
img_shapes = np.stack([img.shape for img in imgs], axis=0)
img_shape_max = np.max(img_shapes, axis=0)
img_shape_min = np.min(img_shapes, axis=0)
assert img_shape_min[-1] == img_shape_max[-1]
if not np.all(img_shape_max == img_shape_min):
pad_shape = img_shape_max[:2]
else:
pad_shape = None
if pad_shape is not None:
imgs = [
mmcv.impad(img, shape=pad_shape, pad_val=0) for img in imgs
]
img = np.stack(imgs, axis=-1)
if self.to_float32: if self.to_float32:
img = img.astype(np.float32) img = img.astype(np.float32)
results['filename'] = filename results['filename'] = filename
# unravel to list, see `DefaultFormatBundle` in formatting.py # unravel to list, see `DefaultFormatBundle` in formating.py
# which will transpose each image separately and then stack into array # which will transpose each image separately and then stack into array
results['img'] = [img[..., i] for i in range(img.shape[-1])] results['img'] = [img[..., i] for i in range(img.shape[-1])]
results['img_shape'] = img.shape results['img_shape'] = img.shape
results['ori_shape'] = img.shape results['ori_shape'] = img.shape
# Set initial values for default meta_keys # Set initial values for default meta_keys
results['pad_shape'] = img.shape results['pad_shape'] = img.shape
results['scale_factor'] = 1.0 if self.set_default_scale:
results['scale_factor'] = 1.0
num_channels = 1 if len(img.shape) < 3 else img.shape[2] num_channels = 1 if len(img.shape) < 3 else img.shape[2]
results['img_norm_cfg'] = dict( results['img_norm_cfg'] = dict(
mean=np.zeros(num_channels, dtype=np.float32), mean=np.zeros(num_channels, dtype=np.float32),
std=np.ones(num_channels, dtype=np.float32), std=np.ones(num_channels, dtype=np.float32),
to_rgb=False) to_rgb=False)
results['num_views'] = self.num_views
results['num_ref_frames'] = self.num_ref_frames
return results return results
def __repr__(self): def __repr__(self):
"""str: Return a string that describes the module.""" """str: Return a string that describes the module."""
repr_str = self.__class__.__name__ repr_str = self.__class__.__name__
repr_str += f'(to_float32={self.to_float32}, ' repr_str += f'(to_float32={self.to_float32}, '
repr_str += f"color_type='{self.color_type}')" repr_str += f"color_type='{self.color_type}', "
repr_str += f'num_views={self.num_views}, '
repr_str += f'num_ref_frames={self.num_ref_frames}, '
repr_str += f'test_mode={self.test_mode})'
return repr_str return repr_str
......
...@@ -140,10 +140,10 @@ class WaymoDataset(KittiDataset): ...@@ -140,10 +140,10 @@ class WaymoDataset(KittiDataset):
# convert gt_bboxes_3d to velodyne coordinates with `lidar2cam` # convert gt_bboxes_3d to velodyne coordinates with `lidar2cam`
if 'gt_bboxes' in ann_info: if 'gt_bboxes' in ann_info:
gt_bboxes = ann_info['gt_bboxes'] gt_bboxes = ann_info['gt_bboxes']
gt_labels = ann_info['gt_labels'] gt_bboxes_labels = ann_info['gt_bboxes_labels']
else: else:
gt_bboxes = np.zeros((0, 4), dtype=np.float32) gt_bboxes = np.zeros((0, 4), dtype=np.float32)
gt_labels = np.array([], dtype=np.int64) gt_bboxes_labels = np.zeros(0, dtype=np.int64)
if 'centers_2d' in ann_info: if 'centers_2d' in ann_info:
centers_2d = ann_info['centers_2d'] centers_2d = ann_info['centers_2d']
depths = ann_info['depths'] depths = ann_info['depths']
...@@ -169,7 +169,7 @@ class WaymoDataset(KittiDataset): ...@@ -169,7 +169,7 @@ class WaymoDataset(KittiDataset):
gt_bboxes_3d=gt_bboxes_3d, gt_bboxes_3d=gt_bboxes_3d,
gt_labels_3d=ann_info['gt_labels_3d'], gt_labels_3d=ann_info['gt_labels_3d'],
gt_bboxes=gt_bboxes, gt_bboxes=gt_bboxes,
gt_labels=gt_labels, gt_bboxes_labels=gt_bboxes_labels,
centers_2d=centers_2d, centers_2d=centers_2d,
depths=depths) depths=depths)
......
...@@ -66,7 +66,8 @@ class KittiMetric(BaseMetric): ...@@ -66,7 +66,8 @@ class KittiMetric(BaseMetric):
self.default_cam_key = default_cam_key self.default_cam_key = default_cam_key
self.file_client_args = file_client_args self.file_client_args = file_client_args
self.default_cam_key = default_cam_key self.default_cam_key = default_cam_key
allowed_metrics = ['bbox', 'img_bbox', 'mAP']
allowed_metrics = ['bbox', 'img_bbox', 'mAP', 'LET_mAP']
self.metrics = metric if isinstance(metric, list) else [metric] self.metrics = metric if isinstance(metric, list) else [metric]
for metric in self.metrics: for metric in self.metrics:
if metric not in allowed_metrics: if metric not in allowed_metrics:
......
This diff is collapsed.
...@@ -14,6 +14,8 @@ from mmdet3d.registry import MODELS ...@@ -14,6 +14,8 @@ from mmdet3d.registry import MODELS
from mmdet3d.utils import OptConfigType from mmdet3d.utils import OptConfigType
from mmdet.models import DetDataPreprocessor from mmdet.models import DetDataPreprocessor
from mmdet.models.utils.misc import samplelist_boxtype2tensor from mmdet.models.utils.misc import samplelist_boxtype2tensor
from .utils import multiview_img_stack_batch
@MODELS.register_module() @MODELS.register_module()
...@@ -144,7 +146,6 @@ class Det3DDataPreprocessor(DetDataPreprocessor): ...@@ -144,7 +146,6 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
data = self.collate_data(data) data = self.collate_data(data)
inputs, data_samples = data['inputs'], data['data_samples'] inputs, data_samples = data['inputs'], data['data_samples']
batch_inputs = dict() batch_inputs = dict()
if 'points' in inputs: if 'points' in inputs:
...@@ -185,6 +186,23 @@ class Det3DDataPreprocessor(DetDataPreprocessor): ...@@ -185,6 +186,23 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
return {'inputs': batch_inputs, 'data_samples': data_samples} return {'inputs': batch_inputs, 'data_samples': data_samples}
def preprocess_img(self, _batch_img):
# channel transform
if self._channel_conversion:
_batch_img = _batch_img[[2, 1, 0], ...]
# Convert to float after channel conversion to ensure
# efficiency
_batch_img = _batch_img.float()
# Normalization.
if self._enable_normalize:
if self.mean.shape[0] == 3:
assert _batch_img.dim() == 3 and _batch_img.shape[0] == 3, (
'If the mean has 3 values, the input tensor '
'should in shape of (3, H, W), but got the '
f'tensor with shape {_batch_img.shape}')
_batch_img = (_batch_img - self.mean) / self.std
return _batch_img
def collate_data(self, data: dict) -> dict: def collate_data(self, data: dict) -> dict:
"""Copying data to the target device and Performs normalization、 """Copying data to the target device and Performs normalization、
padding and bgr2rgb conversion and stack based on padding and bgr2rgb conversion and stack based on
...@@ -203,30 +221,30 @@ class Det3DDataPreprocessor(DetDataPreprocessor): ...@@ -203,30 +221,30 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
if 'img' in data['inputs']: if 'img' in data['inputs']:
_batch_imgs = data['inputs']['img'] _batch_imgs = data['inputs']['img']
# Process data with `pseudo_collate`. # Process data with `pseudo_collate`.
if is_list_of(_batch_imgs, torch.Tensor): if is_list_of(_batch_imgs, torch.Tensor):
batch_imgs = [] batch_imgs = []
img_dim = _batch_imgs[0].dim()
for _batch_img in _batch_imgs: for _batch_img in _batch_imgs:
# channel transform if img_dim == 3: # standard img
if self._channel_conversion: _batch_img = self.preprocess_img(_batch_img)
_batch_img = _batch_img[[2, 1, 0], ...] elif img_dim == 4:
# Convert to float after channel conversion to ensure _batch_img = [
# efficiency self.preprocess_img(_img) for _img in _batch_img
_batch_img = _batch_img.float() ]
# Normalization.
if self._enable_normalize: _batch_img = torch.stack(_batch_img, dim=0)
if self.mean.shape[0] == 3:
assert _batch_img.dim(
) == 3 and _batch_img.shape[0] == 3, (
'If the mean has 3 values, the input tensor '
'should in shape of (3, H, W), but got the '
f'tensor with shape {_batch_img.shape}')
_batch_img = (_batch_img - self.mean) / self.std
batch_imgs.append(_batch_img) batch_imgs.append(_batch_img)
# Pad and stack Tensor. # Pad and stack Tensor.
batch_imgs = stack_batch(batch_imgs, self.pad_size_divisor, if img_dim == 3:
self.pad_value) batch_imgs = stack_batch(batch_imgs, self.pad_size_divisor,
self.pad_value)
elif img_dim == 4:
batch_imgs = multiview_img_stack_batch(
batch_imgs, self.pad_size_divisor, self.pad_value)
# Process data with `default_collate`. # Process data with `default_collate`.
elif isinstance(_batch_imgs, torch.Tensor): elif isinstance(_batch_imgs, torch.Tensor):
assert _batch_imgs.dim() == 4, ( assert _batch_imgs.dim() == 4, (
...@@ -270,6 +288,10 @@ class Det3DDataPreprocessor(DetDataPreprocessor): ...@@ -270,6 +288,10 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
if is_list_of(_batch_inputs, torch.Tensor): if is_list_of(_batch_inputs, torch.Tensor):
batch_pad_shape = [] batch_pad_shape = []
for ori_input in _batch_inputs: for ori_input in _batch_inputs:
if ori_input.dim() == 4:
# mean multiivew input, select ont of the
# image to calculate the pad shape
ori_input = ori_input[0]
pad_h = int( pad_h = int(
np.ceil(ori_input.shape[1] / np.ceil(ori_input.shape[1] /
self.pad_size_divisor)) * self.pad_size_divisor self.pad_size_divisor)) * self.pad_size_divisor
......
# Copyright (c) OpenMMLab. All rights reserved.
from typing import List, Union
import torch
import torch.nn.functional as F
def multiview_img_stack_batch(
tensor_list: List[torch.Tensor],
pad_size_divisor: int = 1,
pad_value: Union[int, float] = 0) -> torch.Tensor:
"""
Compared to the stack_batch in mmengine.model.utils,
multiview_img_stack_batch further handle the multiview images.
see diff of padded_sizes[:, :-2] = 0 vs padded_sizees[:, 0] = 0 in line 47
Stack multiple tensors to form a batch and pad the tensor to the max
shape use the right bottom padding mode in these images. If
``pad_size_divisor > 0``, add padding to ensure the shape of each dim is
divisible by ``pad_size_divisor``.
Args:
tensor_list (List[Tensor]): A list of tensors with the same dim.
pad_size_divisor (int): If ``pad_size_divisor > 0``, add padding
to ensure the shape of each dim is divisible by
``pad_size_divisor``. This depends on the model, and many
models need to be divisible by 32. Defaults to 1
pad_value (int, float): The padding value. Defaults to 0.
Returns:
Tensor: The n dim tensor.
"""
assert isinstance(
tensor_list,
list), (f'Expected input type to be list, but got {type(tensor_list)}')
assert tensor_list, '`tensor_list` could not be an empty list'
assert len({
tensor.ndim
for tensor in tensor_list
}) == 1, (f'Expected the dimensions of all tensors must be the same, '
f'but got {[tensor.ndim for tensor in tensor_list]}')
dim = tensor_list[0].dim()
num_img = len(tensor_list)
all_sizes: torch.Tensor = torch.Tensor(
[tensor.shape for tensor in tensor_list])
max_sizes = torch.ceil(
torch.max(all_sizes, dim=0)[0] / pad_size_divisor) * pad_size_divisor
padded_sizes = max_sizes - all_sizes
# The first dim normally means channel, which should not be padded.
padded_sizes[:, :-2] = 0
if padded_sizes.sum() == 0:
return torch.stack(tensor_list)
# `pad` is the second arguments of `F.pad`. If pad is (1, 2, 3, 4),
# it means that padding the last dim with 1(left) 2(right), padding the
# penultimate dim to 3(top) 4(bottom). The order of `pad` is opposite of
# the `padded_sizes`. Therefore, the `padded_sizes` needs to be reversed,
# and only odd index of pad should be assigned to keep padding "right" and
# "bottom".
pad = torch.zeros(num_img, 2 * dim, dtype=torch.int)
pad[:, 1::2] = padded_sizes[:, range(dim - 1, -1, -1)]
batch_tensor = []
for idx, tensor in enumerate(tensor_list):
batch_tensor.append(
F.pad(tensor, tuple(pad[idx].tolist()), value=pad_value))
return torch.stack(batch_tensor)
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
from .base import Base3DDetector from .base import Base3DDetector
from .centerpoint import CenterPoint from .centerpoint import CenterPoint
from .dfm import DfM
from .dynamic_voxelnet import DynamicVoxelNet from .dynamic_voxelnet import DynamicVoxelNet
from .fcos_mono3d import FCOSMono3D from .fcos_mono3d import FCOSMono3D
from .groupfree3dnet import GroupFree3DNet from .groupfree3dnet import GroupFree3DNet
from .h3dnet import H3DNet from .h3dnet import H3DNet
from .imvotenet import ImVoteNet from .imvotenet import ImVoteNet
from .imvoxelnet import ImVoxelNet from .imvoxelnet import ImVoxelNet
from .multiview_dfm import MultiViewDfM
from .mvx_faster_rcnn import DynamicMVXFasterRCNN, MVXFasterRCNN from .mvx_faster_rcnn import DynamicMVXFasterRCNN, MVXFasterRCNN
from .mvx_two_stage import MVXTwoStageDetector from .mvx_two_stage import MVXTwoStageDetector
from .parta2 import PartA2 from .parta2 import PartA2
...@@ -19,9 +21,25 @@ from .votenet import VoteNet ...@@ -19,9 +21,25 @@ from .votenet import VoteNet
from .voxelnet import VoxelNet from .voxelnet import VoxelNet
__all__ = [ __all__ = [
'Base3DDetector', 'VoxelNet', 'DynamicVoxelNet', 'MVXTwoStageDetector', 'Base3DDetector',
'DynamicMVXFasterRCNN', 'MVXFasterRCNN', 'PartA2', 'VoteNet', 'H3DNet', 'DfM',
'CenterPoint', 'SSD3DNet', 'ImVoteNet', 'SingleStageMono3DDetector', 'VoxelNet',
'FCOSMono3D', 'ImVoxelNet', 'GroupFree3DNet', 'PointRCNN', 'SMOKEMono3D', 'DynamicVoxelNet',
'SASSD' 'MVXTwoStageDetector',
'DynamicMVXFasterRCNN',
'MVXFasterRCNN',
'MultiViewDfM',
'PartA2',
'VoteNet',
'H3DNet',
'CenterPoint',
'SSD3DNet',
'ImVoteNet',
'SingleStageMono3DDetector',
'FCOSMono3D',
'ImVoxelNet',
'GroupFree3DNet',
'PointRCNN',
'SMOKEMono3D',
'SASSD',
] ]
# Copyright (c) OpenMMLab. All rights reserved.
import torch
from mmdet3d.registry import MODELS
from mmdet3d.structures.ops import bbox3d2result
from mmdet3d.utils import ConfigType
from mmdet.models.detectors import BaseDetector
from ..builder import build_backbone, build_head, build_neck
@MODELS.register_module()
class DfM(BaseDetector):
r"""`Monocular 3D Object Detection with Depth from Motion.
<https://arxiv.org/abs/2207.12988>`_.
Args:
backbone (:obj:`ConfigDict` or dict): The backbone config.
neck (:obj:`ConfigDict` or dict): The neck config.
backbone_stereo (:obj:`ConfigDict` or dict): The stereo backbone
config.
backbone_3d (:obj:`ConfigDict` or dict): The 3d backbone config.
neck_3d (:obj:`ConfigDict` or dict): The 3D neck config.
bbox_head_3d (:obj:`ConfigDict` or dict): The 3d bbox head config.
neck_2d (:obj:`ConfigDict` or dict, optional): The 2D neck config
for 2D object detection. Defaults to None.
bbox_head_2d (:obj:`ConfigDict` or dict, optional): The 2D bbox
head config for 2D object detection. Defaults to None.
depth_head_2d (:obj:`ConfigDict` or dict, optional): The 2D depth
head config for depth estimation in fov space. Defaults to None.
depth_head (:obj:`ConfigDict` or dict, optional): The depth head
config for depth estimation in 3D voxel projected to fov space .
train_cfg (:obj:`ConfigDict` or dict, optional): Config dict of
training hyper-parameters. Defaults to None.
test_cfg (:obj:`ConfigDict` or dict, optional): Config dict of test
hyper-parameters. Defaults to None.
pretrained (:obj: `ConfigDict` or dict optional): The pretrained
config.
init_cfg (:obj:`ConfigDict` or dict, optional): The initialization
config. Defaults to None.
"""
def __init__(self,
backbone: ConfigType,
neck: ConfigType,
backbone_stereo: ConfigType,
backbone_3d: ConfigType,
neck_3d: ConfigType,
bbox_head_3d: ConfigType,
neck_2d=None,
bbox_head_2d=None,
depth_head_2d=None,
depth_head=None,
train_cfg=None,
test_cfg=None,
pretrained=None,
init_cfg=None):
super().__init__(init_cfg=init_cfg)
self.backbone = build_backbone(backbone)
self.neck = build_neck(neck)
if backbone_stereo is not None:
backbone_stereo.update(cat_img_feature=self.neck.cat_img_feature)
backbone_stereo.update(in_sem_channels=self.neck.sem_channels[-1])
self.backbone_stereo = build_backbone(backbone_stereo)
assert self.neck.cat_img_feature == \
self.backbone_stereo.cat_img_feature
assert self.neck.sem_channels[
-1] == self.backbone_stereo.in_sem_channels
if backbone_3d is not None:
self.backbone_3d = build_backbone(backbone_3d)
if neck_3d is not None:
self.neck_3d = build_neck(neck_3d)
if neck_2d is not None:
self.neck_2d = build_neck(neck_2d)
if bbox_head_2d is not None:
self.bbox_head_2d = build_head(bbox_head_2d)
if depth_head_2d is not None:
self.depth_head_2d = build_head(depth_head_2d)
if depth_head is not None:
self.depth_head = build_head(depth_head)
self.depth_samples = self.depth_head.depth_samples
self.train_cfg = train_cfg
self.test_cfg = test_cfg
bbox_head_3d.update(train_cfg=train_cfg)
bbox_head_3d.update(test_cfg=test_cfg)
self.bbox_head_3d = build_head(bbox_head_3d)
@property
def with_backbone_3d(self):
"""Whether the detector has a 3D backbone."""
return hasattr(self, 'backbone_3d') and self.backbone_3d is not None
@property
def with_neck_3d(self):
"""Whether the detector has a 3D neck."""
return hasattr(self, 'neck_3d') and self.neck_3d is not None
@property
def with_neck_2d(self):
"""Whether the detector has a 2D neck."""
return hasattr(self, 'neck_2d') and self.neck_2d is not None
@property
def with_bbox_head_2d(self):
"""Whether the detector has a 2D detection head."""
return hasattr(self, 'bbox_head_2d') and self.bbox_head_2d is not None
@property
def with_depth_head_2d(self):
"""Whether the detector has a image-based depth head."""
return hasattr(self,
'depth_head_2d') and self.depth_head_2d is not None
@property
def with_depth_head(self):
"""Whether the detector has a frustum-based depth head."""
return hasattr(self, 'depth_head') and self.depth_head is not None
def extract_feat(self, img, img_metas):
"""Feature extraction for perspective-view images.
Args:
img (torch.Tensor): Images of shape [B, N, C_in, H, W].
img_metas (list): Image meta information. Each element corresponds
to a group of images. len(img_metas) == B.
Returns:
torch.Tensor: bev feature with shape [B, C_out, N_y, N_x].
"""
# split input img into current and previous ones
batch_size, N, C_in, H, W = img.shape
cur_imgs = img[:, 0]
prev_imgs = img[:, 1] # TODO: to support multiple prev imgs
# 2D backbone for feature extraction
cur_feats = self.backbone(cur_imgs)
cur_feats = [cur_imgs] + list(cur_feats)
prev_feats = self.backbone(prev_imgs)
prev_feats = [prev_imgs] + list(prev_feats)
# SPP module as the feature neck
cur_stereo_feat, cur_sem_feat = self.neck(cur_feats)
prev_stereo_feat, prev_sem_feat = self.neck(prev_feats)
# derive cur2prevs
cur_pose = torch.tensor(
[img_meta['cam2global'] for img_meta in img_metas],
device=img.device)[:, None, :, :] # (B, 1, 4, 4)
prev_poses = []
for img_meta in img_metas:
sweep_img_metas = img_meta['sweep_img_metas']
prev_poses.append([
sweep_img_meta['cam2global']
for sweep_img_meta in sweep_img_metas
])
prev_poses = torch.tensor(prev_poses, device=img.device)
pad_prev_cam2global = torch.eye(4)[None, None].expand(
batch_size, N - 1, 4, 4).to(img.device)
pad_prev_cam2global[:, :, :prev_poses.shape[-2], :prev_poses.
shape[-1]] = prev_poses
pad_cur_cam2global = torch.eye(4)[None,
None].expand(batch_size, 1, 4,
4).to(img.device)
pad_cur_cam2global[:, :, :cur_pose.shape[-2], :cur_pose.
shape[-1]] = cur_pose
# (B, N-1, 4, 4) * (B, 1, 4, 4) -> (B, N-1, 4, 4)
# torch.linalg.solve is faster and more numerically stable
# than torch.matmul(torch.linalg.inv(A), B)
# empirical results show that torch.linalg.solve can derive
# almost the same result with np.linalg.inv
# while torch.linalg.inv can not
cur2prevs = torch.linalg.solve(pad_prev_cam2global, pad_cur_cam2global)
for meta_idx, img_meta in enumerate(img_metas):
img_meta['cur2prevs'] = cur2prevs[meta_idx]
# stereo backbone for depth estimation
# volume_feat: (batch_size, Cv, Nz, Ny, Nx)
volume_feat = self.backbone_stereo(cur_stereo_feat, prev_stereo_feat,
img_metas, cur_sem_feat)
# height compression
_, Cv, Nz, Ny, Nx = volume_feat.shape
bev_feat = volume_feat.view(batch_size, Cv * Nz, Ny, Nx)
bev_feat_prehg, bev_feat = self.neck_3d(bev_feat)
return bev_feat
def forward_train(self,
img,
img_metas,
gt_bboxes_3d,
gt_labels_3d,
depth_img=None,
**kwargs):
"""Forward function for training."""
bev_feat = self.extract_feat(img, img_metas)
outs = self.bbox_head_3d([bev_feat])
losses = self.bbox_head_3d.loss(*outs, gt_bboxes_3d, gt_labels_3d,
img_metas)
# TODO: loss_dense_depth, loss_2d, loss_imitation
return losses
def forward_test(self, img, img_metas, **kwargs):
"""Forward of testing.
Args:
img (torch.Tensor): Input images of shape (N, C_in, H, W).
img_metas (list): Image metas.
Returns:
list[dict]: Predicted 3d boxes.
"""
# not supporting aug_test for now
return self.simple_test(img, img_metas)
def simple_test(self, img, img_metas):
"""Simple inference forward without test time augmentation."""
bev_feat = self.extract_feat(img, img_metas)
# bbox_head takes a list of feature from different levels as input
# so need [bev_feat]
outs = self.bbox_head_3d([bev_feat])
bbox_list = self.bbox_head_3d.get_bboxes(*outs, img_metas)
bbox_results = [
bbox3d2result(det_bboxes, det_scores, det_labels)
for det_bboxes, det_scores, det_labels in bbox_list
]
# add pseudo-lidar label to each pred_dict for post-processing
for bbox_result in bbox_results:
bbox_result['pseudo_lidar'] = True
return bbox_results
def aug_test(self, imgs, img_metas, **kwargs):
"""Test with augmentations.
Args:
imgs (list[torch.Tensor]): Input images of shape (N, C_in, H, W).
img_metas (list): Image metas.
Returns:
list[dict]: Predicted 3d boxes.
"""
raise NotImplementedError
...@@ -2,12 +2,15 @@ ...@@ -2,12 +2,15 @@
from typing import List, Tuple, Union from typing import List, Tuple, Union
import torch import torch
from mmengine.structures import InstanceData
from mmdet3d.models.detectors import Base3DDetector from mmdet3d.models.detectors import Base3DDetector
from mmdet3d.models.layers.fusion_layers.point_fusion import point_sample from mmdet3d.models.layers.fusion_layers.point_fusion import point_sample
from mmdet3d.registry import MODELS, TASK_UTILS from mmdet3d.registry import MODELS, TASK_UTILS
from mmdet3d.structures.det3d_data_sample import SampleList from mmdet3d.structures.det3d_data_sample import SampleList
from mmdet3d.utils import ConfigType, OptConfigType from mmdet3d.utils import ConfigType, OptConfigType, OptInstanceList
from mmdet.models.detectors import BaseDetector
@MODELS.register_module() @MODELS.register_module()
...@@ -184,3 +187,64 @@ class ImVoxelNet(Base3DDetector): ...@@ -184,3 +187,64 @@ class ImVoxelNet(Base3DDetector):
x = self.extract_feat(batch_inputs_dict, batch_data_samples) x = self.extract_feat(batch_inputs_dict, batch_data_samples)
results = self.bbox_head.forward(x) results = self.bbox_head.forward(x)
return results return results
def convert_to_datasample(
self,
data_samples: SampleList,
data_instances_3d: OptInstanceList = None,
data_instances_2d: OptInstanceList = None,
) -> SampleList:
"""Convert results list to `Det3DDataSample`.
Subclasses could override it to be compatible for some multi-modality
3D detectors.
Args:
data_samples (list[:obj:`Det3DDataSample`]): The input data.
data_instances_3d (list[:obj:`InstanceData`], optional): 3D
Detection results of each sample.
data_instances_2d (list[:obj:`InstanceData`], optional): 2D
Detection results of each sample.
Returns:
list[:obj:`Det3DDataSample`]: Detection results of the
input. Each Det3DDataSample usually contains
'pred_instances_3d'. And the ``pred_instances_3d`` normally
contains following keys.
- scores_3d (Tensor): Classification scores, has a shape
(num_instance, )
- labels_3d (Tensor): Labels of 3D bboxes, has a shape
(num_instances, ).
- bboxes_3d (Tensor): Contains a tensor with shape
(num_instances, C) where C >=7.
When there are image prediction in some models, it should
contains `pred_instances`, And the ``pred_instances`` normally
contains following keys.
- scores (Tensor): Classification scores of image, has a shape
(num_instance, )
- labels (Tensor): Predict Labels of 2D bboxes, has a shape
(num_instances, ).
- bboxes (Tensor): Contains a tensor with shape
(num_instances, 4).
"""
assert (data_instances_2d is not None) or \
(data_instances_3d is not None),\
'please pass at least one type of data_samples'
if data_instances_2d is None:
data_instances_2d = [
InstanceData() for _ in range(len(data_instances_3d))
]
if data_instances_3d is None:
data_instances_3d = [
InstanceData() for _ in range(len(data_instances_2d))
]
for i, data_sample in enumerate(data_samples):
data_sample.pred_instances_3d = data_instances_3d[i]
data_sample.pred_instances = data_instances_2d[i]
return data_samples
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import torch
from mmdet3d.models.layers.fusion_layers.point_fusion import (point_sample,
voxel_sample)
from mmdet3d.registry import MODELS, TASK_UTILS
from mmdet3d.structures.bbox_3d.utils import get_lidar2img
from mmdet3d.structures.det3d_data_sample import SampleList
from mmdet3d.utils import ConfigType, OptConfigType
from mmdet.models.detectors import BaseDetector
from .dfm import DfM
from .imvoxelnet import ImVoxelNet
@MODELS.register_module()
class MultiViewDfM(ImVoxelNet, DfM):
r"""Waymo challenge solution of `MV-FCOS3D++
<https://arxiv.org/abs/2207.12716>`_.
Args:
backbone (:obj:`ConfigDict` or dict): The backbone config.
neck (:obj:`ConfigDict` or dict): The neck config.
backbone_stereo (:obj:`ConfigDict` or dict): The stereo backbone
config.
backbone_3d (:obj:`ConfigDict` or dict): The 3d backbone config.
neck_3d (:obj:`ConfigDict` or dict): The 3D neck config.
bbox_head (:obj:`ConfigDict` or dict): The bbox head config.
voxel_size (:obj:`ConfigDict` or dict): The voxel size.
anchor_generator (:obj:`ConfigDict` or dict): The anchor generator
config.
neck_2d (:obj:`ConfigDict` or dict, optional): The 2D neck config
for 2D object detection. Defaults to None.
bbox_head_2d (:obj:`ConfigDict` or dict, optional): The 2D bbox
head config for 2D object detection. Defaults to None.
depth_head_2d (:obj:`ConfigDict` or dict, optional): The 2D depth
head config for depth estimation in fov space. Defaults to None.
depth_head (:obj:`ConfigDict` or dict, optional): The depth head
config for depth estimation in 3D voxel projected to fov space .
train_cfg (:obj:`ConfigDict` or dict, optional): Config dict of
training hyper-parameters. Defaults to None.
test_cfg (:obj:`ConfigDict` or dict, optional): Config dict of test
hyper-parameters. Defaults to None.
data_preprocessor (dict or ConfigDict, optional): The pre-process
config of :class:`BaseDataPreprocessor`. it usually includes,
``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
valid_sample (bool): Whether to filter invalid points in view
transformation. Defaults to True.
temporal_aggregate (str): Key to determine the aggregation way in
temporal fusion. Defaults to 'concat'.
transform_depth (bool): Key to determine the transformation of depth.
Defaults to True.
init_cfg (:obj:`ConfigDict` or dict, optional): The initialization
config. Defaults to None.
"""
def __init__(self,
backbone: ConfigType,
neck: ConfigType,
backbone_stereo: ConfigType,
backbone_3d: ConfigType,
neck_3d: ConfigType,
bbox_head: ConfigType,
voxel_size: ConfigType,
anchor_generator: ConfigType,
neck_2d: ConfigType = None,
bbox_head_2d: ConfigType = None,
depth_head_2d: ConfigType = None,
depth_head: ConfigType = None,
train_cfg: OptConfigType = None,
test_cfg: OptConfigType = None,
data_preprocessor: OptConfigType = None,
valid_sample: bool = True,
temporal_aggregate: str = 'concat',
transform_depth: bool = True,
init_cfg: OptConfigType = None):
# TODO merge with DFM
BaseDetector.__init__(
self, data_preprocessor=data_preprocessor, init_cfg=init_cfg)
self.backbone = MODELS.build(backbone)
self.neck = MODELS.build(neck)
if backbone_stereo is not None:
backbone_stereo.update(cat_img_feature=self.neck.cat_img_feature)
backbone_stereo.update(in_sem_channels=self.neck.sem_channels[-1])
self.backbone_stereo = MODELS.build(backbone_stereo)
assert self.neck.cat_img_feature == \
self.backbone_stereo.cat_img_feature
assert self.neck.sem_channels[
-1] == self.backbone_stereo.in_sem_channels
if backbone_3d is not None:
self.backbone_3d = MODELS.build(backbone_3d)
if neck_3d is not None:
self.neck_3d = MODELS.build(neck_3d)
if neck_2d is not None:
self.neck_2d = MODELS.build(neck_2d)
if bbox_head_2d is not None:
self.bbox_head_2d = MODELS.build(bbox_head_2d)
if depth_head_2d is not None:
self.depth_head_2d = MODELS.build(depth_head_2d)
if depth_head is not None:
self.depth_head = MODELS.build(depth_head)
self.depth_samples = self.depth_head.depth_samples
self.train_cfg = train_cfg
self.test_cfg = test_cfg
bbox_head.update(train_cfg=train_cfg)
bbox_head.update(test_cfg=test_cfg)
self.bbox_head = MODELS.build(bbox_head)
self.voxel_size = voxel_size
self.voxel_range = anchor_generator['ranges'][0]
self.n_voxels = [
round((self.voxel_range[3] - self.voxel_range[0]) /
self.voxel_size[0]),
round((self.voxel_range[4] - self.voxel_range[1]) /
self.voxel_size[1]),
round((self.voxel_range[5] - self.voxel_range[2]) /
self.voxel_size[2])
]
self.anchor_generator = TASK_UTILS.build(anchor_generator)
self.valid_sample = valid_sample
self.temporal_aggregate = temporal_aggregate
self.transform_depth = transform_depth
def extract_feat(self, batch_inputs_dict: dict,
batch_data_samples: SampleList):
"""Extract 3d features from the backbone -> fpn -> 3d projection.
Args:
batch_inputs_dict (dict): The model input dict which include
the 'imgs' key.
- imgs (torch.Tensor, optional): Image of each sample.
batch_data_samples (list[:obj:`DetDataSample`]): The batch
data samples. It usually includes information such
as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
Returns:
torch.Tensor: of shape (N, C_out, N_x, N_y, N_z)
"""
# TODO: Nt means the number of frames temporally
# num_views means the number of views of a frame
img = batch_inputs_dict['imgs']
batch_img_metas = [
data_samples.metainfo for data_samples in batch_data_samples
]
batch_size, _, C_in, H, W = img.shape
num_views = batch_img_metas[0]['num_views']
num_ref_frames = batch_img_metas[0]['num_ref_frames']
if num_ref_frames > 0:
num_frames = num_ref_frames + 1
else:
num_frames = 1
input_shape = img.shape[-2:]
# NOTE: input_shape is the largest pad_shape of the batch of images
for img_meta in batch_img_metas:
img_meta.update(input_shape=input_shape)
if num_ref_frames > 0:
cur_imgs = img[:, :num_views].reshape(-1, C_in, H, W)
prev_imgs = img[:, num_views:].reshape(-1, C_in, H, W)
cur_feats = self.backbone(cur_imgs)
cur_feats = self.neck(cur_feats)[0]
with torch.no_grad():
prev_feats = self.backbone(prev_imgs)
prev_feats = self.neck(prev_feats)[0]
_, C_feat, H_feat, W_feat = cur_feats.shape
cur_feats = cur_feats.view(batch_size, -1, C_feat, H_feat, W_feat)
prev_feats = prev_feats.view(batch_size, -1, C_feat, H_feat,
W_feat)
batch_feats = torch.cat([cur_feats, prev_feats], dim=1)
else:
batch_imgs = img.view(-1, C_in, H, W)
batch_feats = self.backbone(batch_imgs)
# TODO: support SPP module neck
batch_feats = self.neck(batch_feats)[0]
_, C_feat, H_feat, W_feat = batch_feats.shape
batch_feats = batch_feats.view(batch_size, -1, C_feat, H_feat,
W_feat)
# transform the feature to voxel & stereo space
transform_feats = self.feature_transformation(batch_feats,
batch_img_metas,
num_views, num_frames)
if self.with_depth_head_2d:
transform_feats += (batch_feats[:, :num_views], )
return transform_feats
def feature_transformation(self, batch_feats, batch_img_metas, num_views,
num_frames):
"""Feature transformation from perspective view to BEV.
Args:
batch_feats (torch.Tensor): Perspective view features of shape
(batch_size, num_views, C, H, W).
batch_img_metas (list[dict]): Image meta information. Each element
corresponds to a group of images. len(img_metas) == B.
num_views (int): Number of views.
num_frames (int): Number of consecutive frames.
Returns:
tuple[torch.Tensor]: Volume features and (optionally) stereo \
features.
"""
# TODO: support more complicated 2D feature sampling
points = self.anchor_generator.grid_anchors(
[self.n_voxels[::-1]], device=batch_feats.device)[0][:, :3]
volumes = []
img_scale_factors = []
img_flips = []
img_crop_offsets = []
for feature, img_meta in zip(batch_feats, batch_img_metas):
# TODO: remove feature sampling from back
# TODO: support different scale_factors/flip/crop_offset for
# different views
frame_volume = []
frame_valid_nums = []
for frame_idx in range(num_frames):
volume = []
valid_flags = []
if isinstance(img_meta['img_shape'], list):
img_shape = img_meta['img_shape'][frame_idx][:2]
else:
img_shape = img_meta['img_shape'][:2]
for view_idx in range(num_views):
sample_idx = frame_idx * num_views + view_idx
if 'scale_factor' in img_meta:
img_scale_factor = img_meta['scale_factor'][sample_idx]
if isinstance(img_scale_factor, np.ndarray) and \
len(img_meta['scale_factor']) >= 2:
img_scale_factor = (
points.new_tensor(img_scale_factor[:2]))
else:
img_scale_factor = (
points.new_tensor(img_scale_factor))
else:
img_scale_factor = (1)
img_flip = img_meta['flip'][sample_idx] \
if 'flip' in img_meta.keys() else False
img_crop_offset = (
points.new_tensor(
img_meta['img_crop_offset'][sample_idx])
if 'img_crop_offset' in img_meta.keys() else 0)
lidar2cam = points.new_tensor(
img_meta['lidar2cam'][sample_idx])
cam2img = points.new_tensor(
img_meta['ori_cam2img'][sample_idx])
# align the precision, the tensor is converted to float32
lidar2img = get_lidar2img(cam2img.double(),
lidar2cam.double())
lidar2img = lidar2img.float()
sample_results = point_sample(
img_meta,
img_features=feature[sample_idx][None, ...],
points=points,
proj_mat=lidar2img,
coord_type='LIDAR',
img_scale_factor=img_scale_factor,
img_crop_offset=img_crop_offset,
img_flip=img_flip,
img_pad_shape=img_meta['input_shape'],
img_shape=img_shape,
aligned=False,
valid_flag=self.valid_sample)
if self.valid_sample:
volume.append(sample_results[0])
valid_flags.append(sample_results[1])
else:
volume.append(sample_results)
# TODO: save valid flags, more reasonable feat fusion
if self.valid_sample:
valid_nums = torch.stack(
valid_flags, dim=0).sum(0) # (N, )
volume = torch.stack(volume, dim=0).sum(0)
valid_mask = valid_nums > 0
volume[~valid_mask] = 0
frame_valid_nums.append(valid_nums)
else:
volume = torch.stack(volume, dim=0).mean(0)
frame_volume.append(volume)
img_scale_factors.append(img_scale_factor)
img_flips.append(img_flip)
img_crop_offsets.append(img_crop_offset)
if self.valid_sample:
if self.temporal_aggregate == 'mean':
frame_volume = torch.stack(frame_volume, dim=0).sum(0)
frame_valid_nums = torch.stack(
frame_valid_nums, dim=0).sum(0)
frame_valid_mask = frame_valid_nums > 0
frame_volume[~frame_valid_mask] = 0
frame_volume = frame_volume / torch.clamp(
frame_valid_nums[:, None], min=1)
elif self.temporal_aggregate == 'concat':
frame_valid_nums = torch.stack(frame_valid_nums, dim=1)
frame_volume = torch.stack(frame_volume, dim=1)
frame_valid_mask = frame_valid_nums > 0
frame_volume[~frame_valid_mask] = 0
frame_volume = (frame_volume / torch.clamp(
frame_valid_nums[:, :, None], min=1)).flatten(
start_dim=1, end_dim=2)
else:
frame_volume = torch.stack(frame_volume, dim=0).mean(0)
volumes.append(
frame_volume.reshape(self.n_voxels[::-1] + [-1]).permute(
3, 2, 1, 0))
volume_feat = torch.stack(volumes) # (B, C, N_x, N_y, N_z)
if self.with_backbone_3d:
outputs = self.backbone_3d(volume_feat)
volume_feat = outputs[0]
if self.backbone_3d.output_bev:
# use outputs[0] if len(outputs) == 1
# use outputs[1] if len(outputs) == 2
# TODO: unify the output formats
bev_feat = outputs[-1]
# grid_sample stereo features from the volume feature
# TODO: also support temporal modeling for depth head
if self.with_depth_head:
batch_stereo_feats = []
for batch_idx in range(volume_feat.shape[0]):
stereo_feat = []
for view_idx in range(num_views):
img_scale_factor = img_scale_factors[batch_idx] \
if self.transform_depth else points.new_tensor(
[1., 1.])
img_crop_offset = img_crop_offsets[batch_idx] \
if self.transform_depth else points.new_tensor(
[0., 0.])
img_flip = img_flips[batch_idx] if self.transform_depth \
else False
img_pad_shape = img_meta['input_shape'] \
if self.transform_depth else img_meta['ori_shape'][:2]
lidar2cam = points.new_tensor(
batch_img_metas[batch_idx]['lidar2cam'][view_idx])
cam2img = points.new_tensor(
img_meta[batch_idx]['lidar2cam'][view_idx])
proj_mat = torch.matmul(cam2img, lidar2cam)
stereo_feat.append(
voxel_sample(
volume_feat[batch_idx][None],
voxel_range=self.voxel_range,
voxel_size=self.voxel_size,
depth_samples=volume_feat.new_tensor(
self.depth_samples),
proj_mat=proj_mat,
downsample_factor=self.depth_head.
downsample_factor,
img_scale_factor=img_scale_factor,
img_crop_offset=img_crop_offset,
img_flip=img_flip,
img_pad_shape=img_pad_shape,
img_shape=batch_img_metas[batch_idx]['img_shape']
[view_idx][:2],
aligned=True)) # TODO: study the aligned setting
batch_stereo_feats.append(torch.cat(stereo_feat))
# cat (N, C, D, H, W) -> (B*N, C, D, H, W)
batch_stereo_feats = torch.cat(batch_stereo_feats)
if self.with_neck_3d:
if self.with_backbone_3d and self.backbone_3d.output_bev:
spatial_features = self.neck_3d(bev_feat)
# TODO: unify the outputs of neck_3d
volume_feat = spatial_features[1]
else:
volume_feat = self.neck_3d(volume_feat)[0]
# TODO: unify the output format of neck_3d
transform_feats = (volume_feat, )
if self.with_depth_head:
transform_feats += (batch_stereo_feats, )
return transform_feats
def aug_test(self, imgs, img_metas, **kwargs):
"""Test with augmentations.
Args:
imgs (list[torch.Tensor]): Input images of shape (N, C_in, H, W).
img_metas (list): Image metas.
Returns:
list[dict]: Predicted 3d boxes.
"""
raise NotImplementedError
...@@ -7,7 +7,7 @@ from torch.nn import functional as F ...@@ -7,7 +7,7 @@ from torch.nn import functional as F
from mmdet3d.registry import MODELS from mmdet3d.registry import MODELS
from mmdet3d.structures.bbox_3d import (get_proj_mat_by_coord_type, from mmdet3d.structures.bbox_3d import (get_proj_mat_by_coord_type,
points_cam2img) points_cam2img, points_img2cam)
from . import apply_3d_transformation from . import apply_3d_transformation
...@@ -23,7 +23,8 @@ def point_sample(img_meta, ...@@ -23,7 +23,8 @@ def point_sample(img_meta,
img_shape, img_shape,
aligned=True, aligned=True,
padding_mode='zeros', padding_mode='zeros',
align_corners=True): align_corners=True,
valid_flag=False):
"""Obtain image features using points. """Obtain image features using points.
Args: Args:
...@@ -41,12 +42,15 @@ def point_sample(img_meta, ...@@ -41,12 +42,15 @@ def point_sample(img_meta,
padding, this is necessary to obtain features in feature map. padding, this is necessary to obtain features in feature map.
img_shape (tuple[int]): int tuple indicates the h & w before padding img_shape (tuple[int]): int tuple indicates the h & w before padding
after scaling, this is necessary for flipping coordinates. after scaling, this is necessary for flipping coordinates.
aligned (bool, optional): Whether use bilinear interpolation when aligned (bool): Whether use bilinear interpolation when
sampling image features for each point. Defaults to True. sampling image features for each point. Defaults to True.
padding_mode (str, optional): Padding mode when padding values for padding_mode (str): Padding mode when padding values for
features of out-of-image points. Defaults to 'zeros'. features of out-of-image points. Defaults to 'zeros'.
align_corners (bool, optional): Whether to align corners when align_corners (bool): Whether to align corners when
sampling image features for each point. Defaults to True. sampling image features for each point. Defaults to True.
valid_flag (bool): Whether to filter out the points that
outside the image and with depth smaller than 0. Defaults to
False.
Returns: Returns:
torch.Tensor: NxC image features sampled by point coordinates. torch.Tensor: NxC image features sampled by point coordinates.
...@@ -57,7 +61,12 @@ def point_sample(img_meta, ...@@ -57,7 +61,12 @@ def point_sample(img_meta,
points, coord_type, img_meta, reverse=True) points, coord_type, img_meta, reverse=True)
# project points to image coordinate # project points to image coordinate
pts_2d = points_cam2img(points, proj_mat) if valid_flag:
proj_pts = points_cam2img(points, proj_mat, with_depth=True)
pts_2d = proj_pts[..., :2]
depths = proj_pts[..., 2]
else:
pts_2d = points_cam2img(points, proj_mat)
# img transformation: scale -> crop -> flip # img transformation: scale -> crop -> flip
# the image is resized by img_scale_factor # the image is resized by img_scale_factor
...@@ -70,13 +79,13 @@ def point_sample(img_meta, ...@@ -70,13 +79,13 @@ def point_sample(img_meta,
if img_flip: if img_flip:
# by default we take it as horizontal flip # by default we take it as horizontal flip
# use img_shape before padding for flip # use img_shape before padding for flip
orig_h, orig_w = img_shape ori_h, ori_w = img_shape
coor_x = orig_w - coor_x coor_x = ori_w - coor_x
h, w = img_pad_shape h, w = img_pad_shape
coor_y = coor_y / h * 2 - 1 norm_coor_y = coor_y / h * 2 - 1
coor_x = coor_x / w * 2 - 1 norm_coor_x = coor_x / w * 2 - 1
grid = torch.cat([coor_x, coor_y], grid = torch.cat([norm_coor_x, norm_coor_y],
dim=1).unsqueeze(0).unsqueeze(0) # Nx2 -> 1x1xNx2 dim=1).unsqueeze(0).unsqueeze(0) # Nx2 -> 1x1xNx2
# align_corner=True provides higher performance # align_corner=True provides higher performance
...@@ -88,6 +97,15 @@ def point_sample(img_meta, ...@@ -88,6 +97,15 @@ def point_sample(img_meta,
padding_mode=padding_mode, padding_mode=padding_mode,
align_corners=align_corners) # 1xCx1xN feats align_corners=align_corners) # 1xCx1xN feats
if valid_flag:
# (N, )
valid = (coor_x.squeeze() < w) & (coor_x.squeeze() > 0) & (
coor_y.squeeze() < h) & (coor_y.squeeze() > 0) & (
depths > 0)
valid_features = point_features.squeeze().t()
valid_features[~valid] = 0
return valid_features, valid # (N, C), (N,)
return point_features.squeeze().t() return point_features.squeeze().t()
...@@ -304,3 +322,94 @@ class PointFusion(BaseModule): ...@@ -304,3 +322,94 @@ class PointFusion(BaseModule):
align_corners=self.align_corners, align_corners=self.align_corners,
) )
return img_pts return img_pts
def voxel_sample(voxel_features,
voxel_range,
voxel_size,
depth_samples,
proj_mat,
downsample_factor,
img_scale_factor,
img_crop_offset,
img_flip,
img_pad_shape,
img_shape,
aligned=True,
padding_mode='zeros',
align_corners=True):
"""Obtain image features using points.
Args:
voxel_features (torch.Tensor): 1 x C x Nx x Ny x Nz voxel features.
voxel_range (list): The range of voxel features.
voxel_size (:obj:`ConfigDict` or dict): The voxel size of voxel
features.
depth_samples (torch.Tensor): N depth samples in LiDAR coordinates.
proj_mat (torch.Tensor): ORIGINAL LiDAR2img projection matrix
for N views.
downsample_factor (int): The downsample factor in rescaling.
img_scale_factor (tuple[torch.Tensor]): Scale factor with shape of
(w_scale, h_scale).
img_crop_offset (tuple[torch.Tensor]): Crop offset used to crop
image during data augmentation with shape of (w_offset, h_offset).
img_flip (bool): Whether the image is flipped.
img_pad_shape (tuple[int]): int tuple indicates the h & w after
padding, this is necessary to obtain features in feature map.
img_shape (tuple[int]): int tuple indicates the h & w before padding
after scaling, this is necessary for flipping coordinates.
aligned (bool, optional): Whether use bilinear interpolation when
sampling image features for each point. Defaults to True.
padding_mode (str, optional): Padding mode when padding values for
features of out-of-image points. Defaults to 'zeros'.
align_corners (bool, optional): Whether to align corners when
sampling image features for each point. Defaults to True.
Returns:
torch.Tensor: 1xCxDxHxW frustum features sampled from voxel features.
"""
# construct frustum grid
device = voxel_features.device
h, w = img_pad_shape
h_out = round(h / downsample_factor)
w_out = round(w / downsample_factor)
ws = (torch.linspace(0, w_out - 1, w_out) * downsample_factor).to(device)
hs = (torch.linspace(0, h_out - 1, h_out) * downsample_factor).to(device)
depths = depth_samples[::downsample_factor]
num_depths = len(depths)
ds_3d, ys_3d, xs_3d = torch.meshgrid(depths, hs, ws)
# grid: (D, H_out, W_out, 3) -> (D*H_out*W_out, 3)
grid = torch.stack([xs_3d, ys_3d, ds_3d], dim=-1).view(-1, 3)
# recover the coordinates in the canonical space
# reverse order of augmentations: flip -> crop -> scale
if img_flip:
# by default we take it as horizontal flip
# use img_shape before padding for flip
ori_h, ori_w = img_shape
grid[:, 0] = ori_w - grid[:, 0]
grid[:, :2] += img_crop_offset
grid[:, :2] /= img_scale_factor
# grid3d: (D*H_out*W_out, 3) in LiDAR coordinate system
grid3d = points_img2cam(grid, proj_mat)
# convert the 3D point coordinates to voxel coordinates
voxel_range = torch.tensor(voxel_range).to(device).view(1, 6)
voxel_size = torch.tensor(voxel_size).to(device).view(1, 3)
# suppose the voxel grid is generated with AlignedAnchorGenerator
# -0.5 given each grid is located at the center of the grid
# TODO: study whether here needs -0.5
grid3d = (grid3d - voxel_range[:, :3]) / voxel_size - 0.5
grid_size = (voxel_range[:, 3:] - voxel_range[:, :3]) / voxel_size
# normalize grid3d to (-1, 1)
grid3d = grid3d / grid_size * 2 - 1
# (x, y, z) -> (z, y, x) for grid_sampling
grid3d = grid3d.view(1, num_depths, h_out, w_out, 3)[..., [2, 1, 0]]
# align_corner=True provides higher performance
mode = 'bilinear' if aligned else 'nearest'
frustum_features = F.grid_sample(
voxel_features,
grid3d,
mode=mode,
padding_mode=padding_mode,
align_corners=align_corners) # 1xCxDxHxW feats
return frustum_features
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment