Commit ede10946 authored by Qing Lian's avatar Qing Lian Committed by ZwwWayne
Browse files

[Fix] Fix some loading bugs and support fov_image_based mode in Waymo dataset. (#1942)



* modify sample_id to sample_id and support fov_image_based on waymo dataset

* Update waymo_metric.py

* Minor fix

* Minor fix

* Minor fix

* Minor fix

* Minor fix

* Minor fix

* Minor fixes

* Minor fixes

* Remove optional

* fix dataset instances converting bugs

* Add a blank line to fix the doc compilation format

* Fix the bin file name in waymo_fov config

* Resolve conflicts

* fix ci and other things
Co-authored-by: default avatarTai-Wang <tab_wang@outlook.com>
Co-authored-by: default avatarlianqing11 <lianqing11@foxmail.com>
Co-authored-by: default avatarChaimZhu <zhuchenming@pjlab.org.cn>
parent 17ac0691
......@@ -52,6 +52,7 @@ train_dataloader = dict(
data_prefix=dict(img='training/image_2'),
pipeline=train_pipeline,
modality=input_modality,
load_type='fov_image_based',
test_mode=False,
metainfo=metainfo,
# we use box_type_3d='Camera' in monocular 3d
......@@ -70,6 +71,7 @@ val_dataloader = dict(
ann_file='kitti_infos_val.pkl',
pipeline=test_pipeline,
modality=input_modality,
load_type='fov_image_based',
metainfo=metainfo,
test_mode=True,
box_type_3d='Camera'))
......
......@@ -65,7 +65,7 @@ train_dataloader = dict(
CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
CAM_BACK_LEFT='samples/CAM_BACK_LEFT'),
ann_file='nuscenes_infos_train.pkl',
task='mono_det',
load_type='mv_image_based',
pipeline=train_pipeline,
metainfo=metainfo,
modality=input_modality,
......@@ -92,7 +92,7 @@ val_dataloader = dict(
CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
CAM_BACK_LEFT='samples/CAM_BACK_LEFT'),
ann_file='nuscenes_infos_val.pkl',
task='mono_det',
load_type='mv_image_based',
pipeline=test_pipeline,
modality=input_modality,
metainfo=metainfo,
......
# dataset settings
# D3 in the config name means the whole dataset is divided into 3 folds
# We only use one fold for efficient experiments
dataset_type = 'WaymoDataset'
data_root = 'data/waymo/kitti_format/'
class_names = ['Car', 'Pedestrian', 'Cyclist']
input_modality = dict(use_lidar=False, use_camera=True)
file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
train_pipeline = [
dict(type='LoadImageFromFileMono3D'),
dict(
type='LoadAnnotations3D',
with_bbox=True,
with_label=True,
with_attr_label=False,
with_bbox_3d=True,
with_label_3d=True,
with_bbox_depth=True),
# base shape (1248, 832), scale (0.95, 1.05)
dict(
type='RandomResize3D',
scale=(1284, 832),
ratio_range=(0.95, 1.05),
keep_ratio=True,
),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(
type='Pack3DDetInputs',
keys=[
'img', 'gt_bboxes', 'gt_bboxes_labels', 'gt_bboxes_3d',
'gt_labels_3d', 'centers_2d', 'depths'
]),
]
test_pipeline = [
dict(type='LoadImageFromFileMono3D'),
dict(
type='RandomResize3D',
scale=(1248, 832),
ratio_range=(1., 1.),
keep_ratio=True),
dict(type='Pack3DDetInputs', keys=['img']),
]
# construct a pipeline for data and gt loading in show function
# please keep its loading function consistent with test_pipeline (e.g. client)
eval_pipeline = [
dict(type='LoadImageFromFileMono3D'),
dict(
type='RandomResize3D',
scale=(1248, 832),
ratio_range=(1., 1.),
keep_ratio=True),
dict(type='Pack3DDetInputs', keys=['img']),
]
metainfo = dict(CLASSES=class_names)
train_dataloader = dict(
batch_size=3,
num_workers=3,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='waymo_infos_train.pkl',
data_prefix=dict(
pts='training/velodyne',
CAM_FRONT='training/image_0',
CAM_FRONT_RIGHT='training/image_1',
CAM_FRONT_LEFT='training/image_2',
CAM_SIDE_RIGHT='training/image_3',
CAM_SIDE_LEFT='training/image_4'),
pipeline=train_pipeline,
modality=input_modality,
test_mode=False,
metainfo=metainfo,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='Camera',
load_type='fov_image_based',
# load one frame every three frames
load_interval=5))
val_dataloader = dict(
batch_size=1,
num_workers=1,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
data_prefix=dict(
pts='training/velodyne',
CAM_FRONT='training/image_0',
CAM_FRONT_RIGHT='training/image_1',
CAM_FRONT_LEFT='training/image_2',
CAM_SIDE_RIGHT='training/image_3',
CAM_SIDE_LEFT='training/image_4'),
ann_file='waymo_infos_val.pkl',
pipeline=eval_pipeline,
modality=input_modality,
test_mode=True,
metainfo=metainfo,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='Camera',
load_type='fov_image_based',
))
test_dataloader = dict(
batch_size=1,
num_workers=1,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
data_prefix=dict(
pts='training/velodyne',
CAM_FRONT='training/image_0',
CAM_FRONT_RIGHT='training/image_1',
CAM_FRONT_LEFT='training/image_2',
CAM_SIDE_RIGHT='training/image_3',
CAM_SIDE_LEFT='training/image_4'),
ann_file='waymo_infos_val.pkl',
pipeline=eval_pipeline,
modality=input_modality,
test_mode=True,
metainfo=metainfo,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='Camera',
load_type='fov_image_based',
))
val_evaluator = dict(
type='WaymoMetric',
ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
waymo_bin_file='./data/waymo/waymo_format/fov_gt.bin',
data_root='./data/waymo/waymo_format',
metric='LET_mAP',
load_type='fov_image_based',
)
test_evaluator = val_evaluator
......@@ -81,7 +81,7 @@ train_dataloader = dict(
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='Camera',
task='mono_det',
load_type='mv_image_based',
# load one frame every three frames
load_interval=5))
......@@ -109,7 +109,7 @@ val_dataloader = dict(
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='Camera',
task='mono_det',
load_type='mv_image_based',
))
test_dataloader = dict(
......@@ -136,7 +136,7 @@ test_dataloader = dict(
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='Camera',
task='mono_det',
load_type='mv_image_based',
))
val_evaluator = dict(
......@@ -145,5 +145,6 @@ val_evaluator = dict(
waymo_bin_file='./data/waymo/waymo_format/cam_gt.bin',
data_root='./data/waymo/waymo_format',
metric='LET_mAP',
task='mono_det')
load_type='mv_image_based',
)
test_evaluator = val_evaluator
_base_ = [
'../_base_/datasets/waymoD5-fov-mono3d-3class.py',
'../_base_/models/pgd.py', '../_base_/schedules/mmdet-schedule-1x.py',
'../_base_/default_runtime.py'
]
# model settings
model = dict(
backbone=dict(
type='mmdet.ResNet',
depth=101,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
norm_cfg=dict(type='BN', requires_grad=True),
norm_eval=True,
style='pytorch',
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
stage_with_dcn=(False, False, True, True)),
neck=dict(num_outs=3),
bbox_head=dict(
num_classes=3,
bbox_code_size=7,
pred_attrs=False,
pred_velo=False,
pred_bbox2d=True,
use_onlyreg_proj=True,
strides=(8, 16, 32),
regress_ranges=((-1, 128), (128, 256), (256, 1e8)),
group_reg_dims=(2, 1, 3, 1, 16,
4), # offset, depth, size, rot, kpts, bbox2d
reg_branch=(
(256, ), # offset
(256, ), # depth
(256, ), # size
(256, ), # rot
(256, ), # kpts
(256, ) # bbox2d
),
centerness_branch=(256, ),
loss_cls=dict(
type='mmdet.FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(
type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
loss_dir=dict(
type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
loss_centerness=dict(
type='mmdet.CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
use_depth_classifier=True,
depth_branch=(256, ),
depth_range=(0, 50),
depth_unit=10,
division='uniform',
depth_bins=6,
pred_keypoints=True,
weight_dim=1,
loss_depth=dict(
type='UncertainSmoothL1Loss', alpha=1.0, beta=3.0,
loss_weight=1.0),
loss_bbox2d=dict(
type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.0),
loss_consistency=dict(type='mmdet.GIoULoss', loss_weight=0.0),
bbox_coder=dict(
type='PGDBBoxCoder',
base_depths=((41.01, 18.44), ),
base_dims=(
(4.73, 1.77, 2.08),
(0.91, 1.74, 0.84),
(1.81, 1.77, 0.84),
),
code_size=7)),
# set weight 1.0 for base 7 dims (offset, depth, size, rot)
# 0.2 for 16-dim keypoint offsets and 1.0 for 4-dim 2D distance targets
train_cfg=dict(code_weight=[
1.0, 1.0, 0.2, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 1.0, 1.0, 1.0, 1.0
]),
test_cfg=dict(nms_pre=100, nms_thr=0.05, score_thr=0.001, max_per_img=20))
# optimizer
optim_wrapper = dict(
optimizer=dict(
type='SGD',
lr=0.008,
),
paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.),
clip_grad=dict(max_norm=35, norm_type=2))
param_scheduler = [
dict(
type='LinearLR',
start_factor=1.0 / 3,
by_epoch=False,
begin=0,
end=500),
dict(
type='MultiStepLR',
begin=0,
end=24,
by_epoch=True,
milestones=[16, 22],
gamma=0.1)
]
total_epochs = 24
runner = dict(max_epochs=total_epochs)
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=24, val_interval=24)
val_cfg = dict(type='ValLoop')
test_cfg = dict(type='TestLoop')
_base_ = [
'../_base_/datasets/waymoD5-mv-mono3d-3class.py',
'../_base_/models/pgd.py', '../_base_/schedules/mmdet-schedule-1x.py',
'../_base_/default_runtime.py'
]
# model settings
model = dict(
backbone=dict(
type='mmdet.ResNet',
depth=101,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
norm_cfg=dict(type='BN', requires_grad=True),
norm_eval=True,
style='pytorch',
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
stage_with_dcn=(False, False, True, True)),
neck=dict(num_outs=3),
bbox_head=dict(
num_classes=3,
bbox_code_size=7,
pred_attrs=False,
pred_velo=False,
pred_bbox2d=True,
use_onlyreg_proj=True,
strides=(8, 16, 32),
regress_ranges=((-1, 128), (128, 256), (256, 1e8)),
group_reg_dims=(2, 1, 3, 1, 16,
4), # offset, depth, size, rot, kpts, bbox2d
reg_branch=(
(256, ), # offset
(256, ), # depth
(256, ), # size
(256, ), # rot
(256, ), # kpts
(256, ) # bbox2d
),
centerness_branch=(256, ),
loss_cls=dict(
type='mmdet.FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(
type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
loss_dir=dict(
type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
loss_centerness=dict(
type='mmdet.CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
use_depth_classifier=True,
depth_branch=(256, ),
depth_range=(0, 50),
depth_unit=10,
division='uniform',
depth_bins=6,
pred_keypoints=True,
weight_dim=1,
loss_depth=dict(
type='UncertainSmoothL1Loss', alpha=1.0, beta=3.0,
loss_weight=1.0),
loss_bbox2d=dict(
type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.0),
loss_consistency=dict(type='mmdet.GIoULoss', loss_weight=0.0),
bbox_coder=dict(
type='PGDBBoxCoder',
base_depths=((41.01, 18.44), ),
base_dims=(
(4.73, 1.77, 2.08),
(0.91, 1.74, 0.84),
(1.81, 1.77, 0.84),
),
code_size=7)),
# set weight 1.0 for base 7 dims (offset, depth, size, rot)
# 0.2 for 16-dim keypoint offsets and 1.0 for 4-dim 2D distance targets
train_cfg=dict(code_weight=[
1.0, 1.0, 0.2, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 1.0, 1.0, 1.0, 1.0
]),
test_cfg=dict(nms_pre=100, nms_thr=0.05, score_thr=0.001, max_per_img=20))
# optimizer
optim_wrapper = dict(
optimizer=dict(
type='SGD',
lr=0.008,
),
paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.),
clip_grad=dict(max_norm=35, norm_type=2))
param_scheduler = [
dict(
type='LinearLR',
start_factor=1.0 / 3,
by_epoch=False,
begin=0,
end=500),
dict(
type='MultiStepLR',
begin=0,
end=24,
by_epoch=True,
milestones=[16, 22],
gamma=0.1)
]
total_epochs = 24
runner = dict(max_epochs=total_epochs)
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=24, val_interval=24)
val_cfg = dict(type='ValLoop')
test_cfg = dict(type='TestLoop')
......@@ -32,6 +32,15 @@ class KittiDataset(Det3DDataset):
- 'LiDAR': Box in LiDAR coordinates.
- 'Depth': Box in depth coordinates, usually for indoor dataset.
- 'Camera': Box in camera coordinates.
load_type (str): Type of loading mode. Defaults to 'frame_based'.
- 'frame_based': Load all of the instances in the frame.
- 'mv_image_based': Load all of the instances in the frame and need
to convert to the FOV-based data type to support image-based
detector.
- 'fov_image_based': Only load the instances inside the default
cam, and need to convert to the FOV-based data type to support
image-based detector.
filter_empty_gt (bool): Whether to filter the data with empty GT.
If it's set to be True, the example with empty annotations after
data pipeline will be dropped and a random example will be chosen
......@@ -54,7 +63,7 @@ class KittiDataset(Det3DDataset):
pipeline: List[Union[dict, Callable]] = [],
modality: dict = dict(use_lidar=True),
default_cam_key: str = 'CAM2',
task: str = 'lidar_det',
load_type: str = 'frame_based',
box_type_3d: str = 'LiDAR',
filter_empty_gt: bool = True,
test_mode: bool = False,
......@@ -62,8 +71,9 @@ class KittiDataset(Det3DDataset):
**kwargs) -> None:
self.pcd_limit_range = pcd_limit_range
assert task in ('lidar_det', 'mono_det')
self.task = task
assert load_type in ('frame_based', 'mv_image_based',
'fov_image_based')
self.load_type = load_type
super().__init__(
data_root=data_root,
ann_file=ann_file,
......@@ -113,7 +123,7 @@ class KittiDataset(Det3DDataset):
info['plane'] = plane_lidar
if self.task == 'mono_det' and self.load_eval_anns:
if self.load_type == 'fov_image_based' and self.load_eval_anns:
info['instances'] = info['cam_instances'][self.default_cam_key]
info = super().parse_data_info(info)
......@@ -144,7 +154,7 @@ class KittiDataset(Det3DDataset):
ann_info['gt_bboxes_3d'] = np.zeros((0, 7), dtype=np.float32)
ann_info['gt_labels_3d'] = np.zeros(0, dtype=np.int64)
if self.task == 'mono_det':
if self.load_type in ['fov_image_based', 'mv_image_based']:
ann_info['gt_bboxes'] = np.zeros((0, 4), dtype=np.float32)
ann_info['gt_bboxes_labels'] = np.array(0, dtype=np.int64)
ann_info['centers_2d'] = np.zeros((0, 2), dtype=np.float32)
......
......@@ -22,7 +22,6 @@ class NuScenesDataset(Det3DDataset):
Args:
data_root (str): Path of dataset root.
ann_file (str): Path of annotation file.
task (str): Detection task. Defaults to 'lidar_det'.
pipeline (list[dict]): Pipeline used for data processing.
Defaults to [].
box_type_3d (str): Type of 3D box of this dataset.
......@@ -33,6 +32,15 @@ class NuScenesDataset(Det3DDataset):
- 'LiDAR': Box in LiDAR coordinates.
- 'Depth': Box in depth coordinates, usually for indoor dataset.
- 'Camera': Box in camera coordinates.
load_type (str): Type of loading mode. Defaults to 'frame_based'.
- 'frame_based': Load all of the instances in the frame.
- 'mv_image_based': Load all of the instances in the frame and need
to convert to the FOV-based data type to support image-based
detector.
- 'fov_image_based': Only load the instances inside the default
cam, and need to convert to the FOV-based data type to support
image-based detector.
modality (dict): Modality to specify the sensor data used as input.
Defaults to dict(use_camera=False, use_lidar=True).
filter_empty_gt (bool): Whether to filter the data with empty GT.
......@@ -58,9 +66,9 @@ class NuScenesDataset(Det3DDataset):
def __init__(self,
data_root: str,
ann_file: str,
task: str = 'lidar_det',
pipeline: List[Union[dict, Callable]] = [],
box_type_3d: str = 'LiDAR',
load_type: str = 'frame_based',
modality: dict = dict(
use_camera=False,
use_lidar=True,
......@@ -74,8 +82,9 @@ class NuScenesDataset(Det3DDataset):
self.with_velocity = with_velocity
# TODO: Redesign multi-view data process in the future
assert task in ('lidar_det', 'mono_det', 'multi-view_det')
self.task = task
assert load_type in ('frame_based', 'mv_image_based',
'fov_image_based')
self.load_type = load_type
assert box_type_3d.lower() in ('lidar', 'camera')
super().__init__(
......@@ -144,7 +153,7 @@ class NuScenesDataset(Det3DDataset):
ann_info['gt_bboxes_3d'] = np.zeros((0, 7), dtype=np.float32)
ann_info['gt_labels_3d'] = np.zeros(0, dtype=np.int64)
if self.task == 'mono3d':
if self.load_type in ['fov_image_based', 'mv_image_based']:
ann_info['gt_bboxes'] = np.zeros((0, 4), dtype=np.float32)
ann_info['gt_bboxes_labels'] = np.array(0, dtype=np.int64)
ann_info['attr_labels'] = np.array(0, dtype=np.int64)
......@@ -154,7 +163,7 @@ class NuScenesDataset(Det3DDataset):
# the nuscenes box center is [0.5, 0.5, 0.5], we change it to be
# the same as KITTI (0.5, 0.5, 0)
# TODO: Unify the coordinates
if self.task == 'mono_det':
if self.load_type in ['fov_image_based', 'mv_image_based']:
gt_bboxes_3d = CameraInstance3DBoxes(
ann_info['gt_bboxes_3d'],
box_dim=ann_info['gt_bboxes_3d'].shape[-1],
......@@ -182,7 +191,7 @@ class NuScenesDataset(Det3DDataset):
dict: Has `ann_info` in training stage. And
all path has been converted to absolute path.
"""
if self.task == 'mono_det':
if self.load_type == 'mv_image_based':
data_list = []
if self.modality['use_lidar']:
info['lidar_points']['lidar_path'] = \
......
......@@ -45,6 +45,15 @@ class WaymoDataset(KittiDataset):
- 'LiDAR': Box in LiDAR coordinates.
- 'Depth': Box in depth coordinates, usually for indoor dataset.
- 'Camera': Box in camera coordinates.
load_type (str): Type of loading mode. Defaults to 'frame_based'.
- 'frame_based': Load all of the instances in the frame.
- 'mv_image_based': Load all of the instances in the frame and need
to convert to the FOV-based data type to support image-based
detector.
- 'fov_image_based': Only load the instances inside the default
cam, and need to convert to the FOV-based data type to support
image-based detector.
filter_empty_gt (bool): Whether to filter the data with empty GT.
If it's set to be True, the example with empty annotations after
data pipeline will be dropped and a random example will be chosen
......@@ -57,10 +66,6 @@ class WaymoDataset(KittiDataset):
cam_sync_instances (bool): If use the camera sync label
supported from waymo version 1.3.1. Defaults to False.
load_interval (int): load frame interval. Defaults to 1.
task (str): task for 3D detection (lidar, mono3d).
lidar: take all the ground trurh in the frame.
mono3d: take the groundtruth that can be seen in the cam.
Defaults to 'lidar_det'.
max_sweeps (int): max sweep for each frame. Defaults to 0.
"""
METAINFO = {'classes': ('Car', 'Pedestrian', 'Cyclist')}
......@@ -79,12 +84,12 @@ class WaymoDataset(KittiDataset):
modality: dict = dict(use_lidar=True),
default_cam_key: str = 'CAM_FRONT',
box_type_3d: str = 'LiDAR',
load_type: str = 'frame_based',
filter_empty_gt: bool = True,
test_mode: bool = False,
pcd_limit_range: List[float] = [0, -40, -3, 70.4, 40, 0.0],
cam_sync_instances: bool = False,
load_interval: int = 1,
task: str = 'lidar_det',
max_sweeps: int = 0,
**kwargs) -> None:
self.load_interval = load_interval
......@@ -108,7 +113,7 @@ class WaymoDataset(KittiDataset):
default_cam_key=default_cam_key,
data_prefix=data_prefix,
test_mode=test_mode,
task=task,
load_type=load_type,
**kwargs)
def parse_ann_info(self, info: dict) -> dict:
......@@ -151,7 +156,7 @@ class WaymoDataset(KittiDataset):
centers_2d = np.zeros((0, 2), dtype=np.float32)
depths = np.zeros((0), dtype=np.float32)
if self.task == 'mono_det':
if self.load_type in ['fov_image_based', 'mv_image_based']:
gt_bboxes_3d = CameraInstance3DBoxes(
ann_info['gt_bboxes_3d'],
box_dim=ann_info['gt_bboxes_3d'].shape[-1],
......@@ -186,10 +191,19 @@ class WaymoDataset(KittiDataset):
def parse_data_info(self, info: dict) -> dict:
"""if task is lidar or multiview det, use super() method elif task is
mono3d, split the info from frame-wise to img-wise."""
if self.task != 'mono_det':
if self.cam_sync_instances:
# use the cam sync labels
info['instances'] = info['cam_sync_instances']
if self.load_type == 'frame_based':
return super().parse_data_info(info)
elif self.load_type == 'fov_image_based':
# only loading the fov image and the fov instance
new_image_info = {}
new_image_info[self.default_cam_key] = \
info['images'][self.default_cam_key]
info['images'] = new_image_info
info['instances'] = info['cam_instances'][self.default_cam_key]
return super().parse_data_info(info)
else:
# in the mono3d, the instances is from cam sync.
......@@ -222,7 +236,7 @@ class WaymoDataset(KittiDataset):
# TODO check if need to modify the sample id
# TODO check when will use it except for evaluation.
camera_info['sample_id'] = info['sample_id']
camera_info['sample_idx'] = info['sample_idx']
if not self.test_mode:
# used in training
......
......@@ -45,8 +45,15 @@ class WaymoMetric(KittiMetric):
submission_prefix (str, optional): The prefix of submission data.
If not specified, the submission data will not be generated.
Default: None.
task: (str, optional): task for 3D detection, if cam, would filter
the points that outside the image.
load_type (str, optional): Type of loading mode during training.
- 'frame_based': Load all of the instances in the frame.
- 'mv_image_based': Load all of the instances in the frame and need
to convert to the FOV-based data type to support image-based
detector.
- 'fov_image_base': Only load the instances inside the default cam,
and need to convert to the FOV-based data type to support
image-based detector.
default_cam_key (str, optional): The default camera for lidar to
camear conversion. By default, KITTI: CAM2, Waymo: CAM_FRONT
use_pred_sample_idx (bool, optional): In formating results, use the
......@@ -76,7 +83,7 @@ class WaymoMetric(KittiMetric):
prefix: Optional[str] = None,
pklfile_prefix: str = None,
submission_prefix: str = None,
task='lidar_det',
load_type: str = 'frame_based',
default_cam_key: str = 'CAM_FRONT',
use_pred_sample_idx: bool = False,
collect_device: str = 'cpu',
......@@ -85,7 +92,7 @@ class WaymoMetric(KittiMetric):
self.waymo_bin_file = waymo_bin_file
self.data_root = data_root
self.split = split
self.task = task
self.load_type = load_type
self.use_pred_sample_idx = use_pred_sample_idx
self.convert_kitti_format = convert_kitti_format
......@@ -124,8 +131,8 @@ class WaymoMetric(KittiMetric):
assert len(results) == len(self.data_infos), \
'invalid list length of network outputs'
# different from kitti, waymo do not need to convert the ann file
# handle the mono3d task
if self.task == 'mono_det':
# handle the mv_image_based load_mode
if self.load_type == 'mv_image_based':
new_data_infos = []
for info in self.data_infos:
height = info['images'][self.default_cam_key]['height']
......@@ -425,7 +432,7 @@ class WaymoMetric(KittiMetric):
lidar2cam = cam0_info['images'][self.default_cam_key]['lidar2img']
lidar2cam = np.array(lidar2cam).astype(np.float32)
box_preds_camera = box_preds_lidar.convert_to(
Box3DMode.CAM, np.linalg.inv(lidar2cam), correct_yaw=True)
Box3DMode.CAM, lidar2cam, correct_yaw=True)
# Note: bbox is meaningless in final evaluation, set to 0
merged_box_dict = dict(
bbox=np.zeros([box_preds_lidar.tensor.shape[0], 4]),
......@@ -470,7 +477,7 @@ class WaymoMetric(KittiMetric):
sample_idx = sample_id_list[idx]
info = self.data_infos[sample_idx]
if self.task == 'mono_det':
if self.load_type == 'mv_image_based':
if idx % self.num_cams == 0:
box_dict_per_frame = []
cam0_key = list(info['images'].keys())[0]
......@@ -487,7 +494,7 @@ class WaymoMetric(KittiMetric):
# If you want to use another camera, please modify it.
image_shape = (info['images'][self.default_cam_key]['height'],
info['images'][self.default_cam_key]['width'])
if self.task == 'mono_det':
if self.load_type == 'mv_image_based':
box_dict_per_frame.append(box_dict)
if (idx + 1) % self.num_cams != 0:
continue
......@@ -587,7 +594,7 @@ class WaymoMetric(KittiMetric):
def convert_valid_bboxes(self, box_dict: dict, info: dict):
"""Convert the predicted boxes into valid ones. Should handle the
different task mode (mono3d, mv3d, lidar), separately.
load_model (frame_based, mv_image_based, fov_image_based), separately.
Args:
box_dict (dict): Box dictionaries to be converted.
......@@ -624,11 +631,11 @@ class WaymoMetric(KittiMetric):
scores=np.zeros([0]),
label_preds=np.zeros([0, 4]),
sample_idx=sample_idx)
# Here default used 'CAM2' to compute metric. If you want to
# Here default used 'CAM_FRONT' to compute metric. If you want to
# use another camera, please modify it.
if self.task in ['mv3d_det', 'lidar_det']:
if self.load_type in ['frame_based', 'fov_image_based']:
cam_key = self.default_cam_key
elif self.task == 'mono_det':
elif self.load_type == 'mv_image_based':
cam_key = list(info['images'].keys())[0]
else:
raise NotImplementedError
......@@ -661,12 +668,12 @@ class WaymoMetric(KittiMetric):
(box_2d_preds[:, 1] < image_shape[0]) &
(box_2d_preds[:, 2] > 0) & (box_2d_preds[:, 3] > 0))
# check box_preds_lidar
if self.task in ['mv3d_det', 'lidar_det']:
if self.load_type in ['frame_based']:
limit_range = box_preds.tensor.new_tensor(self.pcd_limit_range)
valid_pcd_inds = ((box_preds_lidar.center > limit_range[:3]) &
(box_preds_lidar.center < limit_range[3:]))
valid_inds = valid_pcd_inds.all(-1)
elif self.task == 'mono_det':
if self.load_type in ['mv_image_based', 'fov_image_based']:
valid_inds = valid_cam_inds
if valid_inds.sum() > 0:
......
......@@ -133,6 +133,7 @@ class Waymo2KITTI(object):
self.save_image(frame, file_idx, frame_idx)
self.save_calib(frame, file_idx, frame_idx)
if 'testing_3d_camera_only_detection' not in self.load_dir:
# the camera only split doesn't contain lidar points.
self.save_lidar(frame, file_idx, frame_idx)
self.save_pose(frame, file_idx, frame_idx)
self.save_timestamp(frame, file_idx, frame_idx)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment