"tests/vscode:/vscode.git/clone" did not exist on "3a063f5976781ed30c1c36b65fe837d63e76b94e"
Commit b496f579 authored by ZCMax's avatar ZCMax Committed by ChaimZhu
Browse files

[Refactor] Refactor Mono3D models

parent 35667791
dataset_type = 'KittiMonoDataset'
dataset_type = 'KittiDataset'
data_root = 'data/kitti/'
class_names = ['Pedestrian', 'Cyclist', 'Car']
input_modality = dict(use_lidar=False, use_camera=True)
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
metainfo = dict(CLASSES=class_names)
# file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
file_client_args = dict(
backend='petrel',
path_mapping=dict({
'./data/kitti/':
's3://openmmlab/datasets/detection3d/kitti/',
'data/kitti/':
's3://openmmlab/datasets/detection3d/kitti/'
}))
train_pipeline = [
dict(type='LoadImageFromFileMono3D'),
dict(type='LoadImageFromFileMono3D', file_client_args=file_client_args),
dict(
type='LoadAnnotations3D',
with_bbox=True,
......@@ -14,79 +27,60 @@ train_pipeline = [
with_bbox_3d=True,
with_label_3d=True,
with_bbox_depth=True),
dict(type='Resize', img_scale=(1242, 375), keep_ratio=True),
dict(type='Resize', scale=(1242, 375), keep_ratio=True),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(
type='Collect3D',
type='Pack3DDetInputs',
keys=[
'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_3d', 'gt_labels_3d',
'centers2d', 'depths'
'centers_2d', 'depths'
]),
]
test_pipeline = [
dict(type='LoadImageFromFileMono3D'),
dict(
type='MultiScaleFlipAug',
img_scale=(1242, 375),
flip=False,
transforms=[
dict(type='RandomFlip3D'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['img']),
])
]
# construct a pipeline for data and gt loading in show function
# please keep its loading function consistent with test_pipeline (e.g. client)
eval_pipeline = [
dict(type='LoadImageFromFileMono3D'),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['img'])
dict(type='LoadImageFromFileMono3D', file_client_args=file_client_args),
dict(type='Resize', scale=(1242, 375), keep_ratio=True),
dict(type='Pack3DDetInputs', keys=['img'])
]
data = dict(
samples_per_gpu=2,
workers_per_gpu=2,
train=dict(
train_dataloader = dict(
batch_size=2,
num_workers=2,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'kitti_infos_train_mono3d.coco.json',
info_file=data_root + 'kitti_infos_train.pkl',
img_prefix=data_root,
classes=class_names,
ann_file='kitti_infos_train.pkl',
data_prefix=dict(img='training/image_2'),
pipeline=train_pipeline,
modality=input_modality,
test_mode=False,
box_type_3d='Camera'),
val=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'kitti_infos_val_mono3d.coco.json',
info_file=data_root + 'kitti_infos_val.pkl',
img_prefix=data_root,
classes=class_names,
pipeline=test_pipeline,
modality=input_modality,
test_mode=True,
box_type_3d='Camera'),
test=dict(
metainfo=metainfo,
# we use box_type_3d='Camera' in monocular 3d
# detection task
box_type_3d='Camera'))
val_dataloader = dict(
batch_size=1,
num_workers=2,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'kitti_infos_val_mono3d.coco.json',
info_file=data_root + 'kitti_infos_val.pkl',
img_prefix=data_root,
classes=class_names,
data_prefix=dict(img='training/image_2'),
ann_file='kitti_infos_val.pkl',
pipeline=test_pipeline,
modality=input_modality,
metainfo=metainfo,
test_mode=True,
box_type_3d='Camera'))
evaluation = dict(interval=2)
test_dataloader = val_dataloader
val_evaluator = dict(
type='KittiMetric',
ann_file=data_root + 'kitti_infos_val.pkl',
metric='bbox',
pred_box_type_3d='Camera')
test_evaluator = val_evaluator
dataset_type = 'NuScenesMonoDataset'
dataset_type = 'NuScenesDataset'
data_root = 'data/nuscenes/'
class_names = [
'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
]
metainfo = dict(CLASSES=class_names)
# Input modality for nuScenes dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality = dict(
use_lidar=False,
use_camera=True,
use_radar=False,
use_map=False,
use_external=False)
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
input_modality = dict(use_lidar=False, use_camera=True)
# file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
file_client_args = dict(
backend='petrel',
path_mapping=dict({
'./data/nuscenes/':
's3://openmmlab/datasets/detection3d/nuscenes/',
'data/nuscenes/':
's3://openmmlab/datasets/detection3d/nuscenes/'
}))
train_pipeline = [
dict(type='LoadImageFromFileMono3D'),
dict(type='LoadImageFromFileMono3D', file_client_args=file_client_args),
dict(
type='LoadAnnotations3D',
with_bbox=True,
......@@ -26,75 +34,77 @@ train_pipeline = [
with_bbox_depth=True),
dict(type='Resize', img_scale=(1600, 900), keep_ratio=True),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(
type='Collect3D',
type='Pack3DDetInputs',
keys=[
'img', 'gt_bboxes', 'gt_labels', 'attr_labels', 'gt_bboxes_3d',
'gt_labels_3d', 'centers2d', 'depths'
'gt_labels_3d', 'centers_2d', 'depths'
]),
]
test_pipeline = [
dict(type='LoadImageFromFileMono3D'),
dict(
type='MultiScaleFlipAug',
scale_factor=1.0,
flip=False,
transforms=[
dict(type='RandomFlip3D'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['img']),
])
]
# construct a pipeline for data and gt loading in show function
# please keep its loading function consistent with test_pipeline (e.g. client)
eval_pipeline = [
dict(type='LoadImageFromFileMono3D'),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['img'])
dict(type='LoadImageFromFileMono3D', file_client_args=file_client_args),
dict(type='mmdet.Resize', scale=(1600, 900), keep_ratio=True),
dict(type='Pack3DDetInputs', keys=['img'])
]
data = dict(
samples_per_gpu=2,
workers_per_gpu=2,
train=dict(
train_dataloader = dict(
batch_size=2,
num_workers=2,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'nuscenes_infos_train_mono3d.coco.json',
img_prefix=data_root,
classes=class_names,
data_prefix=dict(
pts='',
CAM_FRONT='samples/CAM_FRONT',
CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT',
CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT',
CAM_BACK='samples/CAM_BACK',
CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
CAM_BACK_LEFT='samples/CAM_BACK_LEFT'),
ann_file='nuscenes_infos_train.pkl',
task='mono3d',
pipeline=train_pipeline,
metainfo=metainfo,
modality=input_modality,
test_mode=False,
box_type_3d='Camera'),
val=dict(
# we use box_type_3d='Camera' in monocular 3d
# detection task
box_type_3d='Camera',
use_valid_flag=True))
val_dataloader = dict(
batch_size=1,
num_workers=2,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'nuscenes_infos_val_mono3d.coco.json',
img_prefix=data_root,
classes=class_names,
data_prefix=dict(
pts='',
CAM_FRONT='samples/CAM_FRONT',
CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT',
CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT',
CAM_BACK='samples/CAM_BACK',
CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
CAM_BACK_LEFT='samples/CAM_BACK_LEFT'),
ann_file='nuscenes_infos_val.pkl',
task='mono3d',
pipeline=test_pipeline,
modality=input_modality,
metainfo=metainfo,
test_mode=True,
box_type_3d='Camera'),
test=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'nuscenes_infos_val_mono3d.coco.json',
img_prefix=data_root,
classes=class_names,
pipeline=test_pipeline,
modality=input_modality,
test_mode=True,
box_type_3d='Camera'))
evaluation = dict(interval=2)
box_type_3d='Camera',
use_valid_flag=True))
test_dataloader = val_dataloader
val_evaluator = dict(
type='NuScenesMetric',
data_root=data_root,
ann_file=data_root + 'nuscenes_infos_val.pkl',
metric='bbox')
test_evaluator = val_evaluator
# model settings
model = dict(
type='FCOSMono3D',
data_preprocessor=dict(
type='Det3DDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True,
pad_size_divisor=32),
backbone=dict(
type='ResNet',
type='mmdet.ResNet',
depth=101,
num_stages=4,
out_indices=(0, 1, 2, 3),
......@@ -13,7 +20,7 @@ model = dict(
type='Pretrained',
checkpoint='open-mmlab://detectron2/resnet101_caffe')),
neck=dict(
type='FPN',
type='mmdet.FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
start_level=1,
......@@ -45,18 +52,19 @@ model = dict(
dir_branch=(256, ),
attr_branch=(256, ),
loss_cls=dict(
type='FocalLoss',
type='mmdet.FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
loss_bbox=dict(
type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
loss_dir=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
loss_attr=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
loss_centerness=dict(
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
type='mmdet.CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
bbox_coder=dict(type='FCOS3DBBoxCoder', code_size=9),
norm_on_bbox=True,
centerness_on_reg=True,
......
......@@ -28,18 +28,19 @@ model = dict(
dir_branch=(256, ),
attr_branch=(256, ),
loss_cls=dict(
type='FocalLoss',
type='mmdet.FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
loss_bbox=dict(
type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
loss_dir=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
loss_attr=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
loss_centerness=dict(
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
type='mmdet.CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
norm_on_bbox=True,
centerness_on_reg=True,
center_sampling=True,
......
# model settings
model = dict(
type='SMOKEMono3D',
data_preprocessor=dict(
type='Det3DDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True,
pad_size_divisor=32),
backbone=dict(
type='DLANet',
depth=34,
......@@ -42,10 +49,11 @@ model = dict(
base_dims=((0.88, 1.73, 0.67), (1.78, 1.70, 0.58), (3.88, 1.63,
1.53)),
code_size=7),
loss_cls=dict(type='GaussianFocalLoss', loss_weight=1.0),
loss_bbox=dict(type='L1Loss', reduction='sum', loss_weight=1 / 300),
loss_cls=dict(type='mmdet.GaussianFocalLoss', loss_weight=1.0),
loss_bbox=dict(
type='mmdet.L1Loss', reduction='sum', loss_weight=1 / 300),
loss_dir=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
loss_attr=None,
conv_bias=True,
dcn_on_last_conv=False),
......
# training schedule for 1x
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=1)
val_cfg = dict(type='ValLoop')
test_cfg = dict(type='TestLoop')
# learning rate
param_scheduler = [
dict(
type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
dict(
type='MultiStepLR',
begin=0,
end=12,
by_epoch=True,
milestones=[8, 11],
gamma=0.1)
]
# optimizer
optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[8, 11])
runner = dict(type='EpochBasedRunner', max_epochs=12)
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001))
......@@ -4,18 +4,31 @@ _base_ = [
]
# model settings
model = dict(
data_preprocessor=dict(
type='Det3DDataPreprocessor',
mean=[103.530, 116.280, 123.675],
std=[1.0, 1.0, 1.0],
bgr_to_rgb=False,
pad_size_divisor=32),
backbone=dict(
dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
stage_with_dcn=(False, False, True, True)))
class_names = [
'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
]
img_norm_cfg = dict(
mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
# file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
file_client_args = dict(
backend='petrel',
path_mapping=dict({
'./data/nuscenes/':
's3://openmmlab/datasets/detection3d/nuscenes/',
'data/nuscenes/':
's3://openmmlab/datasets/detection3d/nuscenes/'
}))
train_pipeline = [
dict(type='LoadImageFromFileMono3D'),
dict(type='LoadImageFromFileMono3D', file_client_args=file_client_args),
dict(
type='LoadAnnotations3D',
with_bbox=True,
......@@ -24,52 +37,47 @@ train_pipeline = [
with_bbox_3d=True,
with_label_3d=True,
with_bbox_depth=True),
dict(type='Resize', img_scale=(1600, 900), keep_ratio=True),
dict(type='mmdet.Resize', scale=(1600, 900), keep_ratio=True),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(
type='Collect3D',
type='Pack3DDetInputs',
keys=[
'img', 'gt_bboxes', 'gt_labels', 'attr_labels', 'gt_bboxes_3d',
'gt_labels_3d', 'centers2d', 'depths'
'gt_labels_3d', 'centers_2d', 'depths'
]),
]
test_pipeline = [
dict(type='LoadImageFromFileMono3D'),
dict(
type='MultiScaleFlipAug',
scale_factor=1.0,
flip=False,
transforms=[
dict(type='RandomFlip3D'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['img']),
])
dict(type='LoadImageFromFileMono3D', file_client_args=file_client_args),
dict(type='mmdet.Resize', scale_factor=1.0),
dict(type='Pack3DDetInputs', keys=['img'])
]
data = dict(
samples_per_gpu=2,
workers_per_gpu=2,
train=dict(pipeline=train_pipeline),
val=dict(pipeline=test_pipeline),
test=dict(pipeline=test_pipeline))
train_dataloader = dict(
batch_size=2,
num_workers=2,
dataset=dict(dataset=dict(pipeline=train_pipeline)))
test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
# optimizer
optimizer = dict(
lr=0.002, paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.))
optimizer_config = dict(
_delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=1.0 / 3,
step=[8, 11])
total_epochs = 12
evaluation = dict(interval=2)
optim_wrapper = dict(
optimizer=dict(lr=0.002),
paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.),
clip_grad=dict(max_norm=35, norm_type=2))
# learning rate
param_scheduler = [
dict(
type='LinearLR',
start_factor=1.0 / 3,
by_epoch=False,
begin=0,
end=500),
dict(
type='MultiStepLR',
begin=0,
end=12,
by_epoch=True,
milestones=[8, 11],
gamma=0.1)
]
......@@ -4,6 +4,12 @@ _base_ = [
]
# model settings
model = dict(
data_preprocessor=dict(
type='Det3DDataPreprocessor',
mean=[103.530, 116.280, 123.675],
std=[1.0, 1.0, 1.0],
bgr_to_rgb=False,
pad_size_divisor=32),
backbone=dict(frozen_stages=0),
neck=dict(start_level=0, num_outs=4),
bbox_head=dict(
......@@ -27,16 +33,17 @@ model = dict(
),
centerness_branch=(256, ),
loss_cls=dict(
type='FocalLoss',
type='mmdet.FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
loss_bbox=dict(
type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
loss_dir=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
loss_centerness=dict(
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
type='mmdet.CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
use_depth_classifier=True,
depth_branch=(256, ),
depth_range=(0, 70),
......@@ -61,11 +68,21 @@ model = dict(
]),
test_cfg=dict(nms_pre=100, nms_thr=0.05, score_thr=0.001, max_per_img=20))
class_names = ['Pedestrian', 'Cyclist', 'Car']
img_norm_cfg = dict(
mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
# file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
file_client_args = dict(
backend='petrel',
path_mapping=dict({
'./data/kitti/':
's3://openmmlab/datasets/detection3d/kitti/',
'data/kitti/':
's3://openmmlab/datasets/detection3d/kitti/'
}))
train_pipeline = [
dict(type='LoadImageFromFileMono3D'),
dict(type='LoadImageFromFileMono3D', file_client_args=file_client_args),
dict(
type='LoadAnnotations3D',
with_bbox=True,
......@@ -74,54 +91,47 @@ train_pipeline = [
with_bbox_3d=True,
with_label_3d=True,
with_bbox_depth=True),
dict(type='Resize', img_scale=(1242, 375), keep_ratio=True),
dict(type='mmdet.Resize', scale=(1242, 375), keep_ratio=True),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(
type='Collect3D',
type='Pack3DDetInputs',
keys=[
'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_3d', 'gt_labels_3d',
'centers2d', 'depths'
'centers_2d', 'depths'
]),
]
test_pipeline = [
dict(type='LoadImageFromFileMono3D'),
dict(
type='MultiScaleFlipAug',
scale_factor=1.0,
flip=False,
transforms=[
dict(type='RandomFlip3D'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['img']),
])
dict(type='LoadImageFromFileMono3D', file_client_args=file_client_args),
dict(type='mmdet.Resize', scale_factor=1.0),
dict(type='Pack3DDetInputs', keys=['img'])
]
data = dict(
samples_per_gpu=3,
workers_per_gpu=3,
train=dict(pipeline=train_pipeline),
val=dict(pipeline=test_pipeline),
test=dict(pipeline=test_pipeline))
train_dataloader = dict(
batch_size=3, num_workers=3, dataset=dict(pipeline=train_pipeline))
test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
# optimizer
optimizer = dict(
lr=0.001, paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.))
optimizer_config = dict(
_delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=1.0 / 3,
step=[32, 44])
total_epochs = 48
runner = dict(type='EpochBasedRunner', max_epochs=48)
evaluation = dict(interval=2)
checkpoint_config = dict(interval=8)
optim_wrapper = dict(
optimizer=dict(lr=0.01),
paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.),
clip_grad=dict(max_norm=35, norm_type=2))
# learning rate
param_scheduler = [
dict(
type='LinearLR',
start_factor=1.0 / 3,
by_epoch=False,
begin=0,
end=500),
dict(
type='MultiStepLR',
begin=0,
end=48,
by_epoch=True,
milestones=[32, 44],
gamma=0.1)
]
train_cfg = dict(max_epochs=48)
......@@ -3,21 +3,21 @@ _base_ = [
'../_base_/default_runtime.py'
]
# optimizer
optimizer = dict(type='Adam', lr=2.5e-4)
optimizer_config = dict(grad_clip=None)
lr_config = dict(policy='step', warmup=None, step=[50])
# runtime settings
runner = dict(type='EpochBasedRunner', max_epochs=72)
log_config = dict(interval=10)
# file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
file_client_args = dict(
backend='petrel',
path_mapping=dict({
'./data/kitti/':
's3://openmmlab/datasets/detection3d/kitti/',
'data/kitti/':
's3://openmmlab/datasets/detection3d/kitti/'
}))
find_unused_parameters = True
class_names = ['Pedestrian', 'Cyclist', 'Car']
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
dict(type='LoadImageFromFileMono3D'),
dict(type='LoadImageFromFileMono3D', file_client_args=file_client_args),
dict(
type='LoadAnnotations3D',
with_bbox=True,
......@@ -29,36 +29,42 @@ train_pipeline = [
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(type='RandomShiftScale', shift_scale=(0.2, 0.4), aug_prob=0.3),
dict(type='AffineResize', img_scale=(1280, 384), down_ratio=4),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(
type='Collect3D',
type='Pack3DDetInputs',
keys=[
'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_3d', 'gt_labels_3d',
'centers2d', 'depths'
'centers_2d', 'depths'
]),
]
test_pipeline = [
dict(type='LoadImageFromFileMono3D'),
dict(type='LoadImageFromFileMono3D', file_client_args=file_client_args),
dict(type='AffineResize', img_scale=(1280, 384), down_ratio=4),
dict(type='Pack3DDetInputs', keys=['img'])
]
train_dataloader = dict(
batch_size=8, num_workers=4, dataset=dict(pipeline=train_pipeline))
test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
# training schedule for 1x
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=1)
val_cfg = dict(type='ValLoop')
test_cfg = dict(type='TestLoop')
# learning rate
param_scheduler = [
dict(
type='MultiScaleFlipAug',
img_scale=(1280, 384),
flip=False,
transforms=[
dict(type='AffineResize', img_scale=(1280, 384), down_ratio=4),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['img']),
])
type='MultiStepLR',
begin=0,
end=12,
by_epoch=True,
milestones=[8, 11],
gamma=0.1)
]
data = dict(
samples_per_gpu=8,
workers_per_gpu=4,
train=dict(pipeline=train_pipeline),
val=dict(pipeline=test_pipeline),
test=dict(pipeline=test_pipeline))
# optimizer
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='Adam', lr=2.5e-4),
clip_grad=None)
# Copyright (c) OpenMMLab. All rights reserved.
from .builder import DATASETS, PIPELINES, build_dataset
from .convert_utils import get_2d_boxes
from .dataset_wrappers import CBGSDataset
from .det3d_dataset import Det3DDataset
from .kitti_dataset import KittiDataset
......@@ -41,5 +42,5 @@ __all__ = [
'LoadPointsFromMultiSweeps', 'WaymoDataset', 'BackgroundPointsFilter',
'VoxelBasedPointSampler', 'get_loading_pipeline', 'RandomDropPointsColor',
'RandomJitterPoints', 'ObjectNameFilter', 'AffineResize',
'RandomShiftScale', 'LoadPointsFromDict', 'PIPELINES'
'RandomShiftScale', 'LoadPointsFromDict', 'PIPELINES', 'get_2d_boxes'
]
# Copyright (c) OpenMMLab. All rights reserved.
from collections import OrderedDict
from typing import List, Tuple, Union
import numpy as np
from nuscenes.utils.geometry_utils import view_points
from pyquaternion import Quaternion
from shapely.geometry import MultiPoint, box
from mmdet3d.core.bbox import points_cam2img
nus_categories = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
'barrier')
nus_attributes = ('cycle.with_rider', 'cycle.without_rider',
'pedestrian.moving', 'pedestrian.standing',
'pedestrian.sitting_lying_down', 'vehicle.moving',
'vehicle.parked', 'vehicle.stopped', 'None')
NameMapping = {
'movable_object.barrier': 'barrier',
'vehicle.bicycle': 'bicycle',
'vehicle.bus.bendy': 'bus',
'vehicle.bus.rigid': 'bus',
'vehicle.car': 'car',
'vehicle.construction': 'construction_vehicle',
'vehicle.motorcycle': 'motorcycle',
'human.pedestrian.adult': 'pedestrian',
'human.pedestrian.child': 'pedestrian',
'human.pedestrian.construction_worker': 'pedestrian',
'human.pedestrian.police_officer': 'pedestrian',
'movable_object.trafficcone': 'traffic_cone',
'vehicle.trailer': 'trailer',
'vehicle.truck': 'truck'
}
def get_2d_boxes(nusc, sample_data_token: str, visibilities: List[str]):
"""Get the 2D annotation records for a given `sample_data_token`.
Args:
sample_data_token (str): Sample data token belonging to a camera
keyframe.
visibilities (list[str]): Visibility filter.
Return:
list[dict]: List of 2D annotation record that belongs to the input
`sample_data_token`.
"""
# Get the sample data and the sample corresponding to that sample data.
sd_rec = nusc.get('sample_data', sample_data_token)
assert sd_rec[
'sensor_modality'] == 'camera', 'Error: get_2d_boxes only works' \
' for camera sample_data!'
if not sd_rec['is_key_frame']:
raise ValueError(
'The 2D re-projections are available only for keyframes.')
s_rec = nusc.get('sample', sd_rec['sample_token'])
# Get the calibrated sensor and ego pose
# record to get the transformation matrices.
cs_rec = nusc.get('calibrated_sensor', sd_rec['calibrated_sensor_token'])
pose_rec = nusc.get('ego_pose', sd_rec['ego_pose_token'])
camera_intrinsic = np.array(cs_rec['camera_intrinsic'])
# Get all the annotation with the specified visibilties.
ann_recs = [
nusc.get('sample_annotation', token) for token in s_rec['anns']
]
ann_recs = [
ann_rec for ann_rec in ann_recs
if (ann_rec['visibility_token'] in visibilities)
]
repro_recs = []
for ann_rec in ann_recs:
# Augment sample_annotation with token information.
ann_rec['sample_annotation_token'] = ann_rec['token']
ann_rec['sample_data_token'] = sample_data_token
# Get the box in global coordinates.
box = nusc.get_box(ann_rec['token'])
# Move them to the ego-pose frame.
box.translate(-np.array(pose_rec['translation']))
box.rotate(Quaternion(pose_rec['rotation']).inverse)
# Move them to the calibrated sensor frame.
box.translate(-np.array(cs_rec['translation']))
box.rotate(Quaternion(cs_rec['rotation']).inverse)
# Filter out the corners that are not in front of the calibrated
# sensor.
corners_3d = box.corners()
in_front = np.argwhere(corners_3d[2, :] > 0).flatten()
corners_3d = corners_3d[:, in_front]
# Project 3d box to 2d.
corner_coords = view_points(corners_3d, camera_intrinsic,
True).T[:, :2].tolist()
# Keep only corners that fall within the image.
final_coords = post_process_coords(corner_coords)
# Skip if the convex hull of the re-projected corners
# does not intersect the image canvas.
if final_coords is None:
continue
else:
min_x, min_y, max_x, max_y = final_coords
# Generate dictionary record to be included in the .json file.
repro_rec = generate_record(ann_rec, min_x, min_y, max_x, max_y,
sample_data_token, sd_rec['filename'])
# if repro_rec is None, we do not append it into repre_recs
if repro_rec is not None:
loc = box.center.tolist()
dim = box.wlh
dim[[0, 1, 2]] = dim[[1, 2, 0]] # convert wlh to our lhw
dim = dim.tolist()
rot = box.orientation.yaw_pitch_roll[0]
rot = [-rot] # convert the rot to our cam coordinate
global_velo2d = nusc.box_velocity(box.token)[:2]
global_velo3d = np.array([*global_velo2d, 0.0])
e2g_r_mat = Quaternion(pose_rec['rotation']).rotation_matrix
c2e_r_mat = Quaternion(cs_rec['rotation']).rotation_matrix
cam_velo3d = global_velo3d @ np.linalg.inv(
e2g_r_mat).T @ np.linalg.inv(c2e_r_mat).T
velo = cam_velo3d[0::2].tolist()
repro_rec['bbox_3d'] = loc + dim + rot
repro_rec['velocity'] = velo
center_3d = np.array(loc).reshape([1, 3])
center_2d_with_depth = points_cam2img(
center_3d, camera_intrinsic, with_depth=True)
center_2d_with_depth = center_2d_with_depth.squeeze().tolist()
repro_rec['center_2d'] = center_2d_with_depth[:2]
repro_rec['depth'] = center_2d_with_depth[2]
# normalized center2D + depth
# if samples with depth < 0 will be removed
if repro_rec['depth'] <= 0:
continue
ann_token = nusc.get('sample_annotation',
box.token)['attribute_tokens']
if len(ann_token) == 0:
attr_name = 'None'
else:
attr_name = nusc.get('attribute', ann_token[0])['name']
attr_id = nus_attributes.index(attr_name)
# repro_rec['attribute_name'] = attr_name
repro_rec['attr_label'] = attr_id
repro_recs.append(repro_rec)
return repro_recs
def post_process_coords(
corner_coords: List, imsize: Tuple[int, int] = (1600, 900)
) -> Union[Tuple[float, float, float, float], None]:
"""Get the intersection of the convex hull of the reprojected bbox corners
and the image canvas, return None if no intersection.
Args:
corner_coords (list[int]): Corner coordinates of reprojected
bounding box.
imsize (tuple[int]): Size of the image canvas.
Return:
tuple [float]: Intersection of the convex hull of the 2D box
corners and the image canvas.
"""
polygon_from_2d_box = MultiPoint(corner_coords).convex_hull
img_canvas = box(0, 0, imsize[0], imsize[1])
if polygon_from_2d_box.intersects(img_canvas):
img_intersection = polygon_from_2d_box.intersection(img_canvas)
intersection_coords = np.array(
[coord for coord in img_intersection.exterior.coords])
min_x = min(intersection_coords[:, 0])
min_y = min(intersection_coords[:, 1])
max_x = max(intersection_coords[:, 0])
max_y = max(intersection_coords[:, 1])
return min_x, min_y, max_x, max_y
else:
return None
def generate_record(ann_rec: dict, x1: float, y1: float, x2: float, y2: float,
sample_data_token: str, filename: str) -> OrderedDict:
"""Generate one 2D annotation record given various information on top of
the 2D bounding box coordinates.
Args:
ann_rec (dict): Original 3d annotation record.
x1 (float): Minimum value of the x coordinate.
y1 (float): Minimum value of the y coordinate.
x2 (float): Maximum value of the x coordinate.
y2 (float): Maximum value of the y coordinate.
sample_data_token (str): Sample data token.
filename (str):The corresponding image file where the annotation
is present.
Returns:
dict: A sample mono3D annotation record.
- bbox_label (int): 2d box label id
- bbox_label_3d (int): 3d box label id
- bbox (list[float]): left x, top y, right x, bottom y
of 2d box
- bbox_3d_isvalid (bool): whether the box is valid
"""
repro_rec = OrderedDict()
repro_rec['sample_data_token'] = sample_data_token
coco_rec = dict()
relevant_keys = [
'attribute_tokens',
'category_name',
'instance_token',
'next',
'num_lidar_pts',
'num_radar_pts',
'prev',
'sample_annotation_token',
'sample_data_token',
'visibility_token',
]
for key, value in ann_rec.items():
if key in relevant_keys:
repro_rec[key] = value
repro_rec['bbox_corners'] = [x1, y1, x2, y2]
repro_rec['filename'] = filename
if repro_rec['category_name'] not in NameMapping:
return None
cat_name = NameMapping[repro_rec['category_name']]
coco_rec['bbox_label'] = nus_categories.index(cat_name)
coco_rec['bbox_label_3d'] = nus_categories.index(cat_name)
coco_rec['bbox'] = [x1, y1, x2, y2]
coco_rec['bbox_3d_isvalid'] = True
return coco_rec
......@@ -197,6 +197,7 @@ class Det3DDataset(BaseDataset):
ann_info = dict()
for ann_name in keys:
temp_anns = [item[ann_name] for item in instances]
# map the original dataset label to training label
if 'label' in ann_name:
temp_anns = [
self.label_mapping[item] for item in temp_anns
......
# Copyright (c) OpenMMLab. All rights reserved.
from os import path as osp
from typing import Dict, List
import numpy as np
from mmdet3d.core.bbox.structures.cam_box3d import CameraInstance3DBoxes
from mmdet3d.registry import DATASETS
from ..core.bbox import LiDARInstance3DBoxes
from .det3d_dataset import Det3DDataset
......@@ -53,6 +55,7 @@ class NuScenesDataset(Det3DDataset):
def __init__(self,
data_root: str,
ann_file: str,
task: str = '3d',
pipeline: List[dict] = None,
box_type_3d: str = 'LiDAR',
modality: Dict = dict(
......@@ -66,7 +69,12 @@ class NuScenesDataset(Det3DDataset):
**kwargs):
self.use_valid_flag = use_valid_flag
self.with_velocity = with_velocity
assert box_type_3d.lower() == 'lidar'
# TODO: Redesign multi-view data process in the future
assert task in ('3d', 'mono3d', 'multi-view')
self.task = task
assert box_type_3d.lower() in ('lidar', 'camera')
super().__init__(
data_root=data_root,
ann_file=ann_file,
......@@ -97,6 +105,7 @@ class NuScenesDataset(Det3DDataset):
anns_results['gt_bboxes_3d'] = np.zeros((0, 7), dtype=np.float32)
anns_results['gt_labels_3d'] = np.zeros(0, dtype=np.int64)
return anns_results
if self.use_valid_flag:
mask = ann_info['bbox_3d_isvalid']
else:
......@@ -104,6 +113,22 @@ class NuScenesDataset(Det3DDataset):
gt_bboxes_3d = ann_info['gt_bboxes_3d'][mask]
gt_labels_3d = ann_info['gt_labels_3d'][mask]
if 'gt_bboxes' in ann_info:
gt_bboxes = ann_info['gt_bboxes'][mask]
gt_labels = ann_info['gt_labels'][mask]
attr_labels = ann_info['attr_labels'][mask]
else:
gt_bboxes = np.zeros((0, 4), dtype=np.float32)
gt_labels = np.array([], dtype=np.int64)
attr_labels = np.array([], dtype=np.int64)
if 'centers_2d' in ann_info:
centers_2d = ann_info['centers_2d'][mask]
depths = ann_info['depths'][mask]
else:
centers_2d = np.zeros((0, 2), dtype=np.float32)
depths = np.zeros((0), dtype=np.float32)
if self.with_velocity:
gt_velocity = ann_info['velocity'][mask]
nan_mask = np.isnan(gt_velocity[:, 0])
......@@ -112,11 +137,82 @@ class NuScenesDataset(Det3DDataset):
# the nuscenes box center is [0.5, 0.5, 0.5], we change it to be
# the same as KITTI (0.5, 0.5, 0)
gt_bboxes_3d = LiDARInstance3DBoxes(
gt_bboxes_3d,
box_dim=gt_bboxes_3d.shape[-1],
origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
# TODO: Unify the coordinates
if self.task == 'mono3d':
gt_bboxes_3d = CameraInstance3DBoxes(
gt_bboxes_3d,
box_dim=gt_bboxes_3d.shape[-1],
origin=(0.5, 0.5, 0.5))
else:
gt_bboxes_3d = LiDARInstance3DBoxes(
gt_bboxes_3d,
box_dim=gt_bboxes_3d.shape[-1],
origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
anns_results = dict(
gt_bboxes_3d=gt_bboxes_3d, gt_labels_3d=gt_labels_3d)
gt_bboxes_3d=gt_bboxes_3d,
gt_labels_3d=gt_labels_3d,
gt_bboxes=gt_bboxes,
gt_labels=gt_labels,
attr_labels=attr_labels,
centers_2d=centers_2d,
depths=depths)
return anns_results
def parse_data_info(self, info: dict) -> dict:
"""Process the raw data info.
The only difference with it in `Det3DDataset`
is the specific process for `plane`.
Args:
info (dict): Raw info dict.
Returns:
dict: Has `ann_info` in training stage. And
all path has been converted to absolute path.
"""
if self.task == 'mono3d':
data_list = []
if self.modality['use_lidar']:
info['lidar_points']['lidar_path'] = \
osp.join(
self.data_prefix.get('pts', ''),
info['lidar_points']['lidar_path'])
if self.modality['use_camera']:
for cam_id, img_info in info['images'].items():
if 'img_path' in img_info:
if cam_id in self.data_prefix:
cam_prefix = self.data_prefix[cam_id]
else:
cam_prefix = self.data_prefix.get('img', '')
img_info['img_path'] = osp.join(
cam_prefix, img_info['img_path'])
for idx, (cam_id, img_info) in enumerate(info['images'].items()):
camera_info = dict()
camera_info['images'] = dict()
camera_info['images'][cam_id] = img_info
if 'cam_instances' in info and cam_id in info['cam_instances']:
camera_info['instances'] = info['cam_instances'][cam_id]
else:
camera_info['instances'] = []
# TODO: check whether to change sample_idx for 6 cameras
# in one frame
camera_info['sample_idx'] = info['sample_idx'] * 6 + idx
camera_info['token'] = info['token']
camera_info['ego2global'] = info['ego2global']
if not self.test_mode:
# used in traing
camera_info['ann_info'] = self.parse_ann_info(camera_info)
if self.test_mode and self.load_eval_anns:
camera_info['eval_ann_info'] = \
self.parse_ann_info(camera_info)
data_list.append(camera_info)
return data_list
else:
data_info = super().parse_data_info(info)
return data_info
......@@ -122,7 +122,7 @@ class Pack3DDetInputs(BaseTransform):
for key in [
'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels',
'gt_labels_3d', 'attr_labels', 'pts_instance_mask',
'pts_semantic_mask', 'centers2d', 'depths'
'pts_semantic_mask', 'centers_2d', 'depths'
]:
if key not in results:
continue
......
......@@ -86,7 +86,7 @@ class LoadImageFromFileMono3D(LoadImageFromFile):
:class:`LoadImageFromFile`.
"""
def __call__(self, results):
def transform(self, results: dict) -> dict:
"""Call functions to load image and get image meta information.
Args:
......@@ -95,8 +95,32 @@ class LoadImageFromFileMono3D(LoadImageFromFile):
Returns:
dict: The dict contains loaded image and meta information.
"""
super().__call__(results)
results['cam2img'] = results['img_info']['cam_intrinsic']
# TODO: load different camera image from data info,
# for kitti dataset, we load 'CAM2' image.
# for nuscenes dataset, we load 'CAM_FRONT' image.
if 'CAM2' in results['images']:
filename = results['images']['CAM2']['img_path']
results['cam2img'] = results['images']['CAM2']['cam2img']
elif len(list(results['images'].keys())) == 1:
camera_type = list(results['images'].keys())[0]
filename = results['images'][camera_type]['img_path']
results['cam2img'] = results['images'][camera_type]['cam2img']
else:
raise NotImplementedError(
'Currently we only support load image from kitti and'
'nuscenes datasets')
img_bytes = self.file_client.get(filename)
img = mmcv.imfrombytes(
img_bytes, flag=self.color_type, backend=self.imdecode_backend)
if self.to_float32:
img = img.astype(np.float32)
results['img'] = img
results['img_shape'] = img.shape[:2]
results['ori_shape'] = img.shape[:2]
return results
......@@ -608,6 +632,34 @@ class LoadAnnotations3D(LoadAnnotations):
self.with_seg_3d = with_seg_3d
self.seg_3d_dtype = seg_3d_dtype
def _load_bboxes(self, results: dict) -> None:
"""Private function to load bounding box annotations.
Rewrite '_load_bboxes` since mmdet3d uses 'parse_anno_info' in
datasets.
Args:
results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
Returns:
dict: The dict contains loaded bounding box annotations.
"""
results['gt_bboxes'] = results['ann_info']['gt_bboxes']
def _load_labels(self, results: dict) -> None:
"""Private function to load label annotations.
Rewrite '_load_bboxes` since mmdet3d uses 'parse_anno_info' in
datasets.
Args:
results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
Returns:
dict: The dict contains loaded label annotations.
"""
results['gt_labels'] = results['ann_info']['gt_labels']
def _load_bboxes_3d(self, results: dict) -> dict:
"""Private function to move the 3D bounding box annotation from
`ann_info` field to the root of `results`.
......
......@@ -1579,7 +1579,7 @@ class VoxelBasedPointSampler(object):
@TRANSFORMS.register_module()
class AffineResize(object):
class AffineResize(BaseTransform):
"""Get the affine transform matrices to the target size.
Different from :class:`RandomAffine` in MMDetection, this class can
......@@ -1596,13 +1596,16 @@ class AffineResize(object):
outside the border of the image. Defaults to True.
"""
def __init__(self, img_scale, down_ratio, bbox_clip_border=True):
def __init__(self,
img_scale: Tuple,
down_ratio: int,
bbox_clip_border: bool = True) -> None:
self.img_scale = img_scale
self.down_ratio = down_ratio
self.bbox_clip_border = bbox_clip_border
def __call__(self, results):
def transform(self, results: dict) -> dict:
"""Call function to do affine transform to input image and labels.
Args:
......@@ -1647,39 +1650,38 @@ class AffineResize(object):
results['pad_shape'] = img.shape
results['trans_mat'] = trans_mat
self._affine_bboxes(results, trans_affine)
if 'gt_bboxes' in results:
self._affine_bboxes(results, trans_affine)
if 'centers2d' in results:
centers2d = self._affine_transform(results['centers2d'],
if 'centers_2d' in results:
centers2d = self._affine_transform(results['centers_2d'],
trans_affine)
valid_index = (centers2d[:, 0] >
0) & (centers2d[:, 0] <
self.img_scale[0]) & (centers2d[:, 1] > 0) & (
centers2d[:, 1] < self.img_scale[1])
results['centers2d'] = centers2d[valid_index]
for key in results.get('bbox_fields', []):
if key in ['gt_bboxes']:
results[key] = results[key][valid_index]
if 'gt_labels' in results:
results['gt_labels'] = results['gt_labels'][
valid_index]
if 'gt_masks' in results:
raise NotImplementedError(
'AffineResize only supports bbox.')
for key in results.get('bbox3d_fields', []):
if key in ['gt_bboxes_3d']:
results[key].tensor = results[key].tensor[valid_index]
if 'gt_labels_3d' in results:
results['gt_labels_3d'] = results['gt_labels_3d'][
valid_index]
results['centers_2d'] = centers2d[valid_index]
if 'gt_bboxes' in results:
results['gt_bboxes'] = results['gt_bboxes'][valid_index]
if 'gt_labels' in results:
results['gt_labels'] = results['gt_labels'][valid_index]
if 'gt_masks' in results:
raise NotImplementedError(
'AffineResize only supports bbox.')
if 'gt_bboxes_3d' in results:
results['gt_bboxes_3d'].tensor = results[
'gt_bboxes_3d'].tensor[valid_index]
if 'gt_labels_3d' in results:
results['gt_labels_3d'] = results['gt_labels_3d'][
valid_index]
results['depths'] = results['depths'][valid_index]
return results
def _affine_bboxes(self, results, matrix):
def _affine_bboxes(self, results: dict, matrix: np.ndarray) -> None:
"""Affine transform bboxes to input image.
Args:
......@@ -1689,20 +1691,18 @@ class AffineResize(object):
shape: (3, 3)
"""
for key in results.get('bbox_fields', []):
bboxes = results[key]
bboxes[:, :2] = self._affine_transform(bboxes[:, :2], matrix)
bboxes[:, 2:] = self._affine_transform(bboxes[:, 2:], matrix)
if self.bbox_clip_border:
bboxes[:,
[0, 2]] = bboxes[:,
[0, 2]].clip(0, self.img_scale[0] - 1)
bboxes[:,
[1, 3]] = bboxes[:,
[1, 3]].clip(0, self.img_scale[1] - 1)
results[key] = bboxes
def _affine_transform(self, points, matrix):
bboxes = results['gt_bboxes']
bboxes[:, :2] = self._affine_transform(bboxes[:, :2], matrix)
bboxes[:, 2:] = self._affine_transform(bboxes[:, 2:], matrix)
if self.bbox_clip_border:
bboxes[:, [0, 2]] = bboxes[:, [0, 2]].clip(0,
self.img_scale[0] - 1)
bboxes[:, [1, 3]] = bboxes[:, [1, 3]].clip(0,
self.img_scale[1] - 1)
results['gt_bboxes'] = bboxes
def _affine_transform(self, points: np.ndarray,
matrix: np.ndarray) -> np.ndarray:
"""Affine transform bbox points to input image.
Args:
......@@ -1721,7 +1721,8 @@ class AffineResize(object):
affined_points = np.matmul(matrix, hom_points_2d).T
return affined_points[:, :2]
def _get_transform_matrix(self, center, scale, output_scale):
def _get_transform_matrix(self, center: Tuple, scale: Tuple,
output_scale: Tuple[float]) -> np.ndarray:
"""Get affine transform matrix.
Args:
......@@ -1756,7 +1757,8 @@ class AffineResize(object):
return matrix.astype(np.float32)
def _get_ref_point(self, ref_point1, ref_point2):
def _get_ref_point(self, ref_point1: np.ndarray,
ref_point2: np.ndarray) -> np.ndarray:
"""Get reference point to calculate affine transform matrix.
While using opencv to calculate the affine matrix, we need at least
......@@ -1775,7 +1777,7 @@ class AffineResize(object):
@TRANSFORMS.register_module()
class RandomShiftScale(object):
class RandomShiftScale(BaseTransform):
"""Random shift scale.
Different from the normal shift and scale function, it doesn't
......@@ -1788,12 +1790,12 @@ class RandomShiftScale(object):
aug_prob (float): The shifting and scaling probability.
"""
def __init__(self, shift_scale, aug_prob):
def __init__(self, shift_scale: Tuple[float], aug_prob: float):
self.shift_scale = shift_scale
self.aug_prob = aug_prob
def __call__(self, results):
def transform(self, results: dict) -> dict:
"""Call function to record random shift and scale infos.
Args:
......
......@@ -45,6 +45,7 @@ class KittiMetric(BaseMetric):
def __init__(self,
ann_file: str,
metric: Union[str, List[str]] = 'bbox',
pred_box_type_3d: str = 'LiDAR',
pcd_limit_range: List[float] = [0, -40, -3, 70.4, 40, 0.0],
prefix: Optional[str] = None,
pklfile_prefix: str = None,
......@@ -57,6 +58,7 @@ class KittiMetric(BaseMetric):
self.ann_file = ann_file
self.pklfile_prefix = pklfile_prefix
self.submission_prefix = submission_prefix
self.pred_box_type_3d = pred_box_type_3d
allowed_metrics = ['bbox', 'img_bbox', 'mAP']
self.metrics = metric if isinstance(metric, list) else [metric]
......
......@@ -7,12 +7,15 @@ from typing import Dict, List, Optional, Sequence, Tuple, Union
import mmcv
import numpy as np
import pyquaternion
import torch
from mmengine.evaluator import BaseMetric
from mmengine.logging import MMLogger
from nuscenes.eval.detection.config import config_factory
from nuscenes.eval.detection.data_classes import DetectionConfig
from nuscenes.utils.data_classes import Box as NuScenesBox
from mmdet3d.core import bbox3d2result, box3d_multiclass_nms, xywhr2xyxyr
from mmdet3d.core.bbox import CameraInstance3DBoxes, LiDARInstance3DBoxes
from mmdet3d.registry import METRICS
......@@ -288,21 +291,144 @@ class NuScenesMetric(BaseMetric):
for name in results[0]:
if 'pred' in name and '3d' in name and name[0] != '_':
# format result of model output in Det3dDataSample,
# include 'pred_instances_3d','pts_pred_instances_3d',
# 'img_pred_instances_3d'
print(f'\nFormating bboxes of {name}')
results_ = [out[name] for out in results]
tmp_file_ = osp.join(jsonfile_prefix, name)
result_dict[name] = self._format_bbox(results_, sample_id_list,
classes, tmp_file_)
box_type_3d = type(results_[0]['bboxes_3d'])
if box_type_3d == LiDARInstance3DBoxes:
result_dict[name] = self._format_lidar_bbox(
results_, sample_id_list, classes, tmp_file_)
elif box_type_3d == CameraInstance3DBoxes:
result_dict[name] = self._format_camera_bbox(
results_, sample_id_list, classes, tmp_file_)
return result_dict, tmp_dir
def _format_bbox(self,
results: List[dict],
sample_id_list: List[int],
classes: List[str] = None,
jsonfile_prefix: str = None) -> str:
def _format_camera_bbox(self,
results: List[dict],
sample_id_list: List[int],
classes: List[str] = None,
jsonfile_prefix: str = None) -> str:
"""Convert the results to the standard format.
Args:
results (list[dict]): Testing results of the dataset.
jsonfile_prefix (str): The prefix of the output jsonfile.
You can specify the output directory/filename by
modifying the jsonfile_prefix. Default: None.
Returns:
str: Path of the output json file.
"""
nusc_annos = {}
print('Start to convert detection format...')
# Camera types in Nuscenes datasets
camera_types = [
'CAM_FRONT',
'CAM_FRONT_RIGHT',
'CAM_FRONT_LEFT',
'CAM_BACK',
'CAM_BACK_LEFT',
'CAM_BACK_RIGHT',
]
CAM_NUM = 6
for i, det in enumerate(mmcv.track_iter_progress(results)):
sample_id = sample_id_list[i]
camera_type_id = sample_id % CAM_NUM
if camera_type_id == 0:
boxes_per_frame = []
attrs_per_frame = []
# need to merge results from images of the same sample
annos = []
boxes, attrs = output_to_nusc_box(det)
sample_token = self.data_infos[sample_id]['token']
camera_type = camera_types[camera_type_id]
boxes, attrs = cam_nusc_box_to_global(
self.data_infos[sample_id - camera_type_id], boxes, attrs,
camera_type, classes, self.eval_detection_configs)
boxes_per_frame.extend(boxes)
attrs_per_frame.extend(attrs)
# Remove redundant predictions caused by overlap of images
if (sample_id + 1) % CAM_NUM != 0:
continue
boxes = global_nusc_box_to_cam(
self.data_infos[sample_id + 1 - CAM_NUM], boxes_per_frame,
classes, self.eval_detection_configs)
cam_boxes3d, scores, labels = nusc_box_to_cam_box3d(boxes)
# box nms 3d over 6 images in a frame
# TODO: move this global setting into config
nms_cfg = dict(
use_rotate_nms=True,
nms_across_levels=False,
nms_pre=4096,
nms_thr=0.05,
score_thr=0.01,
min_bbox_size=0,
max_per_frame=500)
from mmcv import Config
nms_cfg = Config(nms_cfg)
cam_boxes3d_for_nms = xywhr2xyxyr(cam_boxes3d.bev)
boxes3d = cam_boxes3d.tensor
# generate attr scores from attr labels
attrs = labels.new_tensor([attr for attr in attrs_per_frame])
boxes3d, scores, labels, attrs = box3d_multiclass_nms(
boxes3d,
cam_boxes3d_for_nms,
scores,
nms_cfg.score_thr,
nms_cfg.max_per_frame,
nms_cfg,
mlvl_attr_scores=attrs)
cam_boxes3d = CameraInstance3DBoxes(boxes3d, box_dim=9)
det = bbox3d2result(cam_boxes3d, scores, labels, attrs)
boxes, attrs = output_to_nusc_box(det)
boxes, attrs = cam_nusc_box_to_global(
self.data_infos[sample_id + 1 - CAM_NUM], boxes, attrs,
classes, self.eval_detection_configs)
for i, box in enumerate(boxes):
name = classes[box.label]
attr = self.get_attr_name(attrs[i], name)
nusc_anno = dict(
sample_token=sample_token,
translation=box.center.tolist(),
size=box.wlh.tolist(),
rotation=box.orientation.elements.tolist(),
velocity=box.velocity[:2].tolist(),
detection_name=name,
detection_score=box.score,
attribute_name=attr)
annos.append(nusc_anno)
# other views results of the same frame should be concatenated
if sample_token in nusc_annos:
nusc_annos[sample_token].extend(annos)
else:
nusc_annos[sample_token] = annos
nusc_submissions = {
'meta': self.modality,
'results': nusc_annos,
}
mmcv.mkdir_or_exist(jsonfile_prefix)
res_path = osp.join(jsonfile_prefix, 'results_nusc.json')
print('Results writes to', res_path)
mmcv.dump(nusc_submissions, res_path)
return res_path
def _format_lidar_bbox(self,
results: List[dict],
sample_id_list: List[int],
classes: List[str] = None,
jsonfile_prefix: str = None) -> str:
"""Convert the results to the standard format.
Args:
......@@ -389,27 +515,59 @@ def output_to_nusc_box(detection: dict) -> List[NuScenesBox]:
bbox3d = detection['bboxes_3d']
scores = detection['scores_3d'].numpy()
labels = detection['labels_3d'].numpy()
attrs = None
if 'attr_labels' in detection:
attrs = detection['attr_labels'].numpy()
box_gravity_center = bbox3d.gravity_center.numpy()
box_dims = bbox3d.dims.numpy()
box_yaw = bbox3d.yaw.numpy()
# our LiDAR coordinate system -> nuScenes box coordinate system
nus_box_dims = box_dims[:, [1, 0, 2]]
box_list = []
for i in range(len(bbox3d)):
quat = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw[i])
velocity = (*bbox3d.tensor[i, 7:9], 0.0)
box = NuScenesBox(
box_gravity_center[i],
nus_box_dims[i],
quat,
label=labels[i],
score=scores[i],
velocity=velocity)
box_list.append(box)
return box_list
if type(bbox3d) == LiDARInstance3DBoxes:
# our LiDAR coordinate system -> nuScenes box coordinate system
nus_box_dims = box_dims[:, [1, 0, 2]]
for i in range(len(bbox3d)):
quat = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw[i])
velocity = (*bbox3d.tensor[i, 7:9], 0.0)
# velo_val = np.linalg.norm(box3d[i, 7:9])
# velo_ori = box3d[i, 6]
# velocity = (
# velo_val * np.cos(velo_ori), velo_val * np.sin(velo_ori), 0.0)
box = NuScenesBox(
box_gravity_center[i],
nus_box_dims[i],
quat,
label=labels[i],
score=scores[i],
velocity=velocity)
box_list.append(box)
elif type(bbox3d) == CameraInstance3DBoxes:
# our Camera coordinate system -> nuScenes box coordinate system
# convert the dim/rot to nuscbox convention
nus_box_dims = box_dims[:, [2, 0, 1]]
nus_box_yaw = -box_yaw
for i in range(len(bbox3d)):
q1 = pyquaternion.Quaternion(
axis=[0, 0, 1], radians=nus_box_yaw[i])
q2 = pyquaternion.Quaternion(axis=[1, 0, 0], radians=np.pi / 2)
quat = q2 * q1
velocity = (bbox3d.tensor[i, 7], 0.0, bbox3d.tensor[i, 8])
box = NuScenesBox(
box_gravity_center[i],
nus_box_dims[i],
quat,
label=labels[i],
score=scores[i],
velocity=velocity)
box_list.append(box)
else:
raise NotImplementedError(
f'Do not support convert {type(bbox3d)} bboxes'
'to standard NuScenesBoxes.')
return box_list, attrs
def lidar_nusc_box_to_global(
......@@ -448,3 +606,117 @@ def lidar_nusc_box_to_global(
box.translate(ego2global[:3, 3])
box_list.append(box)
return box_list
def cam_nusc_box_to_global(info: dict, boxes: List[NuScenesBox],
attrs: List[str], camera_type: str,
classes: List[str],
eval_configs: DetectionConfig) -> List[NuScenesBox]:
"""Convert the box from camera to global coordinate.
Args:
info (dict): Info for a specific sample data, including the
calibration information.
boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
attrs (list[str]): List of attributes.
camera_type (str): Type of camera.
classes (list[str]): Mapped classes in the evaluation.
eval_configs (object): Evaluation configuration object.
Returns:
list: List of standard NuScenesBoxes in the global
coordinate.
"""
box_list = []
attr_list = []
for (box, attr) in zip(boxes, attrs):
# Move box to ego vehicle coord system
cam2ego = np.array(info['images'][camera_type]['cam2ego'])
box.rotate(
pyquaternion.Quaternion(matrix=cam2ego, rtol=1e-05, atol=1e-07))
box.translate(cam2ego[:3, 3])
# filter det in ego.
cls_range_map = eval_configs.class_range
radius = np.linalg.norm(box.center[:2], 2)
det_range = cls_range_map[classes[box.label]]
if radius > det_range:
continue
# Move box to global coord system
ego2global = np.array(info['ego2global'])
box.rotate(
pyquaternion.Quaternion(matrix=ego2global, rtol=1e-05, atol=1e-07))
box.translate(ego2global[:3, 3])
box_list.append(box)
attr_list.append(attr)
return box_list, attr_list
def global_nusc_box_to_cam(info: dict, boxes: List[NuScenesBox],
classes: List[str],
eval_configs: DetectionConfig) -> List[NuScenesBox]:
"""Convert the box from global to camera coordinate.
Args:
info (dict): Info for a specific sample data, including the
calibration information.
boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
classes (list[str]): Mapped classes in the evaluation.
eval_configs (object): Evaluation configuration object.
Returns:
list: List of standard NuScenesBoxes in the global
coordinate.
"""
box_list = []
for box in boxes:
# Move box to ego vehicle coord system
ego2global = np.array(info['ego2global'])
box.translate(-ego2global[:3, 3])
box.rotate(
pyquaternion.Quaternion(matrix=ego2global, rtol=1e-05,
atol=1e-07).inverse)
# filter det in ego.
cls_range_map = eval_configs.class_range
radius = np.linalg.norm(box.center[:2], 2)
det_range = cls_range_map[classes[box.label]]
if radius > det_range:
continue
# Move box to camera coord system
cam2ego = np.array(info['images']['CAM_FRONT']['cam2ego'])
box.translate(-cam2ego[:3, :3])
box.rotate(
pyquaternion.Quaternion(matrix=cam2ego, rtol=1e-05,
atol=1e-07).inverse)
box_list.append(box)
return box_list
def nusc_box_to_cam_box3d(boxes: List[NuScenesBox]):
"""Convert boxes from :obj:`NuScenesBox` to :obj:`CameraInstance3DBoxes`.
Args:
boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
Returns:
tuple (:obj:`CameraInstance3DBoxes` | torch.Tensor | torch.Tensor):
Converted 3D bounding boxes, scores and labels.
"""
locs = torch.Tensor([b.center for b in boxes]).view(-1, 3)
dims = torch.Tensor([b.wlh for b in boxes]).view(-1, 3)
rots = torch.Tensor([b.orientation.yaw_pitch_roll[0]
for b in boxes]).view(-1, 1)
velocity = torch.Tensor([b.velocity[0::2] for b in boxes]).view(-1, 2)
# convert nusbox to cambox convention
dims[:, [0, 1, 2]] = dims[:, [1, 2, 0]]
rots = -rots
boxes_3d = torch.cat([locs, dims, rots, velocity], dim=1).cuda()
cam_boxes3d = CameraInstance3DBoxes(
boxes_3d, box_dim=9, origin=(0.5, 0.5, 0.5))
scores = torch.Tensor([b.score for b in boxes]).cuda()
labels = torch.LongTensor([b.label for b in boxes]).cuda()
nms_scores = scores.new_zeros(scores.shape[0], 10 + 1)
indices = labels.new_tensor(list(range(scores.shape[0])))
nms_scores[indices, labels] = scores
return cam_boxes3d, nms_scores, labels
......@@ -106,8 +106,7 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
if 'points' in inputs_dict[0].keys():
points = [input['points'] for input in inputs_dict]
else:
raise KeyError(
"Model input dict needs to include the 'points' key.")
points = None
if 'img' in inputs_dict[0].keys():
......
# Copyright (c) OpenMMLab. All rights reserved.
from abc import abstractmethod
from typing import Any, List, Sequence, Tuple, Union
import torch
from mmcv.cnn import ConvModule, bias_init_with_prob, normal_init
from mmcv.runner import force_fp32
from torch import Tensor
from torch import nn as nn
from mmdet3d.core.utils import ConfigType, InstanceList, OptConfigType
from mmdet3d.registry import MODELS
from mmdet.core import multi_apply
from ..builder import build_loss
from .base_mono3d_dense_head import BaseMono3DDenseHead
......@@ -20,39 +21,41 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead):
num_classes (int): Number of categories excluding the background
category.
in_channels (int): Number of channels in the input feature map.
feat_channels (int, optional): Number of hidden channels.
feat_channels (int): Number of hidden channels.
Used in child classes. Defaults to 256.
stacked_convs (int, optional): Number of stacking convs of the head.
strides (tuple, optional): Downsample factor of each feature map.
dcn_on_last_conv (bool, optional): If true, use dcn in the last
stacked_convs (int): Number of stacking convs of the head.
strides (Sequence[int] or Sequence[Tuple[int, int]]): Downsample
factor of each feature map.
dcn_on_last_conv (bool): If true, use dcn in the last
layer of towers. Default: False.
conv_bias (bool | str, optional): If specified as `auto`, it will be
conv_bias (bool or str): If specified as `auto`, it will be
decided by the norm_cfg. Bias of conv will be set as True
if `norm_cfg` is None, otherwise False. Default: 'auto'.
background_label (int, optional): Label ID of background,
background_label (bool, Optional): Label ID of background,
set as 0 for RPN and num_classes for other heads.
It will automatically set as `num_classes` if None is given.
use_direction_classifier (bool, optional):
use_direction_classifier (bool):
Whether to add a direction classifier.
diff_rad_by_sin (bool, optional): Whether to change the difference
diff_rad_by_sin (bool): Whether to change the difference
into sin difference for box regression loss. Defaults to True.
dir_offset (float, optional): Parameter used in direction
dir_offset (float): Parameter used in direction
classification. Defaults to 0.
dir_limit_offset (float, optional): Parameter used in direction
dir_limit_offset (float): Parameter used in direction
classification. Defaults to 0.
loss_cls (dict, optional): Config of classification loss.
loss_bbox (dict, optional): Config of localization loss.
loss_dir (dict, optional): Config of direction classifier loss.
loss_attr (dict, optional): Config of attribute classifier loss,
which is only active when `pred_attrs=True`.
bbox_code_size (int, optional): Dimensions of predicted bounding boxes.
pred_attrs (bool, optional): Whether to predict attributes.
loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss.
loss_dir (:obj:`ConfigDict` or dict): Config of direction classifier
loss.
loss_attr (:obj:`ConfigDict` or dict): Config of attribute classifier
loss, which is only active when `pred_attrs=True`.
bbox_code_size (int): Dimensions of predicted bounding boxes.
pred_attrs (bool): Whether to predict attributes.
Defaults to False.
num_attrs (int, optional): The number of attributes to be predicted.
num_attrs (int): The number of attributes to be predicted.
Default: 9.
pred_velo (bool, optional): Whether to predict velocity.
pred_velo (bool): Whether to predict velocity.
Defaults to False.
pred_bbox2d (bool, optional): Whether to predict 2D boxes.
pred_bbox2d (bool): Whether to predict 2D boxes.
Defaults to False.
group_reg_dims (tuple[int], optional): The dimension of each regression
target group. Default: (2, 1, 3, 1, 2).
......@@ -66,68 +69,77 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead):
(64, ), # rot
() # velo
),
dir_branch (tuple[int], optional): Channels for direction
dir_branch (Sequence[int]): Channels for direction
classification branch. Default: (64, ).
attr_branch (tuple[int], optional): Channels for classification branch.
attr_branch (Sequence[int]): Channels for classification branch.
Default: (64, ).
conv_cfg (dict, optional): Config dict for convolution layer.
Default: None.
norm_cfg (dict, optional): Config dict for normalization layer.
Default: None.
train_cfg (dict, optional): Training config of anchor head.
test_cfg (dict, optional): Testing config of anchor head.
conv_cfg (:obj:`ConfigDict` or dict, Optional): Config dict for
convolution layer. Default: None.
norm_cfg (:obj:`ConfigDict` or dict, Optional): Config dict for
normalization layer. Default: None.
train_cfg (:obj:`ConfigDict` or dict, Optional): Training config
of anchor head.
test_cfg (:obj:`ConfigDict` or dict, Optional): Testing config of
anchor head.
init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
dict]): Initialization config dict.
""" # noqa: W605
_version = 1
def __init__(
self,
num_classes,
in_channels,
feat_channels=256,
stacked_convs=4,
strides=(4, 8, 16, 32, 64),
dcn_on_last_conv=False,
conv_bias='auto',
background_label=None,
use_direction_classifier=True,
diff_rad_by_sin=True,
dir_offset=0,
dir_limit_offset=0,
loss_cls=dict(
type='FocalLoss',
num_classes: int,
in_channels: int,
feat_channels: int = 256,
stacked_convs: int = 4,
strides: Sequence[int] = (4, 8, 16, 32, 64),
dcn_on_last_conv: bool = False,
conv_bias: Union[bool, str] = 'auto',
background_label: bool = None,
use_direction_classifier: bool = True,
diff_rad_by_sin: bool = True,
dir_offset: int = 0,
dir_limit_offset: int = 0,
loss_cls: ConfigType = dict(
type='mmdet.FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(
type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
loss_dir=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
loss_attr=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
bbox_code_size=9, # For nuscenes
pred_attrs=False,
num_attrs=9, # For nuscenes
pred_velo=False,
pred_bbox2d=False,
group_reg_dims=(2, 1, 3, 1, 2), # offset, depth, size, rot, velo,
cls_branch=(128, 64),
reg_branch=(
loss_bbox: ConfigType = dict(
type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
loss_dir: ConfigType = dict(
type='mmdet.CrossEntropyLoss',
use_sigmoid=False,
loss_weight=1.0),
loss_attr: ConfigType = dict(
type='mmdet.CrossEntropyLoss',
use_sigmoid=False,
loss_weight=1.0),
bbox_code_size: int = 9, # For nuscenes
pred_attrs: bool = False,
num_attrs: int = 9, # For nuscenes
pred_velo: bool = False,
pred_bbox2d: bool = False,
group_reg_dims: Sequence[int] = (
2, 1, 3, 1, 2), # offset, depth, size, rot, velo,
cls_branch: Sequence[int] = (128, 64),
reg_branch: Sequence[Tuple[int, int]] = (
(128, 64), # offset
(128, 64), # depth
(64, ), # size
(64, ), # rot
() # velo
),
dir_branch=(64, ),
attr_branch=(64, ),
conv_cfg=None,
norm_cfg=None,
train_cfg=None,
test_cfg=None,
init_cfg=None):
super(AnchorFreeMono3DHead, self).__init__(init_cfg=init_cfg)
dir_branch: Sequence[int] = (64, ),
attr_branch: Sequence[int] = (64, ),
conv_cfg: OptConfigType = None,
norm_cfg: OptConfigType = None,
train_cfg: OptConfigType = None,
test_cfg: OptConfigType = None,
init_cfg: OptConfigType = None) -> None:
super().__init__(init_cfg=init_cfg)
self.num_classes = num_classes
self.cls_out_channels = num_classes
self.in_channels = in_channels
......@@ -141,9 +153,9 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead):
self.diff_rad_by_sin = diff_rad_by_sin
self.dir_offset = dir_offset
self.dir_limit_offset = dir_limit_offset
self.loss_cls = build_loss(loss_cls)
self.loss_bbox = build_loss(loss_bbox)
self.loss_dir = build_loss(loss_dir)
self.loss_cls = MODELS.build(loss_cls)
self.loss_bbox = MODELS.build(loss_bbox)
self.loss_dir = MODELS.build(loss_dir)
self.bbox_code_size = bbox_code_size
self.group_reg_dims = list(group_reg_dims)
self.cls_branch = cls_branch
......@@ -174,7 +186,7 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead):
self.num_attrs = num_attrs
if self.pred_attrs:
self.attr_background_label = num_attrs
self.loss_attr = build_loss(loss_attr)
self.loss_attr = MODELS.build(loss_attr)
self.attr_branch = attr_branch
self._init_layers()
......@@ -316,11 +328,13 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead):
if self.pred_attrs:
normal_init(self.conv_attr, std=0.01, bias=bias_cls)
def forward(self, feats):
def forward(
self, x: Tuple[Tensor]
) -> Tuple[List[Tensor], List[Tensor], List[Tensor], List[Tensor]]:
"""Forward features from the upstream network.
Args:
feats (tuple[Tensor]): Features from the upstream network, each is
x (tuple[Tensor]): Features from the upstream network, each is
a 4D-tensor.
Returns:
......@@ -339,9 +353,9 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead):
level, each is a 4D-tensor, the channel number is
num_points * num_attrs.
"""
return multi_apply(self.forward_single, feats)[:5]
return multi_apply(self.forward_single, x)[:5]
def forward_single(self, x):
def forward_single(self, x: Tensor) -> Tuple[Tensor, ...]:
"""Forward features of a single scale level.
Args:
......@@ -394,77 +408,8 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead):
reg_feat
@abstractmethod
@force_fp32(apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds'))
def loss(self,
cls_scores,
bbox_preds,
dir_cls_preds,
attr_preds,
batch_gt_instances_3d,
batch_img_metas,
batch_gt_instances_ignore=None):
"""Compute loss of the head.
Args:
cls_scores (list[Tensor]): Box scores for each scale level,
each is a 4D-tensor, the channel number is
num_points * num_classes.
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level, each is a 4D-tensor, the channel number is
num_points * bbox_code_size.
dir_cls_preds (list[Tensor]): Box scores for direction class
predictions on each scale level, each is a 4D-tensor,
the channel number is num_points * 2. (bin = 2)
attr_preds (list[Tensor]): Box scores for each scale level,
each is a 4D-tensor, the channel number is
num_points * num_attrs.
batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
gt_instance_3d. It usually includes ``bboxes``、``labels``
、``bboxes_3d``、``labels3d``、``depths``、``centers2d`` and
attributes.
batch_img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
Batch of gt_instances_ignore. It includes ``bboxes`` attribute
data that is ignored during training and testing.
Defaults to None.
"""
raise NotImplementedError
@abstractmethod
@force_fp32(apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds'))
def get_results(self,
cls_scores,
bbox_preds,
dir_cls_preds,
attr_preds,
batch_img_metas,
cfg=None,
rescale=None):
"""Transform network output for a batch into bbox predictions.
Args:
cls_scores (list[Tensor]): Box scores for each scale level
Has shape (N, num_points * num_classes, H, W)
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level with shape (N, num_points * bbox_code_size, H, W)
dir_cls_preds (list[Tensor]): Box scores for direction class
predictions on each scale level, each is a 4D-tensor,
the channel number is num_points * 2. (bin = 2)
attr_preds (list[Tensor]): Attribute scores for each scale level
Has shape (N, num_points * num_attrs, H, W)
batch_img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
cfg (mmcv.Config): Test / postprocessing configuration,
if None, test_cfg would be used
rescale (bool): If True, return boxes in original image space
"""
raise NotImplementedError
@abstractmethod
def get_targets(self, points, batch_gt_instances_3d):
def get_targets(self, points: List[Tensor],
batch_gt_instances: InstanceList) -> Any:
"""Compute regression, classification and centerss targets for points
in multiple images.
......@@ -473,18 +418,32 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead):
(num_points, 2).
batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
gt_instance_3d. It usually includes ``bboxes``、``labels``
、``bboxes_3d``、``labels3d``、``depths``、``centers2d`` and
attributes.
、``bboxes_3d``、``labels_3d``、``depths``、``centers_2d``
and attributes.
"""
raise NotImplementedError
# TODO: Refactor using MlvlPointGenerator in MMDet.
def _get_points_single(self,
featmap_size,
stride,
dtype,
device,
flatten=False):
"""Get points of a single scale level."""
featmap_size: Tuple[int],
stride: int,
dtype: torch.dtype,
device: torch.device,
flatten: bool = False) -> Tuple[Tensor, Tensor]:
"""Get points of a single scale level.
Args:
featmap_size (tuple[int]): Single scale level feature map
size.
stride (int): Downsample factor of the feature map.
dtype (torch.dtype): Type of points.
device (torch.device): Device of points.
flatten (bool): Whether to flatten the tensor.
Defaults to False.
Returns:
tuple: points of each image.
"""
h, w = featmap_size
x_range = torch.arange(w, dtype=dtype, device=device)
y_range = torch.arange(h, dtype=dtype, device=device)
......@@ -494,16 +453,23 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead):
x = x.flatten()
return y, x
def get_points(self, featmap_sizes, dtype, device, flatten=False):
# TODO: Refactor using MlvlPointGenerator in MMDet.
def get_points(self,
featmap_sizes: List[Tuple[int]],
dtype: torch.dtype,
device: torch.device,
flatten: bool = False) -> List[Tuple[Tensor, Tensor]]:
"""Get points according to feature map sizes.
Args:
featmap_sizes (list[tuple]): Multi-level feature map sizes.
dtype (torch.dtype): Type of points.
device (torch.device): Device of points.
flatten (bool): Whether to flatten the tensor.
Defaults to False.
Returns:
tuple: points of each image.
list[tuple]: points of each image.
"""
mlvl_points = []
for i in range(len(featmap_sizes)):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment