Commit b496f579 authored by ZCMax's avatar ZCMax Committed by ChaimZhu
Browse files

[Refactor] Refactor Mono3D models

parent 35667791
dataset_type = 'KittiMonoDataset' dataset_type = 'KittiDataset'
data_root = 'data/kitti/' data_root = 'data/kitti/'
class_names = ['Pedestrian', 'Cyclist', 'Car'] class_names = ['Pedestrian', 'Cyclist', 'Car']
input_modality = dict(use_lidar=False, use_camera=True) input_modality = dict(use_lidar=False, use_camera=True)
img_norm_cfg = dict( metainfo = dict(CLASSES=class_names)
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
# file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
file_client_args = dict(
backend='petrel',
path_mapping=dict({
'./data/kitti/':
's3://openmmlab/datasets/detection3d/kitti/',
'data/kitti/':
's3://openmmlab/datasets/detection3d/kitti/'
}))
train_pipeline = [ train_pipeline = [
dict(type='LoadImageFromFileMono3D'), dict(type='LoadImageFromFileMono3D', file_client_args=file_client_args),
dict( dict(
type='LoadAnnotations3D', type='LoadAnnotations3D',
with_bbox=True, with_bbox=True,
...@@ -14,79 +27,60 @@ train_pipeline = [ ...@@ -14,79 +27,60 @@ train_pipeline = [
with_bbox_3d=True, with_bbox_3d=True,
with_label_3d=True, with_label_3d=True,
with_bbox_depth=True), with_bbox_depth=True),
dict(type='Resize', img_scale=(1242, 375), keep_ratio=True), dict(type='Resize', scale=(1242, 375), keep_ratio=True),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict( dict(
type='Collect3D', type='Pack3DDetInputs',
keys=[ keys=[
'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_3d', 'gt_labels_3d', 'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_3d', 'gt_labels_3d',
'centers2d', 'depths' 'centers_2d', 'depths'
]), ]),
] ]
test_pipeline = [ test_pipeline = [
dict(type='LoadImageFromFileMono3D'), dict(type='LoadImageFromFileMono3D', file_client_args=file_client_args),
dict( dict(type='Resize', scale=(1242, 375), keep_ratio=True),
type='MultiScaleFlipAug', dict(type='Pack3DDetInputs', keys=['img'])
img_scale=(1242, 375),
flip=False,
transforms=[
dict(type='RandomFlip3D'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['img']),
])
]
# construct a pipeline for data and gt loading in show function
# please keep its loading function consistent with test_pipeline (e.g. client)
eval_pipeline = [
dict(type='LoadImageFromFileMono3D'),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['img'])
] ]
data = dict(
samples_per_gpu=2, train_dataloader = dict(
workers_per_gpu=2, batch_size=2,
train=dict( num_workers=2,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type=dataset_type, type=dataset_type,
data_root=data_root, data_root=data_root,
ann_file=data_root + 'kitti_infos_train_mono3d.coco.json', ann_file='kitti_infos_train.pkl',
info_file=data_root + 'kitti_infos_train.pkl', data_prefix=dict(img='training/image_2'),
img_prefix=data_root,
classes=class_names,
pipeline=train_pipeline, pipeline=train_pipeline,
modality=input_modality, modality=input_modality,
test_mode=False, test_mode=False,
box_type_3d='Camera'), metainfo=metainfo,
val=dict( # we use box_type_3d='Camera' in monocular 3d
type=dataset_type, # detection task
data_root=data_root, box_type_3d='Camera'))
ann_file=data_root + 'kitti_infos_val_mono3d.coco.json', val_dataloader = dict(
info_file=data_root + 'kitti_infos_val.pkl', batch_size=1,
img_prefix=data_root, num_workers=2,
classes=class_names, persistent_workers=True,
pipeline=test_pipeline, drop_last=False,
modality=input_modality, sampler=dict(type='DefaultSampler', shuffle=False),
test_mode=True, dataset=dict(
box_type_3d='Camera'),
test=dict(
type=dataset_type, type=dataset_type,
data_root=data_root, data_root=data_root,
ann_file=data_root + 'kitti_infos_val_mono3d.coco.json', data_prefix=dict(img='training/image_2'),
info_file=data_root + 'kitti_infos_val.pkl', ann_file='kitti_infos_val.pkl',
img_prefix=data_root,
classes=class_names,
pipeline=test_pipeline, pipeline=test_pipeline,
modality=input_modality, modality=input_modality,
metainfo=metainfo,
test_mode=True, test_mode=True,
box_type_3d='Camera')) box_type_3d='Camera'))
evaluation = dict(interval=2) test_dataloader = val_dataloader
val_evaluator = dict(
type='KittiMetric',
ann_file=data_root + 'kitti_infos_val.pkl',
metric='bbox',
pred_box_type_3d='Camera')
test_evaluator = val_evaluator
dataset_type = 'NuScenesMonoDataset' dataset_type = 'NuScenesDataset'
data_root = 'data/nuscenes/' data_root = 'data/nuscenes/'
class_names = [ class_names = [
'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle', 'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
'motorcycle', 'pedestrian', 'traffic_cone', 'barrier' 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
] ]
metainfo = dict(CLASSES=class_names)
# Input modality for nuScenes dataset, this is consistent with the submission # Input modality for nuScenes dataset, this is consistent with the submission
# format which requires the information in input_modality. # format which requires the information in input_modality.
input_modality = dict( input_modality = dict(use_lidar=False, use_camera=True)
use_lidar=False,
use_camera=True, # file_client_args = dict(backend='disk')
use_radar=False, # Uncomment the following if use ceph or other file clients.
use_map=False, # See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
use_external=False) # for more details.
img_norm_cfg = dict( file_client_args = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) backend='petrel',
path_mapping=dict({
'./data/nuscenes/':
's3://openmmlab/datasets/detection3d/nuscenes/',
'data/nuscenes/':
's3://openmmlab/datasets/detection3d/nuscenes/'
}))
train_pipeline = [ train_pipeline = [
dict(type='LoadImageFromFileMono3D'), dict(type='LoadImageFromFileMono3D', file_client_args=file_client_args),
dict( dict(
type='LoadAnnotations3D', type='LoadAnnotations3D',
with_bbox=True, with_bbox=True,
...@@ -26,75 +34,77 @@ train_pipeline = [ ...@@ -26,75 +34,77 @@ train_pipeline = [
with_bbox_depth=True), with_bbox_depth=True),
dict(type='Resize', img_scale=(1600, 900), keep_ratio=True), dict(type='Resize', img_scale=(1600, 900), keep_ratio=True),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict( dict(
type='Collect3D', type='Pack3DDetInputs',
keys=[ keys=[
'img', 'gt_bboxes', 'gt_labels', 'attr_labels', 'gt_bboxes_3d', 'img', 'gt_bboxes', 'gt_labels', 'attr_labels', 'gt_bboxes_3d',
'gt_labels_3d', 'centers2d', 'depths' 'gt_labels_3d', 'centers_2d', 'depths'
]), ]),
] ]
test_pipeline = [ test_pipeline = [
dict(type='LoadImageFromFileMono3D'), dict(type='LoadImageFromFileMono3D', file_client_args=file_client_args),
dict( dict(type='mmdet.Resize', scale=(1600, 900), keep_ratio=True),
type='MultiScaleFlipAug', dict(type='Pack3DDetInputs', keys=['img'])
scale_factor=1.0,
flip=False,
transforms=[
dict(type='RandomFlip3D'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['img']),
])
]
# construct a pipeline for data and gt loading in show function
# please keep its loading function consistent with test_pipeline (e.g. client)
eval_pipeline = [
dict(type='LoadImageFromFileMono3D'),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['img'])
] ]
data = dict( train_dataloader = dict(
samples_per_gpu=2, batch_size=2,
workers_per_gpu=2, num_workers=2,
train=dict( persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type=dataset_type, type=dataset_type,
data_root=data_root, data_root=data_root,
ann_file=data_root + 'nuscenes_infos_train_mono3d.coco.json', data_prefix=dict(
img_prefix=data_root, pts='',
classes=class_names, CAM_FRONT='samples/CAM_FRONT',
CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT',
CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT',
CAM_BACK='samples/CAM_BACK',
CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
CAM_BACK_LEFT='samples/CAM_BACK_LEFT'),
ann_file='nuscenes_infos_train.pkl',
task='mono3d',
pipeline=train_pipeline, pipeline=train_pipeline,
metainfo=metainfo,
modality=input_modality, modality=input_modality,
test_mode=False, test_mode=False,
box_type_3d='Camera'), # we use box_type_3d='Camera' in monocular 3d
val=dict( # detection task
box_type_3d='Camera',
use_valid_flag=True))
val_dataloader = dict(
batch_size=1,
num_workers=2,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type, type=dataset_type,
data_root=data_root, data_root=data_root,
ann_file=data_root + 'nuscenes_infos_val_mono3d.coco.json', data_prefix=dict(
img_prefix=data_root, pts='',
classes=class_names, CAM_FRONT='samples/CAM_FRONT',
CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT',
CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT',
CAM_BACK='samples/CAM_BACK',
CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
CAM_BACK_LEFT='samples/CAM_BACK_LEFT'),
ann_file='nuscenes_infos_val.pkl',
task='mono3d',
pipeline=test_pipeline, pipeline=test_pipeline,
modality=input_modality, modality=input_modality,
metainfo=metainfo,
test_mode=True, test_mode=True,
box_type_3d='Camera'), box_type_3d='Camera',
test=dict( use_valid_flag=True))
type=dataset_type, test_dataloader = val_dataloader
data_root=data_root,
ann_file=data_root + 'nuscenes_infos_val_mono3d.coco.json', val_evaluator = dict(
img_prefix=data_root, type='NuScenesMetric',
classes=class_names, data_root=data_root,
pipeline=test_pipeline, ann_file=data_root + 'nuscenes_infos_val.pkl',
modality=input_modality, metric='bbox')
test_mode=True,
box_type_3d='Camera')) test_evaluator = val_evaluator
evaluation = dict(interval=2)
# model settings
model = dict( model = dict(
type='FCOSMono3D', type='FCOSMono3D',
data_preprocessor=dict(
type='Det3DDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True,
pad_size_divisor=32),
backbone=dict( backbone=dict(
type='ResNet', type='mmdet.ResNet',
depth=101, depth=101,
num_stages=4, num_stages=4,
out_indices=(0, 1, 2, 3), out_indices=(0, 1, 2, 3),
...@@ -13,7 +20,7 @@ model = dict( ...@@ -13,7 +20,7 @@ model = dict(
type='Pretrained', type='Pretrained',
checkpoint='open-mmlab://detectron2/resnet101_caffe')), checkpoint='open-mmlab://detectron2/resnet101_caffe')),
neck=dict( neck=dict(
type='FPN', type='mmdet.FPN',
in_channels=[256, 512, 1024, 2048], in_channels=[256, 512, 1024, 2048],
out_channels=256, out_channels=256,
start_level=1, start_level=1,
...@@ -45,18 +52,19 @@ model = dict( ...@@ -45,18 +52,19 @@ model = dict(
dir_branch=(256, ), dir_branch=(256, ),
attr_branch=(256, ), attr_branch=(256, ),
loss_cls=dict( loss_cls=dict(
type='FocalLoss', type='mmdet.FocalLoss',
use_sigmoid=True, use_sigmoid=True,
gamma=2.0, gamma=2.0,
alpha=0.25, alpha=0.25,
loss_weight=1.0), loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), loss_bbox=dict(
type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
loss_dir=dict( loss_dir=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
loss_attr=dict( loss_attr=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
loss_centerness=dict( loss_centerness=dict(
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), type='mmdet.CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
bbox_coder=dict(type='FCOS3DBBoxCoder', code_size=9), bbox_coder=dict(type='FCOS3DBBoxCoder', code_size=9),
norm_on_bbox=True, norm_on_bbox=True,
centerness_on_reg=True, centerness_on_reg=True,
......
...@@ -28,18 +28,19 @@ model = dict( ...@@ -28,18 +28,19 @@ model = dict(
dir_branch=(256, ), dir_branch=(256, ),
attr_branch=(256, ), attr_branch=(256, ),
loss_cls=dict( loss_cls=dict(
type='FocalLoss', type='mmdet.FocalLoss',
use_sigmoid=True, use_sigmoid=True,
gamma=2.0, gamma=2.0,
alpha=0.25, alpha=0.25,
loss_weight=1.0), loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), loss_bbox=dict(
type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
loss_dir=dict( loss_dir=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
loss_attr=dict( loss_attr=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
loss_centerness=dict( loss_centerness=dict(
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), type='mmdet.CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
norm_on_bbox=True, norm_on_bbox=True,
centerness_on_reg=True, centerness_on_reg=True,
center_sampling=True, center_sampling=True,
......
# model settings
model = dict( model = dict(
type='SMOKEMono3D', type='SMOKEMono3D',
data_preprocessor=dict(
type='Det3DDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True,
pad_size_divisor=32),
backbone=dict( backbone=dict(
type='DLANet', type='DLANet',
depth=34, depth=34,
...@@ -42,10 +49,11 @@ model = dict( ...@@ -42,10 +49,11 @@ model = dict(
base_dims=((0.88, 1.73, 0.67), (1.78, 1.70, 0.58), (3.88, 1.63, base_dims=((0.88, 1.73, 0.67), (1.78, 1.70, 0.58), (3.88, 1.63,
1.53)), 1.53)),
code_size=7), code_size=7),
loss_cls=dict(type='GaussianFocalLoss', loss_weight=1.0), loss_cls=dict(type='mmdet.GaussianFocalLoss', loss_weight=1.0),
loss_bbox=dict(type='L1Loss', reduction='sum', loss_weight=1 / 300), loss_bbox=dict(
type='mmdet.L1Loss', reduction='sum', loss_weight=1 / 300),
loss_dir=dict( loss_dir=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
loss_attr=None, loss_attr=None,
conv_bias=True, conv_bias=True,
dcn_on_last_conv=False), dcn_on_last_conv=False),
......
# training schedule for 1x
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=1)
val_cfg = dict(type='ValLoop')
test_cfg = dict(type='TestLoop')
# learning rate
param_scheduler = [
dict(
type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
dict(
type='MultiStepLR',
begin=0,
end=12,
by_epoch=True,
milestones=[8, 11],
gamma=0.1)
]
# optimizer # optimizer
optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optim_wrapper = dict(
optimizer_config = dict(grad_clip=None) type='OptimWrapper',
# learning policy optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001))
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[8, 11])
runner = dict(type='EpochBasedRunner', max_epochs=12)
...@@ -4,18 +4,31 @@ _base_ = [ ...@@ -4,18 +4,31 @@ _base_ = [
] ]
# model settings # model settings
model = dict( model = dict(
data_preprocessor=dict(
type='Det3DDataPreprocessor',
mean=[103.530, 116.280, 123.675],
std=[1.0, 1.0, 1.0],
bgr_to_rgb=False,
pad_size_divisor=32),
backbone=dict( backbone=dict(
dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
stage_with_dcn=(False, False, True, True))) stage_with_dcn=(False, False, True, True)))
class_names = [ # file_client_args = dict(backend='disk')
'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle', # Uncomment the following if use ceph or other file clients.
'motorcycle', 'pedestrian', 'traffic_cone', 'barrier' # See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
] # for more details.
img_norm_cfg = dict( file_client_args = dict(
mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) backend='petrel',
path_mapping=dict({
'./data/nuscenes/':
's3://openmmlab/datasets/detection3d/nuscenes/',
'data/nuscenes/':
's3://openmmlab/datasets/detection3d/nuscenes/'
}))
train_pipeline = [ train_pipeline = [
dict(type='LoadImageFromFileMono3D'), dict(type='LoadImageFromFileMono3D', file_client_args=file_client_args),
dict( dict(
type='LoadAnnotations3D', type='LoadAnnotations3D',
with_bbox=True, with_bbox=True,
...@@ -24,52 +37,47 @@ train_pipeline = [ ...@@ -24,52 +37,47 @@ train_pipeline = [
with_bbox_3d=True, with_bbox_3d=True,
with_label_3d=True, with_label_3d=True,
with_bbox_depth=True), with_bbox_depth=True),
dict(type='Resize', img_scale=(1600, 900), keep_ratio=True), dict(type='mmdet.Resize', scale=(1600, 900), keep_ratio=True),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict( dict(
type='Collect3D', type='Pack3DDetInputs',
keys=[ keys=[
'img', 'gt_bboxes', 'gt_labels', 'attr_labels', 'gt_bboxes_3d', 'img', 'gt_bboxes', 'gt_labels', 'attr_labels', 'gt_bboxes_3d',
'gt_labels_3d', 'centers2d', 'depths' 'gt_labels_3d', 'centers_2d', 'depths'
]), ]),
] ]
test_pipeline = [ test_pipeline = [
dict(type='LoadImageFromFileMono3D'), dict(type='LoadImageFromFileMono3D', file_client_args=file_client_args),
dict( dict(type='mmdet.Resize', scale_factor=1.0),
type='MultiScaleFlipAug', dict(type='Pack3DDetInputs', keys=['img'])
scale_factor=1.0,
flip=False,
transforms=[
dict(type='RandomFlip3D'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['img']),
])
] ]
data = dict(
samples_per_gpu=2, train_dataloader = dict(
workers_per_gpu=2, batch_size=2,
train=dict(pipeline=train_pipeline), num_workers=2,
val=dict(pipeline=test_pipeline), dataset=dict(dataset=dict(pipeline=train_pipeline)))
test=dict(pipeline=test_pipeline)) test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
# optimizer # optimizer
optimizer = dict( optim_wrapper = dict(
lr=0.002, paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.)) optimizer=dict(lr=0.002),
optimizer_config = dict( paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.),
_delete_=True, grad_clip=dict(max_norm=35, norm_type=2)) clip_grad=dict(max_norm=35, norm_type=2))
# learning policy
lr_config = dict( # learning rate
policy='step', param_scheduler = [
warmup='linear', dict(
warmup_iters=500, type='LinearLR',
warmup_ratio=1.0 / 3, start_factor=1.0 / 3,
step=[8, 11]) by_epoch=False,
total_epochs = 12 begin=0,
evaluation = dict(interval=2) end=500),
dict(
type='MultiStepLR',
begin=0,
end=12,
by_epoch=True,
milestones=[8, 11],
gamma=0.1)
]
...@@ -4,6 +4,12 @@ _base_ = [ ...@@ -4,6 +4,12 @@ _base_ = [
] ]
# model settings # model settings
model = dict( model = dict(
data_preprocessor=dict(
type='Det3DDataPreprocessor',
mean=[103.530, 116.280, 123.675],
std=[1.0, 1.0, 1.0],
bgr_to_rgb=False,
pad_size_divisor=32),
backbone=dict(frozen_stages=0), backbone=dict(frozen_stages=0),
neck=dict(start_level=0, num_outs=4), neck=dict(start_level=0, num_outs=4),
bbox_head=dict( bbox_head=dict(
...@@ -27,16 +33,17 @@ model = dict( ...@@ -27,16 +33,17 @@ model = dict(
), ),
centerness_branch=(256, ), centerness_branch=(256, ),
loss_cls=dict( loss_cls=dict(
type='FocalLoss', type='mmdet.FocalLoss',
use_sigmoid=True, use_sigmoid=True,
gamma=2.0, gamma=2.0,
alpha=0.25, alpha=0.25,
loss_weight=1.0), loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), loss_bbox=dict(
type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
loss_dir=dict( loss_dir=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
loss_centerness=dict( loss_centerness=dict(
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), type='mmdet.CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
use_depth_classifier=True, use_depth_classifier=True,
depth_branch=(256, ), depth_branch=(256, ),
depth_range=(0, 70), depth_range=(0, 70),
...@@ -61,11 +68,21 @@ model = dict( ...@@ -61,11 +68,21 @@ model = dict(
]), ]),
test_cfg=dict(nms_pre=100, nms_thr=0.05, score_thr=0.001, max_per_img=20)) test_cfg=dict(nms_pre=100, nms_thr=0.05, score_thr=0.001, max_per_img=20))
class_names = ['Pedestrian', 'Cyclist', 'Car'] # file_client_args = dict(backend='disk')
img_norm_cfg = dict( # Uncomment the following if use ceph or other file clients.
mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) # See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
file_client_args = dict(
backend='petrel',
path_mapping=dict({
'./data/kitti/':
's3://openmmlab/datasets/detection3d/kitti/',
'data/kitti/':
's3://openmmlab/datasets/detection3d/kitti/'
}))
train_pipeline = [ train_pipeline = [
dict(type='LoadImageFromFileMono3D'), dict(type='LoadImageFromFileMono3D', file_client_args=file_client_args),
dict( dict(
type='LoadAnnotations3D', type='LoadAnnotations3D',
with_bbox=True, with_bbox=True,
...@@ -74,54 +91,47 @@ train_pipeline = [ ...@@ -74,54 +91,47 @@ train_pipeline = [
with_bbox_3d=True, with_bbox_3d=True,
with_label_3d=True, with_label_3d=True,
with_bbox_depth=True), with_bbox_depth=True),
dict(type='Resize', img_scale=(1242, 375), keep_ratio=True), dict(type='mmdet.Resize', scale=(1242, 375), keep_ratio=True),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict( dict(
type='Collect3D', type='Pack3DDetInputs',
keys=[ keys=[
'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_3d', 'gt_labels_3d', 'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_3d', 'gt_labels_3d',
'centers2d', 'depths' 'centers_2d', 'depths'
]), ]),
] ]
test_pipeline = [ test_pipeline = [
dict(type='LoadImageFromFileMono3D'), dict(type='LoadImageFromFileMono3D', file_client_args=file_client_args),
dict( dict(type='mmdet.Resize', scale_factor=1.0),
type='MultiScaleFlipAug', dict(type='Pack3DDetInputs', keys=['img'])
scale_factor=1.0,
flip=False,
transforms=[
dict(type='RandomFlip3D'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['img']),
])
] ]
data = dict(
samples_per_gpu=3, train_dataloader = dict(
workers_per_gpu=3, batch_size=3, num_workers=3, dataset=dict(pipeline=train_pipeline))
train=dict(pipeline=train_pipeline), test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
val=dict(pipeline=test_pipeline), val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
test=dict(pipeline=test_pipeline))
# optimizer # optimizer
optimizer = dict( optim_wrapper = dict(
lr=0.001, paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.)) optimizer=dict(lr=0.01),
optimizer_config = dict( paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.),
_delete_=True, grad_clip=dict(max_norm=35, norm_type=2)) clip_grad=dict(max_norm=35, norm_type=2))
# learning policy
lr_config = dict( # learning rate
policy='step', param_scheduler = [
warmup='linear', dict(
warmup_iters=500, type='LinearLR',
warmup_ratio=1.0 / 3, start_factor=1.0 / 3,
step=[32, 44]) by_epoch=False,
total_epochs = 48 begin=0,
runner = dict(type='EpochBasedRunner', max_epochs=48) end=500),
evaluation = dict(interval=2) dict(
checkpoint_config = dict(interval=8) type='MultiStepLR',
begin=0,
end=48,
by_epoch=True,
milestones=[32, 44],
gamma=0.1)
]
train_cfg = dict(max_epochs=48)
...@@ -3,21 +3,21 @@ _base_ = [ ...@@ -3,21 +3,21 @@ _base_ = [
'../_base_/default_runtime.py' '../_base_/default_runtime.py'
] ]
# optimizer # file_client_args = dict(backend='disk')
optimizer = dict(type='Adam', lr=2.5e-4) # Uncomment the following if use ceph or other file clients.
optimizer_config = dict(grad_clip=None) # See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
lr_config = dict(policy='step', warmup=None, step=[50]) # for more details.
file_client_args = dict(
# runtime settings backend='petrel',
runner = dict(type='EpochBasedRunner', max_epochs=72) path_mapping=dict({
log_config = dict(interval=10) './data/kitti/':
's3://openmmlab/datasets/detection3d/kitti/',
'data/kitti/':
's3://openmmlab/datasets/detection3d/kitti/'
}))
find_unused_parameters = True
class_names = ['Pedestrian', 'Cyclist', 'Car']
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [ train_pipeline = [
dict(type='LoadImageFromFileMono3D'), dict(type='LoadImageFromFileMono3D', file_client_args=file_client_args),
dict( dict(
type='LoadAnnotations3D', type='LoadAnnotations3D',
with_bbox=True, with_bbox=True,
...@@ -29,36 +29,42 @@ train_pipeline = [ ...@@ -29,36 +29,42 @@ train_pipeline = [
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(type='RandomShiftScale', shift_scale=(0.2, 0.4), aug_prob=0.3), dict(type='RandomShiftScale', shift_scale=(0.2, 0.4), aug_prob=0.3),
dict(type='AffineResize', img_scale=(1280, 384), down_ratio=4), dict(type='AffineResize', img_scale=(1280, 384), down_ratio=4),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict( dict(
type='Collect3D', type='Pack3DDetInputs',
keys=[ keys=[
'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_3d', 'gt_labels_3d', 'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_3d', 'gt_labels_3d',
'centers2d', 'depths' 'centers_2d', 'depths'
]), ]),
] ]
test_pipeline = [ test_pipeline = [
dict(type='LoadImageFromFileMono3D'), dict(type='LoadImageFromFileMono3D', file_client_args=file_client_args),
dict(type='AffineResize', img_scale=(1280, 384), down_ratio=4),
dict(type='Pack3DDetInputs', keys=['img'])
]
train_dataloader = dict(
batch_size=8, num_workers=4, dataset=dict(pipeline=train_pipeline))
test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
# training schedule for 1x
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=1)
val_cfg = dict(type='ValLoop')
test_cfg = dict(type='TestLoop')
# learning rate
param_scheduler = [
dict( dict(
type='MultiScaleFlipAug', type='MultiStepLR',
img_scale=(1280, 384), begin=0,
flip=False, end=12,
transforms=[ by_epoch=True,
dict(type='AffineResize', img_scale=(1280, 384), down_ratio=4), milestones=[8, 11],
dict(type='Normalize', **img_norm_cfg), gamma=0.1)
dict(type='Pad', size_divisor=32),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['img']),
])
] ]
data = dict(
samples_per_gpu=8, # optimizer
workers_per_gpu=4, optim_wrapper = dict(
train=dict(pipeline=train_pipeline), type='OptimWrapper',
val=dict(pipeline=test_pipeline), optimizer=dict(type='Adam', lr=2.5e-4),
test=dict(pipeline=test_pipeline)) clip_grad=None)
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
from .builder import DATASETS, PIPELINES, build_dataset from .builder import DATASETS, PIPELINES, build_dataset
from .convert_utils import get_2d_boxes
from .dataset_wrappers import CBGSDataset from .dataset_wrappers import CBGSDataset
from .det3d_dataset import Det3DDataset from .det3d_dataset import Det3DDataset
from .kitti_dataset import KittiDataset from .kitti_dataset import KittiDataset
...@@ -41,5 +42,5 @@ __all__ = [ ...@@ -41,5 +42,5 @@ __all__ = [
'LoadPointsFromMultiSweeps', 'WaymoDataset', 'BackgroundPointsFilter', 'LoadPointsFromMultiSweeps', 'WaymoDataset', 'BackgroundPointsFilter',
'VoxelBasedPointSampler', 'get_loading_pipeline', 'RandomDropPointsColor', 'VoxelBasedPointSampler', 'get_loading_pipeline', 'RandomDropPointsColor',
'RandomJitterPoints', 'ObjectNameFilter', 'AffineResize', 'RandomJitterPoints', 'ObjectNameFilter', 'AffineResize',
'RandomShiftScale', 'LoadPointsFromDict', 'PIPELINES' 'RandomShiftScale', 'LoadPointsFromDict', 'PIPELINES', 'get_2d_boxes'
] ]
# Copyright (c) OpenMMLab. All rights reserved.
from collections import OrderedDict
from typing import List, Tuple, Union
import numpy as np
from nuscenes.utils.geometry_utils import view_points
from pyquaternion import Quaternion
from shapely.geometry import MultiPoint, box
from mmdet3d.core.bbox import points_cam2img
nus_categories = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
'barrier')
nus_attributes = ('cycle.with_rider', 'cycle.without_rider',
'pedestrian.moving', 'pedestrian.standing',
'pedestrian.sitting_lying_down', 'vehicle.moving',
'vehicle.parked', 'vehicle.stopped', 'None')
NameMapping = {
'movable_object.barrier': 'barrier',
'vehicle.bicycle': 'bicycle',
'vehicle.bus.bendy': 'bus',
'vehicle.bus.rigid': 'bus',
'vehicle.car': 'car',
'vehicle.construction': 'construction_vehicle',
'vehicle.motorcycle': 'motorcycle',
'human.pedestrian.adult': 'pedestrian',
'human.pedestrian.child': 'pedestrian',
'human.pedestrian.construction_worker': 'pedestrian',
'human.pedestrian.police_officer': 'pedestrian',
'movable_object.trafficcone': 'traffic_cone',
'vehicle.trailer': 'trailer',
'vehicle.truck': 'truck'
}
def get_2d_boxes(nusc, sample_data_token: str, visibilities: List[str]):
"""Get the 2D annotation records for a given `sample_data_token`.
Args:
sample_data_token (str): Sample data token belonging to a camera
keyframe.
visibilities (list[str]): Visibility filter.
Return:
list[dict]: List of 2D annotation record that belongs to the input
`sample_data_token`.
"""
# Get the sample data and the sample corresponding to that sample data.
sd_rec = nusc.get('sample_data', sample_data_token)
assert sd_rec[
'sensor_modality'] == 'camera', 'Error: get_2d_boxes only works' \
' for camera sample_data!'
if not sd_rec['is_key_frame']:
raise ValueError(
'The 2D re-projections are available only for keyframes.')
s_rec = nusc.get('sample', sd_rec['sample_token'])
# Get the calibrated sensor and ego pose
# record to get the transformation matrices.
cs_rec = nusc.get('calibrated_sensor', sd_rec['calibrated_sensor_token'])
pose_rec = nusc.get('ego_pose', sd_rec['ego_pose_token'])
camera_intrinsic = np.array(cs_rec['camera_intrinsic'])
# Get all the annotation with the specified visibilties.
ann_recs = [
nusc.get('sample_annotation', token) for token in s_rec['anns']
]
ann_recs = [
ann_rec for ann_rec in ann_recs
if (ann_rec['visibility_token'] in visibilities)
]
repro_recs = []
for ann_rec in ann_recs:
# Augment sample_annotation with token information.
ann_rec['sample_annotation_token'] = ann_rec['token']
ann_rec['sample_data_token'] = sample_data_token
# Get the box in global coordinates.
box = nusc.get_box(ann_rec['token'])
# Move them to the ego-pose frame.
box.translate(-np.array(pose_rec['translation']))
box.rotate(Quaternion(pose_rec['rotation']).inverse)
# Move them to the calibrated sensor frame.
box.translate(-np.array(cs_rec['translation']))
box.rotate(Quaternion(cs_rec['rotation']).inverse)
# Filter out the corners that are not in front of the calibrated
# sensor.
corners_3d = box.corners()
in_front = np.argwhere(corners_3d[2, :] > 0).flatten()
corners_3d = corners_3d[:, in_front]
# Project 3d box to 2d.
corner_coords = view_points(corners_3d, camera_intrinsic,
True).T[:, :2].tolist()
# Keep only corners that fall within the image.
final_coords = post_process_coords(corner_coords)
# Skip if the convex hull of the re-projected corners
# does not intersect the image canvas.
if final_coords is None:
continue
else:
min_x, min_y, max_x, max_y = final_coords
# Generate dictionary record to be included in the .json file.
repro_rec = generate_record(ann_rec, min_x, min_y, max_x, max_y,
sample_data_token, sd_rec['filename'])
# if repro_rec is None, we do not append it into repre_recs
if repro_rec is not None:
loc = box.center.tolist()
dim = box.wlh
dim[[0, 1, 2]] = dim[[1, 2, 0]] # convert wlh to our lhw
dim = dim.tolist()
rot = box.orientation.yaw_pitch_roll[0]
rot = [-rot] # convert the rot to our cam coordinate
global_velo2d = nusc.box_velocity(box.token)[:2]
global_velo3d = np.array([*global_velo2d, 0.0])
e2g_r_mat = Quaternion(pose_rec['rotation']).rotation_matrix
c2e_r_mat = Quaternion(cs_rec['rotation']).rotation_matrix
cam_velo3d = global_velo3d @ np.linalg.inv(
e2g_r_mat).T @ np.linalg.inv(c2e_r_mat).T
velo = cam_velo3d[0::2].tolist()
repro_rec['bbox_3d'] = loc + dim + rot
repro_rec['velocity'] = velo
center_3d = np.array(loc).reshape([1, 3])
center_2d_with_depth = points_cam2img(
center_3d, camera_intrinsic, with_depth=True)
center_2d_with_depth = center_2d_with_depth.squeeze().tolist()
repro_rec['center_2d'] = center_2d_with_depth[:2]
repro_rec['depth'] = center_2d_with_depth[2]
# normalized center2D + depth
# if samples with depth < 0 will be removed
if repro_rec['depth'] <= 0:
continue
ann_token = nusc.get('sample_annotation',
box.token)['attribute_tokens']
if len(ann_token) == 0:
attr_name = 'None'
else:
attr_name = nusc.get('attribute', ann_token[0])['name']
attr_id = nus_attributes.index(attr_name)
# repro_rec['attribute_name'] = attr_name
repro_rec['attr_label'] = attr_id
repro_recs.append(repro_rec)
return repro_recs
def post_process_coords(
corner_coords: List, imsize: Tuple[int, int] = (1600, 900)
) -> Union[Tuple[float, float, float, float], None]:
"""Get the intersection of the convex hull of the reprojected bbox corners
and the image canvas, return None if no intersection.
Args:
corner_coords (list[int]): Corner coordinates of reprojected
bounding box.
imsize (tuple[int]): Size of the image canvas.
Return:
tuple [float]: Intersection of the convex hull of the 2D box
corners and the image canvas.
"""
polygon_from_2d_box = MultiPoint(corner_coords).convex_hull
img_canvas = box(0, 0, imsize[0], imsize[1])
if polygon_from_2d_box.intersects(img_canvas):
img_intersection = polygon_from_2d_box.intersection(img_canvas)
intersection_coords = np.array(
[coord for coord in img_intersection.exterior.coords])
min_x = min(intersection_coords[:, 0])
min_y = min(intersection_coords[:, 1])
max_x = max(intersection_coords[:, 0])
max_y = max(intersection_coords[:, 1])
return min_x, min_y, max_x, max_y
else:
return None
def generate_record(ann_rec: dict, x1: float, y1: float, x2: float, y2: float,
sample_data_token: str, filename: str) -> OrderedDict:
"""Generate one 2D annotation record given various information on top of
the 2D bounding box coordinates.
Args:
ann_rec (dict): Original 3d annotation record.
x1 (float): Minimum value of the x coordinate.
y1 (float): Minimum value of the y coordinate.
x2 (float): Maximum value of the x coordinate.
y2 (float): Maximum value of the y coordinate.
sample_data_token (str): Sample data token.
filename (str):The corresponding image file where the annotation
is present.
Returns:
dict: A sample mono3D annotation record.
- bbox_label (int): 2d box label id
- bbox_label_3d (int): 3d box label id
- bbox (list[float]): left x, top y, right x, bottom y
of 2d box
- bbox_3d_isvalid (bool): whether the box is valid
"""
repro_rec = OrderedDict()
repro_rec['sample_data_token'] = sample_data_token
coco_rec = dict()
relevant_keys = [
'attribute_tokens',
'category_name',
'instance_token',
'next',
'num_lidar_pts',
'num_radar_pts',
'prev',
'sample_annotation_token',
'sample_data_token',
'visibility_token',
]
for key, value in ann_rec.items():
if key in relevant_keys:
repro_rec[key] = value
repro_rec['bbox_corners'] = [x1, y1, x2, y2]
repro_rec['filename'] = filename
if repro_rec['category_name'] not in NameMapping:
return None
cat_name = NameMapping[repro_rec['category_name']]
coco_rec['bbox_label'] = nus_categories.index(cat_name)
coco_rec['bbox_label_3d'] = nus_categories.index(cat_name)
coco_rec['bbox'] = [x1, y1, x2, y2]
coco_rec['bbox_3d_isvalid'] = True
return coco_rec
...@@ -197,6 +197,7 @@ class Det3DDataset(BaseDataset): ...@@ -197,6 +197,7 @@ class Det3DDataset(BaseDataset):
ann_info = dict() ann_info = dict()
for ann_name in keys: for ann_name in keys:
temp_anns = [item[ann_name] for item in instances] temp_anns = [item[ann_name] for item in instances]
# map the original dataset label to training label
if 'label' in ann_name: if 'label' in ann_name:
temp_anns = [ temp_anns = [
self.label_mapping[item] for item in temp_anns self.label_mapping[item] for item in temp_anns
......
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
from os import path as osp
from typing import Dict, List from typing import Dict, List
import numpy as np import numpy as np
from mmdet3d.core.bbox.structures.cam_box3d import CameraInstance3DBoxes
from mmdet3d.registry import DATASETS from mmdet3d.registry import DATASETS
from ..core.bbox import LiDARInstance3DBoxes from ..core.bbox import LiDARInstance3DBoxes
from .det3d_dataset import Det3DDataset from .det3d_dataset import Det3DDataset
...@@ -53,6 +55,7 @@ class NuScenesDataset(Det3DDataset): ...@@ -53,6 +55,7 @@ class NuScenesDataset(Det3DDataset):
def __init__(self, def __init__(self,
data_root: str, data_root: str,
ann_file: str, ann_file: str,
task: str = '3d',
pipeline: List[dict] = None, pipeline: List[dict] = None,
box_type_3d: str = 'LiDAR', box_type_3d: str = 'LiDAR',
modality: Dict = dict( modality: Dict = dict(
...@@ -66,7 +69,12 @@ class NuScenesDataset(Det3DDataset): ...@@ -66,7 +69,12 @@ class NuScenesDataset(Det3DDataset):
**kwargs): **kwargs):
self.use_valid_flag = use_valid_flag self.use_valid_flag = use_valid_flag
self.with_velocity = with_velocity self.with_velocity = with_velocity
assert box_type_3d.lower() == 'lidar'
# TODO: Redesign multi-view data process in the future
assert task in ('3d', 'mono3d', 'multi-view')
self.task = task
assert box_type_3d.lower() in ('lidar', 'camera')
super().__init__( super().__init__(
data_root=data_root, data_root=data_root,
ann_file=ann_file, ann_file=ann_file,
...@@ -97,6 +105,7 @@ class NuScenesDataset(Det3DDataset): ...@@ -97,6 +105,7 @@ class NuScenesDataset(Det3DDataset):
anns_results['gt_bboxes_3d'] = np.zeros((0, 7), dtype=np.float32) anns_results['gt_bboxes_3d'] = np.zeros((0, 7), dtype=np.float32)
anns_results['gt_labels_3d'] = np.zeros(0, dtype=np.int64) anns_results['gt_labels_3d'] = np.zeros(0, dtype=np.int64)
return anns_results return anns_results
if self.use_valid_flag: if self.use_valid_flag:
mask = ann_info['bbox_3d_isvalid'] mask = ann_info['bbox_3d_isvalid']
else: else:
...@@ -104,6 +113,22 @@ class NuScenesDataset(Det3DDataset): ...@@ -104,6 +113,22 @@ class NuScenesDataset(Det3DDataset):
gt_bboxes_3d = ann_info['gt_bboxes_3d'][mask] gt_bboxes_3d = ann_info['gt_bboxes_3d'][mask]
gt_labels_3d = ann_info['gt_labels_3d'][mask] gt_labels_3d = ann_info['gt_labels_3d'][mask]
if 'gt_bboxes' in ann_info:
gt_bboxes = ann_info['gt_bboxes'][mask]
gt_labels = ann_info['gt_labels'][mask]
attr_labels = ann_info['attr_labels'][mask]
else:
gt_bboxes = np.zeros((0, 4), dtype=np.float32)
gt_labels = np.array([], dtype=np.int64)
attr_labels = np.array([], dtype=np.int64)
if 'centers_2d' in ann_info:
centers_2d = ann_info['centers_2d'][mask]
depths = ann_info['depths'][mask]
else:
centers_2d = np.zeros((0, 2), dtype=np.float32)
depths = np.zeros((0), dtype=np.float32)
if self.with_velocity: if self.with_velocity:
gt_velocity = ann_info['velocity'][mask] gt_velocity = ann_info['velocity'][mask]
nan_mask = np.isnan(gt_velocity[:, 0]) nan_mask = np.isnan(gt_velocity[:, 0])
...@@ -112,11 +137,82 @@ class NuScenesDataset(Det3DDataset): ...@@ -112,11 +137,82 @@ class NuScenesDataset(Det3DDataset):
# the nuscenes box center is [0.5, 0.5, 0.5], we change it to be # the nuscenes box center is [0.5, 0.5, 0.5], we change it to be
# the same as KITTI (0.5, 0.5, 0) # the same as KITTI (0.5, 0.5, 0)
gt_bboxes_3d = LiDARInstance3DBoxes( # TODO: Unify the coordinates
gt_bboxes_3d, if self.task == 'mono3d':
box_dim=gt_bboxes_3d.shape[-1], gt_bboxes_3d = CameraInstance3DBoxes(
origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d) gt_bboxes_3d,
box_dim=gt_bboxes_3d.shape[-1],
origin=(0.5, 0.5, 0.5))
else:
gt_bboxes_3d = LiDARInstance3DBoxes(
gt_bboxes_3d,
box_dim=gt_bboxes_3d.shape[-1],
origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
anns_results = dict( anns_results = dict(
gt_bboxes_3d=gt_bboxes_3d, gt_labels_3d=gt_labels_3d) gt_bboxes_3d=gt_bboxes_3d,
gt_labels_3d=gt_labels_3d,
gt_bboxes=gt_bboxes,
gt_labels=gt_labels,
attr_labels=attr_labels,
centers_2d=centers_2d,
depths=depths)
return anns_results return anns_results
def parse_data_info(self, info: dict) -> dict:
"""Process the raw data info.
The only difference with it in `Det3DDataset`
is the specific process for `plane`.
Args:
info (dict): Raw info dict.
Returns:
dict: Has `ann_info` in training stage. And
all path has been converted to absolute path.
"""
if self.task == 'mono3d':
data_list = []
if self.modality['use_lidar']:
info['lidar_points']['lidar_path'] = \
osp.join(
self.data_prefix.get('pts', ''),
info['lidar_points']['lidar_path'])
if self.modality['use_camera']:
for cam_id, img_info in info['images'].items():
if 'img_path' in img_info:
if cam_id in self.data_prefix:
cam_prefix = self.data_prefix[cam_id]
else:
cam_prefix = self.data_prefix.get('img', '')
img_info['img_path'] = osp.join(
cam_prefix, img_info['img_path'])
for idx, (cam_id, img_info) in enumerate(info['images'].items()):
camera_info = dict()
camera_info['images'] = dict()
camera_info['images'][cam_id] = img_info
if 'cam_instances' in info and cam_id in info['cam_instances']:
camera_info['instances'] = info['cam_instances'][cam_id]
else:
camera_info['instances'] = []
# TODO: check whether to change sample_idx for 6 cameras
# in one frame
camera_info['sample_idx'] = info['sample_idx'] * 6 + idx
camera_info['token'] = info['token']
camera_info['ego2global'] = info['ego2global']
if not self.test_mode:
# used in traing
camera_info['ann_info'] = self.parse_ann_info(camera_info)
if self.test_mode and self.load_eval_anns:
camera_info['eval_ann_info'] = \
self.parse_ann_info(camera_info)
data_list.append(camera_info)
return data_list
else:
data_info = super().parse_data_info(info)
return data_info
...@@ -122,7 +122,7 @@ class Pack3DDetInputs(BaseTransform): ...@@ -122,7 +122,7 @@ class Pack3DDetInputs(BaseTransform):
for key in [ for key in [
'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels', 'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels',
'gt_labels_3d', 'attr_labels', 'pts_instance_mask', 'gt_labels_3d', 'attr_labels', 'pts_instance_mask',
'pts_semantic_mask', 'centers2d', 'depths' 'pts_semantic_mask', 'centers_2d', 'depths'
]: ]:
if key not in results: if key not in results:
continue continue
......
...@@ -86,7 +86,7 @@ class LoadImageFromFileMono3D(LoadImageFromFile): ...@@ -86,7 +86,7 @@ class LoadImageFromFileMono3D(LoadImageFromFile):
:class:`LoadImageFromFile`. :class:`LoadImageFromFile`.
""" """
def __call__(self, results): def transform(self, results: dict) -> dict:
"""Call functions to load image and get image meta information. """Call functions to load image and get image meta information.
Args: Args:
...@@ -95,8 +95,32 @@ class LoadImageFromFileMono3D(LoadImageFromFile): ...@@ -95,8 +95,32 @@ class LoadImageFromFileMono3D(LoadImageFromFile):
Returns: Returns:
dict: The dict contains loaded image and meta information. dict: The dict contains loaded image and meta information.
""" """
super().__call__(results) # TODO: load different camera image from data info,
results['cam2img'] = results['img_info']['cam_intrinsic'] # for kitti dataset, we load 'CAM2' image.
# for nuscenes dataset, we load 'CAM_FRONT' image.
if 'CAM2' in results['images']:
filename = results['images']['CAM2']['img_path']
results['cam2img'] = results['images']['CAM2']['cam2img']
elif len(list(results['images'].keys())) == 1:
camera_type = list(results['images'].keys())[0]
filename = results['images'][camera_type]['img_path']
results['cam2img'] = results['images'][camera_type]['cam2img']
else:
raise NotImplementedError(
'Currently we only support load image from kitti and'
'nuscenes datasets')
img_bytes = self.file_client.get(filename)
img = mmcv.imfrombytes(
img_bytes, flag=self.color_type, backend=self.imdecode_backend)
if self.to_float32:
img = img.astype(np.float32)
results['img'] = img
results['img_shape'] = img.shape[:2]
results['ori_shape'] = img.shape[:2]
return results return results
...@@ -608,6 +632,34 @@ class LoadAnnotations3D(LoadAnnotations): ...@@ -608,6 +632,34 @@ class LoadAnnotations3D(LoadAnnotations):
self.with_seg_3d = with_seg_3d self.with_seg_3d = with_seg_3d
self.seg_3d_dtype = seg_3d_dtype self.seg_3d_dtype = seg_3d_dtype
def _load_bboxes(self, results: dict) -> None:
"""Private function to load bounding box annotations.
Rewrite '_load_bboxes` since mmdet3d uses 'parse_anno_info' in
datasets.
Args:
results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
Returns:
dict: The dict contains loaded bounding box annotations.
"""
results['gt_bboxes'] = results['ann_info']['gt_bboxes']
def _load_labels(self, results: dict) -> None:
"""Private function to load label annotations.
Rewrite '_load_bboxes` since mmdet3d uses 'parse_anno_info' in
datasets.
Args:
results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
Returns:
dict: The dict contains loaded label annotations.
"""
results['gt_labels'] = results['ann_info']['gt_labels']
def _load_bboxes_3d(self, results: dict) -> dict: def _load_bboxes_3d(self, results: dict) -> dict:
"""Private function to move the 3D bounding box annotation from """Private function to move the 3D bounding box annotation from
`ann_info` field to the root of `results`. `ann_info` field to the root of `results`.
......
...@@ -1579,7 +1579,7 @@ class VoxelBasedPointSampler(object): ...@@ -1579,7 +1579,7 @@ class VoxelBasedPointSampler(object):
@TRANSFORMS.register_module() @TRANSFORMS.register_module()
class AffineResize(object): class AffineResize(BaseTransform):
"""Get the affine transform matrices to the target size. """Get the affine transform matrices to the target size.
Different from :class:`RandomAffine` in MMDetection, this class can Different from :class:`RandomAffine` in MMDetection, this class can
...@@ -1596,13 +1596,16 @@ class AffineResize(object): ...@@ -1596,13 +1596,16 @@ class AffineResize(object):
outside the border of the image. Defaults to True. outside the border of the image. Defaults to True.
""" """
def __init__(self, img_scale, down_ratio, bbox_clip_border=True): def __init__(self,
img_scale: Tuple,
down_ratio: int,
bbox_clip_border: bool = True) -> None:
self.img_scale = img_scale self.img_scale = img_scale
self.down_ratio = down_ratio self.down_ratio = down_ratio
self.bbox_clip_border = bbox_clip_border self.bbox_clip_border = bbox_clip_border
def __call__(self, results): def transform(self, results: dict) -> dict:
"""Call function to do affine transform to input image and labels. """Call function to do affine transform to input image and labels.
Args: Args:
...@@ -1647,39 +1650,38 @@ class AffineResize(object): ...@@ -1647,39 +1650,38 @@ class AffineResize(object):
results['pad_shape'] = img.shape results['pad_shape'] = img.shape
results['trans_mat'] = trans_mat results['trans_mat'] = trans_mat
self._affine_bboxes(results, trans_affine) if 'gt_bboxes' in results:
self._affine_bboxes(results, trans_affine)
if 'centers2d' in results: if 'centers_2d' in results:
centers2d = self._affine_transform(results['centers2d'], centers2d = self._affine_transform(results['centers_2d'],
trans_affine) trans_affine)
valid_index = (centers2d[:, 0] > valid_index = (centers2d[:, 0] >
0) & (centers2d[:, 0] < 0) & (centers2d[:, 0] <
self.img_scale[0]) & (centers2d[:, 1] > 0) & ( self.img_scale[0]) & (centers2d[:, 1] > 0) & (
centers2d[:, 1] < self.img_scale[1]) centers2d[:, 1] < self.img_scale[1])
results['centers2d'] = centers2d[valid_index] results['centers_2d'] = centers2d[valid_index]
for key in results.get('bbox_fields', []): if 'gt_bboxes' in results:
if key in ['gt_bboxes']: results['gt_bboxes'] = results['gt_bboxes'][valid_index]
results[key] = results[key][valid_index] if 'gt_labels' in results:
if 'gt_labels' in results: results['gt_labels'] = results['gt_labels'][valid_index]
results['gt_labels'] = results['gt_labels'][ if 'gt_masks' in results:
valid_index] raise NotImplementedError(
if 'gt_masks' in results: 'AffineResize only supports bbox.')
raise NotImplementedError(
'AffineResize only supports bbox.') if 'gt_bboxes_3d' in results:
results['gt_bboxes_3d'].tensor = results[
for key in results.get('bbox3d_fields', []): 'gt_bboxes_3d'].tensor[valid_index]
if key in ['gt_bboxes_3d']: if 'gt_labels_3d' in results:
results[key].tensor = results[key].tensor[valid_index] results['gt_labels_3d'] = results['gt_labels_3d'][
if 'gt_labels_3d' in results: valid_index]
results['gt_labels_3d'] = results['gt_labels_3d'][
valid_index]
results['depths'] = results['depths'][valid_index] results['depths'] = results['depths'][valid_index]
return results return results
def _affine_bboxes(self, results, matrix): def _affine_bboxes(self, results: dict, matrix: np.ndarray) -> None:
"""Affine transform bboxes to input image. """Affine transform bboxes to input image.
Args: Args:
...@@ -1689,20 +1691,18 @@ class AffineResize(object): ...@@ -1689,20 +1691,18 @@ class AffineResize(object):
shape: (3, 3) shape: (3, 3)
""" """
for key in results.get('bbox_fields', []): bboxes = results['gt_bboxes']
bboxes = results[key] bboxes[:, :2] = self._affine_transform(bboxes[:, :2], matrix)
bboxes[:, :2] = self._affine_transform(bboxes[:, :2], matrix) bboxes[:, 2:] = self._affine_transform(bboxes[:, 2:], matrix)
bboxes[:, 2:] = self._affine_transform(bboxes[:, 2:], matrix) if self.bbox_clip_border:
if self.bbox_clip_border: bboxes[:, [0, 2]] = bboxes[:, [0, 2]].clip(0,
bboxes[:, self.img_scale[0] - 1)
[0, 2]] = bboxes[:, bboxes[:, [1, 3]] = bboxes[:, [1, 3]].clip(0,
[0, 2]].clip(0, self.img_scale[0] - 1) self.img_scale[1] - 1)
bboxes[:, results['gt_bboxes'] = bboxes
[1, 3]] = bboxes[:,
[1, 3]].clip(0, self.img_scale[1] - 1) def _affine_transform(self, points: np.ndarray,
results[key] = bboxes matrix: np.ndarray) -> np.ndarray:
def _affine_transform(self, points, matrix):
"""Affine transform bbox points to input image. """Affine transform bbox points to input image.
Args: Args:
...@@ -1721,7 +1721,8 @@ class AffineResize(object): ...@@ -1721,7 +1721,8 @@ class AffineResize(object):
affined_points = np.matmul(matrix, hom_points_2d).T affined_points = np.matmul(matrix, hom_points_2d).T
return affined_points[:, :2] return affined_points[:, :2]
def _get_transform_matrix(self, center, scale, output_scale): def _get_transform_matrix(self, center: Tuple, scale: Tuple,
output_scale: Tuple[float]) -> np.ndarray:
"""Get affine transform matrix. """Get affine transform matrix.
Args: Args:
...@@ -1756,7 +1757,8 @@ class AffineResize(object): ...@@ -1756,7 +1757,8 @@ class AffineResize(object):
return matrix.astype(np.float32) return matrix.astype(np.float32)
def _get_ref_point(self, ref_point1, ref_point2): def _get_ref_point(self, ref_point1: np.ndarray,
ref_point2: np.ndarray) -> np.ndarray:
"""Get reference point to calculate affine transform matrix. """Get reference point to calculate affine transform matrix.
While using opencv to calculate the affine matrix, we need at least While using opencv to calculate the affine matrix, we need at least
...@@ -1775,7 +1777,7 @@ class AffineResize(object): ...@@ -1775,7 +1777,7 @@ class AffineResize(object):
@TRANSFORMS.register_module() @TRANSFORMS.register_module()
class RandomShiftScale(object): class RandomShiftScale(BaseTransform):
"""Random shift scale. """Random shift scale.
Different from the normal shift and scale function, it doesn't Different from the normal shift and scale function, it doesn't
...@@ -1788,12 +1790,12 @@ class RandomShiftScale(object): ...@@ -1788,12 +1790,12 @@ class RandomShiftScale(object):
aug_prob (float): The shifting and scaling probability. aug_prob (float): The shifting and scaling probability.
""" """
def __init__(self, shift_scale, aug_prob): def __init__(self, shift_scale: Tuple[float], aug_prob: float):
self.shift_scale = shift_scale self.shift_scale = shift_scale
self.aug_prob = aug_prob self.aug_prob = aug_prob
def __call__(self, results): def transform(self, results: dict) -> dict:
"""Call function to record random shift and scale infos. """Call function to record random shift and scale infos.
Args: Args:
......
...@@ -45,6 +45,7 @@ class KittiMetric(BaseMetric): ...@@ -45,6 +45,7 @@ class KittiMetric(BaseMetric):
def __init__(self, def __init__(self,
ann_file: str, ann_file: str,
metric: Union[str, List[str]] = 'bbox', metric: Union[str, List[str]] = 'bbox',
pred_box_type_3d: str = 'LiDAR',
pcd_limit_range: List[float] = [0, -40, -3, 70.4, 40, 0.0], pcd_limit_range: List[float] = [0, -40, -3, 70.4, 40, 0.0],
prefix: Optional[str] = None, prefix: Optional[str] = None,
pklfile_prefix: str = None, pklfile_prefix: str = None,
...@@ -57,6 +58,7 @@ class KittiMetric(BaseMetric): ...@@ -57,6 +58,7 @@ class KittiMetric(BaseMetric):
self.ann_file = ann_file self.ann_file = ann_file
self.pklfile_prefix = pklfile_prefix self.pklfile_prefix = pklfile_prefix
self.submission_prefix = submission_prefix self.submission_prefix = submission_prefix
self.pred_box_type_3d = pred_box_type_3d
allowed_metrics = ['bbox', 'img_bbox', 'mAP'] allowed_metrics = ['bbox', 'img_bbox', 'mAP']
self.metrics = metric if isinstance(metric, list) else [metric] self.metrics = metric if isinstance(metric, list) else [metric]
......
...@@ -7,12 +7,15 @@ from typing import Dict, List, Optional, Sequence, Tuple, Union ...@@ -7,12 +7,15 @@ from typing import Dict, List, Optional, Sequence, Tuple, Union
import mmcv import mmcv
import numpy as np import numpy as np
import pyquaternion import pyquaternion
import torch
from mmengine.evaluator import BaseMetric from mmengine.evaluator import BaseMetric
from mmengine.logging import MMLogger from mmengine.logging import MMLogger
from nuscenes.eval.detection.config import config_factory from nuscenes.eval.detection.config import config_factory
from nuscenes.eval.detection.data_classes import DetectionConfig from nuscenes.eval.detection.data_classes import DetectionConfig
from nuscenes.utils.data_classes import Box as NuScenesBox from nuscenes.utils.data_classes import Box as NuScenesBox
from mmdet3d.core import bbox3d2result, box3d_multiclass_nms, xywhr2xyxyr
from mmdet3d.core.bbox import CameraInstance3DBoxes, LiDARInstance3DBoxes
from mmdet3d.registry import METRICS from mmdet3d.registry import METRICS
...@@ -288,21 +291,144 @@ class NuScenesMetric(BaseMetric): ...@@ -288,21 +291,144 @@ class NuScenesMetric(BaseMetric):
for name in results[0]: for name in results[0]:
if 'pred' in name and '3d' in name and name[0] != '_': if 'pred' in name and '3d' in name and name[0] != '_':
# format result of model output in Det3dDataSample,
# include 'pred_instances_3d','pts_pred_instances_3d',
# 'img_pred_instances_3d'
print(f'\nFormating bboxes of {name}') print(f'\nFormating bboxes of {name}')
results_ = [out[name] for out in results] results_ = [out[name] for out in results]
tmp_file_ = osp.join(jsonfile_prefix, name) tmp_file_ = osp.join(jsonfile_prefix, name)
result_dict[name] = self._format_bbox(results_, sample_id_list, box_type_3d = type(results_[0]['bboxes_3d'])
classes, tmp_file_) if box_type_3d == LiDARInstance3DBoxes:
result_dict[name] = self._format_lidar_bbox(
results_, sample_id_list, classes, tmp_file_)
elif box_type_3d == CameraInstance3DBoxes:
result_dict[name] = self._format_camera_bbox(
results_, sample_id_list, classes, tmp_file_)
return result_dict, tmp_dir return result_dict, tmp_dir
def _format_bbox(self, def _format_camera_bbox(self,
results: List[dict], results: List[dict],
sample_id_list: List[int], sample_id_list: List[int],
classes: List[str] = None, classes: List[str] = None,
jsonfile_prefix: str = None) -> str: jsonfile_prefix: str = None) -> str:
"""Convert the results to the standard format.
Args:
results (list[dict]): Testing results of the dataset.
jsonfile_prefix (str): The prefix of the output jsonfile.
You can specify the output directory/filename by
modifying the jsonfile_prefix. Default: None.
Returns:
str: Path of the output json file.
"""
nusc_annos = {}
print('Start to convert detection format...')
# Camera types in Nuscenes datasets
camera_types = [
'CAM_FRONT',
'CAM_FRONT_RIGHT',
'CAM_FRONT_LEFT',
'CAM_BACK',
'CAM_BACK_LEFT',
'CAM_BACK_RIGHT',
]
CAM_NUM = 6
for i, det in enumerate(mmcv.track_iter_progress(results)):
sample_id = sample_id_list[i]
camera_type_id = sample_id % CAM_NUM
if camera_type_id == 0:
boxes_per_frame = []
attrs_per_frame = []
# need to merge results from images of the same sample
annos = []
boxes, attrs = output_to_nusc_box(det)
sample_token = self.data_infos[sample_id]['token']
camera_type = camera_types[camera_type_id]
boxes, attrs = cam_nusc_box_to_global(
self.data_infos[sample_id - camera_type_id], boxes, attrs,
camera_type, classes, self.eval_detection_configs)
boxes_per_frame.extend(boxes)
attrs_per_frame.extend(attrs)
# Remove redundant predictions caused by overlap of images
if (sample_id + 1) % CAM_NUM != 0:
continue
boxes = global_nusc_box_to_cam(
self.data_infos[sample_id + 1 - CAM_NUM], boxes_per_frame,
classes, self.eval_detection_configs)
cam_boxes3d, scores, labels = nusc_box_to_cam_box3d(boxes)
# box nms 3d over 6 images in a frame
# TODO: move this global setting into config
nms_cfg = dict(
use_rotate_nms=True,
nms_across_levels=False,
nms_pre=4096,
nms_thr=0.05,
score_thr=0.01,
min_bbox_size=0,
max_per_frame=500)
from mmcv import Config
nms_cfg = Config(nms_cfg)
cam_boxes3d_for_nms = xywhr2xyxyr(cam_boxes3d.bev)
boxes3d = cam_boxes3d.tensor
# generate attr scores from attr labels
attrs = labels.new_tensor([attr for attr in attrs_per_frame])
boxes3d, scores, labels, attrs = box3d_multiclass_nms(
boxes3d,
cam_boxes3d_for_nms,
scores,
nms_cfg.score_thr,
nms_cfg.max_per_frame,
nms_cfg,
mlvl_attr_scores=attrs)
cam_boxes3d = CameraInstance3DBoxes(boxes3d, box_dim=9)
det = bbox3d2result(cam_boxes3d, scores, labels, attrs)
boxes, attrs = output_to_nusc_box(det)
boxes, attrs = cam_nusc_box_to_global(
self.data_infos[sample_id + 1 - CAM_NUM], boxes, attrs,
classes, self.eval_detection_configs)
for i, box in enumerate(boxes):
name = classes[box.label]
attr = self.get_attr_name(attrs[i], name)
nusc_anno = dict(
sample_token=sample_token,
translation=box.center.tolist(),
size=box.wlh.tolist(),
rotation=box.orientation.elements.tolist(),
velocity=box.velocity[:2].tolist(),
detection_name=name,
detection_score=box.score,
attribute_name=attr)
annos.append(nusc_anno)
# other views results of the same frame should be concatenated
if sample_token in nusc_annos:
nusc_annos[sample_token].extend(annos)
else:
nusc_annos[sample_token] = annos
nusc_submissions = {
'meta': self.modality,
'results': nusc_annos,
}
mmcv.mkdir_or_exist(jsonfile_prefix)
res_path = osp.join(jsonfile_prefix, 'results_nusc.json')
print('Results writes to', res_path)
mmcv.dump(nusc_submissions, res_path)
return res_path
def _format_lidar_bbox(self,
results: List[dict],
sample_id_list: List[int],
classes: List[str] = None,
jsonfile_prefix: str = None) -> str:
"""Convert the results to the standard format. """Convert the results to the standard format.
Args: Args:
...@@ -389,27 +515,59 @@ def output_to_nusc_box(detection: dict) -> List[NuScenesBox]: ...@@ -389,27 +515,59 @@ def output_to_nusc_box(detection: dict) -> List[NuScenesBox]:
bbox3d = detection['bboxes_3d'] bbox3d = detection['bboxes_3d']
scores = detection['scores_3d'].numpy() scores = detection['scores_3d'].numpy()
labels = detection['labels_3d'].numpy() labels = detection['labels_3d'].numpy()
attrs = None
if 'attr_labels' in detection:
attrs = detection['attr_labels'].numpy()
box_gravity_center = bbox3d.gravity_center.numpy() box_gravity_center = bbox3d.gravity_center.numpy()
box_dims = bbox3d.dims.numpy() box_dims = bbox3d.dims.numpy()
box_yaw = bbox3d.yaw.numpy() box_yaw = bbox3d.yaw.numpy()
# our LiDAR coordinate system -> nuScenes box coordinate system
nus_box_dims = box_dims[:, [1, 0, 2]]
box_list = [] box_list = []
for i in range(len(bbox3d)):
quat = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw[i]) if type(bbox3d) == LiDARInstance3DBoxes:
velocity = (*bbox3d.tensor[i, 7:9], 0.0) # our LiDAR coordinate system -> nuScenes box coordinate system
box = NuScenesBox( nus_box_dims = box_dims[:, [1, 0, 2]]
box_gravity_center[i], for i in range(len(bbox3d)):
nus_box_dims[i], quat = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw[i])
quat, velocity = (*bbox3d.tensor[i, 7:9], 0.0)
label=labels[i], # velo_val = np.linalg.norm(box3d[i, 7:9])
score=scores[i], # velo_ori = box3d[i, 6]
velocity=velocity) # velocity = (
box_list.append(box) # velo_val * np.cos(velo_ori), velo_val * np.sin(velo_ori), 0.0)
return box_list box = NuScenesBox(
box_gravity_center[i],
nus_box_dims[i],
quat,
label=labels[i],
score=scores[i],
velocity=velocity)
box_list.append(box)
elif type(bbox3d) == CameraInstance3DBoxes:
# our Camera coordinate system -> nuScenes box coordinate system
# convert the dim/rot to nuscbox convention
nus_box_dims = box_dims[:, [2, 0, 1]]
nus_box_yaw = -box_yaw
for i in range(len(bbox3d)):
q1 = pyquaternion.Quaternion(
axis=[0, 0, 1], radians=nus_box_yaw[i])
q2 = pyquaternion.Quaternion(axis=[1, 0, 0], radians=np.pi / 2)
quat = q2 * q1
velocity = (bbox3d.tensor[i, 7], 0.0, bbox3d.tensor[i, 8])
box = NuScenesBox(
box_gravity_center[i],
nus_box_dims[i],
quat,
label=labels[i],
score=scores[i],
velocity=velocity)
box_list.append(box)
else:
raise NotImplementedError(
f'Do not support convert {type(bbox3d)} bboxes'
'to standard NuScenesBoxes.')
return box_list, attrs
def lidar_nusc_box_to_global( def lidar_nusc_box_to_global(
...@@ -448,3 +606,117 @@ def lidar_nusc_box_to_global( ...@@ -448,3 +606,117 @@ def lidar_nusc_box_to_global(
box.translate(ego2global[:3, 3]) box.translate(ego2global[:3, 3])
box_list.append(box) box_list.append(box)
return box_list return box_list
def cam_nusc_box_to_global(info: dict, boxes: List[NuScenesBox],
attrs: List[str], camera_type: str,
classes: List[str],
eval_configs: DetectionConfig) -> List[NuScenesBox]:
"""Convert the box from camera to global coordinate.
Args:
info (dict): Info for a specific sample data, including the
calibration information.
boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
attrs (list[str]): List of attributes.
camera_type (str): Type of camera.
classes (list[str]): Mapped classes in the evaluation.
eval_configs (object): Evaluation configuration object.
Returns:
list: List of standard NuScenesBoxes in the global
coordinate.
"""
box_list = []
attr_list = []
for (box, attr) in zip(boxes, attrs):
# Move box to ego vehicle coord system
cam2ego = np.array(info['images'][camera_type]['cam2ego'])
box.rotate(
pyquaternion.Quaternion(matrix=cam2ego, rtol=1e-05, atol=1e-07))
box.translate(cam2ego[:3, 3])
# filter det in ego.
cls_range_map = eval_configs.class_range
radius = np.linalg.norm(box.center[:2], 2)
det_range = cls_range_map[classes[box.label]]
if radius > det_range:
continue
# Move box to global coord system
ego2global = np.array(info['ego2global'])
box.rotate(
pyquaternion.Quaternion(matrix=ego2global, rtol=1e-05, atol=1e-07))
box.translate(ego2global[:3, 3])
box_list.append(box)
attr_list.append(attr)
return box_list, attr_list
def global_nusc_box_to_cam(info: dict, boxes: List[NuScenesBox],
classes: List[str],
eval_configs: DetectionConfig) -> List[NuScenesBox]:
"""Convert the box from global to camera coordinate.
Args:
info (dict): Info for a specific sample data, including the
calibration information.
boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
classes (list[str]): Mapped classes in the evaluation.
eval_configs (object): Evaluation configuration object.
Returns:
list: List of standard NuScenesBoxes in the global
coordinate.
"""
box_list = []
for box in boxes:
# Move box to ego vehicle coord system
ego2global = np.array(info['ego2global'])
box.translate(-ego2global[:3, 3])
box.rotate(
pyquaternion.Quaternion(matrix=ego2global, rtol=1e-05,
atol=1e-07).inverse)
# filter det in ego.
cls_range_map = eval_configs.class_range
radius = np.linalg.norm(box.center[:2], 2)
det_range = cls_range_map[classes[box.label]]
if radius > det_range:
continue
# Move box to camera coord system
cam2ego = np.array(info['images']['CAM_FRONT']['cam2ego'])
box.translate(-cam2ego[:3, :3])
box.rotate(
pyquaternion.Quaternion(matrix=cam2ego, rtol=1e-05,
atol=1e-07).inverse)
box_list.append(box)
return box_list
def nusc_box_to_cam_box3d(boxes: List[NuScenesBox]):
"""Convert boxes from :obj:`NuScenesBox` to :obj:`CameraInstance3DBoxes`.
Args:
boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
Returns:
tuple (:obj:`CameraInstance3DBoxes` | torch.Tensor | torch.Tensor):
Converted 3D bounding boxes, scores and labels.
"""
locs = torch.Tensor([b.center for b in boxes]).view(-1, 3)
dims = torch.Tensor([b.wlh for b in boxes]).view(-1, 3)
rots = torch.Tensor([b.orientation.yaw_pitch_roll[0]
for b in boxes]).view(-1, 1)
velocity = torch.Tensor([b.velocity[0::2] for b in boxes]).view(-1, 2)
# convert nusbox to cambox convention
dims[:, [0, 1, 2]] = dims[:, [1, 2, 0]]
rots = -rots
boxes_3d = torch.cat([locs, dims, rots, velocity], dim=1).cuda()
cam_boxes3d = CameraInstance3DBoxes(
boxes_3d, box_dim=9, origin=(0.5, 0.5, 0.5))
scores = torch.Tensor([b.score for b in boxes]).cuda()
labels = torch.LongTensor([b.label for b in boxes]).cuda()
nms_scores = scores.new_zeros(scores.shape[0], 10 + 1)
indices = labels.new_tensor(list(range(scores.shape[0])))
nms_scores[indices, labels] = scores
return cam_boxes3d, nms_scores, labels
...@@ -106,8 +106,7 @@ class Det3DDataPreprocessor(DetDataPreprocessor): ...@@ -106,8 +106,7 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
if 'points' in inputs_dict[0].keys(): if 'points' in inputs_dict[0].keys():
points = [input['points'] for input in inputs_dict] points = [input['points'] for input in inputs_dict]
else: else:
raise KeyError( points = None
"Model input dict needs to include the 'points' key.")
if 'img' in inputs_dict[0].keys(): if 'img' in inputs_dict[0].keys():
......
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
from abc import abstractmethod from abc import abstractmethod
from typing import Any, List, Sequence, Tuple, Union
import torch import torch
from mmcv.cnn import ConvModule, bias_init_with_prob, normal_init from mmcv.cnn import ConvModule, bias_init_with_prob, normal_init
from mmcv.runner import force_fp32 from torch import Tensor
from torch import nn as nn from torch import nn as nn
from mmdet3d.core.utils import ConfigType, InstanceList, OptConfigType
from mmdet3d.registry import MODELS from mmdet3d.registry import MODELS
from mmdet.core import multi_apply from mmdet.core import multi_apply
from ..builder import build_loss
from .base_mono3d_dense_head import BaseMono3DDenseHead from .base_mono3d_dense_head import BaseMono3DDenseHead
...@@ -20,39 +21,41 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead): ...@@ -20,39 +21,41 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead):
num_classes (int): Number of categories excluding the background num_classes (int): Number of categories excluding the background
category. category.
in_channels (int): Number of channels in the input feature map. in_channels (int): Number of channels in the input feature map.
feat_channels (int, optional): Number of hidden channels. feat_channels (int): Number of hidden channels.
Used in child classes. Defaults to 256. Used in child classes. Defaults to 256.
stacked_convs (int, optional): Number of stacking convs of the head. stacked_convs (int): Number of stacking convs of the head.
strides (tuple, optional): Downsample factor of each feature map. strides (Sequence[int] or Sequence[Tuple[int, int]]): Downsample
dcn_on_last_conv (bool, optional): If true, use dcn in the last factor of each feature map.
dcn_on_last_conv (bool): If true, use dcn in the last
layer of towers. Default: False. layer of towers. Default: False.
conv_bias (bool | str, optional): If specified as `auto`, it will be conv_bias (bool or str): If specified as `auto`, it will be
decided by the norm_cfg. Bias of conv will be set as True decided by the norm_cfg. Bias of conv will be set as True
if `norm_cfg` is None, otherwise False. Default: 'auto'. if `norm_cfg` is None, otherwise False. Default: 'auto'.
background_label (int, optional): Label ID of background, background_label (bool, Optional): Label ID of background,
set as 0 for RPN and num_classes for other heads. set as 0 for RPN and num_classes for other heads.
It will automatically set as `num_classes` if None is given. It will automatically set as `num_classes` if None is given.
use_direction_classifier (bool, optional): use_direction_classifier (bool):
Whether to add a direction classifier. Whether to add a direction classifier.
diff_rad_by_sin (bool, optional): Whether to change the difference diff_rad_by_sin (bool): Whether to change the difference
into sin difference for box regression loss. Defaults to True. into sin difference for box regression loss. Defaults to True.
dir_offset (float, optional): Parameter used in direction dir_offset (float): Parameter used in direction
classification. Defaults to 0. classification. Defaults to 0.
dir_limit_offset (float, optional): Parameter used in direction dir_limit_offset (float): Parameter used in direction
classification. Defaults to 0. classification. Defaults to 0.
loss_cls (dict, optional): Config of classification loss. loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
loss_bbox (dict, optional): Config of localization loss. loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss.
loss_dir (dict, optional): Config of direction classifier loss. loss_dir (:obj:`ConfigDict` or dict): Config of direction classifier
loss_attr (dict, optional): Config of attribute classifier loss, loss.
which is only active when `pred_attrs=True`. loss_attr (:obj:`ConfigDict` or dict): Config of attribute classifier
bbox_code_size (int, optional): Dimensions of predicted bounding boxes. loss, which is only active when `pred_attrs=True`.
pred_attrs (bool, optional): Whether to predict attributes. bbox_code_size (int): Dimensions of predicted bounding boxes.
pred_attrs (bool): Whether to predict attributes.
Defaults to False. Defaults to False.
num_attrs (int, optional): The number of attributes to be predicted. num_attrs (int): The number of attributes to be predicted.
Default: 9. Default: 9.
pred_velo (bool, optional): Whether to predict velocity. pred_velo (bool): Whether to predict velocity.
Defaults to False. Defaults to False.
pred_bbox2d (bool, optional): Whether to predict 2D boxes. pred_bbox2d (bool): Whether to predict 2D boxes.
Defaults to False. Defaults to False.
group_reg_dims (tuple[int], optional): The dimension of each regression group_reg_dims (tuple[int], optional): The dimension of each regression
target group. Default: (2, 1, 3, 1, 2). target group. Default: (2, 1, 3, 1, 2).
...@@ -66,68 +69,77 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead): ...@@ -66,68 +69,77 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead):
(64, ), # rot (64, ), # rot
() # velo () # velo
), ),
dir_branch (tuple[int], optional): Channels for direction dir_branch (Sequence[int]): Channels for direction
classification branch. Default: (64, ). classification branch. Default: (64, ).
attr_branch (tuple[int], optional): Channels for classification branch. attr_branch (Sequence[int]): Channels for classification branch.
Default: (64, ). Default: (64, ).
conv_cfg (dict, optional): Config dict for convolution layer. conv_cfg (:obj:`ConfigDict` or dict, Optional): Config dict for
Default: None. convolution layer. Default: None.
norm_cfg (dict, optional): Config dict for normalization layer. norm_cfg (:obj:`ConfigDict` or dict, Optional): Config dict for
Default: None. normalization layer. Default: None.
train_cfg (dict, optional): Training config of anchor head. train_cfg (:obj:`ConfigDict` or dict, Optional): Training config
test_cfg (dict, optional): Testing config of anchor head. of anchor head.
test_cfg (:obj:`ConfigDict` or dict, Optional): Testing config of
anchor head.
init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
dict]): Initialization config dict.
""" # noqa: W605 """ # noqa: W605
_version = 1 _version = 1
def __init__( def __init__(
self, self,
num_classes, num_classes: int,
in_channels, in_channels: int,
feat_channels=256, feat_channels: int = 256,
stacked_convs=4, stacked_convs: int = 4,
strides=(4, 8, 16, 32, 64), strides: Sequence[int] = (4, 8, 16, 32, 64),
dcn_on_last_conv=False, dcn_on_last_conv: bool = False,
conv_bias='auto', conv_bias: Union[bool, str] = 'auto',
background_label=None, background_label: bool = None,
use_direction_classifier=True, use_direction_classifier: bool = True,
diff_rad_by_sin=True, diff_rad_by_sin: bool = True,
dir_offset=0, dir_offset: int = 0,
dir_limit_offset=0, dir_limit_offset: int = 0,
loss_cls=dict( loss_cls: ConfigType = dict(
type='FocalLoss', type='mmdet.FocalLoss',
use_sigmoid=True, use_sigmoid=True,
gamma=2.0, gamma=2.0,
alpha=0.25, alpha=0.25,
loss_weight=1.0), loss_weight=1.0),
loss_bbox=dict( loss_bbox: ConfigType = dict(
type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
loss_dir=dict( loss_dir: ConfigType = dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), type='mmdet.CrossEntropyLoss',
loss_attr=dict( use_sigmoid=False,
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_weight=1.0),
bbox_code_size=9, # For nuscenes loss_attr: ConfigType = dict(
pred_attrs=False, type='mmdet.CrossEntropyLoss',
num_attrs=9, # For nuscenes use_sigmoid=False,
pred_velo=False, loss_weight=1.0),
pred_bbox2d=False, bbox_code_size: int = 9, # For nuscenes
group_reg_dims=(2, 1, 3, 1, 2), # offset, depth, size, rot, velo, pred_attrs: bool = False,
cls_branch=(128, 64), num_attrs: int = 9, # For nuscenes
reg_branch=( pred_velo: bool = False,
pred_bbox2d: bool = False,
group_reg_dims: Sequence[int] = (
2, 1, 3, 1, 2), # offset, depth, size, rot, velo,
cls_branch: Sequence[int] = (128, 64),
reg_branch: Sequence[Tuple[int, int]] = (
(128, 64), # offset (128, 64), # offset
(128, 64), # depth (128, 64), # depth
(64, ), # size (64, ), # size
(64, ), # rot (64, ), # rot
() # velo () # velo
), ),
dir_branch=(64, ), dir_branch: Sequence[int] = (64, ),
attr_branch=(64, ), attr_branch: Sequence[int] = (64, ),
conv_cfg=None, conv_cfg: OptConfigType = None,
norm_cfg=None, norm_cfg: OptConfigType = None,
train_cfg=None, train_cfg: OptConfigType = None,
test_cfg=None, test_cfg: OptConfigType = None,
init_cfg=None): init_cfg: OptConfigType = None) -> None:
super(AnchorFreeMono3DHead, self).__init__(init_cfg=init_cfg) super().__init__(init_cfg=init_cfg)
self.num_classes = num_classes self.num_classes = num_classes
self.cls_out_channels = num_classes self.cls_out_channels = num_classes
self.in_channels = in_channels self.in_channels = in_channels
...@@ -141,9 +153,9 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead): ...@@ -141,9 +153,9 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead):
self.diff_rad_by_sin = diff_rad_by_sin self.diff_rad_by_sin = diff_rad_by_sin
self.dir_offset = dir_offset self.dir_offset = dir_offset
self.dir_limit_offset = dir_limit_offset self.dir_limit_offset = dir_limit_offset
self.loss_cls = build_loss(loss_cls) self.loss_cls = MODELS.build(loss_cls)
self.loss_bbox = build_loss(loss_bbox) self.loss_bbox = MODELS.build(loss_bbox)
self.loss_dir = build_loss(loss_dir) self.loss_dir = MODELS.build(loss_dir)
self.bbox_code_size = bbox_code_size self.bbox_code_size = bbox_code_size
self.group_reg_dims = list(group_reg_dims) self.group_reg_dims = list(group_reg_dims)
self.cls_branch = cls_branch self.cls_branch = cls_branch
...@@ -174,7 +186,7 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead): ...@@ -174,7 +186,7 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead):
self.num_attrs = num_attrs self.num_attrs = num_attrs
if self.pred_attrs: if self.pred_attrs:
self.attr_background_label = num_attrs self.attr_background_label = num_attrs
self.loss_attr = build_loss(loss_attr) self.loss_attr = MODELS.build(loss_attr)
self.attr_branch = attr_branch self.attr_branch = attr_branch
self._init_layers() self._init_layers()
...@@ -316,11 +328,13 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead): ...@@ -316,11 +328,13 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead):
if self.pred_attrs: if self.pred_attrs:
normal_init(self.conv_attr, std=0.01, bias=bias_cls) normal_init(self.conv_attr, std=0.01, bias=bias_cls)
def forward(self, feats): def forward(
self, x: Tuple[Tensor]
) -> Tuple[List[Tensor], List[Tensor], List[Tensor], List[Tensor]]:
"""Forward features from the upstream network. """Forward features from the upstream network.
Args: Args:
feats (tuple[Tensor]): Features from the upstream network, each is x (tuple[Tensor]): Features from the upstream network, each is
a 4D-tensor. a 4D-tensor.
Returns: Returns:
...@@ -339,9 +353,9 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead): ...@@ -339,9 +353,9 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead):
level, each is a 4D-tensor, the channel number is level, each is a 4D-tensor, the channel number is
num_points * num_attrs. num_points * num_attrs.
""" """
return multi_apply(self.forward_single, feats)[:5] return multi_apply(self.forward_single, x)[:5]
def forward_single(self, x): def forward_single(self, x: Tensor) -> Tuple[Tensor, ...]:
"""Forward features of a single scale level. """Forward features of a single scale level.
Args: Args:
...@@ -394,77 +408,8 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead): ...@@ -394,77 +408,8 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead):
reg_feat reg_feat
@abstractmethod @abstractmethod
@force_fp32(apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds')) def get_targets(self, points: List[Tensor],
def loss(self, batch_gt_instances: InstanceList) -> Any:
cls_scores,
bbox_preds,
dir_cls_preds,
attr_preds,
batch_gt_instances_3d,
batch_img_metas,
batch_gt_instances_ignore=None):
"""Compute loss of the head.
Args:
cls_scores (list[Tensor]): Box scores for each scale level,
each is a 4D-tensor, the channel number is
num_points * num_classes.
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level, each is a 4D-tensor, the channel number is
num_points * bbox_code_size.
dir_cls_preds (list[Tensor]): Box scores for direction class
predictions on each scale level, each is a 4D-tensor,
the channel number is num_points * 2. (bin = 2)
attr_preds (list[Tensor]): Box scores for each scale level,
each is a 4D-tensor, the channel number is
num_points * num_attrs.
batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
gt_instance_3d. It usually includes ``bboxes``、``labels``
、``bboxes_3d``、``labels3d``、``depths``、``centers2d`` and
attributes.
batch_img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
Batch of gt_instances_ignore. It includes ``bboxes`` attribute
data that is ignored during training and testing.
Defaults to None.
"""
raise NotImplementedError
@abstractmethod
@force_fp32(apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds'))
def get_results(self,
cls_scores,
bbox_preds,
dir_cls_preds,
attr_preds,
batch_img_metas,
cfg=None,
rescale=None):
"""Transform network output for a batch into bbox predictions.
Args:
cls_scores (list[Tensor]): Box scores for each scale level
Has shape (N, num_points * num_classes, H, W)
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level with shape (N, num_points * bbox_code_size, H, W)
dir_cls_preds (list[Tensor]): Box scores for direction class
predictions on each scale level, each is a 4D-tensor,
the channel number is num_points * 2. (bin = 2)
attr_preds (list[Tensor]): Attribute scores for each scale level
Has shape (N, num_points * num_attrs, H, W)
batch_img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
cfg (mmcv.Config): Test / postprocessing configuration,
if None, test_cfg would be used
rescale (bool): If True, return boxes in original image space
"""
raise NotImplementedError
@abstractmethod
def get_targets(self, points, batch_gt_instances_3d):
"""Compute regression, classification and centerss targets for points """Compute regression, classification and centerss targets for points
in multiple images. in multiple images.
...@@ -473,18 +418,32 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead): ...@@ -473,18 +418,32 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead):
(num_points, 2). (num_points, 2).
batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
gt_instance_3d. It usually includes ``bboxes``、``labels`` gt_instance_3d. It usually includes ``bboxes``、``labels``
、``bboxes_3d``、``labels3d``、``depths``、``centers2d`` and 、``bboxes_3d``、``labels_3d``、``depths``、``centers_2d``
attributes. and attributes.
""" """
raise NotImplementedError raise NotImplementedError
# TODO: Refactor using MlvlPointGenerator in MMDet.
def _get_points_single(self, def _get_points_single(self,
featmap_size, featmap_size: Tuple[int],
stride, stride: int,
dtype, dtype: torch.dtype,
device, device: torch.device,
flatten=False): flatten: bool = False) -> Tuple[Tensor, Tensor]:
"""Get points of a single scale level.""" """Get points of a single scale level.
Args:
featmap_size (tuple[int]): Single scale level feature map
size.
stride (int): Downsample factor of the feature map.
dtype (torch.dtype): Type of points.
device (torch.device): Device of points.
flatten (bool): Whether to flatten the tensor.
Defaults to False.
Returns:
tuple: points of each image.
"""
h, w = featmap_size h, w = featmap_size
x_range = torch.arange(w, dtype=dtype, device=device) x_range = torch.arange(w, dtype=dtype, device=device)
y_range = torch.arange(h, dtype=dtype, device=device) y_range = torch.arange(h, dtype=dtype, device=device)
...@@ -494,16 +453,23 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead): ...@@ -494,16 +453,23 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead):
x = x.flatten() x = x.flatten()
return y, x return y, x
def get_points(self, featmap_sizes, dtype, device, flatten=False): # TODO: Refactor using MlvlPointGenerator in MMDet.
def get_points(self,
featmap_sizes: List[Tuple[int]],
dtype: torch.dtype,
device: torch.device,
flatten: bool = False) -> List[Tuple[Tensor, Tensor]]:
"""Get points according to feature map sizes. """Get points according to feature map sizes.
Args: Args:
featmap_sizes (list[tuple]): Multi-level feature map sizes. featmap_sizes (list[tuple]): Multi-level feature map sizes.
dtype (torch.dtype): Type of points. dtype (torch.dtype): Type of points.
device (torch.device): Device of points. device (torch.device): Device of points.
flatten (bool): Whether to flatten the tensor.
Defaults to False.
Returns: Returns:
tuple: points of each image. list[tuple]: points of each image.
""" """
mlvl_points = [] mlvl_points = []
for i in range(len(featmap_sizes)): for i in range(len(featmap_sizes)):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment